diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 4a4a29c52..6b2ed9e9d 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -23,7 +23,7 @@ runs: with: venv: ${{ inputs.venv }} name: compressed - extra: "[dev,accelerate]" + extra: "[dev]" - name: clean up run: | diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 2407b9eaa..f3cc04f93 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -30,7 +30,7 @@ jobs: - name: Set Env run: pip3 install --upgrade pip setuptools - name: "⚙️ Install dependencies" - run: pip3 install .[dev,accelerate] + run: pip3 install .[dev] - name: clean up run: | echo "cleaning up disk space as GHA runner has limited disk size." diff --git a/setup.py b/setup.py index a37bd9d2d..fc59b3d84 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def _setup_install_requires() -> List: def _setup_extras() -> Dict: return { - "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3", "transformers<5.0"], + "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3", "transformers<5.0", "accelerate"], "accelerate": ["accelerate"] } diff --git a/src/compressed_tensors/linear/compressed_linear.py b/src/compressed_tensors/linear/compressed_linear.py index d24df2fcd..014aeef9d 100644 --- a/src/compressed_tensors/linear/compressed_linear.py +++ b/src/compressed_tensors/linear/compressed_linear.py @@ -87,12 +87,6 @@ def from_linear( # mark module as compressed module.quantization_status = QuantizationStatus.COMPRESSED - # handles case where forward is wrapped in new_forward by accelerate hooks - if hasattr(module, "_old_forward"): - module._old_forward = CompressedLinear.forward.__get__( - module, CompressedLinear - ) - return module def forward(self, input: Tensor) -> Tensor: diff --git a/src/compressed_tensors/offload/__init__.py b/src/compressed_tensors/offload/__init__.py index 072dbdf7a..ab86b2d00 100644 --- a/src/compressed_tensors/offload/__init__.py +++ b/src/compressed_tensors/offload/__init__.py @@ -135,9 +135,7 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M """ cache = base._parameters if isinstance(cache, OffloadCache): - offload_module( - module, cache.onload_device, cache.offload_device, no_split=False - ) + offload_module(module, cache.onload_device, cache.offload_device) base.register_module(name, module) @@ -178,9 +176,12 @@ def align_module_device( if isinstance(module._parameters, OffloadCache): assert isinstance(module._buffers, OffloadCache) with module._parameters.disable_offloading(): - with patch_attr( - module._parameters, "onload_device", execution_device - ), patch_attr(module._buffers, "onload_device", execution_device): + if execution_device is not None: + with patch_attr( + module._parameters, "onload_device", execution_device + ), patch_attr(module._buffers, "onload_device", execution_device): + yield + else: yield else: diff --git a/src/compressed_tensors/offload/dispatch.py b/src/compressed_tensors/offload/dispatch.py index 5206e10f4..1f5e4fbf2 100644 --- a/src/compressed_tensors/offload/dispatch.py +++ b/src/compressed_tensors/offload/dispatch.py @@ -39,7 +39,7 @@ def offload_model( model: ModelType, onload_device: torch.device | str, - offload_device: Optional[torch.device | str | Literal["disk"]] = None, + offload_device: torch.device | str | Literal["disk"] = torch.device("cpu"), ) -> ModelType: """ Offload a model to the `offload_device`. During forward passes, model weights will diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py index 8c1b251c5..f8ef79dcf 100644 --- a/src/compressed_tensors/quantization/lifecycle/initialize.py +++ b/src/compressed_tensors/quantization/lifecycle/initialize.py @@ -23,6 +23,7 @@ QuantizedAttentionImpl, QuantizedKVCache, ) +from compressed_tensors.offload import unwrap_offload_forward from compressed_tensors.quantization import ( ActivationOrdering, DynamicType, @@ -37,7 +38,6 @@ ) from compressed_tensors.quantization.utils import strategy_cdiv from compressed_tensors.utils import ( - disable_hf_hook, get_execution_device, get_head_dim, get_num_attn_heads, @@ -134,7 +134,7 @@ def initialize_module_for_quantization( force_zero_point=force_zero_point, ) - with disable_hf_hook(module): + with unwrap_offload_forward(module): # wrap forward call of module to perform # quantized actions based on calltime status wrap_module_forward_quantized(module, scheme) diff --git a/src/compressed_tensors/transform/apply.py b/src/compressed_tensors/transform/apply.py index ade267234..e247e7029 100644 --- a/src/compressed_tensors/transform/apply.py +++ b/src/compressed_tensors/transform/apply.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict - import torch from compressed_tensors import TRANSFORM_CONFIG_NAME from compressed_tensors.transform import TransformConfig, TransformFactory -from compressed_tensors.utils.offload import has_offloaded_params __all__ = ["apply_transform_config"] @@ -37,35 +34,3 @@ def apply_transform_config(model: torch.nn.Module, config: TransformConfig): # attach config to model for compression/serialization setattr(model, TRANSFORM_CONFIG_NAME, config) - - # ensure that tied weight transforms can be serialized without aliases - # In the future, this could be done by transformers or model compressor - # which would make this more robust to changing dispatches after transforms - _tie_offloaded_tensors(model) - - -def _tie_offloaded_tensors(model: torch.nn.Module): - """ - When accelerate replaces tensors with meta tensors during offloading, the meta - tensors may not be identical, even if the offloaded values are identical. - - However, transformers can only serialize correctly if meta tensors are identical - (see transformers#39263). - - This function collects all meta tensors which have shared offloaded values and sets - those tensors to be identical so that they can be removed during serialization - - :param model: model potentially containing offloaded meta tensors to fix - """ - - # ensure that if a location shares an offloaded tensor pointers, that the - # meta tensor is also identical (assigned to the first instance of parameter) - ptr_to_meta: Dict[int, torch.nn.Parameter] = dict() - for module in model.modules(): - if has_offloaded_params(module): - for key, _ in module.named_parameters(recurse=False): - offloaded_ptr = module._hf_hook.weights_map[key].data_ptr() - - if offloaded_ptr not in ptr_to_meta: - ptr_to_meta[offloaded_ptr] = getattr(module, key) - setattr(module, key, ptr_to_meta[offloaded_ptr]) diff --git a/src/compressed_tensors/transform/factory/base.py b/src/compressed_tensors/transform/factory/base.py index 96f15c9da..c54328928 100644 --- a/src/compressed_tensors/transform/factory/base.py +++ b/src/compressed_tensors/transform/factory/base.py @@ -26,6 +26,7 @@ initialize_hooked_kv_cache, register_key_hook, ) +from compressed_tensors.offload import OffloadCache from compressed_tensors.registry.registry import RegistryMixin, T from compressed_tensors.transform import ( TransformArgs, @@ -34,8 +35,6 @@ ) from compressed_tensors.utils import ( align_module_device, - delete_offload_module, - has_offloaded_params, match_named_modules, patch_attr, register_offload_module, @@ -116,13 +115,6 @@ def _apply_to_module(self, model: Module, module: Module, args: TransformArgs): :param module: target module to apply transforms to :param args: defines how the transform will be applied to the target module """ - if has_offloaded_params(module): - if module._hf_hook.place_submodules: - raise NotImplementedError( - "Applying transforms to offloaded submodules with " - "`place_submodules=True` is not supported" - ) - # create transform as submodule transform_name = f"{self.name}_{args.location}" transform = self.create_transform(module, args) @@ -150,13 +142,13 @@ def input_hook(_, args): if self.scheme.requires_grad: # for training, the weight changes with every forward pass # so we can leverage parametrization to propagate the gradient - if has_offloaded_params(module): + if isinstance(module._parameters, OffloadCache): raise ValueError("Offloaded training is not supported") P.register_parametrization(module, "weight", transform) else: # transform is no longer needed (unfusing is not supported) - delete_offload_module(module, transform_name) + delattr(module, transform_name) # register output transformation hook elif args.location == TransformLocation.OUTPUT: diff --git a/src/compressed_tensors/utils/offload.py b/src/compressed_tensors/utils/offload.py index 01d833a77..e5c9bf13d 100644 --- a/src/compressed_tensors/utils/offload.py +++ b/src/compressed_tensors/utils/offload.py @@ -12,59 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Utilities associated with offloading functionality provided by `accelerate`. +Utilities associated with offloading functionality | ------------------------------------------------------------------------------------------------------ | # noqa: E501 | Operation | Without offloading support | With offloading support | # noqa: E501 | ---------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501 -| Add | module.register_parameter(name, param) | register_offload_parameter(module, name, param) | # noqa: E501 -| Check | N/A | has_offloaded_params(module) | # noqa: E501 -| Onload | N/A | with align_module_device(module) | # noqa: E501 | Update | module.name.data.copy_(new_data) | update_offload_parameter(module, name, new_data) | # noqa: E501 -| Delete | del module.name | delete_offload_parameter(module, name) | # noqa: E501 -| Add Module | module.register_module(name, child) | register_offload_module(name, child) | # noqa: E501 -| Del Module | del module.name | delete_offload_module(module, name) | # noqa: E501 | ------------------------------------------------------------------------------------------------------ | # noqa: E501 """ import contextlib -import warnings -from functools import wraps -from operator import attrgetter -from typing import Any, Callable, Dict, Iterable, Literal, Optional, Tuple, Union +from typing import Literal, Optional import torch -from compressed_tensors.utils import patch_attr - - -try: - from accelerate.hooks import ( - AlignDevicesHook, - add_hook_to_module, - attach_align_device_hook, - named_module_tensors, - remove_hook_from_module, - ) - from accelerate.utils import ( - OffloadedWeightsLoader, - PrefixedDataset, - find_tied_parameters, - set_module_tensor_to_device, - ) - - _has_accelerate = True - -except ImportError: - _has_accelerate = False - AlignDevicesHook = None - add_hook_to_module = None - remove_hook_from_module = None - OffloadedWeightsLoader = None - PrefixedDataset = None - set_module_tensor_to_device = None - named_module_tensors = None - attach_align_device_hook = None - find_tied_parameters = None +from compressed_tensors.offload import ( + align_module_device, + align_modules, + disable_offloading, + get_execution_device, + get_offloaded_device, + offload_model, + register_offload_module, + remove_dispatch, + update_offload_parameter, +) +from compressed_tensors.utils.helpers import deprecated __all__ = [ @@ -85,51 +57,11 @@ "disable_offloading", "remove_dispatch", "cast_to_device", + "offload_to_weights_map", + "delete_from_weights_map", ] -def check_accelerate(fallback: Any): - def decorator(func: Callable[[Any], Any]): - if not _has_accelerate: - if fallback == "error": - - @wraps(func) - def fallback_fn(*args, **kwargs): - raise ValueError( - "Please install `accelerate` in order to use this function" - ) - - else: - - @wraps(func) - def fallback_fn(*args, **kwargs): - return fallback - - return fallback_fn - - return func - - return decorator - - -""" Candidates for Depreciation """ - - -def get_offloaded_device(module: torch.nn.Module) -> torch.device: - """ - :param module: module to check - :return: device module is offloaded to onto after forward pass - """ - if has_offloaded_params(module): - first_key = list(module._hf_hook.weights_map.keys())[0] - prefix_dataset = module._hf_hook.weights_map.dataset - return prefix_dataset[first_key].device - else: - # if the module is not offloaded, then any addded weights - # should be placed the module's execution device - return get_execution_device(module) - - def update_parameter_data( module: torch.nn.Module, new_param_data: torch.Tensor, param_name: str ): @@ -147,7 +79,8 @@ def update_parameter_data( """ Candidates for Upstreaming """ -def cast_to_device(device_spec: Union[int, torch.device]) -> torch.device: +@deprecated() +def cast_to_device(device_spec: int | torch.device) -> torch.device: """ Convert an integer device index or torch.device into a torch.device object. @@ -160,31 +93,12 @@ def cast_to_device(device_spec: Union[int, torch.device]) -> torch.device: return device_spec -def get_execution_device(module: torch.nn.Module) -> torch.device: - """ - Get the device which inputs should be moved to before module execution. - Assume that modules execute in the same order as returned by `model.modules()` - - :param module: module to check, may be offloaded - :return: onload device of module - """ - for submodule in module.modules(): - if has_offloaded_params(submodule): - return cast_to_device(submodule._hf_hook.execution_device) - - param = next(submodule.parameters(recurse=False), None) - if param is not None: - return param.device - - warnings.warn(f"Unable to get execution device of {module}, falling back to CPU") - return torch.device("cpu") - - +@deprecated("module.register_parameter(name, parameter)") def register_offload_parameter( module: torch.nn.Module, name: str, parameter: torch.nn.Parameter, - offload_device: Optional[Union[torch.device, Literal["disk"]]] = None, + offload_device: Optional[torch.device | Literal["disk"]] = None, ): """ Register a parameter to the given module which may be offloaded @@ -195,64 +109,13 @@ def register_offload_parameter( :param offload_device: device on which weight will be offloaded to. If None is provided, then infer device from parameters on module """ - has_onload = any(p.device != torch.device("meta") for p in module.parameters()) - module.register_parameter(name, parameter) - - # do everything AlignDevicesHook.init_hook does - # https://github.com/huggingface/accelerate/blob/main/src/accelerate/hooks.py#L281 - if has_offloaded_params(module): - hook: AlignDevicesHook = module._hf_hook - assert hook.weights_map is not None - - # append to original_devices - hook.original_devices[name] = parameter.device - - # append to weights map - offload_to_weights_map(hook.weights_map, name, parameter.data, offload_device) - - # append to tied_params_map - offloaded = hook.weights_map[name] - if hook.tied_params_map is not None: - hook.tied_params_map[offloaded.data_ptr()] = {} # (1) - - # perform offloading - if not has_onload: - set_module_tensor_to_device(module, name, "meta") - - -def update_offload_parameter( - module: torch.nn.Module, - name: str, - data: torch.Tensor, - offload_device: Optional[Union[torch.device, Literal["disk"]]] = None, -): - """ - Update the data of an existing parameter and its offload dict. Supports both - parameters of offloaded modules and non-offloaded modules - - :param module: module containing the parameter to update - :param name: name of module parameter to update - :param data: tensor to update parameter with - :param offload_device: device on which weight will be offloaded to. If None is - provided, then infer device from parameters on module - """ - param: torch.nn.Parameter = getattr(module, name) - if param.data.shape != data.shape: - warnings.warn( - f"Shape of parameter being updated {param.data.shape} does not match shape " - f"of update data {data.shape}" - ) - - # copy data into onloaded parameter if applicable - if param.device != torch.device("meta") and data is not param.data: - param.data.copy_(data) + if offload_device == "disk": + raise NotImplementedError("Disk offloading is not currently supported") - # update offload dict - if has_offloaded_params(module): - weights_map = module._hf_hook.weights_map - offload_to_weights_map(weights_map, name, data, offload_device) + module.register_parameter(name, parameter) +@deprecated("delattr(module, name)") def delete_offload_parameter(module: torch.nn.Module, name: str): """ Delete a parameter from a module which may be offloaded, @@ -263,347 +126,60 @@ def delete_offload_parameter(module: torch.nn.Module, name: str): """ delattr(module, name) - if has_offloaded_params(module): - weights_map = module._hf_hook.weights_map - delete_from_weights_map(weights_map, name) - module._hf_hook.tied_params_names -= set(name) - if name in module._hf_hook.original_devices: - del module._hf_hook.original_devices[name] - if name in module._hf_hook.param_original_devices: - del module._hf_hook.param_original_devices[name] - if name in module._hf_hook.buffer_original_devices: - del module._hf_hook.param_original_devices[name] - - -@check_accelerate(fallback=contextlib.nullcontext()) +@deprecated("compressed_tensors.offload::unwrap_offload") @contextlib.contextmanager def disable_hf_hook(module: torch.nn.Module): - hooks = {} - - def collect_hooks(module): - if hasattr(module, "_hf_hook"): - hooks[module] = module._hf_hook - remove_hook_from_module(module) - - module.apply(collect_hooks) - - yield - - for submodule, hook in hooks.items(): - add_hook_to_module(submodule, hook) - - -@check_accelerate(fallback=None) -def offload_to_weights_map( - weights_map: Union[PrefixedDataset, Dict, OffloadedWeightsLoader], - key: str, - value: torch.Tensor, - offload_device: Optional[Union[torch.device, Literal["disk"]]] = None, -): - """ - Helper function which implements offloaded item assignment for PrefixedDataset, - OffloadedWeightsLoader, and Dict types. - - :param weights_map: weight map to be updated with offload information - :param key: key used to identify weight location - :param value: weight being offloaded - :param offload_device: device on which weight will be offloaded to. If None is - provided, then infer device from parameters in weights_map - """ - if isinstance(weights_map, PrefixedDataset): - if offload_device == "disk": - raise ValueError(f"Cannot offload to disk with type {type(weights_map)}") - - dataset = weights_map.dataset - key = f"{weights_map.prefix}{key}" - offload_to_weights_map(dataset, key, value, offload_device) - - elif isinstance(weights_map, OffloadedWeightsLoader): - if key not in weights_map.all_keys: - weights_map.all_keys.append(key) - - if len(weights_map.index) <= 0 and offload_device != "disk": - offload_to_weights_map(weights_map.state_dict, key, value, offload_device) - - else: - raise NotImplementedError( - "Updating weights_map with disk offloading is not implemented yet" - ) - - elif isinstance(weights_map, dict): - if offload_device == "disk": - raise ValueError(f"Cannot offload to disk with type {type(weights_map)}") - - # infer offload device - if offload_device is None: - if key in weights_map: - offload_device = weights_map[key].device - else: - tens = next(iter(weights_map.values()), None) - if tens is None: - raise ValueError( - "Cannot infer offload device from empty weights_map" - ) - offload_device = tens.device - - weights_map[key] = value.to(device=offload_device) - - else: - raise NotImplementedError( - "Updating offload data not implemented for weights_map of type " - f"{type(weights_map)}" - ) - - -@check_accelerate(fallback=None) -def delete_from_weights_map( - weights_map: Union[PrefixedDataset, Dict, OffloadedWeightsLoader], - key: str, -): - if isinstance(weights_map, PrefixedDataset): - dataset = weights_map.dataset - key = f"{weights_map.prefix}{key}" - delete_from_weights_map(dataset, key) - - elif isinstance(weights_map, OffloadedWeightsLoader): - if len(weights_map.index) <= 0: - delete_from_weights_map(weights_map.state_dict, key) - - else: - raise NotImplementedError( - "Delete from weights_map with disk offloading is not implemented yet" - ) - - elif isinstance(weights_map, dict): - del weights_map[key] - - else: - raise NotImplementedError( - "Updating offload data not implemented for weights_map of type " - f"{type(weights_map)}" - ) - - -@check_accelerate(fallback=contextlib.nullcontext()) -@contextlib.contextmanager -def disable_offload(module: torch.nn.Module): - """ - Context manager to disable module onloading and offloading. Parameters will stay on - their current device - - :param module: module to disable offloading for - """ - if has_offloaded_params(module): - module._hf_hook.offload = False - yield - module._hf_hook.offload = True - else: - yield - - -@check_accelerate(fallback=contextlib.nullcontext()) -@contextlib.contextmanager -def align_modules( - modules: Union[torch.nn.Module, Iterable[torch.nn.Module]], - execution_device: Optional[torch.device] = None, -): - """ - Context manager for onloading modules to a device, and disabling onload and offload - attempts triggered by forward calls. Used for sequential onloading of layers - - :param modules: `torch.nn.Module` or iterable of `torch.nn.Module`s to onload - :param execution_device: device to onload to - """ - modules = (modules,) if isinstance(modules, torch.nn.Module) else modules - - with contextlib.ExitStack() as stack: - for module in modules: - stack.enter_context(align_module_device(module, execution_device)) - stack.enter_context(disable_offload(module)) # disable redundant onloading - yield - - -def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.Module): - """ - Register a submodule with offloading if the parent module is offloaded - - :param base: module to attach submodule to - :param name: name of submodule - :param module: submodule to attach - """ - - if has_offloaded_params(base): - hook: AlignDevicesHook = base._hf_hook - assert hook.offload - assert hook.weights_map is not None - - # offloading kwargs for submodule - place_submodules = False - offload_buffers = True - - # copy device offloading arguments from parent - current_device = next(base.parameters()).device # assume base has parameters - offload_device = get_offloaded_device(base) - - # offload parameters to weights map - for param_name, param in named_module_tensors( - module, include_buffers=offload_buffers, recurse=place_submodules - ): - offloaded = param.to(offload_device) - if hook.tied_params_map is not None: - hook.tied_params_map[offloaded.data_ptr()] = {} # (1) - offload_to_weights_map(hook.weights_map, f"{name}.{param_name}", offloaded) - - # if the parent places submodules, offload here - if hook.place_submodules: - set_module_tensor_to_device(module, param_name, current_device) - - # if the parent does not place submodules, then add a hook - # parameters are offloaded by `add_hook_to_module` - if not hook.place_submodules: - weights_map = PrefixedDataset( - hook.weights_map.dataset, prefix=f"{hook.weights_map.prefix}{name}." - ) - - submodule_hook = AlignDevicesHook( - execution_device=hook.execution_device, - offload=hook.offload, - io_same_device=False, - weights_map=weights_map, - offload_buffers=offload_buffers, - place_submodules=place_submodules, - skip_keys=None, - tied_params_map=hook.tied_params_map, - ) - add_hook_to_module(module, submodule_hook) - - base.register_module(name, module) + raise ValueError() +@deprecated("delattr(base, name)") def delete_offload_module(base: torch.nn.Module, name: str): """ Delete a submodule from a model which may contain offloading :param base: parent module to delete submodule from :param name: name of submodule on parent """ - module: torch.nn.Module = getattr(base, name) - - for param_name, _ in list(module.named_parameters()): - delete_offload_parameter(module, param_name) - delattr(base, name) -@check_accelerate(fallback="error") +@deprecated("compressed_tensors.offload::offload_model") def offloaded_dispatch( module: torch.nn.Module, execution_device: torch.device, - offload_device: Union[torch.device, Literal["disk"]] = torch.device("cpu"), + offload_device: Optional[torch.device | Literal["disk"]] = None, ) -> torch.nn.Module: """ - Unlike `dispatch_model`, this function forces a module (and its submodules) to - offload all parameters and replace them with meta tensors, utiliizing the - `AlignDevicesHook` to control onloading and offloading. + Dispatch a model, keeping device parameters offloaded on their current device :param module: module containing parameters to offload :param execution_device: device that modules will onload and execute on :param offload_device: device that module parameters will offload to :return: module with offloading device hooks """ - if offload_device == "disk": - raise NotImplementedError("Disk offloading is not currently supported") - - # remove any existing hooks - remove_dispatch(module) - - # create weights map - state_dict = module.state_dict() - state_dict = {key: val.to(offload_device) for key, val in state_dict.items()} - weights_map = OffloadedWeightsLoader(state_dict=state_dict, device=offload_device) - - # create tied params map - tied_params = find_tied_parameters(module) - tied_params_map = {} - for group in tied_params: - for param_name in group: - data_ptr = attrgetter(param_name)(module).data_ptr() - tied_params_map[data_ptr] = {} - - # recursively attaches hooks to all submodules - attach_align_device_hook( - module, - execution_device=execution_device, - offload=True, - weights_map=weights_map, - tied_params_map=tied_params_map, - ) - - # when saving a model, `PretrainedModel.save_pretrained` will only - # onload weights if the following requirements are met - # if ( - # hasattr(self, "hf_device_map") - # and len(set(self.hf_device_map.values())) > 1 - # and ("cpu" in self.hf_device_map.values() - # or "disk" in self.hf_device_map.values()) - # ): - # because this function always offloads, disregard actual devices and - # always use `cpu` and `cuda:0` to guarantee this condition passes - setattr(module, "hf_device_map", {"fake_offload": "cpu", "fake_exec": "cuda:0"}) - - return module - - -def remove_dispatch(module: torch.nn.Module) -> torch.nn.Module: - """ - Remove any existing dispatches from module - - :param module: module which may be dispatched with hf hooks - :return: module without dispatch - """ - remove_hook_from_module(module, recurse=True) - if hasattr(module, "hf_device_map"): - delattr(module, "hf_device_map") - module.to("cpu") - - return module - + if offload_device is not None: + raise ValueError( + "Passing offload_device to offloaded_dispatch is no longer supported" + ) + offload_model(module, execution_device) -@contextlib.contextmanager -def disable_offloading(): - """ - Keep modules onloaded and disable offloading until this context exits. - Affects modules which have been hooked with accelerate's `AlignDevicesHook` - """ - original_pre_forward = AlignDevicesHook.pre_forward - onloaded_modules: Dict[torch.nn.Module, Tuple[AlignDevicesHook, bool]] = dict() - # onload once and disable any future onloading/offloading steps - def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs): - ret = original_pre_forward(self, module, *args, **kwargs) - if module not in onloaded_modules: - onloaded_modules[module] = (self, self.offload) - self.offload = False - return ret +@deprecated("compressed_tensors.offload::align_module_device") +def disable_offload(module: torch.nn.Module): + raise ValueError() - # use the patched pre_forward function within the context - with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward): - yield - # manually offload all modules that were onloaded - # update any parameters which may have changed - for module, (hook, offload) in onloaded_modules.items(): - hook.offload = offload - for name, param in module.named_parameters(recurse=False): - update_offload_parameter(module, name, param.data) - hook.post_forward(module, None) +@deprecated() +def offload_to_weights_map(*args, **kwargs): + raise ValueError() -""" Upstreamed Functions """ +@deprecated() +def delete_from_weights_map(*args, **kwargs): + raise ValueError() -# introduced in accelerate v1.1.0 -@check_accelerate(fallback=False) +@deprecated() def has_offloaded_params(module: torch.nn.Module) -> bool: """ Checks if a module has offloaded parameters by checking if the given module has a @@ -616,57 +192,4 @@ def has_offloaded_params(module: torch.nn.Module) -> bool: bool: `True` if the module has an offload hook and offloading is enabled, `False` otherwise. """ - return ( - hasattr(module, "_hf_hook") - and isinstance(module._hf_hook, AlignDevicesHook) - and module._hf_hook.offload - ) - - -# introduced in accelerate v1.1.0 -@check_accelerate(fallback=contextlib.nullcontext()) -@contextlib.contextmanager -def align_module_device( - module: torch.nn.Module, execution_device: Optional[torch.device] = None -): - """ - Context manager that moves a module's parameters to the specified execution device. - - Args: - module (`torch.nn.Module`): - Module with parameters to align. - execution_device (`torch.device`, *optional*): - If provided, overrides the module's execution device within the context. - Otherwise, use hook execution device or pass - """ - if has_offloaded_params(module): - if execution_device is not None: - original_device = module._hf_hook.execution_device - module._hf_hook.execution_device = execution_device - - try: - module._hf_hook.pre_forward(module) - yield - finally: - module._hf_hook.post_forward(module, None) - if execution_device is not None: - module._hf_hook.execution_device = original_device - - elif execution_device is not None: - devices = { - name: param.device for name, param in module.named_parameters(recurse=False) - } - try: - for name in devices: - set_module_tensor_to_device(module, name, execution_device) - yield - finally: - for name, device in devices.items(): - set_module_tensor_to_device(module, name, device) - - else: - yield - - -# (1): Since we cannot know which pointers are shared when we add parameters in an -# online way, assume that all pointers are shared. This has virtually no runtime cost + return False diff --git a/tests/test_quantization/lifecycle/test_apply.py b/tests/test_quantization/lifecycle/test_apply.py index caf679781..d18b08c46 100644 --- a/tests/test_quantization/lifecycle/test_apply.py +++ b/tests/test_quantization/lifecycle/test_apply.py @@ -32,7 +32,6 @@ ) from compressed_tensors.quantization.lifecycle import apply_quantization_config from compressed_tensors.utils import is_match, match_named_modules -from tests.testing_utils import requires_accelerate from transformers import AutoModelForCausalLM @@ -322,7 +321,6 @@ def get_sample_tinyllama_quant_config( return QuantizationConfig.model_validate(config_dict) -@requires_accelerate() @pytest.mark.parametrize( "target,should_raise_warning", [ @@ -462,12 +460,8 @@ def test_multi_apply_quantization_config(): ) -@requires_accelerate() def test_apply_kv_cache(): - from accelerate import init_empty_weights - - with init_empty_weights(): - model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M") + model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M") args = QuantizationArgs( num_bits=8, @@ -486,12 +480,8 @@ def test_apply_kv_cache(): assert hasattr(layer.self_attn, "v_scale") -@requires_accelerate() def test_apply_attention(): - from accelerate import init_empty_weights - - with init_empty_weights(): - model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M") + model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M") scheme = QuantizationScheme( targets=["LlamaAttention"], diff --git a/tests/test_quantization/lifecycle/test_initialize.py b/tests/test_quantization/lifecycle/test_initialize.py index e463ea2c2..3bb8d407e 100644 --- a/tests/test_quantization/lifecycle/test_initialize.py +++ b/tests/test_quantization/lifecycle/test_initialize.py @@ -17,6 +17,7 @@ import pytest import torch +from compressed_tensors.offload import offload_model from compressed_tensors.quantization import ( FP8_E4M3_DATA, ActivationOrdering, @@ -28,7 +29,7 @@ from compressed_tensors.quantization.lifecycle.initialize import ( initialize_module_for_quantization, ) -from tests.testing_utils import requires_accelerate +from tests.testing_utils import requires_gpu from torch.nn import Linear @@ -98,7 +99,7 @@ def test_initialize_module_for_quantization( assert layer.quantization_status == QuantizationStatus.INITIALIZED -@requires_accelerate() +@requires_gpu @pytest.mark.parametrize( "weights,input_activations", [ @@ -119,9 +120,7 @@ def test_initialize_module_for_quantization( def test_initialize_module_for_quantization_offloaded( create_quantization_scheme, weights, input_activations, layer ): - from accelerate.hooks import attach_align_device_hook - - attach_align_device_hook(layer, offload=True) + offload_model(layer, "cuda:0") test_initialize_module_for_quantization( create_quantization_scheme, diff --git a/tests/test_transform/factory/test_correctness.py b/tests/test_transform/factory/test_correctness.py index 1fdbc3a00..0c6a7bc28 100644 --- a/tests/test_transform/factory/test_correctness.py +++ b/tests/test_transform/factory/test_correctness.py @@ -14,6 +14,7 @@ import pytest import torch +from compressed_tensors.offload import offload_model from compressed_tensors.transform import ( TransformArgs, TransformConfig, @@ -21,9 +22,8 @@ TransformScheme, apply_transform_config, ) -from compressed_tensors.utils import offloaded_dispatch from tests.test_transform.conftest import MockAttention, MockAttentionModel -from tests.testing_utils import requires_accelerate, requires_gpu +from tests.testing_utils import requires_gpu @pytest.mark.parametrize("type", ("hadamard", "random-hadamard", "random-matrix")) @@ -89,16 +89,16 @@ def test_correctness_embedding(type, randomize, embed_loc, linear_loc): assert torch.allclose(true_output, output, atol=1e-5, rtol=0.0) +@requires_gpu @pytest.mark.parametrize("type", ("hadamard", "random-hadamard", "random-matrix")) @pytest.mark.parametrize("randomize", (True, False)) @pytest.mark.parametrize("input_batch_size", (1, 5, 17)) -def test_correctness_model( - type, randomize, input_batch_size, model_apply, offload=False -): +@pytest.mark.parametrize("offload", (True, False)) +def test_correctness_model(type, randomize, input_batch_size, model_apply, offload): # load model model = model_apply[0] if offload: - model = offloaded_dispatch(model, torch.device("cuda")) + offload_model(model, torch.device("cuda")) # get output input = torch.rand((input_batch_size, 5, model.fcs[0].in_features)) @@ -119,15 +119,6 @@ def test_correctness_model( assert torch.allclose(true_output, output, atol=1e-5, rtol=0.0) -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize("type", ("hadamard", "random-hadamard", "random-matrix")) -@pytest.mark.parametrize("randomize", (True, False)) -@pytest.mark.parametrize("input_batch_size", (1, 5, 17)) -def test_correctness_model_offload(type, randomize, input_batch_size, model_apply): - test_correctness_model(type, randomize, input_batch_size, model_apply, offload=True) - - @pytest.mark.parametrize("type", ("hadamard", "random-hadamard", "random-matrix")) @pytest.mark.parametrize("randomize", (True, False)) @pytest.mark.parametrize("head_dim", (4, 8)) diff --git a/tests/test_transform/factory/test_memory.py b/tests/test_transform/factory/test_memory.py index 64a068c98..e373a752d 100644 --- a/tests/test_transform/factory/test_memory.py +++ b/tests/test_transform/factory/test_memory.py @@ -16,6 +16,11 @@ import pytest import torch +from compressed_tensors.offload import ( + disable_offloading, + disable_onloading, + offload_model, +) from compressed_tensors.transform import ( TransformArgs, TransformBase, @@ -23,19 +28,21 @@ TransformScheme, apply_transform_config, ) -from compressed_tensors.utils import align_modules, offloaded_dispatch from tests.test_transform.conftest import TransformableModel -from tests.testing_utils import requires_accelerate, requires_gpu +from tests.testing_utils import requires_gpu +@requires_gpu @pytest.mark.parametrize("type", ("hadamard", "random-hadamard")) @pytest.mark.parametrize("randomize", (True, False)) @pytest.mark.parametrize("requires_grad", (True, False)) -def test_memory_sharing(type, randomize, requires_grad, offload=False): +# @pytest.mark.parametrize("offload", (True, False)) +@pytest.mark.parametrize("offload", (True,)) +def test_memory_sharing(type, randomize, requires_grad, offload): # load model (maybe with offloading) model = TransformableModel(2, 2, 4, 4, 8, 8) if offload: - offloaded_dispatch(model, torch.device("cuda")) + offload_model(model, torch.device("cuda")) # add transforms to model config = TransformConfig( @@ -53,40 +60,15 @@ def test_memory_sharing(type, randomize, requires_grad, offload=False): ) apply_transform_config(model, config) - # check that memory is shared when onloaded - with align_modules(model.modules()): - weights = [m.weight for m in model.modules() if isinstance(m, TransformBase)] - weight_to_count = Counter(weights) - size_to_weight = {weight.size(0): weight for weight in weight_to_count} - - assert len(weight_to_count) == len(size_to_weight) == 3 - assert weight_to_count[size_to_weight[2]] == 3 - assert weight_to_count[size_to_weight[4]] == 4 - assert weight_to_count[size_to_weight[8]] == 3 - - # check that memory is shared in offloaded dict - if offload: - weights_map = dict(model.fcs[0]._hf_hook.weights_map.dataset) - offloaded_weights = [ - value - for name, value in weights_map.items() - if name.endswith("_input.weight") or name.endswith("_output.weight") - ] - weight_to_count = Counter(offloaded_weights) - size_to_weight = {weight.size(0): weight for weight in weight_to_count} - - assert len(weight_to_count) == len(size_to_weight) == 3 - assert weight_to_count[size_to_weight[2]] == 3 - assert weight_to_count[size_to_weight[4]] == 4 - assert weight_to_count[size_to_weight[8]] == 3 - + for context in disable_onloading, disable_offloading: + with context(): + weights = [ + m.weight for m in model.modules() if isinstance(m, TransformBase) + ] + weight_to_count = Counter(weights) + size_to_weight = {weight.size(0): weight for weight in weight_to_count} -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize("type", ("hadamard", "random-hadamard")) -@pytest.mark.parametrize("randomize", (True, False)) -def test_memory_sharing_offload( - type, - randomize, -): - test_memory_sharing(type, randomize, requires_grad=False, offload=True) + assert len(weight_to_count) == len(size_to_weight) == 3 + assert weight_to_count[size_to_weight[2]] == 3 + assert weight_to_count[size_to_weight[4]] == 4 + assert weight_to_count[size_to_weight[8]] == 3 diff --git a/tests/test_transform/factory/test_serialization.py b/tests/test_transform/factory/test_serialization.py index 15fa240ba..9adeb4cf8 100644 --- a/tests/test_transform/factory/test_serialization.py +++ b/tests/test_transform/factory/test_serialization.py @@ -16,24 +16,25 @@ import pytest import torch +from compressed_tensors.offload import offload_model from compressed_tensors.transform import ( TransformConfig, TransformScheme, apply_transform_config, ) -from compressed_tensors.utils import offloaded_dispatch from safetensors import safe_open -from tests.testing_utils import requires_accelerate, requires_gpu +from tests.testing_utils import requires_gpu from transformers import AutoModelForCausalLM, AutoTokenizer @pytest.mark.parametrize("type", ("hadamard", "random-hadamard")) @pytest.mark.parametrize("randomize", (True, False)) -def test_serialization(type, randomize, model_apply, tmp_path, offload=False): +@pytest.mark.parametrize("offload", (True, False)) +def test_serialization(type, randomize, model_apply, tmp_path, offload): # get model, maybe offload model, apply = model_apply if offload: - offloaded_dispatch(model, torch.device("cuda")) + offload_model(model, torch.device("cuda")) # apply transforms to model config = TransformConfig( @@ -48,7 +49,8 @@ def test_serialization(type, randomize, model_apply, tmp_path, offload=False): # check that saved values match model values # note that shared weights are only serialized once safetensors_path = os.path.join(model_path, "model.safetensors") - with safe_open(safetensors_path, framework="pt", device="cpu") as file: + device = "cuda:0" if offload else "cpu" + with safe_open(safetensors_path, framework="pt", device=device) as file: saved_keys = set(file.keys()) assert { "fcs.0.weight", @@ -60,17 +62,7 @@ def test_serialization(type, randomize, model_apply, tmp_path, offload=False): for key in saved_keys: param = model.get_parameter(key) saved_param = file.get_tensor(key) - - if param.device.type != "meta": # skip testing values in offload case - assert torch.equal(param, saved_param) - - -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize("type", ("hadamard", "random-hadamard")) -@pytest.mark.parametrize("randomize", (True, False)) -def test_serialization_offload(type, randomize, model_apply, tmp_path): - test_serialization(type, randomize, model_apply, tmp_path, offload=True) + assert torch.equal(param, saved_param) @pytest.mark.skip("Requires transformers#40673") diff --git a/tests/test_utils/test_match.py b/tests/test_utils/test_match.py index 86bf639be..467d74e0c 100644 --- a/tests/test_utils/test_match.py +++ b/tests/test_utils/test_match.py @@ -43,41 +43,35 @@ class DummyModel(nn.Module): """Test model for unit tests. Weights are initialized on meta device""" def __init__(self): - try: - from accelerate import init_empty_weights - except ImportError: - pytest.skip("Skipping weight init requires accelerate") - super().__init__() - with init_empty_weights(): - self.layer1 = nn.Linear(10, 20) - self.layer2 = nn.Linear(20, 30) - self.norm = nn.LayerNorm(30) - self.attention = nn.MultiheadAttention(30, 2) - - # Create nested structure - self.transformer = nn.ModuleDict( - { - "layers": nn.ModuleList( - [ - nn.ModuleDict( - { - "self_attn": nn.ModuleDict( - { - "q_proj": nn.Linear(30, 30), - "k_proj": nn.Linear(30, 30), - "v_proj": nn.Linear(30, 30), - } - ), - "norm": nn.LayerNorm(30), - "mlp": nn.Linear(30, 30), - } - ) - for _ in range(3) - ] - ) - } - ) + self.layer1 = nn.Linear(10, 20) + self.layer2 = nn.Linear(20, 30) + self.norm = nn.LayerNorm(30) + self.attention = nn.MultiheadAttention(30, 2) + + # Create nested structure + self.transformer = nn.ModuleDict( + { + "layers": nn.ModuleList( + [ + nn.ModuleDict( + { + "self_attn": nn.ModuleDict( + { + "q_proj": nn.Linear(30, 30), + "k_proj": nn.Linear(30, 30), + "v_proj": nn.Linear(30, 30), + } + ), + "norm": nn.LayerNorm(30), + "mlp": nn.Linear(30, 30), + } + ) + for _ in range(3) + ] + ) + } + ) class DummyMoEModel(nn.Module): diff --git a/tests/test_utils/test_offload.py b/tests/test_utils/test_offload.py deleted file mode 100644 index aed0186b2..000000000 --- a/tests/test_utils/test_offload.py +++ /dev/null @@ -1,540 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest -import torch -from compressed_tensors.utils import ( - align_module_device, - align_modules, - delete_offload_module, - delete_offload_parameter, - disable_hf_hook, - disable_offloading, - get_execution_device, - has_offloaded_params, - offloaded_dispatch, - register_offload_module, - register_offload_parameter, - update_offload_parameter, -) -from compressed_tensors.utils.offload import offload_to_weights_map -from tests.testing_utils import requires_accelerate, requires_gpu - - -class ExampleModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.a = torch.nn.Parameter(torch.tensor(0).float()) - self.b = torch.nn.Parameter(torch.tensor(0).float()) - - def forward(self, x): - return x * self.a + self.b - - -class ExampleModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(1, 2) - - def forward(self, x): - return self.linear(x) - - -@requires_accelerate() -def test_has_offloaded_params(): - from accelerate.hooks import attach_align_device_hook, remove_hook_from_module - - module = ExampleModule() - assert not has_offloaded_params(module) - - attach_align_device_hook(module, offload=False) - assert not has_offloaded_params(module) - - remove_hook_from_module(module) - attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) - assert has_offloaded_params(module) - - -@requires_gpu -@requires_accelerate() -def test_get_execution_device(): - from accelerate import init_empty_weights - from accelerate.big_modeling import attach_align_device_hook - - # no offloading - module = ExampleModule() - assert get_execution_device(module) == torch.device("cpu") - - # with offloading - attach_align_device_hook(module, torch.device("cuda:0")) - assert get_execution_device(module) == torch.device("cuda:0") - - # in meta context - with torch.device("meta"): - module = ExampleModule() - assert get_execution_device(module) == torch.device("meta") - - # offloaded in meta context - module = ExampleModule() - attach_align_device_hook(module, torch.device("cuda:0")) - with torch.device("meta"): - assert get_execution_device(module) == torch.device("cuda:0") - - # in empty weights context - with init_empty_weights(): - module = ExampleModule() - assert get_execution_device(module) == torch.device("meta") - - # offloaded in empty weights context - module = ExampleModule() - attach_align_device_hook(module, torch.device("cuda:0")) - with init_empty_weights(): - assert get_execution_device(module) == torch.device("cuda:0") - - -@requires_gpu -@requires_accelerate() -def test_get_execution_device_model(): - class Model(torch.nn.Module): - def __init__(self): - super().__init__() - self.a = torch.nn.Linear(1, 2) - self.b = torch.nn.Linear(2, 2, device="cuda:0") - - def forward(self, x): - return self.b(self.a(x).to("cuda:0")) - - model = Model() - assert get_execution_device(model) == torch.device("cpu") - - offloaded_dispatch(model.a, torch.device("cuda:0")) - assert get_execution_device(model) == torch.device("cuda:0") - - -@requires_accelerate() -def test_register_offload_parameter(): - from accelerate import init_empty_weights - from accelerate.hooks import attach_align_device_hook - - module = ExampleModule() - parameter = torch.nn.Parameter(torch.tensor(1.0)) - - # register a param prior to offloading - register_offload_parameter(module, "c", parameter) - assert module.c == parameter - - # offloading, check that added param was offloaded - attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) - assert "c" in module._hf_hook.weights_map - - # register a param after offloading, check that added param was offloaded - register_offload_parameter(module, "d", parameter) - assert module.d.device == torch.device("meta") - assert module._hf_hook.weights_map["d"].device == torch.device("cpu") - - # added parameters can be onloaded and offloaded - with align_module_device(module, execution_device="cpu"): - assert module.c.device == torch.device("cpu") - assert module.d.device == torch.device("cpu") - assert module.c.device == torch.device("meta") - assert module.d.device == torch.device("meta") - - # parameters can be added during onload - with align_module_device(module, execution_device="cpu"): - register_offload_parameter(module, "e", parameter) - assert module.e.device == torch.device("cpu") - - # parameters can be added before onload and with explicit offload - register_offload_parameter(module, "f", parameter, offload_device="cpu") - assert module._hf_hook.weights_map["f"].device == torch.device("cpu") - with align_module_device(module, execution_device="cpu"): - assert module.f.device == torch.device("cpu") - assert module._hf_hook.weights_map["f"].device == torch.device("cpu") - - # parameters registered in the empty init context are still empty - with init_empty_weights(): - module = ExampleModule() - register_offload_parameter(module, "c", parameter) - assert module.a.device == module.b.device == module.c.device == torch.device("meta") - - -@requires_accelerate() -@requires_gpu -def test_register_offload_parameter_hook_replacement(): - module = ExampleModule() - parameter_c = torch.nn.Parameter(torch.tensor(1.0, device="cuda")) - parameter_d = torch.nn.Parameter(torch.tensor(1.0, device="cpu")) - - offloaded_dispatch(module, "cuda") - register_offload_parameter(module, "c", parameter_c) - register_offload_parameter(module, "d", parameter_d) - - with disable_hf_hook(module): - assert module.a.device == torch.device("cpu") - assert module.b.device == torch.device("cpu") - assert module.c.device == torch.device("cuda:0") - assert module.d.device == torch.device("cpu") - - assert module.a.device == torch.device("meta") - assert module.b.device == torch.device("meta") - assert module.c.device == torch.device("meta") - assert module.d.device == torch.device("meta") - assert module._hf_hook.weights_map["a"].device == torch.device("cpu") - assert module._hf_hook.weights_map["b"].device == torch.device("cpu") - assert module._hf_hook.weights_map["c"].device == torch.device("cpu") - assert module._hf_hook.weights_map["d"].device == torch.device("cpu") - - -@requires_accelerate() -@requires_gpu -def test_register_offload_parameter_shared(): - module = ExampleModule() - parameter = torch.nn.Parameter(torch.tensor(1.0)) - - offloaded_dispatch(module, "cuda") - register_offload_parameter(module, "c", parameter) - register_offload_parameter(module, "d", parameter) - - with align_module_device(module): - assert module.c is module.d - - -@requires_accelerate() -def test_update_offload_parameter(): - from accelerate.hooks import attach_align_device_hook - - module = ExampleModule() - tensor_a = torch.tensor(1.0) - tensor_b = torch.tensor(2.0) - - # can update modules which are not offloaded - update_offload_parameter(module, "a", tensor_a) - assert module.a == tensor_a - - # can update modules which are offloaded - attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) - update_offload_parameter(module, "b", tensor_b) - assert module.b.device == torch.device("meta") - assert module._hf_hook.weights_map["b"] == tensor_b - - # data persists across onloading - with align_module_device(module, execution_device="cpu"): - assert module.a.data == tensor_a - assert module.b.data == tensor_b - assert module._hf_hook.weights_map["a"] == tensor_a - assert module._hf_hook.weights_map["b"] == tensor_b - - # data persists across offloading - assert module.a.device == torch.device("meta") - assert module.b.device == torch.device("meta") - assert module._hf_hook.weights_map["a"] == tensor_a - assert module._hf_hook.weights_map["b"] == tensor_b - - # can update with differnt shape with warning - with pytest.warns(): - new_data = torch.tensor([3.0]) - update_offload_parameter(module, "a", new_data) - assert module._hf_hook.weights_map["a"] == new_data - - -@requires_accelerate() -def test_delete_offload_parameter(): - from accelerate.hooks import attach_align_device_hook - - module = ExampleModule() - param_c = torch.nn.Parameter(torch.tensor(1.0)) - param_d = torch.nn.Parameter(torch.tensor(2.0)) - register_offload_parameter(module, "c", param_c) - register_offload_parameter(module, "d", param_d) - - # parameters are deleted - delete_offload_parameter(module, "a") - delete_offload_parameter(module, "c") - assert not hasattr(module, "a") - assert hasattr(module, "b") - assert not hasattr(module, "c") - assert hasattr(module, "d") - - # parameters and their offload are deleted - attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) - delete_offload_parameter(module, "b") - delete_offload_parameter(module, "d") - assert not hasattr(module, "a") - assert not hasattr(module, "b") - assert not hasattr(module, "c") - assert not hasattr(module, "d") - assert "a" not in module._hf_hook.weights_map - assert "b" not in module._hf_hook.weights_map - assert "c" not in module._hf_hook.weights_map - assert "d" not in module._hf_hook.weights_map - - -@requires_accelerate() -def test_disable_hf_hook(): - from accelerate.hooks import attach_align_device_hook - - module = ExampleModule() - - def custom_forward(): - pass - - attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) - with disable_hf_hook(module): - assert not hasattr(module, "_hf_hook") - module.forward = custom_forward - - assert hasattr(module, "_hf_hook") - assert module._old_forward == custom_forward - - -@requires_accelerate() -def test_disable_hf_hook_model_recurse(): - from accelerate.hooks import attach_align_device_hook - - module0 = ExampleModule() - module1 = ExampleModule() - module2 = ExampleModule() - model = torch.nn.Sequential(module0, torch.nn.Sequential(module1, module2)) - attach_align_device_hook(model, offload=True, weights_map=model.state_dict()) - - with disable_hf_hook(model): - assert not hasattr(module0, "_hf_hook") - assert not hasattr(module1, "_hf_hook") - assert not hasattr(module2, "_hf_hook") - - assert hasattr(module0, "_hf_hook") - assert hasattr(module1, "_hf_hook") - assert hasattr(module2, "_hf_hook") - - -@requires_accelerate() -def test_align_modules(): - from accelerate.hooks import attach_align_device_hook - - module0 = ExampleModule() - module1 = ExampleModule() - module2 = ExampleModule() - model = torch.nn.Sequential(module0, torch.nn.Sequential(module1, module2)) - attach_align_device_hook( - model, - execution_device=torch.device("cpu"), - offload=True, - weights_map=model.state_dict(), - ) - - assert module0.a.device == torch.device("meta") - assert module1.a.device == torch.device("meta") - assert module2.a.device == torch.device("meta") - - with align_modules((module0, module1)): - assert module0.a.device != torch.device("meta") - assert module1.a.device != torch.device("meta") - assert module2.a.device == torch.device("meta") - - assert module0.a.device == torch.device("meta") - assert module1.a.device == torch.device("meta") - assert module2.a.device == torch.device("meta") - - -@requires_accelerate() -def test_offload_to_weights_map(): - from accelerate.utils import OffloadedWeightsLoader, PrefixedDataset - - name = "name" - old_value = torch.tensor(0.0) - new_value = torch.tensor(1.0) - prefix = "prefix" - - # Dict empty - weights_map = {} - with pytest.raises(ValueError): - offload_to_weights_map(weights_map, name, new_value) - offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") - assert weights_map[name] == new_value - - # Dict populated - weights_map = {name: old_value} - offload_to_weights_map(weights_map, name, new_value) - assert weights_map[name] == new_value - - # OffloadedWeightsLoader[Dict] empty - weights_map = OffloadedWeightsLoader({}) - with pytest.raises(ValueError): - offload_to_weights_map(weights_map, name, new_value) - offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") - assert weights_map[name] == new_value - - # OffloadedWeightsLoader[Dict] populated - weights_map = OffloadedWeightsLoader({name: old_value}) - offload_to_weights_map(weights_map, name, new_value) - assert weights_map[name] == new_value - - # PrefixedDataset[Dict] empty - weights_map = PrefixedDataset({}, prefix) - with pytest.raises(ValueError): - offload_to_weights_map(weights_map, name, new_value) - offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") - assert weights_map[name] == new_value - - # PrefixedDataset[Dict] populated - weights_map = PrefixedDataset({name: old_value}, prefix) - offload_to_weights_map(weights_map, name, new_value) - assert weights_map[name] == new_value - - # PrefixedDataset[OffloadedWeightsLoader[Dict]] empty - weights_map = PrefixedDataset(OffloadedWeightsLoader({}), prefix) - with pytest.raises(ValueError): - offload_to_weights_map(weights_map, name, new_value) - offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") - assert weights_map[name] == new_value - - # PrefixedDataset[OffloadedWeightsLoader[Dict]] populated - weights_map = PrefixedDataset(OffloadedWeightsLoader({name: old_value}), prefix) - offload_to_weights_map(weights_map, name, new_value) - assert weights_map[name] == new_value - - -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize("exec_device", [torch.device("cpu"), torch.device("cuda")]) -def test_register_offload_module(exec_device): - # no offloading - model = ExampleModel() - child = torch.nn.Linear(2, 3) - register_offload_module(model, "child", child) - register_offload_module(model.linear, "child", child) - assert child in model.children() - assert child in model.linear.children() - - # with offloading - model = ExampleModel() - child = torch.nn.Linear(2, 3) - offloaded_dispatch(model, exec_device) - register_offload_module(model, "child", child) - register_offload_module(model.linear, "child", child) - assert child in model.children() - assert child in model.linear.children() - - # can run modules - model(torch.empty(1)) - child(torch.empty(2, device=exec_device)) - - -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize("exec_device", [torch.device("cpu"), torch.device("cuda")]) -def test_delete_offload_module(exec_device): - # no offloading - model = ExampleModel() - child = torch.nn.Linear(2, 3) - register_offload_module(model, "child", child) - register_offload_module(model.linear, "child", child) - delete_offload_module(model, "child") - delete_offload_module(model.linear, "child") - assert child not in model.children() - assert child not in model.linear.children() - - # with offloading - model = ExampleModel() - child = torch.nn.Linear(2, 3) - offloaded_dispatch(model, exec_device) - register_offload_module(model, "child", child) - register_offload_module(model.linear, "child", child) - delete_offload_module(model, "child") - delete_offload_module(model.linear, "child") - assert child not in model.children() - assert child not in model.linear.children() - - -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize( - "exec_device,offload_device", - [ - (torch.device("cpu"), torch.device("cpu")), - (torch.device("cpu"), torch.device("cuda:0")), - (torch.device("cuda:0"), torch.device("cpu")), - (torch.device("cuda:0"), torch.device("cuda:0")), - ], -) -def test_offloaded_dispatch(exec_device, offload_device): - # single module - module = torch.nn.Linear(1, 2, device=offload_device) - module = offloaded_dispatch(module, exec_device, offload_device) - assert has_offloaded_params(module) - assert module._hf_hook.offload - assert module.weight.device == torch.device("meta") - assert module._hf_hook.weights_map["weight"].device == offload_device - assert module._hf_hook.tied_params_map is not None - - # can run - module(torch.empty(1, device=exec_device)) - - # model - model = ExampleModel() - model = offloaded_dispatch(model, exec_device, offload_device) - assert not has_offloaded_params(model) - - assert has_offloaded_params(model.linear) - assert model.linear._hf_hook.offload - assert model.linear.weight.device == torch.device("meta") - assert model.linear._hf_hook.weights_map["weight"].device == offload_device - assert model.linear._hf_hook.tied_params_map is not None - - # can run - model(torch.empty(1, device=exec_device)) - - # can add new params - parameter = torch.nn.Parameter(torch.tensor(1.0)) - register_offload_parameter(module, "new_param", parameter) - assert module.new_param.device == torch.device("meta") - assert module._hf_hook.weights_map["new_param"].device == offload_device - - -@requires_gpu -@requires_accelerate() -@pytest.mark.parametrize( - "exec_device,offload_device", - [ - (torch.device("cpu"), torch.device("cpu")), - (torch.device("cpu"), torch.device("cuda:0")), - (torch.device("cuda:0"), torch.device("cpu")), - (torch.device("cuda:0"), torch.device("cuda:0")), - ], -) -def test_disable_offloading(exec_device, offload_device): - module = torch.nn.Linear(1, 2, device=exec_device) - - # non-offloaded modules are unaffected - with disable_offloading(): - output = module(torch.empty(1, device=exec_device)) - assert module.weight.device == exec_device - assert output.device == exec_device - - # offloaded modules stay on device until context exit - offloaded_dispatch(module, exec_device, offload_device) - assert module.weight.device == torch.device("meta") - assert module._hf_hook.weights_map["weight"].device == offload_device - - with disable_offloading(): - assert module.weight.device == torch.device("meta") - output = module(torch.empty(1, device=exec_device)) - assert module.weight.device == exec_device - assert output.device == exec_device - - output = module(torch.empty(1, device=exec_device)) - assert module.weight.device == exec_device - assert output.device == exec_device - - assert module.weight.device == torch.device("meta") - assert module._hf_hook.weights_map["weight"].device == offload_device diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 40a9dc2fb..b28af78ae 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -28,18 +28,7 @@ def compressed_tensors_config_available(): return False -def accelerate_availabe(): - try: - import accelerate # noqa: F401 - - return True - - except ImportError: - return False - - _is_compressed_tensors_config_available = compressed_tensors_config_available() -_is_accelerate_available = accelerate_availabe() def requires_hf_quantizer(): @@ -49,13 +38,6 @@ def requires_hf_quantizer(): ) -def requires_accelerate(): - return pytest.mark.skipif( - not _is_accelerate_available, - reason="requires accelerate", - ) - - def get_random_mat(M, K, dtype) -> "torch.Tensor": """ :param M: number of rows