From 53bc4018778086589dd36d6aaaf3691cde2e9e5c Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 11 Aug 2022 14:25:48 -0700 Subject: [PATCH 01/13] first batch of changes to sub configs --- deepspeed/comm/config.py | 32 +- deepspeed/compression/config.py | 639 ++++-------------- deepspeed/monitor/config.py | 61 +- deepspeed/monitor/csv_monitor.py | 10 +- deepspeed/monitor/monitor.py | 8 +- deepspeed/monitor/tensorboard.py | 10 +- deepspeed/monitor/wandb.py | 12 +- .../activation_checkpointing/config.py | 106 +-- deepspeed/runtime/config.py | 12 +- 9 files changed, 226 insertions(+), 664 deletions(-) diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py index d0238331de9e..7cdd47fca1a2 100644 --- a/deepspeed/comm/config.py +++ b/deepspeed/comm/config.py @@ -3,29 +3,19 @@ Licensed under the MIT license. """ -from pydantic import BaseModel -from .constants import * +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +COMMS_LOGGER = "comms_logger" -class CommsConfig(BaseModel): - class Config: - validate_all = True - validate_assignment = True - use_enum_values = True - extra = 'forbid' +def get_comms_config(param_dict): + comms_config_dict = param_dict.get(COMMS_LOGGER, {}) + return DeepSpeedCommsConfig(**comms_config_dict) -class CommsLoggerConfig(CommsConfig): - enabled: bool = COMMS_LOGGER_ENABLED_DEFAULT - prof_all: bool = COMMS_LOGGER_PROF_ALL_DEFAULT - prof_ops: list = COMMS_LOGGER_PROF_OPS_DEFAULT - verbose: bool = COMMS_LOGGER_VERBOSE_DEFAULT - debug: bool = COMMS_LOGGER_DEBUG_DEFAULT - -class DeepSpeedCommsConfig: - def __init__(self, ds_config): - self.comms_logger_enabled = 'comms_logger' in ds_config - - if self.comms_logger_enabled: - self.comms_logger = CommsLoggerConfig(**ds_config['comms_logger']) +class DeepSpeedCommsConfig(DeepSpeedConfigModel): + enabled: bool = False + prof_all: bool = True + prof_ops: list = [] + verbose: bool = False + debug: bool = False diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py index d53246e2ed87..89bb1ca2c44f 100644 --- a/deepspeed/compression/config.py +++ b/deepspeed/compression/config.py @@ -1,490 +1,149 @@ -from .constants import * -import copy -from ..runtime.config_utils import get_scalar_param - - -def get_compression_config(param_dict): - # - output = {} - - if COMPRESSION_TRAINING not in param_dict.keys(): - param_dict[COMPRESSION_TRAINING] = {} - sub_param_dict = param_dict[COMPRESSION_TRAINING] - output[WEIGHT_QUANTIZATION] = get_weight_quantization(sub_param_dict) - output[ACTIVATION_QUANTIZATION] = get_activation_quantization(sub_param_dict) - output[SPARSE_PRUNING] = get_sparse_pruning(sub_param_dict) - output[ROW_PRUNING] = get_row_pruning(sub_param_dict) - output[HEAD_PRUNING] = get_head_pruning(sub_param_dict) - output[CHANNEL_PRUNING] = get_channel_pruning(sub_param_dict) - - output[LAYER_REDUCTION] = get_layer_reduction(sub_param_dict) - - return output - - -def get_layer_reduction(param_dict): - output = {} - output[LAYER_REDUCTION_ENABLED] = LAYER_REDUCTION_ENABLED_DEFAULT - if get_layer_reduction_enabled(param_dict): - output[LAYER_REDUCTION_ENABLED] = get_layer_reduction_enabled(param_dict) - for key, val in get_layer_reduction_params(param_dict).items(): - output[key] = val - return output - - -def get_layer_reduction_enabled(param_dict): - if LAYER_REDUCTION in param_dict.keys(): - return get_scalar_param(param_dict[LAYER_REDUCTION], - LAYER_REDUCTION_ENABLED, - LAYER_REDUCTION_ENABLED_DEFAULT) - else: - return False - - -def get_layer_reduction_params(param_dict): - if LAYER_REDUCTION in param_dict.keys(): - layer_reduction_params = copy.copy(param_dict[LAYER_REDUCTION]) - layer_reduction_params.pop(LAYER_REDUCTION_ENABLED) - return layer_reduction_params - else: - return False - - -def get_quantize_enabled(param_dict): - if COMPRESSION_TRAINING not in param_dict.keys(): - return False - - sub_param_dict = param_dict[COMPRESSION_TRAINING] - output = get_weight_quantization_shared_parameters(sub_param_dict) - return output[WEIGHT_QUANTIZE_ENABLED] - - -def get_weight_quantization(param_dict): - output = {} - if WEIGHT_QUANTIZATION not in param_dict.keys(): - param_dict[WEIGHT_QUANTIZATION] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}} - sub_param_dict = param_dict[WEIGHT_QUANTIZATION] - # shared parameters - output[SHARED_PARAMETERS] = get_weight_quantization_shared_parameters(sub_param_dict) - # each sub-groups - if output[SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_weight_quantization_different_groups(sub_param_dict) - return output - - -def get_weight_quantization_shared_parameters(param_dict): - output = {} - if SHARED_PARAMETERS in param_dict.keys(): - sub_param_dict = param_dict[SHARED_PARAMETERS] - output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_ENABLED, - WEIGHT_QUANTIZE_ENABLED_DEFAULT) - output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_KERNEL, - WEIGHT_QUANTIZE_KERNEL_DEFAULT) - output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_SCHEDULE_OFFSET, - WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT) - output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_GROUPS, - WEIGHT_QUANTIZE_GROUPS_DEFAULT) - output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_VERBOSE, - WEIGHT_QUANTIZE_VERBOSE_DEFAULT) - output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, - WEIGHT_QUANTIZE_TYPE, - WEIGHT_QUANTIZE_TYPE_DEFAULT) - output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_IN_FORWARD_ENABLED, - WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT) - assert output[WEIGHT_QUANTIZE_TYPE] in [WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]" - output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_ROUNDING, - WEIGHT_QUANTIZE_ROUNDING_DEFAULT) - assert output[WEIGHT_QUANTIZE_ROUNDING] in [WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]" - if WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE in sub_param_dict.keys(): - output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = get_scalar_param( - sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED, - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT) - output[WEIGHT_QUANTIZE_CHANGE_RATIO] = get_scalar_param( - sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], - WEIGHT_QUANTIZE_CHANGE_RATIO, - WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT) - else: - output[ - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT - output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT - else: - output[WEIGHT_QUANTIZE_ENABLED] = WEIGHT_QUANTIZE_ENABLED_DEFAULT - output[WEIGHT_QUANTIZE_KERNEL] = WEIGHT_QUANTIZE_KERNEL_DEFAULT - output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT - output[WEIGHT_QUANTIZE_GROUPS] = WEIGHT_QUANTIZE_GROUPS_DEFAULT - output[WEIGHT_QUANTIZE_VERBOSE] = WEIGHT_QUANTIZE_VERBOSE_DEFAULT - output[WEIGHT_QUANTIZE_TYPE] = WEIGHT_QUANTIZE_TYPE_DEFAULT - output[WEIGHT_QUANTIZE_ROUNDING] = WEIGHT_QUANTIZE_ROUNDING_DEFAULT - output[ - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT - output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT - return output - - -def get_weight_quantization_different_groups(param_dict): - output = {} - sub_param_dict = param_dict[DIFFERENT_GROUPS] - - def get_params(name, group_dict): - assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}" - assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}" - group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param( - group_dict, - WEIGHT_QUANTIZATION_PERIOD, - WEIGHT_QUANTIZATION_PERIOD_DEFAULT) - return group_dict - - for k, v in sub_param_dict.items(): - output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) - output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) - - return output - - -def get_activation_quantization(param_dict): - output = {} - if ACTIVATION_QUANTIZATION not in param_dict.keys(): - param_dict[ACTIVATION_QUANTIZATION] = { - SHARED_PARAMETERS: {}, - DIFFERENT_GROUPS: {} - } - sub_param_dict = param_dict[ACTIVATION_QUANTIZATION] - # shared parameters - output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters( - sub_param_dict) - # each sub-groups - if output[SHARED_PARAMETERS][ACTIVATION_QUANTIZATION_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups( - sub_param_dict) - return output - - -def get_activation_quantization_shared_parameters(param_dict): - output = {} - if SHARED_PARAMETERS in param_dict.keys(): - sub_param_dict = param_dict[SHARED_PARAMETERS] - output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZATION_ENABLED, - ACTIVATION_QUANTIZATION_ENABLED_DEFAULT) - output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZE_TYPE, - ACTIVATION_QUANTIZE_TYPE_DEFAULT) - assert output[ACTIVATION_QUANTIZE_TYPE] in [ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]" - output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZE_RANGE, - ACTIVATION_QUANTIZE_RANGE_DEFAULT) - assert output[ACTIVATION_QUANTIZE_RANGE] in [ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]" - output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZE_SCHEDULE_OFFSET, - ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT) - else: - output[ACTIVATION_QUANTIZATION_ENABLED] = ACTIVATION_QUANTIZATION_ENABLED_DEFAULT - output[ACTIVATION_QUANTIZE_TYPE] = ACTIVATION_QUANTIZE_TYPE_DEFAULT - output[ACTIVATION_QUANTIZE_RANGE] = ACTIVATION_QUANTIZE_RANGE_DEFAULT - output[ - ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT - return output - - -def get_activation_quantization_different_groups(param_dict): - output = {} - sub_param_dict = param_dict[DIFFERENT_GROUPS] - - def get_params(name, group_dict): - assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}" - return group_dict - - for k, v in sub_param_dict.items(): - output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) - output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) - - return output - - -def get_sparse_pruning(param_dict): - output = {} - if SPARSE_PRUNING not in param_dict.keys(): - param_dict[SPARSE_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}} - sub_param_dict = param_dict[SPARSE_PRUNING] - # shared parameters - output[SHARED_PARAMETERS] = get_sparse_pruning_shared_parameters(sub_param_dict) - # each sub-groups - if output[SHARED_PARAMETERS][SPARSE_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_sparse_pruning_different_groups(sub_param_dict) - return output - - -def get_sparse_pruning_shared_parameters(param_dict): - output = {} - if SHARED_PARAMETERS in param_dict.keys(): - sub_param_dict = param_dict[SHARED_PARAMETERS] - output[SPARSE_PRUNING_ENABLED] = get_scalar_param( - sub_param_dict, - SPARSE_PRUNING_ENABLED, - SPARSE_PRUNING_ENABLED_DEFAULT) - output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict, - SPARSE_PRUNING_METHOD, - SPARSE_PRUNING_METHOD_DEFAULT) - assert output[SPARSE_PRUNING_METHOD] in [SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]" - output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - SPARSE_PRUNING_SCHEDULE_OFFSET, - SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT) - else: - output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT - output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT - output[SPARSE_PRUNING_SCHEDULE_OFFSET] = SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT - return output - - -def get_sparse_pruning_different_groups(param_dict): - output = {} - sub_param_dict = param_dict[DIFFERENT_GROUPS] - - def get_params(name, group_dict): - assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}" - return group_dict - - for k, v in sub_param_dict.items(): - output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) - output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) - - return output - - -def get_row_pruning(param_dict): - output = {} - if ROW_PRUNING not in param_dict.keys(): - param_dict[ROW_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}} - sub_param_dict = param_dict[ROW_PRUNING] - # shared parameters - output[SHARED_PARAMETERS] = get_row_pruning_shared_parameters(sub_param_dict) - # each sub-groups - if output[SHARED_PARAMETERS][ROW_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_row_pruning_different_groups(sub_param_dict) - return output - - -def get_row_pruning_shared_parameters(param_dict): - output = {} - if SHARED_PARAMETERS in param_dict.keys(): - sub_param_dict = param_dict[SHARED_PARAMETERS] - output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, - ROW_PRUNING_ENABLED, - ROW_PRUNING_ENABLED_DEFAULT) - output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict, - ROW_PRUNING_METHOD, - ROW_PRUNING_METHOD_DEFAULT) - assert output[ROW_PRUNING_METHOD] in [ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]" - output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - ROW_PRUNING_SCHEDULE_OFFSET, - ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT) - else: - output[ROW_PRUNING_ENABLED] = ROW_PRUNING_ENABLED_DEFAULT - output[ROW_PRUNING_METHOD] = ROW_PRUNING_METHOD_DEFAULT - output[ROW_PRUNING_SCHEDULE_OFFSET] = ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT - return output - - -def get_row_pruning_different_groups(param_dict): - output = {} - sub_param_dict = param_dict[DIFFERENT_GROUPS] - - def get_params(name, group_dict): - assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}" - return group_dict - - for k, v in sub_param_dict.items(): - output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) - output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) - return output - - -def get_head_pruning(param_dict): - output = {} - if HEAD_PRUNING not in param_dict.keys(): - param_dict[HEAD_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}} - sub_param_dict = param_dict[HEAD_PRUNING] - # shared parameters - output[SHARED_PARAMETERS] = get_head_pruning_shared_parameters(sub_param_dict) - # each sub-groups - if output[SHARED_PARAMETERS][HEAD_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_head_pruning_different_groups(sub_param_dict) - return output - - -def get_head_pruning_shared_parameters(param_dict): - output = {} - if SHARED_PARAMETERS in param_dict.keys(): - sub_param_dict = param_dict[SHARED_PARAMETERS] - output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, - HEAD_PRUNING_ENABLED, - HEAD_PRUNING_ENABLED_DEFAULT) - output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict, - HEAD_PRUNING_METHOD, - HEAD_PRUNING_METHOD_DEFAULT) - assert output[HEAD_PRUNING_METHOD] in [HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]" - output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - HEAD_PRUNING_SCHEDULE_OFFSET, - HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT) - if output[HEAD_PRUNING_ENABLED]: - assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning" - output[HEAD_PRUNING_NUM_HEADS] = sub_param_dict[HEAD_PRUNING_NUM_HEADS] - else: - output[HEAD_PRUNING_ENABLED] = HEAD_PRUNING_ENABLED_DEFAULT - output[HEAD_PRUNING_METHOD] = HEAD_PRUNING_METHOD_DEFAULT - output[HEAD_PRUNING_SCHEDULE_OFFSET] = HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT - return output - - -def get_head_pruning_different_groups(param_dict): - output = {} - sub_param_dict = param_dict[DIFFERENT_GROUPS] - - def get_params(name, group_dict): - assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(), f"dense_ratio must be specified for head pruning group {name}" - return group_dict - - for k, v in sub_param_dict.items(): - output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) - output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) - return output - - -def get_channel_pruning(param_dict): - output = {} - if CHANNEL_PRUNING not in param_dict.keys(): - param_dict[CHANNEL_PRUNING] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}} - sub_param_dict = param_dict[CHANNEL_PRUNING] - # shared parameters - output[SHARED_PARAMETERS] = get_channel_pruning_shared_parameters(sub_param_dict) - # each sub-groups - if output[SHARED_PARAMETERS][CHANNEL_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_channel_pruning_different_groups(sub_param_dict) - return output - - -def get_channel_pruning_shared_parameters(param_dict): - output = {} - if SHARED_PARAMETERS in param_dict.keys(): - sub_param_dict = param_dict[SHARED_PARAMETERS] - output[CHANNEL_PRUNING_ENABLED] = get_scalar_param( - sub_param_dict, - CHANNEL_PRUNING_ENABLED, - CHANNEL_PRUNING_ENABLED_DEFAULT) - output[CHANNEL_PRUNING_METHOD] = get_scalar_param( - sub_param_dict, - CHANNEL_PRUNING_METHOD, - CHANNEL_PRUNING_METHOD_DEFAULT) - assert output[CHANNEL_PRUNING_METHOD] in [CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]" - output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - CHANNEL_PRUNING_SCHEDULE_OFFSET, - CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT) - else: - output[CHANNEL_PRUNING_ENABLED] = CHANNEL_PRUNING_ENABLED_DEFAULT - output[CHANNEL_PRUNING_METHOD] = CHANNEL_PRUNING_METHOD_DEFAULT - output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT - return output - - -def get_channel_pruning_different_groups(param_dict): - output = {} - sub_param_dict = param_dict[DIFFERENT_GROUPS] - - def get_params(name, group_dict): - assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}" - return group_dict - - for k, v in sub_param_dict.items(): - output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) - output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) - - return output +from pydantic import Field +from typing import Dict, List, Any +from enum import Enum +from deepspeed.runtime.config_utils import DeepSpeedConfigModel + + +class QuantizationTypeEnum(str, Enum): + symmetric = "symmetric" + asymmetric = "asymmetric" + + +class QuantizationRoundingEnum(str, Enum): + nearest = "nearest" + stochastic = "stochastic" + + +class QuantizationRangeEnum(str, Enum): + dynamic = "dynamic" + static = "static" + + +class PruningMethodEnum(str, Enum): + l1 = "l1" + topk = "topk" + + +""" Weights """ + + +class FP16MixedQuantizeConfig(DeepSpeedConfigModel): + enabled: bool = False + quantize_change_ratio: float = Field(0.001, ge=0) + + +class WeightQuantizationSharedParamsConfig(DeepSpeedConfigModel): + enabled: bool = False + quantizer_kernel: bool = False + schedule_offset: int = Field(0, ge=0) + quantize_groups: int = Field(1, gt=0) + quantize_verbose: bool = False + quantization_type: QuantizationTypeEnum = QuantizationTypeEnum.symmetric + quantize_weight_in_forward: bool = False + rounding: QuantizationRoundingEnum = QuantizationRoundingEnum.nearest + fp16_mixed_quantize: FP16MixedQuantizeConfig = {} + + +class WeightQuantizationDifferentGroupsParamsConfig(DeepSpeedConfigModel): + start_bits: int + target_bits: int + quantization_period: int = Field(1, ge=0) + + +class WeightQuantizationDifferentGroupsConfig(DeepSpeedConfigModel): + params: WeightQuantizationDifferentGroupsParamsConfig + modules: List[str] = ["*"] + related_modules: List[str] = None + + +class WeightQuantizationConfig(DeepSpeedConfigModel): + shared_parameters: WeightQuantizationSharedParamsConfig = {} + different_groups: Dict[str, WeightQuantizationDifferentGroupsConfig] = {} + + +""" Activation """ + + +class ActivationQuantizationSharedParamsConfig(DeepSpeedConfigModel): + enabled: bool = False + quantization_type: QuantizationTypeEnum = QuantizationTypeEnum.symmetric + range_calibration: QuantizationRangeEnum = QuantizationRangeEnum.dynamic + schedule_offset: int = Field(1000, ge=0) + + +class ActivationQuantizationDifferentGroupsParamConfig(DeepSpeedConfigModel): + bits: int + + +class ActivationQuantizationDifferentGroupsConfig(DeepSpeedConfigModel): + params: ActivationQuantizationDifferentGroupsParamConfig + modules: List[str] = ["*"] + related_modules: Any = None + + +class ActivationQuantizationConfig(DeepSpeedConfigModel): + shared_parameters: ActivationQuantizationSharedParamsConfig = {} + different_groups: Dict[str, ActivationQuantizationDifferentGroupsConfig] = {} + + +""" Pruning """ + + +class PruningSharedParamsConfig(DeepSpeedConfigModel): + enabled: bool = False + method: PruningMethodEnum = PruningMethodEnum.l1 + schedule_offset: int = Field(1000, ge=0) + + +class PruningDifferentGroupsParamConfig(DeepSpeedConfigModel): + dense_ratio: float + + +class PruningDifferentGroupsConfig(DeepSpeedConfigModel): + params: PruningDifferentGroupsParamConfig + modules: List[str] = ["*"] + related_modules: Any = None + + +class PruningConfig(DeepSpeedConfigModel): + shared_parameters: PruningSharedParamsConfig = {} + different_groups: Dict[str, PruningDifferentGroupsConfig] = {} + + +# Head pruning is slightly different: + + +class HeadPruningSharedParamsConfig(DeepSpeedConfigModel): + enabled: bool = False + method: PruningMethodEnum = PruningMethodEnum.l1 + schedule_offset: int = Field(1000, ge=0) + num_heads: int = Field(None, ge=0) + + +class HeadPruningConfig(DeepSpeedConfigModel): + shared_parameters: HeadPruningSharedParamsConfig = {} + different_groups: Dict[str, PruningDifferentGroupsConfig] = {} + + +""" Layer Reduction """ + + +class LayerReductionConfig(DeepSpeedConfigModel): + enabled: bool = False + keep_number_layer: int = Field(None, ge=0) + module_name_prefix: str = "" + teacher_layer: List[int] = [] + other_module_name: List[str] = [] + + +""" Compression Config """ + + +class DeepSpeedCompressionConfig(DeepSpeedConfigModel): + weight_quantization: WeightQuantizationConfig = {} + activation_quantization: ActivationQuantizationConfig = {} + sparse_pruning: PruningConfig = {} + row_pruning: PruningConfig = {} + head_pruning: HeadPruningConfig = {} + channel_pruning: PruningConfig = {} + layer_reduction: LayerReductionConfig = {} diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py index 709830f27e98..555be7d27624 100644 --- a/deepspeed/monitor/config.py +++ b/deepspeed/monitor/config.py @@ -3,46 +3,39 @@ Licensed under the MIT license. """ -from pydantic import BaseModel -from .constants import * +from types import SimpleNamespace +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +TENSORBOARD = "tensorboard" +WANDB = "wandb" +CSV_MONITOR = "csv_monitor" -class MonitorConfig(BaseModel): - class Config: - validate_all = True - validate_assignment = True - use_enum_values = True - extra = 'forbid' +def get_monitor_config(param_dict): + tensorboard_config_dict = param_dict.get(TENSORBOARD, {}) + wandb_config_dict = param_dict.get(WANDB, {}) + csv_monitor_config_dict = param_dict.get(CSV_MONITOR, {}) + monitor_config = SimpleNamespace( + tensorboard=TensorBoardConfig(**tensorboard_config_dict), + wandb=WandbConfig(**wandb_config_dict), + csv_monitor=CSVMonitorConfig(**csv_monitor_config_dict)) + return monitor_config -class TensorBoardConfig(MonitorConfig): - enabled: bool = TENSORBOARD_ENABLED_DEFAULT - output_path: str = TENSORBOARD_OUTPUT_PATH_DEFAULT - job_name: str = TENSORBOARD_JOB_NAME_DEFAULT +class TensorBoardConfig(DeepSpeedConfigModel): + enabled: bool = False + output_path: str = "" + job_name: str = "DeepSpeedJobName" -class WandbConfig(MonitorConfig): - enabled: bool = WANDB_ENABLED_DEFAULT - group: str = WANDB_GROUP_NAME_DEFAULT - team: str = WANDB_TEAM_NAME_DEFAULT - project: str = WANDB_PROJECT_NAME_DEFAULT +class WandbConfig(DeepSpeedConfigModel): + enabled: bool = False + group: str = None + team: str = None + project: str = "deepspeed" -class CSVConfig(MonitorConfig): - enabled: bool = CSV_MONITOR_ENABLED_DEFAULT - output_path: str = CSV_MONITOR_OUTPUT_PATH_DEFAULT - job_name: str = CSV_MONITOR_JOB_NAME_DEFAULT - -class DeepSpeedMonitorConfig: - def __init__(self, ds_config): - self.tensorboard_enabled = 'tensorboard' in ds_config - self.wandb_enabled = 'wandb' in ds_config - self.csv_monitor_enabled = 'csv_monitor' in ds_config - - if self.tensorboard_enabled: - self.tensorboard_config = TensorBoardConfig(**ds_config['tensorboard']) - if self.wandb_enabled: - self.wandb_config = WandbConfig(**ds_config['wandb']) - if self.csv_monitor_enabled: - self.csv_monitor_config = CSVConfig(**ds_config['csv_monitor']) +class CSVMonitorConfig(DeepSpeedConfigModel): + enabled: bool = False + output_path: str = "" + job_name: str = "DeepSpeedJobName" diff --git a/deepspeed/monitor/csv_monitor.py b/deepspeed/monitor/csv_monitor.py index 1425f1d56f66..67b8d6da0d1f 100644 --- a/deepspeed/monitor/csv_monitor.py +++ b/deepspeed/monitor/csv_monitor.py @@ -5,12 +5,12 @@ class csvMonitor(Monitor): - def __init__(self, monitor_config): - super().__init__(monitor_config) + def __init__(self, csv_monitor_config): + super().__init__(csv_monitor_config) self.filenames = [] - self.enabled = monitor_config.csv_monitor_config.enabled - self.output_path = monitor_config.csv_monitor_config.output_path - self.job_name = monitor_config.csv_monitor_config.job_name + self.enabled = csv_monitor_config.enabled + self.output_path = csv_monitor_config.output_path + self.job_name = csv_monitor_config.job_name self.log_dir = self.setup_log_dir() def setup_log_dir(self, base=os.path.join(os.path.expanduser("~"), "csv_monitor")): diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py index a5ac271861ff..6cc1731a8f1b 100644 --- a/deepspeed/monitor/monitor.py +++ b/deepspeed/monitor/monitor.py @@ -27,15 +27,15 @@ def __init__(self, monitor_config): self.tb_monitor = None self.wandb_monitor = None self.csv_monitor = None - self.enabled = monitor_config.tensorboard_enabled or monitor_config.csv_monitor_enabled or monitor_config.wandb_enabled + self.enabled = monitor_config.tensorboard.enabled or monitor_config.csv_monitor.enabled or monitor_config.wandb.enabled if dist.get_rank() == 0: if monitor_config.tensorboard_enabled: - self.tb_monitor = TensorBoardMonitor(monitor_config) + self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard) if monitor_config.wandb_enabled: - self.wandb_monitor = WandbMonitor(monitor_config) + self.wandb_monitor = WandbMonitor(monitor_config.wandb) if monitor_config.csv_monitor_enabled: - self.csv_monitor = csvMonitor(monitor_config) + self.csv_monitor = csvMonitor(monitor_config.csv_monitor) def write_events(self, event_list): if dist.get_rank() == 0: diff --git a/deepspeed/monitor/tensorboard.py b/deepspeed/monitor/tensorboard.py index 447143e53b05..8a2a777b4c37 100644 --- a/deepspeed/monitor/tensorboard.py +++ b/deepspeed/monitor/tensorboard.py @@ -6,14 +6,14 @@ class TensorBoardMonitor(Monitor): - def __init__(self, monitor_config): - super().__init__(monitor_config) + def __init__(self, tensorboard_config): + super().__init__(tensorboard_config) check_tb_availability() self.summary_writer = None - self.enabled = monitor_config.tensorboard_config.enabled - self.output_path = monitor_config.tensorboard_config.output_path - self.job_name = monitor_config.tensorboard_config.job_name + self.enabled = tensorboard_config.enabled + self.output_path = tensorboard_config.output_path + self.job_name = tensorboard_config.job_name if self.enabled and dist.get_rank() == 0: self.get_summary_writer() diff --git a/deepspeed/monitor/wandb.py b/deepspeed/monitor/wandb.py index 63f5879633b5..85c9c23c821b 100644 --- a/deepspeed/monitor/wandb.py +++ b/deepspeed/monitor/wandb.py @@ -5,15 +5,15 @@ class WandbMonitor(Monitor): - def __init__(self, monitor_config): - super().__init__(monitor_config) + def __init__(self, wandb_config): + super().__init__(wandb_config) check_wandb_availability() import wandb - self.enabled = monitor_config.wandb_config.enabled - self.group = monitor_config.wandb_config.group - self.team = monitor_config.wandb_config.team - self.project = monitor_config.wandb_config.project + self.enabled = wandb_config.enabled + self.group = wandb_config.group + self.team = wandb_config.team + self.project = wandb_config.project if self.enabled and dist.get_rank() == 0: wandb.init(project=self.project, group=self.group, entity=self.team) diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py index 0ab59ac64eea..7a61aa482b09 100755 --- a/deepspeed/runtime/activation_checkpointing/config.py +++ b/deepspeed/runtime/activation_checkpointing/config.py @@ -3,101 +3,21 @@ Licensed under the MIT license. """ -from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject +from pydantic import Field +from deepspeed.runtime.config_utils import DeepSpeedConfigModel -######################################### -# DeepSpeed Activation Checkpointing -######################################### -# Activation Checkpointing Allows to save memory by only keeping a select few -#activations for the backpropagation. -ACTIVATION_CHKPT_FORMAT = ''' -Activation Checkpointing should be configured as: -"session_params": { - "activation_checkpointing": { - "partitioned_activations": [true|false], - "number_checkpoints": 100, - "contiguous_memory_optimization": [true|false], - "cpu_checkpointing": [true|false] - "profile": [true|false], - "synchronize_checkpoint_boundary": [true|false], - } -} -''' +ACT_CHKPT = "activation_checkpointing" -ACT_CHKPT_PARTITION_ACTIVATIONS = 'partition_activations' -ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT = False -ACT_CHKPT_NUMBER_CHECKPOINTS = 'number_checkpoints' -ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT = None +def get_activation_checkpointing_config(param_dict): + act_chkpt_config_dict = param_dict.get(ACT_CHKPT, {}) + return DeepSpeedActivationCheckpointingConfig(**act_chkpt_config_dict) -ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION = 'contiguous_memory_optimization' -ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT = False -ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY = 'synchronize_checkpoint_boundary' -ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT = False - -ACT_CHKPT_PROFILE = 'profile' -ACT_CHKPT_PROFILE_DEFAULT = False - -ACT_CHKPT_CPU_CHECKPOINTING = 'cpu_checkpointing' -ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT = False - -ACT_CHKPT = 'activation_checkpointing' - -ACT_CHKPT_DEFAULT = { - ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT, - ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT, - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION: - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT, - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY: - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT, - ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT, - ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT -} - - -class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject): - def __init__(self, param_dict): - super(DeepSpeedActivationCheckpointingConfig, self).__init__() - - self.partition_activations = None - self.contiguous_memory_optimization = None - self.cpu_checkpointing = None - self.number_checkpoints = None - self.synchronize_checkpoint_boundary = None - self.profile = None - - if ACT_CHKPT in param_dict.keys(): - act_chkpt_config_dict = param_dict[ACT_CHKPT] - else: - act_chkpt_config_dict = ACT_CHKPT_DEFAULT - - self._initialize(act_chkpt_config_dict) - - def _initialize(self, act_chkpt_config_dict): - self.partition_activations = get_scalar_param( - act_chkpt_config_dict, - ACT_CHKPT_PARTITION_ACTIVATIONS, - ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT) - - self.contiguous_memory_optimization = get_scalar_param( - act_chkpt_config_dict, - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION, - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT) - - self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict, - ACT_CHKPT_CPU_CHECKPOINTING, - ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT) - - self.number_checkpoints = get_scalar_param(act_chkpt_config_dict, - ACT_CHKPT_NUMBER_CHECKPOINTS, - ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT) - - self.profile = get_scalar_param(act_chkpt_config_dict, - ACT_CHKPT_PROFILE, - ACT_CHKPT_PROFILE_DEFAULT) - - self.synchronize_checkpoint_boundary = get_scalar_param( - act_chkpt_config_dict, - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY, - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT) +class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigModel): + partition_activations: bool = False + contiguous_memory_optimization: bool = False + cpu_checkpointing: bool = False + number_checkpoints: int = Field(None, gt=0) + synchronize_checkpoint_boundary: bool = False + profile: bool = False diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 6d377e5dc6a2..6ad58fc781ef 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -22,9 +22,9 @@ ScientificNotationEncoder, ) from .zero.config import get_zero_config, ZeroStageEnum -from .activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig -from ..comm.config import DeepSpeedCommsConfig -from ..monitor.config import DeepSpeedMonitorConfig +from .activation_checkpointing.config import get_activation_checkpointing_config +from ..comm.config import get_comms_config +from ..monitor.config import get_monitor_config from deepspeed import comm as dist @@ -818,11 +818,11 @@ def _initialize_params(self, param_dict): self.zero_optimization_stage = self.zero_config.stage self.zero_enabled = self.zero_optimization_stage > 0 - self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig( + self.activation_checkpointing_config = get_activation_checkpointing_config( param_dict) - self.comms_config = DeepSpeedCommsConfig(param_dict) - self.monitor_config = DeepSpeedMonitorConfig(param_dict) + self.comms_config = get_comms_config(param_dict) + self.monitor_config = get_monitor_config(param_dict) self.gradient_clipping = get_gradient_clipping(param_dict) self.fp16_enabled = get_fp16_enabled(param_dict) From e3f1f132eb495764c078e85cc2fc0d212ba207d8 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 11 Aug 2022 14:31:09 -0700 Subject: [PATCH 02/13] fix for compression configs --- deepspeed/compression/config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py index 89bb1ca2c44f..31aef72f9171 100644 --- a/deepspeed/compression/config.py +++ b/deepspeed/compression/config.py @@ -3,6 +3,13 @@ from enum import Enum from deepspeed.runtime.config_utils import DeepSpeedConfigModel +COMPRESSION_TRAINING = "compression_training" + + +def get_compression_config(param_dict): + compression_config_dict = param_dict.get(COMPRESSION_TRAINING, {}) + return DeepSpeedCompressionConfig(**compression_config_dict) + class QuantizationTypeEnum(str, Enum): symmetric = "symmetric" From 609d180521d0ad7ba1e50b865d7fc3557568a7e8 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 11 Aug 2022 15:12:33 -0700 Subject: [PATCH 03/13] more compression fixes --- deepspeed/compression/compress.py | 31 +++++++++++++++---------------- deepspeed/runtime/config.py | 9 +++++---- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/deepspeed/compression/compress.py b/deepspeed/compression/compress.py index 84ee53aab921..13ce66dc179d 100644 --- a/deepspeed/compression/compress.py +++ b/deepspeed/compression/compress.py @@ -1,8 +1,8 @@ import re from .helper import compression_preparation, fix_compression, recursive_getattr, is_module_compressible -from .config import get_compression_config from ..runtime.config_utils import dict_raise_error_on_duplicate_keys from .constants import * +from .config import get_compression_config import os import json @@ -49,21 +49,20 @@ def get_module_name(group_name, def get_compress_methods(model, compress_methods, mpu=None): # extract the compression module for each method in compress_methods layer_added_compress_methods = [] - for method, method_content in compress_methods.items(): + for method, method_content in compress_methods: if LAYER_REDUCTION in method: continue # for loop different methods, i.e., weight quantization, activation quantization etc exist_module_name = set() - shared_parameters = method_content[ - SHARED_PARAMETERS] # get all the shared parameters - for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items(): + shared_parameters = method_content.shared_parameters # get all the shared parameters + for group_name, method_parameters in method_content.different_groups.items(): # for loop different groups, i.e., weight quantization group 1, weight quantization group 2 etc module_name_list = [] related_module_name_list = [] - if method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]: + if method_parameters.related_modules: # this is used for head/row/channel pruning, if users provide the related module scope, we can shrink the layer dim for them # otherwise we just mask those as zeros - for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE], method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]): + for key_word, related_key_words in zip(method_parameters.modules, method_parameters.related_modules): module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu) module_name_list.append(module_name) tmp_related_module_name_list = [] @@ -73,15 +72,15 @@ def get_compress_methods(model, compress_methods, mpu=None): tmp_related_module_name_list.append(module_name) related_module_name_list.append(tmp_related_module_name_list) else: - for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]: + for key_word in method_parameters.modules: module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu) module_name_list.append(module_name) if module_name_list: # combine shared parameters with each group combined_method_parameters = { - **(method_parameters.copy().pop(DIFFERENT_GROUPS_PARAMETERS)), - **shared_parameters + **method_parameters.dict().pop(DIFFERENT_GROUPS_PARAMETERS), + **shared_parameters.dict() } compression_item = [ module_name_list, @@ -112,7 +111,7 @@ def init_compression(model, deepspeed_config, teacher_model=None, mpu=None): c_model = model # For layer reduction - if compress_methods[LAYER_REDUCTION][LAYER_REDUCTION_ENABLED]: + if compress_methods.layer_reduction.enabled: assert teacher_model is not None, "Teacher model is required for layer reduction" student_initialization(c_model, teacher_model, deepspeed_config) @@ -135,7 +134,7 @@ def redundancy_clean(model, deepspeed_config, mpu=None): mpu The mpu module for Row/Column parallelism """ - compress_methods = get_compression_config(check_deepspeed_config(deepspeed_config)) + compress_methods = deepspeed_config.commpression_config if hasattr(model, 'module'): c_model = model.module else: @@ -191,12 +190,12 @@ def student_initialization(student_model, teacher_model, deepspeed_config): The path of ds_config ''' config = get_compression_config(check_deepspeed_config(deepspeed_config)) - compress_methods = config[LAYER_REDUCTION] + compress_methods = config.layer_reduction - module_name_prefix = compress_methods[MODULE_NAME_PREFIX] - teacher_layer = compress_methods[TEACHER_LAYER] + module_name_prefix = compress_methods.module_name_prefix + teacher_layer = compress_methods.teacher_layer student_layer = [i for i in range(len(teacher_layer))] - other_module_name = compress_methods[OTHER_MODULE_NAME] + other_module_name = compress_methods.other_module_name ''' name_prefix (`str`) The prefix name before the layer #. diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 6ad58fc781ef..bc1311a884a1 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -51,7 +51,7 @@ from ..autotuning.config import DeepSpeedAutotuningConfig from ..nebula.config import DeepSpeedNebulaConfig -from ..compression.config import get_compression_config, get_quantize_enabled +from ..compression.config import get_compression_config from ..compression.constants import * from .swap_tensor.aio_config import get_aio_config @@ -551,8 +551,8 @@ def get_memory_breakdown(param_dict): return get_scalar_param(param_dict, MEMORY_BREAKDOWN, MEMORY_BREAKDOWN_DEFAULT) -def get_eigenvalue_config(param_dict): - if get_quantize_enabled(param_dict): +def get_eigenvalue_config(param_dict, compression_config): + if compression_config.weight_quantization.shared_parameters.enabled: param_dict = param_dict[QUANTIZE_TRAINING] assert not get_eigenvalue_enabled(param_dict), "Eigenvalue based MoQ is temporarily disabled" return ( @@ -868,7 +868,8 @@ def _initialize_params(self, param_dict): self.eigenvalue_gas_boundary_resolution, self.eigenvalue_layer_name, self.eigenvalue_layer_num, - ) = get_eigenvalue_config(param_dict) + ) = get_eigenvalue_config(param_dict, + self.compression_config) self.sparse_attention = get_sparse_attention(param_dict) self.pipeline = get_pipeline_config(param_dict) From 64ebfd10cf79d34a38ba1940fb98f9273fe4d706 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 11 Aug 2022 16:00:53 -0700 Subject: [PATCH 04/13] converted more configs --- deepspeed/autotuning/config.py | 185 +++++++++-------------------- deepspeed/comm/comm.py | 2 +- deepspeed/nebula/config.py | 56 ++------- deepspeed/profiling/config.py | 52 ++------ deepspeed/runtime/config.py | 12 +- tests/unit/monitor/test_monitor.py | 12 +- 6 files changed, 95 insertions(+), 224 deletions(-) diff --git a/deepspeed/autotuning/config.py b/deepspeed/autotuning/config.py index dea36f0377dc..d43d99e8d502 100644 --- a/deepspeed/autotuning/config.py +++ b/deepspeed/autotuning/config.py @@ -3,132 +3,59 @@ Licensed under the MIT license. """ -from deepspeed.runtime.config_utils import get_scalar_param, get_dict_param, DeepSpeedConfigObject -from deepspeed.autotuning.constants import * - - -class DeepSpeedAutotuningConfig(DeepSpeedConfigObject): - def __init__(self, param_dict): - super(DeepSpeedAutotuningConfig, self).__init__() - - self.enabled = None - self.start_step = None - self.end_step = None - self.metric_path = None - self.arg_mappings = None - self.metric = None - self.model_info = None - self.results_dir = None - self.exps_dir = None - self.overwrite = None - - if param_dict and AUTOTUNING in param_dict.keys(): - autotuning_dict = param_dict[AUTOTUNING] - else: - autotuning_dict = {} - - self._initialize(autotuning_dict) - - def _initialize(self, autotuning_dict): - self.enabled = get_scalar_param(autotuning_dict, - AUTOTUNING_ENABLED, - AUTOTUNING_ENABLED_DEFAULT) - - self.fast = get_scalar_param(autotuning_dict, - AUTOTUNING_FAST, - AUTOTUNING_FAST_DEFAULT) - - self.results_dir = get_scalar_param(autotuning_dict, - AUTOTUNING_RESULTS_DIR, - AUTOTUNING_RESULTS_DIR_DEFAULT) - - self.exps_dir = get_scalar_param(autotuning_dict, - AUTOTUNING_EXPS_DIR, - AUTOTUNING_EXPS_DIR_DEFAULT) - - self.overwrite = get_scalar_param(autotuning_dict, - AUTOTUNING_OVERWRITE, - AUTOTUNING_OVERWRITE_DEFAULT) - - self.start_profile_step = get_scalar_param( - autotuning_dict, - AUTOTUNING_START_PROFILE_STEP, - AUTOTUNING_START_PROFILE_STEP_DEFAULT) - - self.end_profile_step = get_scalar_param(autotuning_dict, - AUTOTUNING_END_PROFILE_STEP, - AUTOTUNING_END_PROFILE_STEP_DEFAULT) - - self.metric = get_scalar_param(autotuning_dict, - AUTOTUNING_METRIC, - AUTOTUNING_METRIC_DEFAULT) - - self.metric_path = get_scalar_param(autotuning_dict, - AUTOTUNING_METRIC_PATH, - AUTOTUNING_METRIC_PATH_DEFAULT) - - self.tuner_type = get_scalar_param(autotuning_dict, - AUTOTUNING_TUNER_TYPE, - AUTOTUNING_TUNER_TYPE_DEFAULT) - - self.tuner_early_stopping = get_scalar_param( - autotuning_dict, - AUTOTUNING_TUNER_EARLY_STOPPING, - AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT) - - self.tuner_num_trials = get_scalar_param(autotuning_dict, - AUTOTUNING_TUNER_NUM_TRIALS, - AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT) - - self.arg_mappings = get_dict_param(autotuning_dict, - AUTOTUNING_ARG_MAPPINGS, - AUTOTUNING_ARG_MAPPINGS_DEFAULT) - - self.model_info = get_model_info_config(autotuning_dict) - - self.model_info_path = get_scalar_param(autotuning_dict, - AUTOTUNING_MODEL_INFO_PATH, - AUTOTUNING_MODEL_INFO_PATH_DEFAULT) - self.mp_size = get_scalar_param(autotuning_dict, - AUTOTUNING_MP_SIZE, - AUTOTUNING_MP_SIZE_DEFAULT) - - self.max_train_batch_size = get_dict_param( - autotuning_dict, - AUTOTUNING_MAX_TRAIN_BATCH_SIZE, - AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT) - - self.min_train_batch_size = get_dict_param( - autotuning_dict, - AUTOTUNING_MIN_TRAIN_BATCH_SIZE, - AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT) - - self.max_train_micro_batch_size_per_gpu = get_dict_param( - autotuning_dict, - AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU, - AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT) - - self.min_train_micro_batch_size_per_gpu = get_dict_param( - autotuning_dict, - AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU, - AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT) - - self.num_tuning_micro_batch_sizes = get_dict_param( - autotuning_dict, - AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES, - AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT) - - -def get_model_info_config(param_dict): - if MODEL_INFO in param_dict and param_dict[MODEL_INFO] is not None: - model_info_config = {} - for key, default_value in MODEL_INFO_KEY_DEFAULT_DICT.items(): - model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO], - key, - default_value) - return model_info_config - return None - - -def get_default_model_info_config(): - return MODEL_INFO_KEY_DEFAULT_DICT +from pydantic import Field +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +from enum import Enum + +AUTOTUNING = "autotuning" + + +def get_autotuning_config(param_dict): + autotuning_config_dict = param_dict.get(AUTOTUNING, {}) + return DeepSpeedAutotuningConfig(**autotuning_config_dict) + + +class MetricEnum(str, Enum): + latency = "latency" + throughput = "throughput" + flops = "flops" + forward = "forward" + backward = "backward" + steps = "steps" + + +class TunerTypeEnum(str, Enum): + gridsearch = "gridsearch" + random = "random" + model_based = "model_based" + + +class ModelInfoConfig(DeepSpeedConfigModel): + profile: bool = False + num_params: int = Field(None, ge=0) + hidden_size: int = Field(None, ge=0) + num_layers: int = Field(None, ge=0) + + +class DeepSpeedAutotuningConfig(DeepSpeedConfigModel): + enabled: bool = False + fast: bool = True + results_dir: str = None # Should this be Path dtype? + exps_dir: str = None + overwrite: bool = True + start_step: int = Field(3, ge=0, alias="start_profile_step") + end_step: int = Field(5, ge=0, alias="end_profile_step") + metric: MetricEnum = MetricEnum.throughput + metric_path: str = None + tuner_type: TunerTypeEnum = TunerTypeEnum.gridsearch + tuner_early_stopping: int = Field(5, ge=0) + tuner_num_trials: int = Field(50, gt=0) + arg_mapping: str = None + model_info: ModelInfoConfig = ModelInfoConfig() + model_info_path: str = None + mp_size: int = Field(1, gt=0) + max_train_batch_size: int = Field(None, gt=0) + min_train_batch_size: int = Field(1, gt=0) + max_train_micro_batch_size_per_gpu: int = Field(1024, gt=0) + min_train_micro_batch_size_per_gpu: int = Field(1, gt=0) + num_tuning_micro_batch_sizes: int = Field(3, gt=0) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index ac7ea8f55c90..a1f5b7a0e83e 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -75,7 +75,7 @@ class ReduceOp(Enum): def _configure_using_config_file(config): - if config.comms_logger_enabled: + if config.enabled: comms_logger.configure(config) diff --git a/deepspeed/nebula/config.py b/deepspeed/nebula/config.py index f9928d66147c..80d7db109b8f 100644 --- a/deepspeed/nebula/config.py +++ b/deepspeed/nebula/config.py @@ -3,51 +3,21 @@ Licensed under the MIT license. """ -from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject -from deepspeed.nebula.constants import * +from pydantic import Field +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +NEBULA = "nebula" -class DeepSpeedNebulaConfig(DeepSpeedConfigObject): - def __init__(self, param_dict): - super(DeepSpeedNebulaConfig, self).__init__() - self.enabled = None - self.persistent_storage_path = None - self.persistent_time_interval = None - self.num_of_version_in_retention = None - self.enable_nebula_load = None +def get_nebula_config(param_dict): + nebula_config_dict = param_dict.get(NEBULA, {}) + return DeepSpeedNebulaConfig(**nebula_config_dict) - if NEBULA in param_dict.keys(): - nebula_dict = param_dict[NEBULA] - else: - nebula_dict = {} - self._initialize(nebula_dict) - - def _initialize(self, nebula_dict): - self.enabled = get_scalar_param(nebula_dict, - NEBULA_ENABLED, - NEBULA_ENABLED_DEFAULT) - - self.load_path = get_scalar_param(nebula_dict, - NEBULA_LOAD_PATH, - NEBULA_LOAD_PATH_DEFAULT) - - self.enable_nebula_load = get_scalar_param(nebula_dict, - NEBULA_ENABLE_NEBULA_LOAD, - NEBULA_ENABLE_NEBULA_LOAD_DEFAULT) - - self.persistent_storage_path = get_scalar_param( - nebula_dict, - NEBULA_PERSISTENT_STORAGE_PATH, - NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT) - - self.persistent_time_interval = get_scalar_param( - nebula_dict, - NEBULA_PERSISTENT_TIME_INTERVAL, - NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT) - - self.num_of_version_in_retention = get_scalar_param( - nebula_dict, - NEBULA_NUM_OF_VERSION_IN_RETENTION, - NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT) +class DeepSpeedNebulaConfig(DeepSpeedConfigModel): + enabled: bool = False + load_path: str = None + enable_nebula_load: bool = True + persistent_storage_path: str = None # Should this be Path dtype? + persistent_time_interval: int = Field(100, gt=0) + num_of_version_in_retention: int = Field(2, ge=0) diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py index 0671211132c6..68d60ea13c6b 100644 --- a/deepspeed/profiling/config.py +++ b/deepspeed/profiling/config.py @@ -3,47 +3,21 @@ Licensed under the MIT license. """ -from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject -from deepspeed.profiling.constants import * +from pydantic import Field +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +FLOPS_PROFILER = "flops_profiler" -class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject): - def __init__(self, param_dict): - super(DeepSpeedFlopsProfilerConfig, self).__init__() - self.enabled = None - self.profile_step = None - self.module_depth = None - self.top_modules = None +def get_flops_profiler_config(param_dict): + flops_profiler_config_dict = param_dict.get(FLOPS_PROFILER, {}) + return DeepSpeedFlopsProfilerConfig(**flops_profiler_config_dict) - if FLOPS_PROFILER in param_dict.keys(): - flops_profiler_dict = param_dict[FLOPS_PROFILER] - else: - flops_profiler_dict = {} - self._initialize(flops_profiler_dict) - - def _initialize(self, flops_profiler_dict): - self.enabled = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_ENABLED, - FLOPS_PROFILER_ENABLED_DEFAULT) - - self.profile_step = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_PROFILE_STEP, - FLOPS_PROFILER_PROFILE_STEP_DEFAULT) - - self.module_depth = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_MODULE_DEPTH, - FLOPS_PROFILER_MODULE_DEPTH_DEFAULT) - - self.top_modules = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_TOP_MODULES, - FLOPS_PROFILER_TOP_MODULES_DEFAULT) - - self.detailed = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_DETAILED, - FLOPS_PROFILER_DETAILED_DEFAULT) - - self.output_file = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_OUTPUT_FILE, - FLOPS_PROFILER_OUTPUT_FILE_DEFAULT) +class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigModel): + enabled: bool = False + profile_step: int = Field(1, ge=1) + module_depth: int = -1 + top_modules: int = 1 + detailed: bool = True + output_file: str = None # Should this be Path dtype? diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index bc1311a884a1..4d3150a6d7a6 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -47,9 +47,9 @@ NUM_GPUS_PER_NODE_DEFAULT, ) -from ..profiling.config import DeepSpeedFlopsProfilerConfig -from ..autotuning.config import DeepSpeedAutotuningConfig -from ..nebula.config import DeepSpeedNebulaConfig +from ..profiling.config import get_flops_profiler_config +from ..autotuning.config import get_autotuning_config +from ..nebula.config import get_nebula_config from ..compression.config import get_compression_config from ..compression.constants import * @@ -853,11 +853,11 @@ def _initialize_params(self, param_dict): self.scheduler_name = get_scheduler_name(param_dict) self.scheduler_params = get_scheduler_params(param_dict) - self.flops_profiler_config = DeepSpeedFlopsProfilerConfig(param_dict) + self.flops_profiler_config = get_flops_profiler_config(param_dict) self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict) | self.flops_profiler_config.enabled) self.memory_breakdown = get_memory_breakdown(param_dict) - self.autotuning_config = DeepSpeedAutotuningConfig(param_dict) + self.autotuning_config = get_autotuning_config(param_dict) ( self.eigenvalue_enabled, @@ -893,7 +893,7 @@ def _initialize_params(self, param_dict): self.dataloader_drop_last = get_dataloader_drop_last(param_dict) - self.nebula_config = DeepSpeedNebulaConfig(param_dict) + self.nebula_config = get_nebula_config(param_dict) def _batch_assertion(self): diff --git a/tests/unit/monitor/test_monitor.py b/tests/unit/monitor/test_monitor.py index 674a8d7ce841..2109a4785c68 100644 --- a/tests/unit/monitor/test_monitor.py +++ b/tests/unit/monitor/test_monitor.py @@ -21,7 +21,7 @@ def test_tensorboard(self): } } ds_config = DeepSpeedConfig(config_dict) - tb_monitor = TensorBoardMonitor(ds_config.monitor_config) + tb_monitor = TensorBoardMonitor(ds_config.monitor_config.tensorboard) assert tb_monitor.enabled == True assert tb_monitor.output_path == "test_output/ds_logs/" assert tb_monitor.job_name == "test" @@ -29,7 +29,7 @@ def test_tensorboard(self): def test_empty_tensorboard(self): config_dict = {"train_batch_size": 2, "tensorboard": {}} ds_config = DeepSpeedConfig(config_dict) - tb_monitor = TensorBoardMonitor(ds_config.monitor_config) + tb_monitor = TensorBoardMonitor(ds_config.monitor_config.tensorboard) assert tb_monitor.enabled == TENSORBOARD_ENABLED_DEFAULT assert tb_monitor.output_path == TENSORBOARD_OUTPUT_PATH_DEFAULT assert tb_monitor.job_name == TENSORBOARD_JOB_NAME_DEFAULT @@ -49,7 +49,7 @@ def test_wandb(self): } } ds_config = DeepSpeedConfig(config_dict) - wandb_monitor = WandbMonitor(ds_config.monitor_config) + wandb_monitor = WandbMonitor(ds_config.monitor_config.wandb) assert wandb_monitor.enabled == False assert wandb_monitor.group == "my_group" assert wandb_monitor.team == "my_team" @@ -58,7 +58,7 @@ def test_wandb(self): def test_empty_wandb(self): config_dict = {"train_batch_size": 2, "wandb": {}} ds_config = DeepSpeedConfig(config_dict) - wandb_monitor = WandbMonitor(ds_config.monitor_config) + wandb_monitor = WandbMonitor(ds_config.monitor_config.wandb) assert wandb_monitor.enabled == WANDB_ENABLED_DEFAULT assert wandb_monitor.group == WANDB_GROUP_NAME_DEFAULT assert wandb_monitor.team == WANDB_TEAM_NAME_DEFAULT @@ -78,7 +78,7 @@ def test_csv_monitor(self): } } ds_config = DeepSpeedConfig(config_dict) - csv_monitor = csvMonitor(ds_config.monitor_config) + csv_monitor = csvMonitor(ds_config.monitor_config.csv_monitor) assert csv_monitor.enabled == True assert csv_monitor.output_path == "test_output/ds_logs/" assert csv_monitor.job_name == "test" @@ -86,7 +86,7 @@ def test_csv_monitor(self): def test_empty_csv_monitor(self): config_dict = {"train_batch_size": 2, "csv_monitor": {}} ds_config = DeepSpeedConfig(config_dict) - csv_monitor = csvMonitor(ds_config.monitor_config) + csv_monitor = csvMonitor(ds_config.monitor_config.csv_monitor) assert csv_monitor.enabled == CSV_MONITOR_ENABLED_DEFAULT assert csv_monitor.output_path == CSV_MONITOR_OUTPUT_PATH_DEFAULT assert csv_monitor.job_name == CSV_MONITOR_JOB_NAME_DEFAULT From 7466c8dcced744865997b85088dd6f8721227c77 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 11 Aug 2022 16:20:33 -0700 Subject: [PATCH 05/13] fix for broken tests --- deepspeed/autotuning/config.py | 2 +- deepspeed/monitor/monitor.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deepspeed/autotuning/config.py b/deepspeed/autotuning/config.py index d43d99e8d502..d33028eb2eb4 100644 --- a/deepspeed/autotuning/config.py +++ b/deepspeed/autotuning/config.py @@ -50,7 +50,7 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigModel): tuner_type: TunerTypeEnum = TunerTypeEnum.gridsearch tuner_early_stopping: int = Field(5, ge=0) tuner_num_trials: int = Field(50, gt=0) - arg_mapping: str = None + arg_mappings: str = None model_info: ModelInfoConfig = ModelInfoConfig() model_info_path: str = None mp_size: int = Field(1, gt=0) diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py index 6cc1731a8f1b..35f368768fd0 100644 --- a/deepspeed/monitor/monitor.py +++ b/deepspeed/monitor/monitor.py @@ -30,11 +30,11 @@ def __init__(self, monitor_config): self.enabled = monitor_config.tensorboard.enabled or monitor_config.csv_monitor.enabled or monitor_config.wandb.enabled if dist.get_rank() == 0: - if monitor_config.tensorboard_enabled: + if monitor_config.tensorboard.enabled: self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard) - if monitor_config.wandb_enabled: + if monitor_config.wandb.enabled: self.wandb_monitor = WandbMonitor(monitor_config.wandb) - if monitor_config.csv_monitor_enabled: + if monitor_config.csv_monitor.enabled: self.csv_monitor = csvMonitor(monitor_config.csv_monitor) def write_events(self, event_list): From 5c8ee6098042ec8878d56169e925f016528dcca3 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 11 Aug 2022 16:31:25 -0700 Subject: [PATCH 06/13] fix compression schedule to work with configs --- deepspeed/compression/scheduler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deepspeed/compression/scheduler.py b/deepspeed/compression/scheduler.py index 02c8fb904af8..e4f951e2cc59 100644 --- a/deepspeed/compression/scheduler.py +++ b/deepspeed/compression/scheduler.py @@ -26,7 +26,7 @@ def __init__(self, model, compression_config): def make_init(self): self.different_compression_methods = {} - for method, method_content in self.compression_config.items(): + for method, method_content in self.compression_config: if LAYER_REDUCTION in method: continue self.different_compression_methods[method] = { @@ -35,22 +35,22 @@ def make_init(self): DIFFERENT_GROUPS: [] } exist_module_name = set() - shared_parameters = method_content[SHARED_PARAMETERS] + shared_parameters = method_content.shared_parameters self.different_compression_methods[method][ - TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED] + TECHNIQUE_ENABLED] = shared_parameters.enabled self.different_compression_methods[method][ SHARED_PARAMETERS] = shared_parameters - for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items(): + for group_name, method_parameters in method_content.different_groups.items(): module_name_list = [] - for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]: + for key_word in method_parameters.modules: module_name, exist_module_name = get_module_name(group_name, self.model, key_word, exist_module_name, verbose=False) module_name_list.extend(module_name) if module_name_list: self.different_compression_methods[method][DIFFERENT_GROUPS].append([ group_name, module_name_list, - method_parameters.copy().pop('params') + method_parameters.dict().pop('params') ]) def check_weight_quantization(self): From c8f5c9f486e2c2415d5e59ca0f431a0bd92182e5 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 12 Aug 2022 13:06:12 -0700 Subject: [PATCH 07/13] fix for compression config usage --- deepspeed/runtime/engine.py | 47 ++++++++++++++----------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 13d58ff11438..2aac004022aa 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -39,17 +39,6 @@ PLD_THETA, PLD_GAMMA, BFLOAT16, FP16 from deepspeed.runtime.zero.config import ZeroStageEnum from deepspeed.compression import compression_scheduler -from deepspeed.compression.constants import \ - WEIGHT_QUANTIZE_IN_FORWARD_ENABLED, \ - WEIGHT_QUANTIZATION, SHARED_PARAMETERS, \ - WEIGHT_QUANTIZE_ENABLED, \ - WEIGHT_QUANTIZE_GROUPS, \ - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE, \ - WEIGHT_QUANTIZE_CHANGE_RATIO, \ - WEIGHT_QUANTIZE_TYPE, \ - WEIGHT_QUANTIZE_ROUNDING, \ - WEIGHT_QUANTIZE_VERBOSE, \ - WEIGHT_QUANTIZE_KERNEL from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT from deepspeed.runtime.sparse_tensor import SparseTensor @@ -607,24 +596,24 @@ def scheduler_params(self): def quantize_training(self): return ( - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_IN_FORWARD_ENABLED], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_ENABLED], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_GROUPS], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_CHANGE_RATIO], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_TYPE], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_ROUNDING], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_VERBOSE], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_KERNEL], + self._config.compression_config.weight_quantization.shared_parameters. + quantize_weight_in_forward, + self._config.compression_config.weight_quantization.shared_parameters. + enabled, + self._config.compression_config.weight_quantization.shared_parameters. + quantize_groups, + self._config.compression_config.weight_quantization.shared_parameters. + fp16_mixed_quantize.enabled, + self._config.compression_config.weight_quantization.shared_parameters. + fp16_mixed_quantize.quantize_change_ratio, + self._config.compression_config.weight_quantization.shared_parameters. + quantization_type, + self._config.compression_config.weight_quantization.shared_parameters. + rounding, + self._config.compression_config.weight_quantization.shared_parameters. + quantize_verbose, + self._config.compression_config.weight_quantization.shared_parameters. + quantizer_kernel, ) def zero_optimization(self): From f72e98f3784ab4ba252016272b5b1f955c919eca Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 12 Aug 2022 16:39:36 -0700 Subject: [PATCH 08/13] fix broken test --- tests/unit/profiling/flops_profiler/test_flops_profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py index 734e2996fa80..ecd23f106938 100644 --- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py +++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py @@ -40,7 +40,7 @@ def test(self): }, "flops_profiler": { "enabled": True, - "step": 1, + "profile_step": 1, "module_depth": -1, "top_modules": 3, }, From 3aebde21337a9060a5825aa2221b718cc12c2008 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 15 Aug 2022 11:45:44 -0700 Subject: [PATCH 09/13] removed autotuner constants --- deepspeed/autotuning/autotuner.py | 12 +- deepspeed/autotuning/constants.py | 211 ------------------ deepspeed/autotuning/scheduler.py | 9 +- deepspeed/autotuning/tuner/base_tuner.py | 4 +- .../autotuning/tuner/model_based_tuner.py | 9 +- 5 files changed, 15 insertions(+), 230 deletions(-) delete mode 100644 deepspeed/autotuning/constants.py diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py index b8a67075b55e..fae621f82f83 100755 --- a/deepspeed/autotuning/autotuner.py +++ b/deepspeed/autotuning/autotuner.py @@ -11,8 +11,7 @@ from ..runtime.zero.config import DeepSpeedZeroConfig, ZERO_OPTIMIZATION, ZeroStageEnum from ..utils import logger -from .config import DeepSpeedAutotuningConfig -from .constants import * +from .config import AUTOTUNING, DeepSpeedAutotuningConfig, TunerTypeEnum from .scheduler import ResourceManager from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner from .utils import * @@ -598,9 +597,9 @@ def tune_space(self, exps = self._generate_experiments(tuning_space, max_train_batch_size_per_gpu) logger.info(f'Tuner type is {self.autotuning_config.tuner_type}') - if self.autotuning_config.tuner_type == AUTOTUNING_TUNER_MODELBASED: + if self.autotuning_config.tuner_type == TunerTypeEnum.model_based: t = ModelBasedTuner(exps, self.rm, self.metric(), tuning_space) - elif self.autotuning_config.tuner_type == AUTOTUNING_TUNER_RANDOM: + elif self.autotuning_config.tuner_type == TunerTypeEnum.random: t = RandomTuner(exps, self.rm, self.metric()) else: t = GridSearchTuner(exps, self.rm, self.metric()) @@ -672,13 +671,14 @@ def model_info_profile_run(self): model_info_path = os.path.join(self.results_dir, "profile_model_info", "model_info.json") - ds_config[AUTOTUNING] = { + at_config_dict = { "enabled": True, "model_info_path": model_info_path, "model_info": { "profile": True } } + ds_config[AUTOTUNING] = DeepSpeedAutotuningConfig(**at_config_dict) exp_config = {} exp_name = "profile_model_info" @@ -781,7 +781,7 @@ def run_tuning_micro_batch_sizes(self, self.rm.run() for exp_id, (exp, err) in self.rm.finished_experiments.items(): if exp: - metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH] + metric_file = exp[DS_CONFIG][AUTOTUNING].metric_path if os.path.exists(metric_file): with open(metric_file, 'r') as f: diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py deleted file mode 100644 index 3bfcd2725f90..000000000000 --- a/deepspeed/autotuning/constants.py +++ /dev/null @@ -1,211 +0,0 @@ -""" -Copyright (c) Microsoft Corporation -Licensed under the MIT license. -""" - -######################################### -# autotunner implementation constants -######################################### - -import os - -DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", - "template_zero0.json") -DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", - "template_zero1.json") -DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", - "template_zero2.json") -DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", - "template_zero3.json") - -DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps") -DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results") - -METRIC_PERCENT_DIFF_CONST = 0.05 -DS_CONFIG = "ds_config" -BUFSIZE = 1 # line buffer size for writing files - -######################################### -# autotuner configuration constants -######################################### -# Autotuner. By default, this feature is not enabled. -# Users can configure in ds_config.json as below example: -AUTOTUNING_FORMAT = """ -autotuner should be enabled as: -"session_params": { - "autotuning": { - "enabled": true, - "start_step": 5, - "end_step": 15 - } -} -""" - -AUTOTUNING = "autotuning" - -AUTOTUNING_ENABLED = "enabled" -AUTOTUNING_ENABLED_DEFAULT = False - -AUTOTUNING_FAST = "fast" -AUTOTUNING_FAST_DEFAULT = True - -AUTOTUNING_RESULTS_DIR = "results_dir" -AUTOTUNING_RESULTS_DIR_DEFAULT = None - -AUTOTUNING_EXPS_DIR = "exps_dir" -AUTOTUNING_EXPS_DIR_DEFAULT = None - -AUTOTUNING_OVERWRITE = "overwrite" -AUTOTUNING_OVERWRITE_DEFAULT = True - -AUTOTUNING_START_PROFILE_STEP = "start_profile_step" -AUTOTUNING_START_PROFILE_STEP_DEFAULT = 3 - -AUTOTUNING_END_PROFILE_STEP = "end_profile_step" -AUTOTUNING_END_PROFILE_STEP_DEFAULT = 5 -AUTOTUNING_METRIC_PATH = "metric_path" -AUTOTUNING_METRIC_PATH_DEFAULT = None - -AUTOTUNING_TUNER_TYPE = "tuner_type" -AUTOTUNING_TUNER_GRIDSEARCH = "gridsearch" -AUTOTUNING_TUNER_RANDOM = "random" -AUTOTUNING_TUNER_MODELBASED = "model_based" -AUTOTUNING_TUNER_TYPE_DEFAULT = AUTOTUNING_TUNER_GRIDSEARCH -AUTOTUNING_TUNER_EARLY_STOPPING = "tuner_early_stopping" -AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT = 5 -AUTOTUNING_TUNER_NUM_TRIALS = "tuner_num_trials" -AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT = 50 - -AUTOTUNING_ARG_MAPPINGS = "arg_mappings" -AUTOTUNING_ARG_MAPPINGS_DEFAULT = None - -AUTOTUNING_MAX_TRAIN_BATCH_SIZE = "max_train_batch_size" -AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT = None -AUTOTUNING_MIN_TRAIN_BATCH_SIZE = "min_train_batch_size" -AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT = 1 -AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU = "max_train_micro_batch_size_per_gpu" -AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = 1024 -AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU = "min_train_micro_batch_size_per_gpu" -AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = 1 -AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES = "num_tuning_micro_batch_sizes" -AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT = 3 - -AUTOTUNING_MP_SIZE = "mp_size" -AUTOTUNING_MP_SIZE_DEFAULT = 1 - -AUTOTUNING_METRIC = "metric" -AUTOTUNING_METRIC_LATENCY = "latency" -AUTOTUNING_METRIC_THROUGHPUT = "throughput" -AUTOTUNING_METRIC_FLOPS = "flops" -AUTOTUNING_METRIC_FORWARD = "forward" -AUTOTUNING_METRIC_BACKWRAD = "flops" -AUTOTUNING_METRIC_STEPS = "step" -AUTOTUNING_METRIC_DEFAULT = AUTOTUNING_METRIC_THROUGHPUT - -######################################### -# MODEL INFO -######################################### -AUTOTUNING_MODEL_INFO_PATH = "model_info_path" -AUTOTUNING_MODEL_INFO_PATH_DEFAULT = None - -MODEL_INFO_FORMAT = ''' -"model_info": { - "num_params": 1000000000, - "hidden_size": 10, - "num_layers": 12, -} -''' -MODEL_INFO = "model_info" -MODEL_INFO_PROFILE = "profile" -MODEL_INFO_PROFILE_DEFAULT = False -MODEL_INFO_NUM_PARAMS = "num_params" -MODEL_INFO_NUM_PARAMS_DEFAULT = None -MODEL_INFO_HIDDEN_SIZE = "hideen_size" -MODEL_INFO_HIDDEN_SIZE_DEFAULT = None -MODEL_INFO_NUM_LAYERS = "num_layers" -MODEL_INFO_NUM_LAYERS_DEFAULT = None - -MODEL_INFO_KEY_DEFAULT_DICT = { - MODEL_INFO_PROFILE: MODEL_INFO_PROFILE_DEFAULT, - MODEL_INFO_NUM_PARAMS: MODEL_INFO_NUM_PARAMS_DEFAULT, - MODEL_INFO_HIDDEN_SIZE: MODEL_INFO_HIDDEN_SIZE_DEFAULT, - MODEL_INFO_NUM_LAYERS: MODEL_INFO_NUM_LAYERS_DEFAULT -} - -######################################### -# autotunner search space constants -######################################### - -DEFAULT_HF_CONFIG = { - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", -} - -DEFAULT_MIN_MEM_CONFIG = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3 - }, - "memory_break_down": False -} - -DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}} - -DEFAULT_TUNING_SPACE_ZERO_1 = { - "zero_optimization": { - "stage": 1, - "reduce_bucket_size": [5e7, - 5e8, - 1e9], - "allgather_bucket_size": [5e7, - 5e8, - 1e9], - } -} - -DEFAULT_TUNING_SPACE_ZERO_2 = { - "zero_optimization": { - "stage": 2, - "overlap_comm": [True, - False], - "reduce_scatter": [False, - True], - "reduce_bucket_size": [5e7, - 5e8, - 1e9], - "allgather_bucket_size": [5e7, - 5e8, - 1e9], - "contiguous_gradients": [False, - True] - }, -} - -DEFAULT_TUNING_SPACE_ZERO_3 = { - "zero_optimization": { - "stage": 3, - "overlap_comm": [True, - False], - "reduce_scatter": [False, - True], - "reduce_bucket_size": [5e7, - 5e8, - 1e9], - "allgather_partitions": [True, - False], - "allgather_bucket_size": [5e7, - 5e8, - 1e9], - "contiguous_gradients": [False, - True] - }, -} - -GLOBAL_TUNING_SPACE = 'global' -# TUNING_MICRO_BATCH_SIZE_PREFIX="tune_micro_batch_size_z" -TUNING_MICRO_BATCH_SIZE_PREFIX = "z" diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py index 4f91f3cc98df..5eb428cc61fa 100755 --- a/deepspeed/autotuning/scheduler.py +++ b/deepspeed/autotuning/scheduler.py @@ -3,6 +3,7 @@ from numpy import BUFSIZE import json import subprocess +import os import sys import threading import time @@ -11,8 +12,7 @@ from tqdm import tqdm from ..utils import logger -from .constants import * -from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH +from .config import AUTOTUNING from .utils import get_val_by_key, search_error, was_interruptted """ thread-0: loop over experiment queue dispatching experiments if they become available @@ -71,8 +71,7 @@ def schedule_experiments(self, exp_paths): exp['name']) if AUTOTUNING in exp["ds_config"]: metric_file = os.path.join(result_dir, "metrics.json") - exp["ds_config"][AUTOTUNING][ - AUTOTUNING_METRIC_PATH] = metric_file + exp["ds_config"][AUTOTUNING].metric_path = metric_file stderr_file = os.path.join(result_dir, "stderr.log") model_info_file = os.path.join(result_dir, "model_info.json") metric_file = os.path.join(result_dir, "metrics.json") @@ -237,7 +236,7 @@ def parse_results(self, metric): ) continue - metric_file = exp["ds_config"][AUTOTUNING][AUTOTUNING_METRIC_PATH] + metric_file = exp["ds_config"][AUTOTUNING].metric_path if os.path.exists(metric_file): with open(metric_file, 'r') as f: diff --git a/deepspeed/autotuning/tuner/base_tuner.py b/deepspeed/autotuning/tuner/base_tuner.py index fe00e27457e7..75910361e79e 100755 --- a/deepspeed/autotuning/tuner/base_tuner.py +++ b/deepspeed/autotuning/tuner/base_tuner.py @@ -1,6 +1,6 @@ import sys -from deepspeed.autotuning.constants import * +from deepspeed.autotuning.config import DeepSpeedAutotuningConfig from deepspeed.autotuning.utils import write_experiments from deepspeed.utils import logger @@ -12,7 +12,7 @@ def __init__(self, exps, resource_manager, metric): self.best_iter = 0 self.best_exp = None self.best_metric_val = None - self.metric = metric if metric else AUTOTUNING_METRIC_DEFAULT + self.metric = metric if metric else DeepSpeedAutotuningConfig().metric logger.info(f"total number of exps = {len(self.all_exps)}") def has_next(self): diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py index c3e0b1ac3fd1..2574d7231e3f 100755 --- a/deepspeed/autotuning/tuner/model_based_tuner.py +++ b/deepspeed/autotuning/tuner/model_based_tuner.py @@ -1,12 +1,11 @@ import hjson -from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH +from ..config import AUTOTUNING, MetricEnum from .base_tuner import BaseTuner from .cost_model import XGBoostCostModel from .utils import * from ..utils import * import numbers -from ..constants import AUTOTUNING_METRIC_LATENCY INIT_NUM = 2 @@ -69,9 +68,7 @@ def find_estimated_top_configs(self): n = len(estimates) top_idx = np.argsort(estimates) - top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[:: - -1][: - n] + top_idx_ret = top_idx if self.metric == MetricEnum.latency else top_idx[::-1][:n] # top_configs = [self.all_configs[i] for i in top_idx] @@ -129,7 +126,7 @@ def update(self): self.evaluated_perf.append(0.0) continue - p = exp["ds_config"][AUTOTUNING][AUTOTUNING_METRIC_PATH] + p = exp["ds_config"][AUTOTUNING].metric_path with open(p, 'r') as f: results = hjson.load(f) curr_iter = results[self.metric] From f671bcaf254546f75e9cc5d30e212d6e4f35bc99 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 15 Aug 2022 11:54:56 -0700 Subject: [PATCH 10/13] removed comms constants --- deepspeed/comm/constants.py | 44 -------------------------------- deepspeed/utils/comms_logging.py | 13 +++++----- 2 files changed, 7 insertions(+), 50 deletions(-) delete mode 100644 deepspeed/comm/constants.py diff --git a/deepspeed/comm/constants.py b/deepspeed/comm/constants.py deleted file mode 100644 index b3a526a5afbc..000000000000 --- a/deepspeed/comm/constants.py +++ /dev/null @@ -1,44 +0,0 @@ -NCCL_BACKEND = 'nccl' -MPI_BACKEND = 'mpi' -GLOO_BACKEND = 'gloo' -SCCL_BACKEND = 'sccl' - -DEFAULT_AML_MASTER_PORT = "54965" -DEFAULT_AML_NCCL_SOCKET_IFNAME = "^docker0,lo" - -######################################### -# Comms Logger -######################################### -# Comms Logger. By default, this feature is not enabled. -# Users can configure in ds_config.json as below example: -COMMS_LOGGER_FORMAT = ''' -The Comms Logger can be specified as: -"comms_logger": { - "enabled": true, - "verbose": false, - "prof_all": true, - "debug": false, - "prof_ops": ["all_reduce", "custom_all_reduce_name"] -} -''' -COMMS_LOGGER = "comms_logger" - -# Comms logger enable signal -COMMS_LOGGER_ENABLED = "enabled" -COMMS_LOGGER_ENABLED_DEFAULT = False - -# Comms logger verbose signal -COMMS_LOGGER_VERBOSE = "verbose" -COMMS_LOGGER_VERBOSE_DEFAULT = False - -# comms logger profile all ops signal -COMMS_LOGGER_PROF_ALL = "prof_all" -COMMS_LOGGER_PROF_ALL_DEFAULT = True - -# comms logger show all ops signal -COMMS_LOGGER_DEBUG = "debug" -COMMS_LOGGER_DEBUG_DEFAULT = False - -# comms logger profile specific ops in list -COMMS_LOGGER_PROF_OPS = "prof_ops" -COMMS_LOGGER_PROF_OPS_DEFAULT = [] diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py index 34f8e2e2e1dc..dcac8be16b37 100644 --- a/deepspeed/utils/comms_logging.py +++ b/deepspeed/utils/comms_logging.py @@ -1,5 +1,6 @@ import math from deepspeed.utils import log_dist +from deepspeed.comm.config import DeepSpeedCommsConfig def get_caller_func(frame=3): @@ -55,13 +56,13 @@ def calc_bw_log(comm_op, size, duration): class CommsLogger: def __init__(self): - from deepspeed.comm.constants import COMMS_LOGGER_VERBOSE_DEFAULT, COMMS_LOGGER_DEBUG_DEFAULT, COMMS_LOGGER_PROF_OPS_DEFAULT, COMMS_LOGGER_PROF_ALL_DEFAULT, COMMS_LOGGER_ENABLED_DEFAULT self.comms_dict = {} - self.verbose = COMMS_LOGGER_VERBOSE_DEFAULT - self.debug = COMMS_LOGGER_DEBUG_DEFAULT - self.prof_ops = COMMS_LOGGER_PROF_OPS_DEFAULT - self.prof_all = COMMS_LOGGER_PROF_ALL_DEFAULT - self.enabled = COMMS_LOGGER_ENABLED_DEFAULT + self.comms_config = DeepSpeedCommsConfig() + self.verbose = self.comms_config.verbose + self.debug = self.comms_config.debug + self.prof_ops = self.comms_config.prof_ops + self.prof_all = self.comms_config.prof_all + self.enabled = self.comms_config.enabled def configure(self, comms_config): self.enabled = comms_config.comms_logger_enabled From 8d25c5546395e0b22f5b3f8067a4dee3b06f8f6c Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 16:33:39 -0700 Subject: [PATCH 11/13] avoid circular import --- deepspeed/utils/comms_logging.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py index dcac8be16b37..8c6da4aed384 100644 --- a/deepspeed/utils/comms_logging.py +++ b/deepspeed/utils/comms_logging.py @@ -1,6 +1,5 @@ import math from deepspeed.utils import log_dist -from deepspeed.comm.config import DeepSpeedCommsConfig def get_caller_func(frame=3): @@ -56,6 +55,8 @@ def calc_bw_log(comm_op, size, duration): class CommsLogger: def __init__(self): + from deepspeed.comm.config import DeepSpeedCommsConfig + self.comms_dict = {} self.comms_config = DeepSpeedCommsConfig() self.verbose = self.comms_config.verbose From 0aaf80cf55401168e4c2463f03cb3f96ed432a26 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 17:00:20 -0700 Subject: [PATCH 12/13] remove comm.constants --- deepspeed/comm/comm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index b098e3d29d01..43ad826b486a 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -30,7 +30,6 @@ import os from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout -from .constants import * class ReduceOp(Enum): From a8cbc37ef86c89cea742ac61d9c16838ace97023 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 12 Sep 2022 09:51:11 -0700 Subject: [PATCH 13/13] added back some necessary autotuner constants --- deepspeed/autotuning/autotuner.py | 1 + deepspeed/autotuning/constants.py | 133 ++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 deepspeed/autotuning/constants.py diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py index fae621f82f83..25d0cfd5f1a9 100755 --- a/deepspeed/autotuning/autotuner.py +++ b/deepspeed/autotuning/autotuner.py @@ -12,6 +12,7 @@ from ..runtime.zero.config import DeepSpeedZeroConfig, ZERO_OPTIMIZATION, ZeroStageEnum from ..utils import logger from .config import AUTOTUNING, DeepSpeedAutotuningConfig, TunerTypeEnum +from .constants import * from .scheduler import ResourceManager from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner from .utils import * diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py new file mode 100644 index 000000000000..30330f1d7b53 --- /dev/null +++ b/deepspeed/autotuning/constants.py @@ -0,0 +1,133 @@ +""" +Copyright (c) Microsoft Corporation +Licensed under the MIT license. +""" + +######################################### +# autotunner implementation constants +######################################### + +import os + +DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), + "config_templates", + "template_zero0.json") +DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), + "config_templates", + "template_zero1.json") +DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), + "config_templates", + "template_zero2.json") +DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), + "config_templates", + "template_zero3.json") + +DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps") +DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results") + +METRIC_PERCENT_DIFF_CONST = 0.05 +DS_CONFIG = "ds_config" +BUFSIZE = 1 # line buffer size for writing files + +######################################### +# MODEL INFO +######################################### +AUTOTUNING_MODEL_INFO_PATH = "model_info_path" +AUTOTUNING_MODEL_INFO_PATH_DEFAULT = None + +MODEL_INFO_FORMAT = ''' +"model_info": { + "num_params": 1000000000, + "hidden_size": 10, + "num_layers": 12, +} +''' +MODEL_INFO = "model_info" +MODEL_INFO_PROFILE = "profile" +MODEL_INFO_PROFILE_DEFAULT = False +MODEL_INFO_NUM_PARAMS = "num_params" +MODEL_INFO_NUM_PARAMS_DEFAULT = None +MODEL_INFO_HIDDEN_SIZE = "hideen_size" +MODEL_INFO_HIDDEN_SIZE_DEFAULT = None +MODEL_INFO_NUM_LAYERS = "num_layers" +MODEL_INFO_NUM_LAYERS_DEFAULT = None + +MODEL_INFO_KEY_DEFAULT_DICT = { + MODEL_INFO_PROFILE: MODEL_INFO_PROFILE_DEFAULT, + MODEL_INFO_NUM_PARAMS: MODEL_INFO_NUM_PARAMS_DEFAULT, + MODEL_INFO_HIDDEN_SIZE: MODEL_INFO_HIDDEN_SIZE_DEFAULT, + MODEL_INFO_NUM_LAYERS: MODEL_INFO_NUM_LAYERS_DEFAULT +} + +######################################### +# autotunner search space constants +######################################### + +DEFAULT_HF_CONFIG = { + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", +} + +DEFAULT_MIN_MEM_CONFIG = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 3 + }, + "memory_break_down": False +} + +DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}} + +DEFAULT_TUNING_SPACE_ZERO_1 = { + "zero_optimization": { + "stage": 1, + "reduce_bucket_size": [5e7, + 5e8, + 1e9], + "allgather_bucket_size": [5e7, + 5e8, + 1e9], + } +} + +DEFAULT_TUNING_SPACE_ZERO_2 = { + "zero_optimization": { + "stage": 2, + "overlap_comm": [True, + False], + "reduce_scatter": [False, + True], + "reduce_bucket_size": [5e7, + 5e8, + 1e9], + "allgather_bucket_size": [5e7, + 5e8, + 1e9], + "contiguous_gradients": [False, + True] + }, +} + +DEFAULT_TUNING_SPACE_ZERO_3 = { + "zero_optimization": { + "stage": 3, + "overlap_comm": [True, + False], + "reduce_scatter": [False, + True], + "reduce_bucket_size": [5e7, + 5e8, + 1e9], + "allgather_partitions": [True, + False], + "allgather_bucket_size": [5e7, + 5e8, + 1e9], + "contiguous_gradients": [False, + True] + }, +} + +GLOBAL_TUNING_SPACE = 'global' +TUNING_MICRO_BATCH_SIZE_PREFIX = "z"