Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Impl adam_w #2957

Merged
merged 17 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 35 additions & 26 deletions driver/adam_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
float eps,
bool amsgrad,
bool maximize,
bool adamw,
bool is_amp,
int32_t grad_scale,
bool found_inf)
Expand All @@ -86,7 +87,12 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
float bias_correction2 = 1 - pow(beta2, step);

if(weight_decay != 0)
grad += param * weight_decay;
{
if(adamw)
param -= lr * weight_decay * param;
else
grad += param * weight_decay;
}

exp_avg = exp_avg * beta1 + grad * (1 - beta1);
exp_avg_sq = exp_avg_sq * beta2 + grad * grad * (1 - beta2);
Expand Down Expand Up @@ -114,11 +120,11 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,

#endif

template <typename Tgpu, typename Tref = Tgpu, bool is_amp = false, typename Tgrad = Tgpu>
template <typename Tgpu, typename Tref = Tgpu, typename Tgrad = Tgpu>
class AdamDriver : public Driver
{
public:
AdamDriver() : Driver()
AdamDriver(bool adamw_ = false, bool is_amp_ = false) : Driver(), adamw(adamw_), is_amp(is_amp_)
{
miopenCreateTensorDescriptor(&paramDesc);
miopenCreateTensorDescriptor(&gradDesc);
Expand Down Expand Up @@ -219,14 +225,16 @@ class AdamDriver : public Driver
bool amsgrad = false;
bool maximize = false;
bool found_inf = false;
bool adamw = false;
bool is_amp = false;
int grad_scale = 1;
int iter = 0;

miopenDataType_t grad_type;
};

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::ParseCmdLineArgs(int argc, char* argv[])
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::ParseCmdLineArgs(int argc, char* argv[])
{
inflags.Parse(argc, argv);

Expand All @@ -237,8 +245,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::ParseCmdLineArgs(int argc, char* argv
return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetandSetData()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::GetandSetData()
{
auto param_len = GetInputTensorLengthsFromCmdLine();
lr = inflags.GetValueDouble("lr");
Expand Down Expand Up @@ -280,8 +288,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetandSetData()
return 0;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AddCmdLineArgs()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::AddCmdLineArgs()
{
inflags.AddInputFlag("forw", 'F', "1", "Run only Forward GroupNorm (Default=1)", "int");
inflags.AddTensorFlag("dims", 'd', "64x32x128", "params tensor dims (Default=64x32x128)");
Expand Down Expand Up @@ -309,8 +317,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AddCmdLineArgs()
return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
std::vector<int> AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetInputTensorLengthsFromCmdLine()
template <typename Tgpu, typename Tref, typename Tgrad>
std::vector<int> AdamDriver<Tgpu, Tref, Tgrad>::GetInputTensorLengthsFromCmdLine()
{
std::vector<int> ret;
auto tensor = inflags.GetValueTensor("dims");
Expand All @@ -319,8 +327,8 @@ std::vector<int> AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetInputTensorLengthsFro
return ret;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AllocateBuffersAndCopy()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::AllocateBuffersAndCopy()
{
size_t param_sz = GetTensorSize(paramDesc);

Expand Down Expand Up @@ -425,8 +433,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AllocateBuffersAndCopy()
return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardGPU()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::RunForwardGPU()
{
float kernel_total_time = 0;
float kernel_first_time = 0;
Expand Down Expand Up @@ -474,7 +482,7 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardGPU()
eps,
amsgrad,
maximize,
false, // adamw
adamw,
gradScaleDesc,
grad_scale_ptr,
foundInfDesc,
Expand Down Expand Up @@ -505,8 +513,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardGPU()
return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardCPU()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::RunForwardCPU()
{
mloAdamRunHost<Tref>(paramDesc,
param_host.data(),
Expand All @@ -522,21 +530,22 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardCPU()
eps,
amsgrad,
maximize,
adamw,
is_amp,
grad_scale,
found_inf);

return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunBackwardGPU()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::RunBackwardGPU()
{
return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
Tref AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetTolerance()
template <typename Tgpu, typename Tref, typename Tgrad>
Tref AdamDriver<Tgpu, Tref, Tgrad>::GetTolerance()
{
if(data_type == miopenHalf)
{
Expand All @@ -557,8 +566,8 @@ Tref AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetTolerance()
return 0;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::VerifyForward()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::VerifyForward()
{
RunForwardCPU();
const Tref tolerance = GetTolerance();
Expand All @@ -575,8 +584,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::VerifyForward()
return miopenStatusSuccess;
}

template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::VerifyBackward()
template <typename Tgpu, typename Tref, typename Tgrad>
int AdamDriver<Tgpu, Tref, Tgrad>::VerifyBackward()
{
return miopenStatusSuccess;
}
Expand Down
8 changes: 7 additions & 1 deletion driver/dm_adam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ static Driver* makeDriver(const std::string& base_arg)
else if(base_arg == "adamfp16")
return new AdamDriver<float16, float>();
else if(base_arg == "ampadam")
return new AdamDriver<float, float, true, float16>();
return new AdamDriver<float, float, float16>(false, true);
else if(base_arg == "adamw")
return new AdamDriver<float, float>(true);
else if(base_arg == "adamwfp16")
return new AdamDriver<float16, float>(true);
else if(base_arg == "ampadamw")
return new AdamDriver<float, float, float16>(true, true);
return nullptr;
}

Expand Down
6 changes: 4 additions & 2 deletions driver/driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
"activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
"tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
"groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
"t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16]\n");
"t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
"adamw[fp16], ampadamw\n");
exit(0); // NOLINT (concurrency-mt-unsafe)
}

Expand Down Expand Up @@ -202,7 +203,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" &&
arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" &&
arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "--version")
arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
arg != "adamwfp16" && arg != "ampadamw" && arg != "--version")
{
printf("FAILED: Invalid Base Input Argument\n");
Usage();
Expand Down
40 changes: 38 additions & 2 deletions include/miopen/miopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -6916,6 +6916,14 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
* gradients. Fused Adam optimization efficiently combines multiple operations into a single kernel,
* reducing memory access overhead and improving performance.
*
* Additionally, Fused Adam can be utilized in both adam w and Automatic Mixed Precision (AMP),
* enabling accelerated model training and reduced memory consumption. AMP supports FP16
* computation, optimizing model calculations using a mixture of FP32 and FP16 precision to enhance
* training speed. When utilizing AMP, FoundInf, ScaleGrad, and step tensors should be employed. In
* AMP mode, the execution of Adam is determined based on the FoundInf value. State Step accepts
* both int values and int tensors. If a Step tensor is employed, the step received as an int is
* disregarded, and if Adam is executed, the step tensor is incremented by 1.
*
* @code
* // Execute Adam
* miopenFusedAdam(handle,
Expand Down Expand Up @@ -6945,6 +6953,34 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
* NULL, // Unused foundInf Tensor because not amp
* NULL);
*
* // Execute AdamW
* miopenFusedAdam(handle,
* paramDesc,
* param,
* gradDesc,
* grad,
* expAvgDesc,
* expAvg,
* expAvgSqDesc,
* expAvgSq,
* NULL, // Unused maxExpAvgSqDesc because amsgrad is false
* NULL,
* NULL, // Unused stateStep Tensor because use step integer argument
* NULL,
* step,
* lr,
* beta1,
* beta2,
* weight_decay,
* eps,
* false, // amsgrad
* false, // maximize
* true, // adamw
* NULL, // Unused gradScale Tensor because not amp
* NULL,
* NULL, // Unused foundInf Tensor because not amp
* NULL);
*
* // Execute AMP Adam
* miopenFusedAdam(handle,
* paramDesc,
Expand Down Expand Up @@ -7002,7 +7038,7 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
* @param amsgrad Flag indicating whether to use the AMSGrad variant of Adam (input)
* @param maximize Flag indicating whether to maximize the objective with respect to the
* parameters (input)
* @param adamw If true, the operation becomes AdamW (input) (not supported)
* @param adamw If true, the operation becomes AdamW (input)
* @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional)
* @param gradScale Input grad scale tensor (input, optional)
* @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional)
Expand Down Expand Up @@ -7169,7 +7205,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFusedAdam(miopenHandle_t handle,
* @param amsgrad Flag indicating whether to use the AMSGrad variant of Adam (input)
* @param maximize Flag indicating whether to maximize the objective with respect to the
* parameters (input)
* @param adamw If it is true, the operation becomes AdamW (input) (not supported)
* @param adamw If it is true, the operation becomes AdamW (input)
* @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional)
* @param gradScale Input grad scale tensor (input, optional)
* @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional)
Expand Down
4 changes: 2 additions & 2 deletions src/adam/problem_description.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
ss << kernel;
if(IsAdamW())
ss << "w";
if(IsAllPacked())
ss << "packed";
if(IsAllContiguous())
ss << "cont";
ss << "step" << step_ind;
ss << "dtype" << dtype;
if(IsAmp())
Expand Down
6 changes: 3 additions & 3 deletions src/include/miopen/adam/problem_description.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
bool ExistStepTensor() const { return !stepInDesc.GetLengths().empty(); }
bool IsAmp() const { return is_amp; }
bool IsAdamW() const { return adamw; }
bool IsAllPacked() const
bool IsAllContiguous() const
{
if(!(paramInDesc.IsPacked() && gradInDesc.IsPacked() && expAvgInDesc.IsPacked() &&
expAvgSqInDesc.IsPacked()))
if(!(paramInDesc.IsContiguous() && gradInDesc.IsContiguous() &&
expAvgInDesc.IsContiguous() && expAvgSqInDesc.IsContiguous()))
return false;
return true;
}
Expand Down
1 change: 1 addition & 0 deletions src/include/miopen/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
}

bool IsPacked() const;
bool IsContiguous() const;
/// Checks all lengths and strides.
bool AllDimsFitIntoInt() const;
/// Checks only lengths.
Expand Down
Loading