ROCm · junliume · Jun 26, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 25, 2024
@@ -62,6 +62,7 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
                     float eps,
                     bool amsgrad,
                     bool maximize,
+                    bool adamw,
                     bool is_amp,
                     int32_t grad_scale,
                     bool found_inf)
@@ -86,7 +87,12 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
         float bias_correction2 = 1 - pow(beta2, step);
 
         if(weight_decay != 0)
-            grad += param * weight_decay;
+        {
+            if(adamw)
+                param -= lr * weight_decay * param;
+            else
+                grad += param * weight_decay;
+        }
 
         exp_avg    = exp_avg * beta1 + grad * (1 - beta1);
         exp_avg_sq = exp_avg_sq * beta2 + grad * grad * (1 - beta2);
@@ -114,11 +120,11 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
 
 #endif
 
-template <typename Tgpu, typename Tref = Tgpu, bool is_amp = false, typename Tgrad = Tgpu>
+template <typename Tgpu, typename Tref = Tgpu, typename Tgrad = Tgpu>
 class AdamDriver : public Driver
 {
 public:
-    AdamDriver() : Driver()
+    AdamDriver(bool adamw_ = false, bool is_amp_ = false) : Driver(), adamw(adamw_), is_amp(is_amp_)
     {
         miopenCreateTensorDescriptor(&paramDesc);
         miopenCreateTensorDescriptor(&gradDesc);
@@ -219,14 +225,16 @@ class AdamDriver : public Driver
     bool amsgrad   = false;
     bool maximize  = false;
     bool found_inf = false;
+    bool adamw     = false;
+    bool is_amp    = false;
     int grad_scale = 1;
     int iter       = 0;
 
     miopenDataType_t grad_type;
 };
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::ParseCmdLineArgs(int argc, char* argv[])
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -237,8 +245,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::ParseCmdLineArgs(int argc, char* argv
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetandSetData()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::GetandSetData()
 {
     auto param_len = GetInputTensorLengthsFromCmdLine();
     lr             = inflags.GetValueDouble("lr");
@@ -280,8 +288,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetandSetData()
     return 0;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AddCmdLineArgs()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward GroupNorm (Default=1)", "int");
     inflags.AddTensorFlag("dims", 'd', "64x32x128", "params tensor dims (Default=64x32x128)");
@@ -309,8 +317,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-std::vector<int> AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetInputTensorLengthsFromCmdLine()
+template <typename Tgpu, typename Tref, typename Tgrad>
+std::vector<int> AdamDriver<Tgpu, Tref, Tgrad>::GetInputTensorLengthsFromCmdLine()
 {
     std::vector<int> ret;
     auto tensor = inflags.GetValueTensor("dims");
@@ -319,8 +327,8 @@ std::vector<int> AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetInputTensorLengthsFro
     return ret;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AllocateBuffersAndCopy()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::AllocateBuffersAndCopy()
 {
     size_t param_sz = GetTensorSize(paramDesc);
 
@@ -425,8 +433,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::AllocateBuffersAndCopy()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardGPU()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::RunForwardGPU()
 {
     float kernel_total_time = 0;
     float kernel_first_time = 0;
@@ -474,7 +482,7 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardGPU()
                                   eps,
                                   amsgrad,
                                   maximize,
-                                  false, // adamw
+                                  adamw,
                                   gradScaleDesc,
                                   grad_scale_ptr,
                                   foundInfDesc,
@@ -505,8 +513,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardCPU()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::RunForwardCPU()
 {
     mloAdamRunHost<Tref>(paramDesc,
                          param_host.data(),
@@ -522,21 +530,22 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunForwardCPU()
                          eps,
                          amsgrad,
                          maximize,
+                         adamw,
                          is_amp,
                          grad_scale,
                          found_inf);
 
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::RunBackwardGPU()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::RunBackwardGPU()
 {
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-Tref AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetTolerance()
+template <typename Tgpu, typename Tref, typename Tgrad>
+Tref AdamDriver<Tgpu, Tref, Tgrad>::GetTolerance()
 {
     if(data_type == miopenHalf)
     {
@@ -557,8 +566,8 @@ Tref AdamDriver<Tgpu, Tref, is_amp, Tgrad>::GetTolerance()
     return 0;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::VerifyForward()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::VerifyForward()
 {
     RunForwardCPU();
     const Tref tolerance = GetTolerance();
@@ -575,8 +584,8 @@ int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, bool is_amp, typename Tgrad>
-int AdamDriver<Tgpu, Tref, is_amp, Tgrad>::VerifyBackward()
+template <typename Tgpu, typename Tref, typename Tgrad>
+int AdamDriver<Tgpu, Tref, Tgrad>::VerifyBackward()
 {
     return miopenStatusSuccess;
 }

@@ -33,7 +33,13 @@ static Driver* makeDriver(const std::string& base_arg)
     else if(base_arg == "adamfp16")
         return new AdamDriver<float16, float>();
     else if(base_arg == "ampadam")
-        return new AdamDriver<float, float, true, float16>();
+        return new AdamDriver<float, float, float16>(false, true);
+    else if(base_arg == "adamw")
+        return new AdamDriver<float, float>(true);
+    else if(base_arg == "adamwfp16")
+        return new AdamDriver<float16, float>(true);
+    else if(base_arg == "ampadamw")
+        return new AdamDriver<float, float, float16>(true, true);
     return nullptr;
 }
 

@@ -174,7 +174,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
-           "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16]\n");
+           "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
+           "adamw[fp16], ampadamw\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -202,7 +203,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" &&
        arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" &&
        arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
-       arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "--version")
+       arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
+       arg != "adamwfp16" && arg != "ampadamw" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();

@@ -6916,6 +6916,14 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  * gradients. Fused Adam optimization efficiently combines multiple operations into a single kernel,
  * reducing memory access overhead and improving performance.
  *
+ * Additionally, Fused Adam can be utilized in both adam w and Automatic Mixed Precision (AMP),
+ * enabling accelerated model training and reduced memory consumption. AMP supports FP16
+ * computation, optimizing model calculations using a mixture of FP32 and FP16 precision to enhance
+ * training speed. When utilizing AMP, FoundInf, ScaleGrad, and step tensors should be employed. In
+ * AMP mode, the execution of Adam is determined based on the FoundInf value. State Step accepts
+ * both int values and int tensors. If a Step tensor is employed, the step received as an int is
+ * disregarded, and if Adam is executed, the step tensor is incremented by 1.
+ *
  * @code
  * // Execute Adam
  * miopenFusedAdam(handle,
@@ -6945,6 +6953,34 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  *                 NULL,     // Unused foundInf Tensor because not amp
  *                 NULL);
  *
+ * // Execute AdamW
+ * miopenFusedAdam(handle,
+ *                 paramDesc,
+ *                 param,
+ *                 gradDesc,
+ *                 grad,
+ *                 expAvgDesc,
+ *                 expAvg,
+ *                 expAvgSqDesc,
+ *                 expAvgSq,
+ *                 NULL,     // Unused maxExpAvgSqDesc because amsgrad is false
+ *                 NULL,
+ *                 NULL,     // Unused stateStep Tensor because use step integer argument
+ *                 NULL,
+ *                 step,
+ *                 lr,
+ *                 beta1,
+ *                 beta2,
+ *                 weight_decay,
+ *                 eps,
+ *                 false,    // amsgrad
+ *                 false,    // maximize
+ *                 true,     // adamw
+ *                 NULL,     // Unused gradScale Tensor because not amp
+ *                 NULL,
+ *                 NULL,     // Unused foundInf Tensor because not amp
+ *                 NULL);
+ *
  * // Execute AMP Adam
  * miopenFusedAdam(handle,
  *                 paramDesc,
@@ -7002,7 +7038,7 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  * @param amsgrad             Flag indicating whether to use the AMSGrad variant of Adam (input)
  * @param maximize            Flag indicating whether to maximize the objective with respect to the
  *                            parameters (input)
- * @param adamw               If true, the operation becomes AdamW (input) (not supported)
+ * @param adamw               If true, the operation becomes AdamW (input)
  * @param gradScaleDesc       Tensor descriptor for the input grad scale tensor (input, optional)
  * @param gradScale           Input grad scale tensor (input, optional)
  * @param foundInfDesc        Tensor descriptor for the input found inf tensor (input, optional)
@@ -7169,7 +7205,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFusedAdam(miopenHandle_t handle,
  * @param amsgrad             Flag indicating whether to use the AMSGrad variant of Adam (input)
  * @param maximize            Flag indicating whether to maximize the objective with respect to the
  *                            parameters (input)
- * @param adamw               If it is true, the operation becomes AdamW (input) (not supported)
+ * @param adamw               If it is true, the operation becomes AdamW (input)
  * @param gradScaleDesc       Tensor descriptor for the input grad scale tensor (input, optional)
  * @param gradScale           Input grad scale tensor (input, optional)
  * @param foundInfDesc        Tensor descriptor for the input found inf tensor (input, optional)

@@ -45,8 +45,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
     ss << kernel;
     if(IsAdamW())
         ss << "w";
-    if(IsAllPacked())
-        ss << "packed";
+    if(IsAllContiguous())
+        ss << "cont";
     ss << "step" << step_ind;
     ss << "dtype" << dtype;
     if(IsAmp())

@@ -138,10 +138,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     bool ExistStepTensor() const { return !stepInDesc.GetLengths().empty(); }
     bool IsAmp() const { return is_amp; }
     bool IsAdamW() const { return adamw; }
-    bool IsAllPacked() const
+    bool IsAllContiguous() const
     {
-        if(!(paramInDesc.IsPacked() && gradInDesc.IsPacked() && expAvgInDesc.IsPacked() &&
-             expAvgSqInDesc.IsPacked()))
+        if(!(paramInDesc.IsContiguous() && gradInDesc.IsContiguous() &&
+             expAvgInDesc.IsContiguous() && expAvgSqInDesc.IsContiguous()))
             return false;
         return true;
     }

@@ -216,6 +216,7 @@ struct MIOPEN_INTERNALS_EXPORT TensorDescriptor : miopenTensorDescriptor
     }
 
     bool IsPacked() const;
+    bool IsContiguous() const;
     /// Checks all lengths and strides.
     bool AllDimsFitIntoInt() const;
     /// Checks only lengths.