modern-fortran · milancurcic · Aug 6, 2023 · Jul 28, 2023 · Aug 2, 2023 · Aug 4, 2023
diff --git a/example/quadratic.f90 b/example/quadratic.f90
@@ -4,10 +4,10 @@ program quadratic_fit
   ! descent.
   use nf, only: dense, input, network
   use nf_dense_layer, only: dense_layer
-  use nf_optimizers, only: sgd, rmsprop, adam
+  use nf_optimizers, only: sgd, rmsprop, adam, adagrad
 
   implicit none
-  type(network) :: net(9)
+  type(network) :: net(10)
 
   ! Training parameters
   integer, parameter :: num_epochs = 1000
@@ -95,6 +95,11 @@ program quadratic_fit
     beta1, beta2, epsilon, weight_decay_decoupled=1e-5 &
   )
 
+  ! Adagrad optimizer
+  call adagrad_optimizer( &
+    net(10), x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
+  )
+
 contains
 
   real elemental function quadratic(x) result(y)
@@ -358,6 +363,44 @@ subroutine adam_optimizer( &
 
   end subroutine adam_optimizer
 
+  subroutine adagrad_optimizer( &
+    net, x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
+  )
+    ! Adagrad optimizer for updating weights using adaptive gradient algorithm
+    type(network), intent(inout) :: net
+    real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
+    real, intent(in) :: learning_rate, epsilon
+    integer, intent(in) :: num_epochs
+    integer :: i, n
+    real, allocatable :: ypred(:)
+
+    print '(a)', 'Adagrad optimizer'
+    print '(34("-"))'
+
+    do n = 1, num_epochs
+
+      do i = 1, size(x)
+        call net % forward([x(i)])
+        call net % backward([y(i)])
+      end do
+
+      call net % update( &
+        adagrad(learning_rate=learning_rate, epsilon=epsilon) &
+      )
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
+          n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
+    end do
+
+    print *, ''
+
+  end subroutine adagrad_optimizer
+
   subroutine shuffle(arr)
     ! Shuffle an array using the Fisher-Yates algorithm.
     integer, intent(inout) :: arr(:)

diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
@@ -13,7 +13,7 @@ module nf_optimizers
   implicit none
 
   private
-  public :: optimizer_base_type, sgd, rmsprop, adam
+  public :: optimizer_base_type, sgd, rmsprop, adam, adagrad
 
   type, abstract :: optimizer_base_type
     real :: learning_rate = 0.01
@@ -87,6 +87,20 @@ end subroutine minimize
     procedure :: minimize => minimize_adam
   end type adam
 
+  type, extends(optimizer_base_type) :: adagrad
+    !! Adagrad optimizer by Duchi et al. (2011)
+    !!
+    !! Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient
+    !! methods for online learning and stochastic optimization. Journal
+    !! of Machine Learning Research, 12(Jul), pp.2121-2159.
+    !! http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
+    real :: epsilon = 1e-8
+    real, allocatable, private :: sum_squared_gradient(:)
+  contains
+    procedure :: init => init_adagrad
+    procedure :: minimize => minimize_adagrad
+  end type adagrad
+
 contains
 
   impure elemental subroutine init_sgd(self, num_params)
@@ -187,10 +201,34 @@ pure subroutine minimize_adam(self, param, gradient)
     ! Update parameters.
     param = param &
       - self % learning_rate * m_hat / (sqrt(v_hat) + self % epsilon) &
-      - self % weight_decay_decoupled * param
+      - self % learning_rate * self % weight_decay_decoupled * param
 
     end associate
 
   end subroutine minimize_adam
 
+  impure elemental subroutine init_adagrad(self, num_params)
+    class(adagrad), intent(inout) :: self
+    integer, intent(in) :: num_params
+    if (.not. allocated(self % sum_squared_gradient)) then
+      allocate(self % sum_squared_gradient(num_params))
+      self % sum_squared_gradient = 0
+    end if
+  end subroutine init_adagrad
+
+  pure subroutine minimize_adagrad(self, param, gradient)
+    !! Concrete implementation of an Adagrad optimizer update rule.
+    class(adagrad), intent(inout) :: self
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
+
+    ! Update the sum of squared gradients using the Adagrad rule
+    self % sum_squared_gradient = self % sum_squared_gradient + gradient**2
+
+    ! Update the network parameters based on the new sum of squared gradients
+    param = param - self % learning_rate &
+      / (sqrt(self % sum_squared_gradient) + self % epsilon) * gradient
+
+  end subroutine minimize_adagrad
+
 end module nf_optimizers