Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions example/quadratic.f90
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ program quadratic_fit
! descent.
use nf, only: dense, input, network
use nf_dense_layer, only: dense_layer
use nf_optimizers, only: sgd, rmsprop, adam
use nf_optimizers, only: sgd, rmsprop, adam, adagrad

implicit none
type(network) :: net(9)
type(network) :: net(10)

! Training parameters
integer, parameter :: num_epochs = 1000
Expand Down Expand Up @@ -95,6 +95,11 @@ program quadratic_fit
beta1, beta2, epsilon, weight_decay_decoupled=1e-5 &
)

! Adagrad optimizer
call adagrad_optimizer( &
net(10), x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
)

contains

real elemental function quadratic(x) result(y)
Expand Down Expand Up @@ -358,6 +363,44 @@ subroutine adam_optimizer( &

end subroutine adam_optimizer

subroutine adagrad_optimizer( &
net, x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
)
! Adagrad optimizer for updating weights using adaptive gradient algorithm
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: xtest(:), ytest(:)
real, intent(in) :: learning_rate, epsilon
integer, intent(in) :: num_epochs
integer :: i, n
real, allocatable :: ypred(:)

print '(a)', 'Adagrad optimizer'
print '(34("-"))'

do n = 1, num_epochs

do i = 1, size(x)
call net % forward([x(i)])
call net % backward([y(i)])
end do

call net % update( &
adagrad(learning_rate=learning_rate, epsilon=epsilon) &
)

if (mod(n, num_epochs / 10) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
end if

end do

print *, ''

end subroutine adagrad_optimizer

subroutine shuffle(arr)
! Shuffle an array using the Fisher-Yates algorithm.
integer, intent(inout) :: arr(:)
Expand Down
42 changes: 40 additions & 2 deletions src/nf/nf_optimizers.f90
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ module nf_optimizers
implicit none

private
public :: optimizer_base_type, sgd, rmsprop, adam
public :: optimizer_base_type, sgd, rmsprop, adam, adagrad

type, abstract :: optimizer_base_type
real :: learning_rate = 0.01
Expand Down Expand Up @@ -87,6 +87,20 @@ end subroutine minimize
procedure :: minimize => minimize_adam
end type adam

type, extends(optimizer_base_type) :: adagrad
!! Adagrad optimizer by Duchi et al. (2011)
!!
!! Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient
!! methods for online learning and stochastic optimization. Journal
!! of Machine Learning Research, 12(Jul), pp.2121-2159.
!! http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
real :: epsilon = 1e-8
real, allocatable, private :: sum_squared_gradient(:)
contains
procedure :: init => init_adagrad
procedure :: minimize => minimize_adagrad
end type adagrad

contains

impure elemental subroutine init_sgd(self, num_params)
Expand Down Expand Up @@ -187,10 +201,34 @@ pure subroutine minimize_adam(self, param, gradient)
! Update parameters.
param = param &
- self % learning_rate * m_hat / (sqrt(v_hat) + self % epsilon) &
- self % weight_decay_decoupled * param
- self % learning_rate * self % weight_decay_decoupled * param

end associate

end subroutine minimize_adam

impure elemental subroutine init_adagrad(self, num_params)
class(adagrad), intent(inout) :: self
integer, intent(in) :: num_params
if (.not. allocated(self % sum_squared_gradient)) then
allocate(self % sum_squared_gradient(num_params))
self % sum_squared_gradient = 0
end if
end subroutine init_adagrad

pure subroutine minimize_adagrad(self, param, gradient)
!! Concrete implementation of an Adagrad optimizer update rule.
class(adagrad), intent(inout) :: self
real, intent(inout) :: param(:)
real, intent(in) :: gradient(:)

! Update the sum of squared gradients using the Adagrad rule
self % sum_squared_gradient = self % sum_squared_gradient + gradient**2

! Update the network parameters based on the new sum of squared gradients
param = param - self % learning_rate &
/ (sqrt(self % sum_squared_gradient) + self % epsilon) * gradient

end subroutine minimize_adagrad

end module nf_optimizers