diff --git a/config_src/drivers/timing_tests/time_MOM_ANN.F90 b/config_src/drivers/timing_tests/time_MOM_ANN.F90
new file mode 100644
index 0000000000..a399835d89
--- /dev/null
+++ b/config_src/drivers/timing_tests/time_MOM_ANN.F90
@@ -0,0 +1,189 @@
+program time_MOM_ANN
+
+! This file is part of MOM6. See LICENSE.md for the license.
+
+use MOM_ANN, only : ANN_CS
+use MOM_ANN, only : ANN_allocate, ANN_apply, ANN_end
+use MOM_ANN, only : ANN_apply_vector_orig, ANN_apply_vector_oi
+use MOM_ANN, only : ANN_apply_array_sio
+use MOM_ANN, only : ANN_random
+
+implicit none
+
+! Command line options
+integer :: nargs ! Number of command line arguments
+character(len=12) :: cmd_ln_arg !< Command line argument (if any)
+
+! ANN parameters
+integer :: nlayers ! Number of layers
+integer :: nin ! Number of inputs
+integer :: layer_width ! Width of hidden layers
+integer :: nout ! Number of outputs
+! Timing parameters
+integer :: nsamp ! Number of measurements
+integer :: nits ! Number of calls to time
+integer :: nxy ! Spatial dimension
+
+nlayers = 7; nin = 4; layer_width = 16; nout = 1 ! Deep network
+!nlayers = 4; nin = 4; layer_width = 48; nout = 1 ! Shallow-wide network
+!nlayers = 3; nin = 4; layer_width = 20; nout = 1 ! Small network
+
+nsamp = 100
+nits = 20000
+!nits = 300000 ! Needed for robust measurements on small networks
+nxy = 100 ! larger array
+!nxy = 10 ! small array
+
+! Optionally grab ANN and timing parameters from the command line
+nargs = command_argument_count()
+if (nargs==7) then
+  call get_command_argument(1, cmd_ln_arg)
+  read(cmd_ln_arg,*) nlayers
+  call get_command_argument(2, cmd_ln_arg)
+  read(cmd_ln_arg,*) nin
+  call get_command_argument(3, cmd_ln_arg)
+  read(cmd_ln_arg,*) layer_width
+  call get_command_argument(4, cmd_ln_arg)
+  read(cmd_ln_arg,*) nout
+  call get_command_argument(5, cmd_ln_arg)
+  read(cmd_ln_arg,*) nsamp
+  call get_command_argument(6, cmd_ln_arg)
+  read(cmd_ln_arg,*) nits
+  call get_command_argument(7, cmd_ln_arg)
+  read(cmd_ln_arg,*) nxy
+endif
+
+! Fastest variants on Intel Xeon W-2223 CPU @ 3.60GHz (gfortran-13.2 -O3)
+!                   | vector(nxy=1)  |   nxy = 10   |   nxy = 100
+! ----------------------------------------------------------------------------
+! Small ANN         |   vector_oi    |   array_soi  |   array_sio
+! Shallow-wide ANN  |   vector_oi    |   array_ois  |   array_sio
+! Deep ANN          |   vector_oi    |   array_ois  |   array_sio
+
+write(*,'(a)') "{"
+
+call time_ANN(nlayers, nin, layer_width, nout, nsamp, nits, nxy, &
+              0, "MOM_ANN:ANN_apply(vector)")
+write(*,"(',')")
+call time_ANN(nlayers, nin, layer_width, nout, nsamp, nits, nxy, &
+              1, "MOM_ANN:ANN_apply_vector_orig(array)")
+write(*,"(',')")
+call time_ANN(nlayers, nin, layer_width, nout, nsamp, nits, nxy, &
+              2, "MOM_ANN:ANN_apply_vector_oi(array)")
+write(*,"(',')")
+call time_ANN(nlayers, nin, layer_width, nout, nsamp, nits, nxy, &
+              12, "MOM_ANN:ANN_apply_array_sio(array)")
+write(*,"()")
+
+write(*,'(a)') "}"
+
+contains
+
+!> Time ANN inference.
+!!
+!! Times are measured over the "nits effective calls" and appropriately scaled to the
+!! time per call per single vector of input features. For array inputs, the number of
+!! actual calls is reduced by the size of the array.  The timing measurement is repeated
+!! "nsamp" times, to check the statistics of the timing measurement.
+subroutine time_ANN(nlayers, nin, width, nout, nsamp, nits, nxy, impl, label)
+  integer,          intent(in)  :: nlayers !< Number of layers
+  integer,          intent(in)  :: nin     !< Number of inputs
+  integer,          intent(in)  :: width   !< Width of hidden layers
+  integer,          intent(in)  :: nout    !< Number of outputs
+  integer,          intent(in)  :: nsamp   !< Number of measurements
+  integer,          intent(in)  :: nits    !< Number of calls to time
+  integer,          intent(in)  :: nxy     !< Spatial dimension
+  integer,          intent(in)  :: impl    !< Implementation to time
+  character(len=*), intent(in)  :: label   !< Label for YAML output
+  ! Local variables
+  type(ANN_CS) :: ANN ! ANN
+  integer :: widths(nlayers) ! Width of each layer
+  real :: x_s(nin) ! Inputs (just features) [nondim]
+  real :: y_s(nin) ! Outputs (just features) [nondim]
+  real :: x_fs(nin,nxy) ! Inputs (feature, space) [nondim]
+  real :: y_fs(nin,nxy) ! Outputs (feature, space) [nondim]
+  real :: x_sf(nin,nxy) ! Inputs (space, feature) [nondim]
+  real :: y_sf(nin,nxy) ! Outputs (space, feature) [nondim]
+  integer :: iter, samp ! Loop counters
+  integer :: ij ! Horizontal loop index
+  real :: start, finish, timing ! CPU times [s]
+  real :: tmin, tmax, tmean, tstd ! Min, max, mean, and standard deviation, of CPU times [s]
+  integer :: asamp ! Actual samples of timings
+  integer :: aits ! Actual iterations
+  real :: words_per_sec ! Operations per sec estimated from parameters [# s-1]
+
+  widths(:) = width
+  widths(1) = nin
+  widths(nlayers) = nout
+
+  call ANN_random(ANN, nlayers, widths)
+  call random_number(x_fs)
+  call random_number(x_sf)
+
+
+  tmin = 1e9
+  tmax = 0.
+  tmean = 0.
+  tstd = 0.
+  asamp = nits ! Most cases below use this
+  aits = nits / nxy ! Most cases below use this
+
+  do samp = 1, nsamp
+    select case (impl)
+      case (0)
+        aits = nits
+        call cpu_time(start)
+        do iter = 1, nits ! Make many passes to reduce sampling error
+          call ANN_apply(x_s, y_s, ANN)
+        enddo
+        call cpu_time(finish)
+      case (1)
+        call cpu_time(start)
+        do iter = 1, aits ! Make many passes to reduce sampling error
+          do ij = 1, nxy
+            call ANN_apply_vector_orig(x_fs(:,ij), y_fs(:,ij), ANN)
+          enddo
+        enddo
+        call cpu_time(finish)
+      case (2)
+        call cpu_time(start)
+        do iter = 1, aits ! Make many passes to reduce sampling error
+          do ij = 1, nxy
+            call ANN_apply_vector_oi(x_fs(:,ij), y_fs(:,ij), ANN)
+          enddo
+        enddo
+        call cpu_time(finish)
+      case (12)
+        call cpu_time(start)
+        do iter = 1, aits ! Make many passes to reduce sampling error
+          call ANN_apply_array_sio(nxy, x_sf(:,:), y_sf(:,:), ANN)
+        enddo
+        call cpu_time(finish)
+        asamp = nsamp * aits ! Account for working on whole arrays
+    end select
+
+    timing = ( finish - start ) / real(nits) ! Average time per call
+
+    tmin = min( tmin, timing )
+    tmax = max( tmax, timing )
+    tmean = tmean + timing
+    tstd = tstd + timing**2
+  enddo
+
+  tmean = tmean / real(nsamp)
+  tstd = tstd / real(nsamp) ! convert to mean of squares
+  tstd = tstd - tmean**2  ! convert to variance
+  tstd = sqrt( tstd * real(nsamp) / real(nsamp-1) ) ! convert to standard deviation
+  words_per_sec = ANN%parameters / ( tmean * 1024 * 1024 )
+
+  write(*,"(2x,3a)") '"', trim(label), '": {'
+  write(*,"(4x,a,1pe11.4,',')") '"min": ', tmin
+  write(*,"(4x,a,1pe11.4,',')") '"mean":', tmean
+  write(*,"(4x,a,1pe11.4,',')") '"std": ', tstd
+  write(*,"(4x,a,i0,',')") '"n_samples": ', asamp
+  write(*,"(4x,a,1pe11.4,',')") '"max": ', tmax
+  write(*,"(4x,a,1pe11.4,'}')", advance="no") '"MBps": ', words_per_sec
+
+end subroutine time_ANN
+
+end program time_MOM_ANN
diff --git a/config_src/drivers/unit_tests/test_MOM_ANN.F90 b/config_src/drivers/unit_tests/test_MOM_ANN.F90
new file mode 100644
index 0000000000..2bdf7c14fe
--- /dev/null
+++ b/config_src/drivers/unit_tests/test_MOM_ANN.F90
@@ -0,0 +1,10 @@
+program test_MOM_ANN
+
+use MOM_ANN, only : ANN_unit_tests
+use MOM_error_handler, only : set_skip_mpi
+
+call set_skip_mpi(.true.) ! This unit tests is not expecting MPI to be used
+
+if ( ANN_unit_tests(.true.) ) stop 1
+
+end program test_MOM_ANN
diff --git a/src/framework/MOM_ANN.F90 b/src/framework/MOM_ANN.F90
new file mode 100644
index 0000000000..56cafcf7d5
--- /dev/null
+++ b/src/framework/MOM_ANN.F90
@@ -0,0 +1,734 @@
+!> Implements the general purpose ANN.
+module MOM_ANN
+
+! This file is part of MOM6. See LICENSE.md for the license
+
+use MOM_io, only : MOM_read_data, field_exists
+use MOM_error_handler, only : MOM_error, FATAL, MOM_mesg
+use numerical_testing_type, only : testing
+
+implicit none ; private
+
+!#include <MOM_memory.h>
+
+public ANN_init, ANN_allocate, ANN_apply, ANN_end, ANN_unit_tests
+public ANN_apply_vector_orig, ANN_apply_vector_oi, ANN_apply_array_sio
+public set_layer, set_input_normalization, set_output_normalization
+public ANN_random, randomize_layer
+
+!> Applies ANN to x, returning results in y
+interface ANN_apply
+  module procedure ANN_apply_vector_oi
+  module procedure ANN_apply_array_sio
+end interface ANN_apply
+
+!> Type for a single Linear layer of ANN,
+!! i.e. stores the matrix A and bias b
+!! for matrix-vector multiplication
+!! y = A*x + b.
+type, private :: layer_type; private
+  integer :: output_width        !< Number of rows in matrix A
+  integer :: input_width         !< Number of columns in matrix A
+  logical :: activation = .True. !< If true, apply the default activation function
+
+  real, allocatable :: A(:,:) !< Matrix in column-major order
+                              !! of size A(output_width, input_width) [nondim]
+  real, allocatable :: b(:)   !< bias vector of size output_width [nondim]
+end type layer_type
+
+!> Control structure/type for ANN
+type, public :: ANN_CS ; private
+  ! Parameters
+  integer :: num_layers          !< Number of layers in the ANN, including the input and output.
+                                 !! For example, for ANN with one hidden layer, num_layers = 3.
+  integer, allocatable &
+          :: layer_sizes(:)      !< Array of length num_layers, storing the number of neurons in
+                                 !! each layer.
+
+  type(layer_type), allocatable &
+          :: layers(:)           !< Array of length num_layers-1, where each element is the Linear
+                                 !! transformation between layers defined by Matrix A and vias b.
+
+  real, allocatable :: &
+    input_means(:), &  !< Array of length layer_sizes(1) containing the mean of each input feature
+                       !! prior to normalization by input_norms [arbitrary].
+    input_norms(:), &  !< Array of length layer_sizes(1) containing the *inverse* of the standard
+                       !! deviation for each input feature used to normalize (multiply) before
+                       !! feeding into the ANN [arbitrary]
+    output_means(:), & !< Array of length layer_sizes(num_layers) containing the mean of each
+                       !! output prior to normalization by output_norms [arbitrary].
+    output_norms(:)    !< Array of length layer_sizes(num_layers) containing the standard deviation
+                       !! each output of the ANN will be multiplied [arbitrary]
+
+  integer, public :: parameters = 0 !< Count of number of parameters
+end type ANN_CS
+
+contains
+
+!> Initialization of ANN. Allocates memory and reads ANN parameters from NetCDF file.
+!! The NetCDF file must contain:
+!! Integer num_layers.
+!! Integer arrays: layer_sizes, input_norms, output_norms
+!! Matrices and biases for Linear layers can be Real(4) or Real(8) and
+!! are named as: A0, b0 for the first layer; A1, b1 for the second layer and so on.
+subroutine ANN_init(CS, NNfile)
+  type(ANN_CS), intent(inout)  :: CS     !< ANN control structure.
+  character(*), intent(in)     :: NNfile !< The name of NetCDF file having neural network parameters
+  ! Local variables
+  integer :: i
+  integer :: num_layers ! Number of layers, including input and output layers
+  integer, allocatable :: layer_sizes(:) ! Number of neurons in each layer
+  character(len=1) :: layer_num_str
+  character(len=3) :: fieldname
+
+  call MOM_mesg('ANN: init from ' // trim(NNfile), 2)
+
+  ! Read the number of layers
+  call MOM_read_data(NNfile, "num_layers", num_layers)
+
+  ! Read size of layers
+  allocate( layer_sizes(num_layers) )
+  call MOM_read_data(NNfile, "layer_sizes", layer_sizes)
+
+  ! Allocates the memory for storing normalization, weights and biases
+  call ANN_allocate(CS, num_layers, layer_sizes)
+  deallocate( layer_sizes )
+
+  ! Read normalization factors
+  if (field_exists(NNfile, 'input_means')) &
+    call MOM_read_data(NNfile, 'input_means', CS%input_means)
+  if (field_exists(NNfile, 'input_norms')) then
+    call MOM_read_data(NNfile, 'input_norms', CS%input_norms)
+    ! We calculate the reciprocal here to avoid repeated divisions later
+    CS%input_norms(:) = 1.  / CS%input_norms(:)
+  endif
+  if (field_exists(NNfile, 'output_means')) &
+    call MOM_read_data(NNfile, 'output_means', CS%output_means)
+  if (field_exists(NNfile, 'output_norms')) &
+    call MOM_read_data(NNfile, 'output_norms', CS%output_norms)
+
+  ! Allocate and read matrix A and bias b for each layer
+  do i = 1,CS%num_layers-1
+    CS%layers(i)%input_width = CS%layer_sizes(i)
+    CS%layers(i)%output_width = CS%layer_sizes(i+1)
+
+    ! Reading matrix A
+    write(layer_num_str, '(I0)') i-1
+    fieldname = trim('A') // trim(layer_num_str)
+    call MOM_read_data(NNfile, fieldname, CS%layers(i)%A, &
+                        (/1,1,1,1/),(/CS%layers(i)%output_width,CS%layers(i)%input_width,1,1/))
+
+    ! Reading bias b
+    fieldname = trim('b') // trim(layer_num_str)
+    call MOM_read_data(NNfile, fieldname, CS%layers(i)%b)
+  enddo
+
+  ! No activation function for the last layer
+  CS%layers(CS%num_layers-1)%activation = .False.
+
+  if (field_exists(NNfile, 'x_test') .and. field_exists(NNfile, 'y_test') ) &
+  call ANN_test(CS, NNfile)
+
+  call MOM_mesg('ANN: have been read from ' // trim(NNfile), 2)
+
+end subroutine ANN_init
+
+!> Allocate an ANN
+!!
+!! This creates the memory for storing weights and intermediate work arrays, but does not set
+!! the values of weights or biases (not even initializing with zeros).
+subroutine ANN_allocate(CS, num_layers, layer_sizes)
+  type(ANN_CS), intent(inout) :: CS !< ANN control structure
+  integer,      intent(in)    :: num_layers !< The number of layers, including the input and output layer
+  integer,      intent(in)    :: layer_sizes(num_layers) !< The number of neurons in each layer
+  ! Local variables
+  integer :: l ! Layer number
+
+  ! Assert that there is always an input and output layer
+  if (num_layers < 2) call MOM_error(FATAL, "The number of layers in an ANN must be >=2")
+
+  CS%num_layers = num_layers
+
+  ! Layers
+  allocate( CS%layer_sizes(CS%num_layers) )
+  CS%layer_sizes(:) = layer_sizes(:)
+
+  ! Input and output normalization values
+  allocate( CS%input_means(CS%layer_sizes(1)), source=0. ) ! Assume zero mean by default
+  allocate( CS%input_norms(CS%layer_sizes(1)), source=1. ) ! Assume unit variance by default
+  allocate( CS%output_means(CS%layer_sizes(CS%num_layers)), source=0. ) ! Assume zero mean by default
+  allocate( CS%output_norms(CS%layer_sizes(CS%num_layers)), source=1. ) ! Assume unit variance by default
+
+  ! Allocate the Linear transformations between layers
+  allocate(CS%layers(CS%num_layers-1))
+  CS%parameters = 2 * CS%layer_sizes(1) ! For input normalization
+
+  ! Allocate matrix A and bias b for each layer
+  do l = 1, CS%num_layers-1
+    CS%layers(l)%input_width = CS%layer_sizes(l)
+    CS%layers(l)%output_width = CS%layer_sizes(l+1)
+
+    allocate( CS%layers(l)%A(CS%layers(l)%output_width, CS%layers(l)%input_width) )
+    allocate( CS%layers(l)%b(CS%layers(l)%output_width) )
+
+    CS%parameters = CS%parameters &
+       + CS%layer_sizes(l) * CS%layer_sizes(l+1) & ! For weights
+       + CS%layer_sizes(l+1) ! For bias
+  enddo
+  CS%parameters = CS%parameters &
+     + 2 * CS%layer_sizes(CS%num_layers) ! For output normalization
+
+end subroutine ANN_allocate
+
+!> Test ANN by comparing the prediction with the test data.
+subroutine ANN_test(CS, NNfile)
+  type(ANN_CS), intent(inout) :: CS     !< ANN control structure.
+  character(*), intent(in)    :: NNfile !< The name of NetCDF file having neural network parameters
+  ! Local variables
+  real, dimension(:), allocatable :: x_test, y_test, y_pred ! [arbitrary]
+  real :: relative_error ! [arbitrary]
+  character(len=200) :: relative_error_str
+
+  ! Allocate data
+  allocate(x_test(CS%layer_sizes(1)))
+  allocate(y_test(CS%layer_sizes(CS%num_layers)))
+  allocate(y_pred(CS%layer_sizes(CS%num_layers)))
+
+  ! Read test vectors
+  call MOM_read_data(NNfile, 'x_test', x_test)
+  call MOM_read_data(NNfile, 'y_test', y_test)
+
+  ! Compute prediction
+  call ANN_apply_vector_oi(x_test, y_pred, CS)
+
+  relative_error = maxval(abs(y_pred(:) - y_test(:))) / maxval(abs(y_test(:)))
+
+  if (relative_error > 1e-5) then
+    write(relative_error_str, '(ES12.4)') relative_error
+    call MOM_error(FATAL, 'Relative error in ANN prediction is too large: ' // trim(relative_error_str))
+  endif
+
+  deallocate(x_test)
+  deallocate(y_test)
+  deallocate(y_pred)
+end subroutine ANN_test
+
+!> Deallocates memory of ANN
+subroutine ANN_end(CS)
+  type(ANN_CS), intent(inout) :: CS !< ANN control structure.
+  ! Local variables
+  integer :: i
+
+  deallocate(CS%layer_sizes)
+  deallocate(CS%input_means)
+  deallocate(CS%input_norms)
+  deallocate(CS%output_means)
+  deallocate(CS%output_norms)
+
+  do i = 1, CS%num_layers-1
+    deallocate(CS%layers(i)%A)
+    deallocate(CS%layers(i)%b)
+  enddo
+  deallocate(CS%layers)
+
+end subroutine ANN_end
+
+!> The default activation function
+pure elemental function activation_fn(x) result (y)
+  real, intent(in) :: x !< Scalar input value [nondim]
+  real             :: y !< Scalar output value [nondim]
+
+  y = max(x, 0.0) ! ReLU activation
+
+end function activation_fn
+
+!> Single application of ANN inference using vector input and output
+!!
+!! This implementation is the simplest using allocation and de-allocation
+!! of temporary arrays
+subroutine ANN_apply_vector_orig(x, y, CS)
+  type(ANN_CS), intent(in)    :: CS                               !< ANN instance
+  real,         intent(in)    :: x(CS%layer_sizes(1))             !< Inputs [arbitrary]
+  real,         intent(inout) :: y(CS%layer_sizes(CS%num_layers)) !< Outputs [arbitrary]
+  ! Local variables
+  real, allocatable :: x_1(:), x_2(:) ! intermediate states [nondim]
+  integer :: i, o ! Input, output indices
+
+  ! Normalize input
+  allocate(x_1(CS%layer_sizes(1)))
+  do i = 1,CS%layer_sizes(1)
+    x_1(i) = ( x(i) - CS%input_means(i) ) * CS%input_norms(i)
+  enddo
+
+  ! Apply Linear layers
+  do i = 1, CS%num_layers-1
+    allocate(x_2(CS%layer_sizes(i+1)))
+    call layer_apply_orig(x_1, x_2, CS%layers(i))
+    deallocate(x_1)
+    allocate(x_1(CS%layer_sizes(i+1)))
+    x_1(:) = x_2(:)
+    deallocate(x_2)
+  enddo
+
+  ! Un-normalize output
+  do o = 1, CS%layer_sizes(CS%num_layers)
+    y(o) = ( x_1(o) * CS%output_norms(o) ) + CS%output_means(o)
+  enddo
+
+  deallocate(x_1)
+
+  contains
+
+  !> Applies linear layer to input data x and stores the result in y with
+  !! y = A*x + b with optional application of the activation function so the
+  !! overall operations is ReLU(A*x + b)
+  subroutine layer_apply_orig(x, y, layer)
+    type(layer_type), intent(in)    :: layer                 !< Linear layer
+    real,             intent(in)    :: x(layer%input_width)  !< Input vector [nondim]
+    real,             intent(inout) :: y(layer%output_width) !< Output vector [nondim]
+    ! Local variables
+    integer :: i, o ! Input, output indices
+
+    ! Add bias
+    y(:) = layer%b(:)
+    ! Multiply by kernel
+    do i=1,layer%input_width
+      do o=1,layer%output_width
+        y(o) = y(o) + x(i) * layer%A(o, i)
+      enddo
+    enddo
+    ! Apply activation function
+    if (layer%activation) y(:) = activation_fn(y(:))
+
+  end subroutine layer_apply_orig
+end subroutine ANN_apply_vector_orig
+
+!> Single application of ANN inference using vector input and output
+!!
+!! This implementation avoids repeated reallocation of work arrays and uses the
+!! output index for the fastest (inner-most) loop in the layer matrix multiply.
+subroutine ANN_apply_vector_oi(x, y, CS)
+  type(ANN_CS), intent(in)    :: CS                               !< ANN instance
+  real,         intent(in)    :: x(CS%layer_sizes(1))             !< Inputs [arbitrary]
+  real,         intent(inout) :: y(CS%layer_sizes(CS%num_layers)) !< Outputs [arbitrary]
+  ! Local variables
+  real, allocatable :: x_1(:), x_2(:) ! intermediate states [nondim]
+  integer :: i, o ! Input, output indices
+
+  allocate( x_1( maxval( CS%layer_sizes(:) ) ) )
+  allocate( x_2( maxval( CS%layer_sizes(:) ) ) )
+
+  ! Normalize input
+  do i = 1,CS%layer_sizes(1)
+    x_1(i) = ( x(i) - CS%input_means(i) ) * CS%input_norms(i)
+  enddo
+
+  ! Apply Linear layers
+  do i = 1, CS%num_layers-2, 2
+    call layer_apply_oi(x_1, x_2, CS%layers(i))
+    call layer_apply_oi(x_2, x_1, CS%layers(i+1))
+  enddo
+  if (mod(CS%num_layers,2)==0) then
+    call layer_apply_oi(x_1, x_2, CS%layers(CS%num_layers-1))
+    ! Un-normalize output
+    do o = 1, CS%layer_sizes(CS%num_layers)
+      y(o) = x_2(o) * CS%output_norms(o) + CS%output_means(o)
+    enddo
+  else
+    ! Un-normalize output
+    do o = 1, CS%layer_sizes(CS%num_layers)
+      y(o) = x_1(o) * CS%output_norms(o) + CS%output_means(o)
+    enddo
+  endif
+
+  deallocate(x_1, x_2)
+
+  contains
+
+  !> Applies linear layer to input data x and stores the result in y with
+  !! y = A*x + b with optional application of the activation function so the
+  !! overall operations is ReLU(A*x + b)
+  subroutine layer_apply_oi(x, y, layer)
+    type(layer_type), intent(in)    :: layer                 !< Linear layer
+    real,             intent(in)    :: x(layer%input_width)  !< Input vector [nondim]
+    real,             intent(inout) :: y(layer%output_width) !< Output vector [nondim]
+    ! Local variables
+    integer :: i, o ! Input, output indices
+
+    ! Add bias
+    y(:) = layer%b(:)
+    ! Multiply by kernel
+    do i=1,layer%input_width
+      do o=1,layer%output_width
+        y(o) = y(o) + x(i) * layer%A(o, i)
+      enddo
+    enddo
+    ! Apply activation function
+    if (layer%activation) y(:) = activation_fn(y(:))
+
+  end subroutine layer_apply_oi
+end subroutine ANN_apply_vector_oi
+
+!> Single application of ANN inference using array input and output
+!! with (space,feature) indexing
+!!
+!! This implementation uses the space index for the fastest (inner-most) loop
+!! in the layer matrix multiply, with the input index as the next fastest loop,
+!! and uses the weights matrix A(output,index). It also applies the activation
+!! function within the outer loop of the matrix multiply.
+subroutine ANN_apply_array_sio(nij, x, y, CS)
+  type(ANN_CS), intent(in)    :: CS !< ANN control structure
+  integer,      intent(in)    :: nij !< Size of spatial dimension
+  real,         intent(in)    :: x(nij, CS%layer_sizes(1)) !< input [arbitrary]
+  real,         intent(inout) :: y(nij, CS%layer_sizes(CS%num_layers)) !< output [arbitrary]
+  ! Local variables
+  real, allocatable :: x_1(:,:), x_2(:,:) ! intermediate states [nondim]
+  integer :: l, i, o ! Layer, input, output index
+
+  allocate( x_1( nij, maxval( CS%layer_sizes(:) ) ) )
+  allocate( x_2( nij, maxval( CS%layer_sizes(:) ) ) )
+
+  ! Normalize input
+  do i = 1, CS%layer_sizes(1)
+    x_1(:,i) = ( x(:,i) - CS%input_means(i) ) * CS%input_norms(i)
+  enddo
+
+  ! Apply Linear layers
+  do l = 1, CS%num_layers-2, 2
+    call layer_apply_sio(nij, x_1, x_2, CS%layers(l))
+    call layer_apply_sio(nij, x_2, x_1, CS%layers(l+1))
+  enddo
+  if (mod(CS%num_layers,2)==0) then
+    call layer_apply_sio(nij, x_1, x_2, CS%layers(CS%num_layers-1))
+    ! Un-normalize output
+    do o = 1, CS%layer_sizes(CS%num_layers)
+      y(:,o) = x_2(:,o) * CS%output_norms(o) + CS%output_means(o)
+    enddo
+  else
+    ! Un-normalize output
+    do o = 1, CS%layer_sizes(CS%num_layers)
+      y(:,o) = x_1(:,o) * CS%output_norms(o) + CS%output_means(o)
+    enddo
+  endif
+
+  deallocate(x_1, x_2)
+
+  contains
+
+  !> Applies linear layer to input data x and stores the result in y with
+  !! y = A*x + b with optional application of the activation function so the
+  !! overall operations is ReLU(A*x + b)
+  subroutine layer_apply_sio(nij, x, y, layer)
+    type(layer_type), intent(in)    :: layer !< Linear layer
+    integer,          intent(in)    :: nij   !< Size of spatial dimension
+    real,             intent(in)    :: x(nij, layer%input_width) !< Input vector [nondim]
+    real,             intent(inout) :: y(nij, layer%output_width) !< Output vector [nondim]
+    ! Local variables
+    integer :: i, o ! Input, output indices
+
+    do o = 1, layer%output_width
+      ! Add bias
+      y(:,o) = layer%b(o)
+      ! Multiply by kernel
+      do i = 1, layer%input_width
+        y(:,o) = y(:,o) + x(:,i) * layer%A(o, i)
+      enddo
+      ! Apply activation function
+      if (layer%activation) y(:,o) = activation_fn(y(:,o))
+    enddo
+
+  end subroutine layer_apply_sio
+end subroutine ANN_apply_array_sio
+
+!> Sets weights and bias for a single layer
+subroutine set_layer(ANN, layer, weights, biases, activation)
+  type(ANN_CS), intent(inout) :: ANN !< ANN control structure
+  integer,      intent(in)    :: layer !< The number of the layer being adjusted
+  real,         intent(in)    :: weights(:,:) !< The weights to assign
+  real,         intent(in)    :: biases(:) !< The biases to assign
+  logical,      intent(in)    :: activation !< Turn on the activation function
+
+  if ( layer >= ANN%num_layers ) &
+      call MOM_error(FATAL, "MOM_ANN, set_layer: layer is out of range")
+  if ( layer < 1 ) &
+      call MOM_error(FATAL, "MOM_ANN, set_layer: layer should be >= 1")
+
+  if ( size(biases) /= size(ANN%layers(layer)%b) ) &
+      call MOM_error(FATAL, "MOM_ANN, set_layer: mismatch in size of biases")
+  ANN%layers(layer)%b(:) = biases(:)
+
+  if ( size(weights,1) /= size(ANN%layers(layer)%A,1) ) &
+      call MOM_error(FATAL, "MOM_ANN, set_layer: mismatch in size of weights (first dim)")
+  if ( size(weights,2) /= size(ANN%layers(layer)%A,2) ) &
+      call MOM_error(FATAL, "MOM_ANN, set_layer: mismatch in size of weights (second dim)")
+  ANN%layers(layer)%A(:,:) = weights(:,:)
+
+  ANN%layers(layer)%activation = activation
+end subroutine set_layer
+
+!> Sets input normalization
+subroutine set_input_normalization(ANN, means, norms)
+  type(ANN_CS),   intent(inout) :: ANN !< ANN control structure
+  real, optional, intent(in)    :: means(:) !< The mean of each input
+  real, optional, intent(in)    :: norms(:) !< The standard deviation of each input
+
+  if (present(means)) then
+    if ( size(means) /= size(ANN%input_means) ) &
+        call MOM_error(FATAL, "MOM_ANN, set_input_normalization: mismatch in size of means")
+    ANN%input_means(:) = means(:)
+  endif
+
+  if (present(norms)) then
+    if ( size(norms) /= size(ANN%input_norms) ) &
+        call MOM_error(FATAL, "MOM_ANN, set_input_normalization: mismatch in size of norms")
+    ANN%input_norms(:) = norms(:)
+  endif
+
+end subroutine set_input_normalization
+
+!> Sets output normalization
+subroutine set_output_normalization(ANN, means, norms)
+  type(ANN_CS),   intent(inout) :: ANN !< ANN control structure
+  real, optional, intent(in)    :: means(:) !< The mean of each output
+  real, optional, intent(in)    :: norms(:) !< The standard deviation of each output
+
+  if (present(means)) then
+    if ( size(means) /= size(ANN%output_means) ) &
+        call MOM_error(FATAL, "MOM_ANN, set_output_normalization: mismatch in size of means")
+    ANN%output_means(:) = means(:)
+  endif
+
+  if (present(norms)) then
+    if ( size(norms) /= size(ANN%output_norms) ) &
+        call MOM_error(FATAL, "MOM_ANN, set_output_normalization: mismatch in size of norms")
+    ANN%output_norms(:) = norms(:)
+  endif
+
+end subroutine set_output_normalization
+
+!> Create a random ANN
+subroutine ANN_random(ANN, nlayers, widths)
+  type(ANN_CS), intent(inout) :: ANN !< ANN control structure
+  integer,      intent(in)    :: nlayers !< Number of layers
+  integer,      intent(in)    :: widths(nlayers) !< Width of each layer
+  ! Local variables
+  integer :: l
+
+  call ANN_allocate(ANN, nlayers, widths)
+
+  do l = 1, nlayers-1
+    call randomize_layer(ANN, nlayers, l, widths)
+  enddo
+
+end subroutine ANN_random
+
+!> Fill a layer with random numbers
+subroutine randomize_layer(ANN, nlayers, layer, widths)
+  type(ANN_CS), intent(inout) :: ANN !< ANN control structure
+  integer,      intent(in)    :: nlayers !< Number of layers
+  integer,      intent(in)    :: layer !< Layer number to randomize
+  integer,      intent(in)    :: widths(nlayers) !< Width of each layer
+  ! Local variables
+  real :: weights(widths(layer+1),widths(layer)) ! Weights
+  real :: biases(widths(layer+1)) ! Biases
+
+  call random_number(weights)
+  weights(:,:) = 2. * weights(:,:) - 1.
+
+  call random_number(biases)
+  biases(:) = 2. * biases(:) - 1.
+
+  call set_layer(ANN, layer, weights, biases, layer<nlayers-1)
+
+end subroutine randomize_layer
+
+!> Runs unit tests on ANN functions.
+!!
+!! Should only be called from a single/root thread.
+!! Returns True if a test fails, otherwise False.
+logical function ANN_unit_tests(verbose)
+  logical, intent(in) :: verbose !< If true, write results to stdout
+  ! Local variables
+  type(ANN_CS) :: ANN ! An ANN
+  type(testing) :: test ! Manage tests
+  real, allocatable :: x(:), y(:), y_good(:), x2(:,:), y2(:,:) ! Inputs, outputs [arbitrary]
+  integer, parameter :: max_rand_nlay = 10 ! Deepest random ANN to generate
+  integer :: widths(max_rand_nlay) ! Number of layers for random ANN
+  integer :: nlay ! Number of layers for random ANN
+  integer :: i, iter ! Loop counters
+  logical :: rand_res ! Status of random tests
+
+  ANN_unit_tests = .false. ! Start by assuming all is well
+  call test%set(verbose=verbose) ! Pass verbose mode to test
+
+  ! Identity ANN for one input
+  allocate( y(1) )
+  call ANN_allocate(ANN, 2, [1,1])
+  call set_layer(ANN, 1, reshape([1.],[1,1]), [0.], .false.)
+  call ANN_apply([1.], y, ANN)
+  call test%real_scalar(y(1), 1., 'Scalar identity')
+  deallocate( y )
+  call ANN_end(ANN)
+
+  ! Summation ANN
+  allocate( y(1) )
+  call ANN_allocate(ANN, 2, [4,1])
+  call set_layer(ANN, 1, reshape([1.,1.,1.,1.], [1,4]), [0.], .false.)
+  call ANN_apply([-1.,0.,1.,2.], y, ANN)
+  call test%real_scalar(y(1), 2., 'Summation')
+  deallocate( y )
+  call ANN_end(ANN)
+
+  ! Identity ANN for vector input/output
+  call ANN_allocate(ANN, 2, [3,3])
+  allocate( y(3) )
+  call set_layer(ANN, 1, reshape([1.,0.,0., &
+                                  0.,1.,0., &
+                                  0.,0.,1.], [3,3]), [0.,0.,0.], .false.)
+  call ANN_apply([-1.,0.,1.], y, ANN)
+  call test%real_arr(3, y, [-1.,0.,1.], 'Vector identity')
+  deallocate( y )
+  call ANN_end(ANN)
+
+  ! Rectifying ANN for vector input/output
+  allocate( y(3) )
+  call ANN_allocate(ANN, 2, [3,3])
+  call set_layer(ANN, 1, reshape([1.,0.,0., &
+                                  0.,1.,0., &
+                                  0.,0.,1.], [3,3]), [0.,0.,0.], .true.)
+  call ANN_apply([-1.,0.,1.], y, ANN)
+  call test%real_arr(3, y, [0.,0.,1.], 'Rectifier')
+  deallocate( y )
+  call ANN_end(ANN)
+
+  ! The next 3 tests re-use the same network with 4 inputs, a 4-wide hidden layer, and one output
+  allocate( y(1) )
+  call ANN_allocate(ANN, 3, [4,4,1])
+
+  ! 1 hidden layer: rectifier followed by summation
+  ! Inputs: [-1,0,1,2]
+  ! Rectified: [0,0,1,2]
+  ! Sum: 3
+  ! Outputs: 3
+  call set_layer(ANN, 1, reshape([1.,0.,0.,0., &
+                                  0.,1.,0.,0., &
+                                  0.,0.,1.,0., &
+                                  0.,0.,0.,1.], [4,4]), [0.,0.,0.,0.], .true.)
+  call set_layer(ANN, 2, reshape([1.,1.,1.,1.], [1,4]), [0.], .false.)
+  call ANN_apply_vector_orig([-1.,0.,1.,2.], y, ANN)
+  call test%real_scalar(y(1), 3., 'Rectifier+summation')
+
+  ! as above but with biases
+  ! Inputs: [-2,-1,0,1]
+  ! After bias: [-1,0,1,2] with b=1
+  ! Rectified: [0,0,1,2]
+  ! Sum: 3
+  ! After bias: 6 with b=3
+  ! Outputs: 6
+  call set_layer(ANN, 1, reshape([1.,0.,0.,0., &
+                                  0.,1.,0.,0., &
+                                  0.,0.,1.,0., &
+                                  0.,0.,0.,1.], [4,4]), [1.,1.,1.,1.], .true.)
+  call set_layer(ANN, 2, reshape([1.,1.,1.,1.], [1,4]), [3.], .false.)
+  call ANN_apply_vector_orig([-2.,-1.,0.,1.], y, ANN)
+  call test%real_scalar(y(1), 6., 'Rectifier+summation+bias')
+
+  ! as above but with normalization of inputs and outputs
+  ! Inputs: [0,2,4,6]
+  ! Normalized inputs: [-2,-1,0,1] (using mean=-4, norm=2)
+  ! Normalized outputs: 6
+  ! De-normalized output: 2 (using mean=-10, norm=2)
+  call set_input_normalization(ANN, means=[4.,4.,4.,4.], norms=[0.5,0.5,0.5,0.5])
+  call set_output_normalization(ANN, norms=[2.], means=[-10.])
+  call ANN_apply_vector_orig([0.,2.,4.,6.], y, ANN)
+  call test%real_scalar(y(1), 2., 'Rectifier+summation+bias+norms')
+
+  deallocate( y )
+  call ANN_end(ANN)
+
+  ! as above with a 1x1 4th identity layer (to check loop combinations)
+  allocate( y(1) )
+  call ANN_allocate(ANN, 4, [4,4,1,1])
+  call set_layer(ANN, 1, reshape([1.,0.,0.,0., &
+                                  0.,1.,0.,0., &
+                                  0.,0.,1.,0., &
+                                  0.,0.,0.,1.], [4,4]), [1.,1.,1.,1.], .true.)
+  call set_layer(ANN, 2, reshape([1.,1.,1.,1.], [1,4]), [3.], .false.)
+  call set_layer(ANN, 3, reshape([1.],[1,1]), [0.], .false.)
+  call set_input_normalization(ANN, means=[4.,4.,4.,4.], norms=[0.5,0.5,0.5,0.5])
+  call set_output_normalization(ANN, norms=[2.], means=[-10.])
+  call ANN_apply_vector_orig([0.,2.,4.,6.], y, ANN)
+  call test%real_scalar(y(1), 2., 'Rectifier+summation+bias+norms 4-layer')
+
+  ! as above with v2 of ANN_apply
+  call ANN_apply_vector_oi([0.,2.,4.,6.], y, ANN)
+  call test%real_scalar(y(1), 2., 'Rectifier+summation+bias+norms 4-layer v2')
+  deallocate( y )
+
+  allocate( y2(1,2) )
+  ! as above with v5 of ANN_apply applied to 2d inputs, x(space,feature)
+  call ANN_apply_array_sio(2, reshape([0.,1.,2.,3.,4.,5.,6.,7.],[2,4]), y2, ANN)
+  call test%real_arr(2, y2, [2.,5.], 'Rectifier+summation+bias+norms 4-layer array v2')
+  deallocate( y2 )
+
+  call ANN_end(ANN)
+
+  ! The following block checks that for random ANN (weights and layers widths)
+  ! each of the various implementations of inference give identical results.
+  ! This helped catch loop and allocation errors.
+  rand_res = .false.
+  do iter = 1, 1000
+    allocate( y(max_rand_nlay+1) )
+    call random_number(y) ! Vector of random numbers 0..1
+    nlay = 2 + floor( y(max_rand_nlay+1) * ( max_rand_nlay - 1 ) ) ! 2 < nlay < max_rand_nlay
+    widths(:) = 1 + floor( y(1:nlay) * 8 ) ! 1 < layer width < 8
+    deallocate( y )
+    call ANN_random(ANN, nlay, widths)
+    allocate( x(widths(1)), y(widths(nlay)), y_good(widths(nlay)) )
+    call ANN_apply_vector_orig(x, y_good, ANN)
+    call ANN_apply_vector_oi(x, y, ANN)
+    rand_res = rand_res .or. maxval( abs( y(:) - y_good(:) ) ) > 0. ! Check results from v2 = v1
+    allocate( x2(20,widths(1)), y2(20,widths(nlay)) ) ! 2D input, output
+    do i = 1, 20
+      x2(i,:) = x(:)
+    enddo
+    call ANN_apply_array_sio(20, x2, y2, ANN)
+    rand_res = rand_res .or. maxval( abs( maxval(y2(:,:),1) - y_good(:) ) ) > 0. ! Check results from array v2 = v1
+    rand_res = rand_res .or. maxval( abs( minval(y2(:,:),1) - y_good(:) ) ) > 0. ! Check results from array v2 = v1
+    deallocate( x, y, y_good, x2, y2 )
+    call ANN_end(ANN)
+  enddo
+  call test%test(rand_res, 'Equivalence between inference variants with random results')
+
+  ANN_unit_tests = test%summarize('ANN_unit_tests')
+
+end function ANN_unit_tests
+
+!> \namespace mom_ann
+!!
+!! The mom_ann module is a pure fortran implementation of fully-connected feed-forward
+!! networks to facilitate easy evaluation of data-driven functions in MOM6. For performant
+!! implementations or for novel architectires, using machine-learning libraries (e.g. via
+!! mom_database_comms) are necessary, or at least likely to be more efficient.
+!!
+!! The artificial neural network (ANN) understood by this MOM6 module has \f$ N \f$ layers,
+!! including the input-layer and output-layer, thus requireing \f$ N \geq 2\f$.
+!!
+!! The output values (neurons or nodes) of any layer other than the input layer (i.e. \f$ l>1 \f$) are
+!! \f[
+!! y_{l,j} = f_l( b_{l,j} + A_{l,j,i} x_{l-1,i} )
+!! \f]
+!! where \f$ f(x) = max(0, x) \f$ is the ReLU activation function, \f$b_{l,j}\f$ is a bias for each neuron,
+!! $\f$A_{l,j,i}\f$ are a rectangular matrix of weights for each layer, and \f$x_{l-1,i}\f$ are the outputs
+!! of the previous layer, \f$l-1\f$. The subscript on \f$ f_l() \f$ indicates the activation function is
+!! optional for each layer.
+!!
+!! Currently, the performance of various implementations is dependent on the shape/size of the network and
+!! the size of input data. For this reason we provide several versions that all yield the same result but
+!! for differently shaped inputs.
+!!
+!! \image html https://upload.wikimedia.org/wikipedia/commons/4/46/Colored_neural_network.svg
+!! Fig: A three layer network with 3 inputs, 2 outputs, and 1 hidden layer. There are two rectanglar
+!! matrices of weights (black arrows). The bias for each neuron is implied."
+
+end module MOM_ANN
diff --git a/src/parameterizations/lateral/MOM_ANN.F90 b/src/parameterizations/lateral/MOM_ANN.F90
deleted file mode 100644
index 89963fd218..0000000000
--- a/src/parameterizations/lateral/MOM_ANN.F90
+++ /dev/null
@@ -1,238 +0,0 @@
-!> Implements the general purpose ANN.
-module MOM_ANN
-
-! This file is part of MOM6. See LICENSE.md for the license
-
-use MOM_diag_mediator, only : diag_ctrl, time_type
-use MOM_io, only : MOM_read_data
-use MOM_error_handler, only : MOM_error, FATAL, MOM_mesg
-!
-implicit none ; private
-
-#include <MOM_memory.h>
-
-public ANN_init, ANN_apply, ANN_end
-
-!> Type for a single Linear layer of ANN,
-!! i.e. stores the matrix A and bias b
-!! for matrix-vector multiplication
-!! y = A*x + b.
-type, private :: layer_type; private
-  integer :: output_width        !< Number of rows in matrix A
-  integer :: input_width         !< Number of columns in matrix A
-  logical :: activation = .True. !< If true, apply the default activation function
-
-  real, allocatable :: A(:,:) !< Matrix in column-major order
-                              !! of size A(output_width, input_width) [nondim]
-  real, allocatable :: b(:)   !< bias vector of size output_width [nondim]
-end type layer_type
-
-!> Control structure/type for ANN
-type, public :: ANN_CS ; private
-  ! Parameters
-  integer :: num_layers          !< Number of layers in the ANN, including the input and output.
-                                 !! For example, for ANN with one hidden layer, num_layers = 3.
-  integer, allocatable &
-          :: layer_sizes(:)      !< Array of length num_layers, storing the number of neurons in
-                                 !! each layer.
-
-  type(layer_type), allocatable &
-          :: layers(:)           !< Array of length num_layers-1, where each element is the Linear
-                                 !! transformation between layers defined by Matrix A and vias b.
-
-  real, allocatable :: &
-    input_norms(:), & !< Array of length layer_sizes(1). By these values
-                      !! each input feature will be divided before feeding into the ANN [arbitrary]
-    output_norms(:)   !< Array of length layer_sizes(num_layers). By these values
-                      !! each output of the ANN will be multiplied [arbitrary]
-end type ANN_CS
-
-contains
-
-!> Initialization of ANN. Allocates memory and reads ANN parameters from NetCDF file.
-!! The NetCDF file must contain:
-!! Integer num_layers.
-!! Integer arrays: layer_sizes, input_norms, output_norms
-!! Matrices and biases for Linear layers can be Real(4) or Real(8) and
-!! are named as: A0, b0 for the first layer; A1, b1 for the second layer and so on.
-subroutine ANN_init(CS, NNfile)
-  type(ANN_CS), intent(inout)  :: CS     !< ANN control structure.
-  character(*), intent(in)     :: NNfile !< The name of NetCDF file having neural network parameters
-
-  integer :: i
-  character(len=1) :: layer_num_str
-  character(len=3) :: fieldname
-
-  call MOM_mesg('ANN: init from ' // trim(NNfile), 2)
-
-  ! Read the number of layers
-  call MOM_read_data(NNfile, "num_layers", CS%num_layers)
-
-  ! Read size of layers
-  allocate(CS%layer_sizes(CS%num_layers))
-  call MOM_read_data(NNfile, "layer_sizes", CS%layer_sizes)
-
-  ! Read normalization factors
-  allocate(CS%input_norms(CS%layer_sizes(1)))
-  allocate(CS%output_norms(CS%layer_sizes(CS%num_layers)))
-
-  call MOM_read_data(NNfile, 'input_norms', CS%input_norms)
-  call MOM_read_data(NNfile, 'output_norms', CS%output_norms)
-
-  ! Allocate the Linear transformations between layers.
-  allocate(CS%layers(CS%num_layers-1))
-
-  ! Allocate and read matrix A and bias b for each layer
-  do i = 1,CS%num_layers-1
-    CS%layers(i)%input_width = CS%layer_sizes(i)
-    CS%layers(i)%output_width = CS%layer_sizes(i+1)
-
-    allocate(CS%layers(i)%A(CS%layers(i)%output_width, CS%layers(i)%input_width), source=0.)
-    ! Reading matrix A
-    write(layer_num_str, '(I0)') i-1
-    fieldname = trim('A') // trim(layer_num_str)
-    call MOM_read_data(NNfile, fieldname, CS%layers(i)%A, &
-                        (/1,1,1,1/),(/CS%layers(i)%output_width,CS%layers(i)%input_width,1,1/))
-
-    allocate(CS%layers(i)%b(CS%layers(i)%output_width), source=0.)
-    ! Reading bias b
-    fieldname = trim('b') // trim(layer_num_str)
-    call MOM_read_data(NNfile, fieldname, CS%layers(i)%b)
-  enddo
-
-  ! No activation function for the last layer
-  CS%layers(CS%num_layers-1)%activation = .False.
-
-  call ANN_test(CS, NNfile)
-
-  call MOM_mesg('ANN: have been read from ' // trim(NNfile), 2)
-
-end subroutine ANN_init
-
-!> Test ANN by comparing the prediction with the test data.
-subroutine ANN_test(CS, NNfile)
-  type(ANN_CS), intent(inout)  :: CS     !< ANN control structure.
-  character(*), intent(in)     :: NNfile !< The name of NetCDF file having neural network parameters
-
-  real, dimension(:), allocatable :: x_test, y_test, y_pred ! [arbitrary]
-  real :: relative_error ! [arbitrary]
-  character(len=200) :: relative_error_str
-
-  ! Allocate data
-  allocate(x_test(CS%layer_sizes(1)))
-  allocate(y_test(CS%layer_sizes(CS%num_layers)))
-  allocate(y_pred(CS%layer_sizes(CS%num_layers)))
-
-  ! Read test vectors
-  call MOM_read_data(NNfile, 'x_test', x_test)
-  call MOM_read_data(NNfile, 'y_test', y_test)
-
-  ! Compute prediction
-  call ANN_apply(x_test, y_pred, CS)
-
-  relative_error = maxval(abs(y_pred - y_test)) / maxval(abs(y_test))
-
-  if (relative_error > 1e-5) then
-    write(relative_error_str, '(ES12.4)') relative_error
-    call MOM_error(FATAL, 'Relative error in ANN prediction is too large: ' // trim(relative_error_str))
-  endif
-
-  deallocate(x_test)
-  deallocate(y_test)
-  deallocate(y_pred)
-end subroutine ANN_test
-
-!> Deallocates memory of ANN
-subroutine ANN_end(CS)
-  type(ANN_CS), intent(inout) :: CS !< ANN control structure.
-
-  integer :: i
-
-  deallocate(CS%layer_sizes)
-  deallocate(CS%input_norms)
-  deallocate(CS%output_norms)
-
-  do i = 1, CS%num_layers-1
-    deallocate(CS%layers(i)%A)
-    deallocate(CS%layers(i)%b)
-  enddo
-  deallocate(CS%layers)
-
-end subroutine ANN_end
-
-!> Main ANN function: normalizes input vector x, applies Linear layers, and
-!! un-normalizes the output.
-subroutine ANN_apply(x, y, CS)
-  type(ANN_CS), intent(in) :: CS !< ANN control structure
-
-  real, dimension(CS%layer_sizes(1)), &
-                  intent(in)  :: x !< input [arbitrary]
-  real, dimension(CS%layer_sizes(CS%num_layers)), &
-                  intent(out) :: y !< output [arbitrary]
-
-  real, allocatable :: x_1(:), x_2(:) ! intermediate states [nondim]
-  integer :: i
-
-  ! Normalize input
-  allocate(x_1(CS%layer_sizes(1)))
-  do i = 1,CS%layer_sizes(1)
-      x_1(i) = x(i) / CS%input_norms(i)
-  enddo
-
-  ! Apply Linear layers
-  do i = 1, CS%num_layers-1
-    allocate(x_2(CS%layer_sizes(i+1)))
-    call Layer_apply(x_1, x_2, CS%layers(i))
-    deallocate(x_1)
-    allocate(x_1(CS%layer_sizes(i+1)))
-    x_1 = x_2
-    deallocate(x_2)
-  enddo
-
-  ! Un-normalize output
-  do i = 1, CS%layer_sizes(CS%num_layers)
-    y(i) = x_1(i) * CS%output_norms(i)
-  enddo
-
-  deallocate(x_1)
-end subroutine ANN_apply
-
-!> The default activation function
-pure function activation_fn(x) result (y)
-  real, intent(in)  :: x !< Scalar input value [nondim]
-  real :: y !< Scalar output value [nondim]
-
-  y = max(x, 0.0) ! ReLU activation
-
-end function activation_fn
-
-!> Applies linear layer to input data x and stores the result in y with
-!! y = A*x + b with optional application of the activation function.
-subroutine Layer_apply(x, y, layer)
-  type(layer_type), intent(in)  :: layer !< Linear layer
-  real, dimension(layer%input_width), &
-                    intent(in)  :: x     !< Input vector [nondim]
-  real, dimension(layer%output_width), &
-                    intent(out) :: y     !< Output vector [nondim]
-
-  integer :: i, j
-
-  y(:) = 0.
-  do i=1,layer%input_width
-    do j=1,layer%output_width
-      ! Multiply by kernel
-      y(j) = y(j) + ( x(i) * layer%A(j, i) )
-    enddo
-  enddo
-
-  do j=1,layer%output_width
-    ! Add bias
-    y(j) = y(j) + layer%b(j)
-    ! Apply activation function
-    if (layer%activation) then
-      y(j) = activation_fn(y(j))
-    endif
-  enddo
-end subroutine Layer_apply
-
-end module MOM_ANN
\ No newline at end of file