multihead_attention: use pure and elemental where necessary

OneAdder · OneAdder · commit 110cda98a8ee · 2025-02-17T15:04:09.000+04:00
diff --git a/src/nf/nf_cross_attention_layer.f90 b/src/nf/nf_cross_attention_layer.f90
@@ -35,7 +35,7 @@ module function cross_attention_layer_cons(n_heads) result(res)
     res % n_heads = n_heads
   end function cross_attention_layer_cons
 
-  module subroutine backward(self, input, gradient)
+  pure module subroutine backward(self, input, gradient)
     !! Cross Attention Back propagation
     class(cross_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :, :)
@@ -46,7 +46,7 @@ module subroutine backward(self, input, gradient)
     self % gradient(2, :, :) = self % key_layer % gradient + self % value_layer % gradient
   end subroutine backward
 
-  module subroutine forward(self, input)
+  pure module subroutine forward(self, input)
     !! Cross Attention Forward propagation
     !! Input Shape (kind, sequence_length, model_dimension)
     !! where kind is 1 for Query and 2 for Key-Value
diff --git a/src/nf/nf_multihead_attention.f90 b/src/nf/nf_multihead_attention.f90
@@ -60,7 +60,7 @@ end function multihead_attention_layer_cons
 
   interface
 
-    module subroutine common_backward(self, input, gradient)
+    pure module subroutine common_backward(self, input, gradient)
       !! General backprop for MultiHead Attention mechanism
       !! Might be used for both Self and Cross Attention
       !! Self Attention: sum output gradients
@@ -70,7 +70,7 @@ module subroutine common_backward(self, input, gradient)
       real, intent(in) :: gradient(:, :)
     end subroutine common_backward
 
-    module subroutine common_forward(self, query, key, value)
+    pure module subroutine common_forward(self, query, key, value)
       !! General forward propagation for MultiHead Attention Mechanism
       !! Might be used for both Self and Cross Attention
       !! Self Attention: pass the same value thrice
@@ -79,63 +79,63 @@ module subroutine common_forward(self, query, key, value)
       real, intent(in) :: query(:, :), key(:, :), value(:, :)
     end subroutine common_forward
 
-    module subroutine init(self, input_shape)
+    pure module subroutine init(self, input_shape)
       !! Initialize the layer data structures.
       !!
       !! This is a deferred procedure from the `base_layer` abstract type.
       class(multihead_attention_layer), intent(in out) :: self
       integer, intent(in) :: input_shape(:)
     end subroutine init
 
-    module function split_heads(self, input) result(output)
+    pure module function split_heads(self, input) result(output)
       !! Split inputs into heads
       !!
       !! Example with two heads:
       !! input (3, 4)
       !! output (3, 2, 2)
-      class(multihead_attention_layer) :: self
-      real :: input(:, :)
+      class(multihead_attention_layer), intent(in) :: self
+      real, intent(in) :: input(:, :)
       real :: output(self % sequence_length, self % head_size, self % n_heads)
     end function split_heads
 
-    module subroutine create_attention_matrix(self, query, key)
+    pure module subroutine create_attention_matrix(self, query, key)
       !! Create attention matrix for query and key
       !! Output dimensions: sequence_length, sequence_length, n_heads
-      class(multihead_attention_layer) :: self
-      real :: query(:, :, :)
-      real :: key(:, :, :)
+      class(multihead_attention_layer), intent(in out) :: self
+      real, intent(in) :: query(:, :, :)
+      real, intent(in) :: key(:, :, :)
       integer :: head
     end subroutine create_attention_matrix
 
-    module subroutine normalize_attention_matrix(self, attention_mask)
+    pure module subroutine normalize_attention_matrix(self, attention_mask)
       !! Create attention matrix for query and key
       !! Output dims: sequence_length, sequence_length, n_heads
-      class(multihead_attention_layer) :: self
+      class(multihead_attention_layer), intent(in out) :: self
       !! (sequence_length, sequence_length, n_heads)
-      real, optional :: attention_mask(:, :, :)
+      real, optional, intent(in) :: attention_mask(:, :, :)
       !! (sequence_length, sequence_length, n_heads)
       real, allocatable :: output(:, :, :)
       integer :: head, seq
     end subroutine normalize_attention_matrix
 
-    module subroutine scaled_dot_product_attention(self, value)
+    pure module subroutine scaled_dot_product_attention(self, value)
       !! Create scaled dot product attention
       !! Output dims: sequence_length, head_size, n_heads
-      class(multihead_attention_layer) :: self
-      real :: value(:, :, :)
+      class(multihead_attention_layer), intent(in out) :: self
+      real, intent(in) :: value(:, :, :)
       integer :: head
     end subroutine scaled_dot_product_attention
 
-    module function combine_heads(self, input) result(output)
-      class(multihead_attention_layer) :: self
-      real :: input(:, :, :)
+    pure module function combine_heads(self, input) result(output)
+      class(multihead_attention_layer), intent(in) :: self
+      real, intent(in) :: input(:, :, :)
       !! (sequence_length, head_size, n_heads)
       real :: output(self % sequence_length, self % model_dimension)
       integer :: seq
     end function combine_heads
 
-    module function get_num_params(self) result(num_params)
-      class(multihead_attention_layer) :: self
+    elemental module function get_num_params(self) result(num_params)
+      class(multihead_attention_layer), intent(in) :: self
       integer :: num_params
     end function get_num_params
 
diff --git a/src/nf/nf_multihead_attention_submodule.f90 b/src/nf/nf_multihead_attention_submodule.f90
@@ -14,7 +14,7 @@ module function multihead_attention_layer_cons(n_heads) result(res)
     res % n_heads = n_heads
   end function multihead_attention_layer_cons
 
-  module subroutine common_backward(self, input, gradient)
+  pure module subroutine common_backward(self, input, gradient)
     class(multihead_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :)
     real, intent(in) :: gradient(:, :)
@@ -112,7 +112,7 @@ module subroutine common_backward(self, input, gradient)
     deallocate(dk)
   end subroutine common_backward
 
-  module subroutine common_forward(self, query, key, value)
+  pure module subroutine common_forward(self, query, key, value)
     class(multihead_attention_layer), intent(in out) :: self
     real, intent(in) :: query(:, :), key(:, :), value(:, :)
 
@@ -156,27 +156,27 @@ module subroutine common_forward(self, query, key, value)
     deallocate(v)
   end subroutine common_forward
 
-  module function split_heads(self, input) result(output)
-    class(multihead_attention_layer) :: self
-    real :: input(:, :)
+  pure module function split_heads(self, input) result(output)
+    class(multihead_attention_layer), intent(in) :: self
+    real, intent(in) :: input(:, :)
     real :: output(self % sequence_length, self % head_size, self % n_heads)
     output = reshape(input, [self % sequence_length, self % head_size, self % n_heads])
   end function split_heads
 
-  module subroutine create_attention_matrix(self, query, key)
-    class(multihead_attention_layer) :: self
-    real :: query(:, :, :)
-    real :: key(:, :, :)
+  pure module subroutine create_attention_matrix(self, query, key)
+    class(multihead_attention_layer), intent(in out) :: self
+    real, intent(in) :: query(:, :, :)
+    real, intent(in) :: key(:, :, :)
     integer :: head
     ! create attention matrix for each sequence in each batch
     do concurrent(head = 1: self % n_heads)
       self % attention_matrix(:, :, head) = matmul(query(:, :, head), transpose(key(:, :, head)))
     end do
   end subroutine create_attention_matrix
 
-  module subroutine normalize_attention_matrix(self, attention_mask)
-    class(multihead_attention_layer) :: self
-    real, optional :: attention_mask(:, :, :)
+  pure module subroutine normalize_attention_matrix(self, attention_mask)
+    class(multihead_attention_layer), intent(in out) :: self
+    real, optional, intent(in) :: attention_mask(:, :, :)
     real, allocatable :: output(:, :, :)
     integer :: head, seq
 
@@ -198,19 +198,19 @@ module subroutine normalize_attention_matrix(self, attention_mask)
     deallocate(output)
   end subroutine normalize_attention_matrix
 
-  module subroutine scaled_dot_product_attention(self, value)
-    class(multihead_attention_layer) :: self
-    real :: value(:, :, :)
+  pure module subroutine scaled_dot_product_attention(self, value)
+    class(multihead_attention_layer), intent(in out) :: self
+    real, intent(in) :: value(:, :, :)
     integer :: head
 
     do concurrent(head = 1: self % n_heads)
       self % sdpa(:, :, head) = matmul(self % attention_matrix(:, :, head), value(:, :, head))
     end do
   end subroutine scaled_dot_product_attention
 
-  module function combine_heads(self, input) result(output)
-    class(multihead_attention_layer) :: self
-    real :: input(:, :, :)
+  pure module function combine_heads(self, input) result(output)
+    class(multihead_attention_layer), intent(in) :: self
+    real, intent(in) :: input(:, :, :)
     real :: output(self % sequence_length, self % model_dimension)
     integer :: seq
 
@@ -219,8 +219,8 @@ module function combine_heads(self, input) result(output)
     end do
   end function combine_heads
 
-  module function get_num_params(self) result(num_params)
-    class(multihead_attention_layer) :: self
+  elemental module function get_num_params(self) result(num_params)
+    class(multihead_attention_layer), intent(in) :: self
     integer :: num_params
 
     num_params = &
diff --git a/src/nf/nf_self_attention_layer.f90 b/src/nf/nf_self_attention_layer.f90
@@ -35,7 +35,7 @@ module function self_attention_layer_cons(n_heads) result(res)
     res % n_heads = n_heads
   end function self_attention_layer_cons
 
-  module subroutine backward(self, input, gradient)
+  pure module subroutine backward(self, input, gradient)
     !! Self Attention back propagation
     !! Returns sum of Query, Key and Value gradients
     class(self_attention_layer), intent(in out) :: self
@@ -49,7 +49,7 @@ module subroutine backward(self, input, gradient)
         + self % value_layer % gradient
   end subroutine backward
 
-  module subroutine forward(self, input)
+  pure module subroutine forward(self, input)
     !! Cross Attention forward propagation
     !! Passes input three times into MultiHead Attention
     !! Input Shape: (sequence_length, model_dimension)
diff --git a/test/test_multihead_attention_layer.f90 b/test/test_multihead_attention_layer.f90
@@ -68,7 +68,7 @@ subroutine test_multihead_attention_split_heads(attention, input, ok, output)
   end subroutine test_multihead_attention_split_heads
 
   subroutine test_multihead_attention_create_attention_matrix(attention, input, ok)
-    type(multihead_attention_layer), intent(in) :: attention
+    type(multihead_attention_layer), intent(in out) :: attention
     real, intent(in) :: input(:, :, :)
     logical, intent(in out) :: ok
     real :: attention_matrix_shape(3)
@@ -95,7 +95,7 @@ subroutine test_multihead_attention_create_attention_matrix(attention, input, ok
   end subroutine test_multihead_attention_create_attention_matrix
 
   subroutine test_multihead_attention_normalization(attention, ok)
-    type(multihead_attention_layer), intent(in) :: attention
+    type(multihead_attention_layer), intent(in out) :: attention
     logical, intent(in out) :: ok
     real :: output_flat(18)
     real :: expected_output_flat(18) = [&
@@ -114,7 +114,7 @@ subroutine test_multihead_attention_normalization(attention, ok)
   end subroutine test_multihead_attention_normalization
 
   subroutine test_multihead_attention_scaled_dot_product_attention(attention, value, ok)
-    type(multihead_attention_layer), intent(in) :: attention
+    type(multihead_attention_layer), intent(in out) :: attention
     real, intent(in) :: value(:, :, :)
     logical, intent(in out) :: ok
     real :: output_flat(12)