-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathexample_example.h90
231 lines (204 loc) · 9.99 KB
/
example_example.h90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
!
! ===============================
! C O N G R A T U L A T I O N S !
! ===============================
!
! You made it through the Getting Started to see your first Hybrid Fortran code example.
! Thank you for checking this out. This example will guide you through the most important
! features in Hybrid Fortran. Just follow the numbers in brackets (1), (2), ...
! For more information there is a full documentation available here:
! https://github.com/muellermichel/Hybrid-Fortran/blob/master/doc/Documentation.pdf?raw=true
module example
contains
!
! (1) As you can see below, this subroutine encapsules all the computations done in this example
!
subroutine data_region(a, b, c, d, sum_c, sum_d)
implicit none
real, dimension(NX, NY, NZ), intent(in) :: a, b
real, dimension(NX, NY, NZ), intent(out) :: c, d
real, intent(out) :: sum_c, sum_d
!
! (2) The @domainDependant directive is one of two language additions you get with Hybrid Fortran.
! It is used to define the symbol names that Hybrid Fortran is supposed to touch in order to
! make them available on accelerators and to give these symbols additional attributes.
! You can have as many @domainDependant directives per subroutine as you want, but they all need to be
! located in between the specification part and the implementation part of your Fortran code.
!
! (3) In this particular example we see two attributes being added to the symbols a, b, c, d:
! - autoDom: Lets Hybrid Fortran handle the domains of these symbols according to their Fortran specification.
! In this particular case, this will only take care of the storage order in the specification part.
! Since GPUs generally like different storage orders than CPUs, Hybrid Fortran introduces 'DOM' storage order macros,
! e.g. 'dimension(NX, NY, NZ)' becomes 'dimension(DOM(NX, NY, NZ))'. This macro is defined by the programmer using the central
! 'storage_order.F90' facility. It is also possible to override the macro names, see the documentation for this.
! - transferHere: Lets Hybrid Fortran transfer a, b, c, d to and/or from the GPU at the boundaries of this subroutine.
! Together with the 'present' attribute in the callees, this oversteers the default behaviour of Hybrid Fortran:
! By default it will always transfer in 'kernel wrapper subroutines'[1].
! Plese note: Hybrid Fortran will use the 'intent' information that you give in the Fortran specification part to determine,
! whether a symbol needs to be copied and whether the copy operation needs to go both ways. So, in Hybrid Fortran it is
! mandatory to specify the intents of your symbols in all subroutines (which is a good practice anyways).
!
! [1] All subroutines that call another subroutine with an @parallelRegion that applies to GPUs (either implicitely or explicitely), are
! called 'kernel wrapper subroutines'. Here, this applies to the 'data_region' and 'run' subroutines.
!
@domainDependant{attribute(autoDom,transferHere)}
a, b, c, d
@end domainDependant
call run(a, b, c, d)
call reduce_to_sum(c, sum_c)
call reduce_to_sum(d, sum_d)
end subroutine
subroutine run(a, b, c, d)
implicit none
real, dimension(NX, NY, NZ), intent(in) :: a, b
real, dimension(NX, NY, NZ), intent(out) :: c, d
!
! (4) Here we see the counterpart to the 'transferHere' attribute: The 'present' attribute. If you use this, there must be a 'transferHere'
! specified at a higher point in the callgraph.
!
@domainDependant{attribute(autoDom,present)}
a, b, c, d
@end domainDependant
!
! (5) Here you see your first @parallelRegion, the second addition to the Fortran language introduced with Hybrid Fortran. It tells the
! framework that you want to run code in parallel. In this particular case it is only applied to the CPU version of the code. We separate CPU from
! GPU parallelization this way because the CPU prefers a coarse-grained parallelization, as opposed to the GPU that preferes fine-grained. For the CPU the
! 'add' and 'mult' are 1D-subroutines. By calling them inside this parallel region, Hybrid Fortran will introduce an OpenMP directive over
! loops going from x=1 to NX and y=1 to NY. Only the outermost loop (the y-loop here) will be parallelized, since this is usually optimal for
! multicore, as long as your y-dimension has enough granularity. a, b, c, d will be passed to the subroutines using only their remaining dimension
! 'z' - e.g. 'add(a, ...' becomes 'add(a(:,x,y), ...' since the CPU storage order is defined as 'z, x, y' in storage_order.F90.
!
@parallelRegion{appliesTo(CPU), domName(x,y), domSize(NX, NY)}
call add(a, b, c)
call mult(a, b, d)
@end parallelRegion
end subroutine
subroutine add(a, b, c)
implicit none
real, dimension(NZ), intent(in) :: a, b
real, dimension(NZ), intent(out) :: c
integer :: z
!
! (6) For the GPU version we want to keep the full 3D version of a, b, c available to the kernel (GPU code is best when it can deal with 'tight' parallel
! regions over data regions with operations over large vectors). So in addition to 'autoDom' we also specify a, b, c to be dependant of the domains 'x'
! and 'y'. In other words, a, b, c are at some point part of a larger data structure that spans over 'x' and 'y' as well, not just 'z' as specified above.
! Hybrid Fortran uses this information in order to re-insert these dimensions before 'z' for a, b and c, because we're still outside the GPU-parallelRegion
! in the callgraph at this point.
!
@domainDependant{attribute(autoDom, present), domName(x,y), domSize(NX,NY)}
a, b, c
@end domainDependant
!
! (7) And here goes the parallelization for GPU, now as a tight region that is very close to the actual computations, since GPU prefers a very fine-grained
! parallelization as opposed to the CPU.
!
@parallelRegion{appliesTo(GPU), domName(x,y), domSize(NX, NY)}
do z=1,NZ
c(z) = a(z) + b(z)
end do
@end parallelRegion
end subroutine
!
! (8) And the same for the second kernel...
!
subroutine mult(a, b, d)
implicit none
real, dimension(NZ), intent(in) :: a, b
real, dimension(NZ), intent(out) :: d
integer :: z
@domainDependant{attribute(autoDom, present), domName(x,y), domSize(NX,NY)}
a, b, d
@end domainDependant
@parallelRegion{appliesTo(GPU), domName(x,y), domSize(NX, NY)}
do z=1,NZ
d(z) = a(z) * b(z)
end do
@end parallelRegion
end subroutine
! (9) Using the @scheme directive and defining the implementation in MakesettingsGeneral, as shown in this example, we can override the default implementation and mix in different ones.
! Note that the template can span over multiple parts of your file, such as multiple subroutines or even modules.
! In this case we use it to enable reduction support (which CUDA Fortran doesn't have), so we can configure the rest of the program to use CUDA as well.
@scheme{name(REDUCTION_SUPPORT)}
subroutine reduce_to_sum(a, result)
implicit none
real, dimension(NZ), intent(in) :: a
real, intent(out) :: result
integer :: z
@domainDependant{attribute(autoDom, present), domName(x,y), domSize(NX,NY)}
a
@end domainDependant
result = 0.0d0
!
! (10) The only thing special here is the 'reduction' attribute that makes Hybrid Fortran sum over all private copies of 'result' as the final step of this
! parallel region. It works exactly the same as reductions in OpenMP and OpenACC, the only difference is that the right-hand side of the colon needs to
! be a single entry rather than a list (but you can still define as many reduction attributes as you want).
! Please note: Reductions are *NOT* implemented for the CUDA Fortran backend implementation - in that case you will have to deal with them manually using
! 'cublas', see the poisson2d example.
!
@parallelRegion{domName(x,y), domSize(NX, NY), reduction(+:result)}
do z=1,NZ
result = result + a(z)
end do
@end parallelRegion
end subroutine
@end scheme
end module example
!
! (11) Make sure you have PGI Accelerator Version 15.7 installed and running. Type './configure && make tests' in the example's root directory and you should see 'test ok',
! once for the CPU and once for the GPU version.
!
! (12) Have a look at the cpu and gpu folders inside the build directory, especially example.f90 - this will show you what Hybrid Fortran does in the backend.
!
program main
use example
implicit none
real, dimension(DOM(NX, NY, NZ)) :: a, b, c, d
real :: sum_c, sum_d, expected_sum
integer :: x, y, z
integer :: fail_x, fail_y, fail_z
logical :: test, testSum
a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.
testSum = .TRUE.
call data_region(a, b, c, d, sum_c, sum_d)
write(6,*) "calculation complete"
do y=1,NY
do x=1,NX
do z=1,NZ
if (test .EQ. .TRUE. .AND. c(AT(x,y,z)) /= 3.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
end if
if (test .EQ. .TRUE. .AND. d(AT(x,y,z)) /= 2.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
end if
end do
end do
end do
expected_sum = 3.0d0 * NX * NY * NZ
if ( abs(sum_c - expected_sum) > 1E-5 ) then
write(6,*) "sum c failed: ", sum_c, "; expected: ", expected_sum
testSum = .FALSE.
end if
expected_sum = 2.0d0 * NX * NY * NZ
if ( abs(sum_d - expected_sum) > 1E-5 ) then
write(6,*) "sum d failed: ", sum_d, "; expected: ", expected_sum
testSum = .FALSE.
end if
if (test .EQ. .TRUE. .AND. testSum .EQ. .TRUE.) then
write(6,*) "test ok"
else
write(6,*) "test failed"
write(6,*) "fails at", fail_x, fail_y, fail_z, "C:", c(AT(fail_x,fail_y,fail_z)), "D:", d(AT(fail_x,fail_y,fail_z))
stop 2
end if
stop
end program main