@@ -150,11 +150,11 @@ def _map(
150
150
in_strides : Strides ,
151
151
) -> None :
152
152
153
- out_index = cuda .local .array (MAX_DIMS , numba .int32 )
154
- in_index = cuda .local .array (MAX_DIMS , numba .int32 )
155
- i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x
153
+ out_index = cuda .local .array (MAX_DIMS , numba .int32 ) # noqa: F841
154
+ in_index = cuda .local .array (MAX_DIMS , numba .int32 ) # noqa: F841
155
+ i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x # noqa: F841
156
156
# TODO: Implement for Task 3.3.
157
- raise NotImplementedError (' Need to implement for Task 3.3' )
157
+ raise NotImplementedError (" Need to implement for Task 3.3" )
158
158
159
159
return cuda .jit ()(_map ) # type: ignore
160
160
@@ -190,13 +190,13 @@ def _zip(
190
190
b_strides : Strides ,
191
191
) -> None :
192
192
193
- out_index = cuda .local .array (MAX_DIMS , numba .int32 )
194
- a_index = cuda .local .array (MAX_DIMS , numba .int32 )
195
- b_index = cuda .local .array (MAX_DIMS , numba .int32 )
196
- i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x
193
+ out_index = cuda .local .array (MAX_DIMS , numba .int32 ) # noqa: F841
194
+ a_index = cuda .local .array (MAX_DIMS , numba .int32 ) # noqa: F841
195
+ b_index = cuda .local .array (MAX_DIMS , numba .int32 ) # noqa: F841
196
+ i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x # noqa: F841
197
197
198
198
# TODO: Implement for Task 3.3.
199
- raise NotImplementedError (' Need to implement for Task 3.3' )
199
+ raise NotImplementedError (" Need to implement for Task 3.3" )
200
200
201
201
return cuda .jit ()(_zip ) # type: ignore
202
202
@@ -224,12 +224,12 @@ def _sum_practice(out: Storage, a: Storage, size: int) -> None:
224
224
"""
225
225
BLOCK_DIM = 32
226
226
227
- cache = cuda .shared .array (BLOCK_DIM , numba .float64 )
228
- i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x
229
- pos = cuda .threadIdx .x
227
+ cache = cuda .shared .array (BLOCK_DIM , numba .float64 ) # noqa: F841
228
+ i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x # noqa: F841
229
+ pos = cuda .threadIdx .x # noqa: F841
230
230
231
231
# TODO: Implement for Task 3.3.
232
- raise NotImplementedError (' Need to implement for Task 3.3' )
232
+ raise NotImplementedError (" Need to implement for Task 3.3" )
233
233
234
234
235
235
jit_sum_practice = cuda .jit ()(_sum_practice )
@@ -273,13 +273,13 @@ def _reduce(
273
273
reduce_value : float ,
274
274
) -> None :
275
275
BLOCK_DIM = 1024
276
- cache = cuda .shared .array (BLOCK_DIM , numba .float64 )
277
- out_index = cuda .local .array (MAX_DIMS , numba .int32 )
278
- out_pos = cuda .blockIdx .x
279
- pos = cuda .threadIdx .x
276
+ cache = cuda .shared .array (BLOCK_DIM , numba .float64 ) # noqa: F841
277
+ out_index = cuda .local .array (MAX_DIMS , numba .int32 ) # noqa: F841
278
+ out_pos = cuda .blockIdx .x # noqa: F841
279
+ pos = cuda .threadIdx .x # noqa: F841
280
280
281
281
# TODO: Implement for Task 3.3.
282
- raise NotImplementedError (' Need to implement for Task 3.3' )
282
+ raise NotImplementedError (" Need to implement for Task 3.3" )
283
283
284
284
return cuda .jit ()(_reduce ) # type: ignore
285
285
@@ -314,9 +314,9 @@ def _mm_practice(out: Storage, a: Storage, b: Storage, size: int) -> None:
314
314
b (Storage): storage for `b` tensor.
315
315
size (int): size of the square
316
316
"""
317
- BLOCK_DIM = 32
317
+ BLOCK_DIM = 32 # noqa: F841
318
318
# TODO: Implement for Task 3.3.
319
- raise NotImplementedError (' Need to implement for Task 3.3' )
319
+ raise NotImplementedError (" Need to implement for Task 3.3" )
320
320
321
321
322
322
jit_mm_practice = cuda .jit ()(_mm_practice )
@@ -363,30 +363,30 @@ def _tensor_matrix_multiply(
363
363
Returns:
364
364
None : Fills in `out`
365
365
"""
366
- a_batch_stride = a_strides [0 ] if a_shape [0 ] > 1 else 0
367
- b_batch_stride = b_strides [0 ] if b_shape [0 ] > 1 else 0
366
+ a_batch_stride = a_strides [0 ] if a_shape [0 ] > 1 else 0 # noqa: F841
367
+ b_batch_stride = b_strides [0 ] if b_shape [0 ] > 1 else 0 # noqa: F841
368
368
# Batch dimension - fixed
369
- batch = cuda .blockIdx .z
369
+ batch = cuda .blockIdx .z # noqa: F841
370
370
371
371
BLOCK_DIM = 32
372
- a_shared = cuda .shared .array ((BLOCK_DIM , BLOCK_DIM ), numba .float64 )
373
- b_shared = cuda .shared .array ((BLOCK_DIM , BLOCK_DIM ), numba .float64 )
372
+ a_shared = cuda .shared .array ((BLOCK_DIM , BLOCK_DIM ), numba .float64 ) # noqa: F841
373
+ b_shared = cuda .shared .array ((BLOCK_DIM , BLOCK_DIM ), numba .float64 ) # noqa: F841
374
374
375
375
# The final position c[i, j]
376
- i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x
377
- j = cuda .blockIdx .y * cuda .blockDim .y + cuda .threadIdx .y
376
+ i = cuda .blockIdx .x * cuda .blockDim .x + cuda .threadIdx .x # noqa: F841
377
+ j = cuda .blockIdx .y * cuda .blockDim .y + cuda .threadIdx .y # noqa: F841
378
378
379
379
# The local position in the block.
380
- pi = cuda .threadIdx .x
381
- pj = cuda .threadIdx .y
380
+ pi = cuda .threadIdx .x # noqa: F841
381
+ pj = cuda .threadIdx .y # noqa: F841
382
382
383
383
# Code Plan:
384
384
# 1) Move across shared dimension by block dim.
385
385
# a) Copy into shared memory for a matrix.
386
386
# b) Copy into shared memory for b matrix
387
387
# c) Compute the dot produce for position c[i, j]
388
388
# TODO: Implement for Task 3.4.
389
- raise NotImplementedError (' Need to implement for Task 3.4' )
389
+ raise NotImplementedError (" Need to implement for Task 3.4" )
390
390
391
391
392
392
tensor_matrix_multiply = cuda .jit (_tensor_matrix_multiply )
0 commit comments