dantp-ai
diff --git a/‎.github/workflows/minitorch.yml
+15-2 b/‎.github/workflows/minitorch.yml
+15-2
diff --git a/‎environment.yml
+19 b/‎environment.yml
+19
diff --git a/‎minitorch/autodiff.py
+4-4 b/‎minitorch/autodiff.py
+4-4
diff --git a/‎minitorch/cuda_ops.py
+30-30 b/‎minitorch/cuda_ops.py
+30-30
diff --git a/‎minitorch/fast_conv.py
+9-10 b/‎minitorch/fast_conv.py
+9-10
diff --git a/‎minitorch/fast_ops.py
+12-10 b/‎minitorch/fast_ops.py
+12-10
diff --git a/‎minitorch/module.py
+4-4 b/‎minitorch/module.py
+4-4
@@ -26,30 +26,43 @@ jobs:
       run: |
         # stop the build if there are Python syntax errors or undefined names
         flake8 --ignore "N801, E203, E266, E501, W503, F812, F401, F841, E741, N803, N802, N806" minitorch/ tests/ project/
-    - name: Test with pytest
+
+    - name: Test Module 0
       run: |
         echo "Module 0"
         pytest tests -x -m task0_1
         pytest tests -x -m task0_2
         pytest tests -x -m task0_3
         pytest tests -x -m task0_4
+
+    - name: Test Module 1
+      run: |
         echo "Module 1"
         pytest tests -x -m task1_1
         pytest tests -x -m task1_2
         pytest tests -x -m task1_3
         pytest tests -x -m task1_4
+
+    - name: Test Module 2
+      run: |
         echo "Module 2"
         pytest tests -x -m task2_1
         pytest tests -x -m task2_2
         pytest tests -x -m task2_3
         pytest tests -x -m task2_4
+
+    - name: Test Module 3
+      run: |
         echo "Module 3"
         pytest tests -x -m task3_1
         pytest tests -x -m task3_2
         pytest tests -x -m task3_3
         pytest tests -x -m task3_4
+
+    - name: Test Module 4
+      run: |
         echo "Module 4"
         pytest tests -x -m task4_1
         pytest tests -x -m task4_2
         pytest tests -x -m task4_3
-        pytest tests -x -m task4_4
+        pytest tests -x -m task4_4
@@ -0,0 +1,19 @@
+name: minitorch-conda-env-py38
+
+channels:
+  - conda-forge
+
+dependencies:
+  - python=3.8
+  - chalk-diagram
+  - colorama==0.4.3
+  - hypothesis==6.54
+  - mypy==0.971
+  - numba==0.56
+  - numpy==1.22
+  - pre-commit==2.20.0
+  - pytest==7.1.2
+  - pytest-env
+  - pytest-runner==5.2
+  - typing_extensions
+  - colour==0.1.5
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, Iterable, List, Tuple
+from typing import Any, Iterable, List, Tuple  # noqa: F401
 
 from typing_extensions import Protocol
 
@@ -23,7 +23,7 @@ def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6)
         An approximation of $f'_i(x_0, \ldots, x_{n-1})$
     """
     # TODO: Implement for Task 1.1.
-    raise NotImplementedError('Need to implement for Task 1.1')
+    raise NotImplementedError("Need to implement for Task 1.1")
 
 
 variable_count = 1
@@ -62,7 +62,7 @@ def topological_sort(variable: Variable) -> Iterable[Variable]:
         Non-constant Variables in topological order starting from the right.
     """
     # TODO: Implement for Task 1.4.
-    raise NotImplementedError('Need to implement for Task 1.4')
+    raise NotImplementedError("Need to implement for Task 1.4")
 
 
 def backpropagate(variable: Variable, deriv: Any) -> None:
@@ -77,7 +77,7 @@ def backpropagate(variable: Variable, deriv: Any) -> None:
     No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
     """
     # TODO: Implement for Task 1.4.
-    raise NotImplementedError('Need to implement for Task 1.4')
+    raise NotImplementedError("Need to implement for Task 1.4")
 
 
 @dataclass
 
@@ -150,11 +150,11 @@ def _map(
         in_strides: Strides,
     ) -> None:
 
-        out_index = cuda.local.array(MAX_DIMS, numba.int32)
-        in_index = cuda.local.array(MAX_DIMS, numba.int32)
-        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
+        out_index = cuda.local.array(MAX_DIMS, numba.int32)  # noqa: F841
+        in_index = cuda.local.array(MAX_DIMS, numba.int32)  # noqa: F841
+        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x  # noqa: F841
         # TODO: Implement for Task 3.3.
-        raise NotImplementedError('Need to implement for Task 3.3')
+        raise NotImplementedError("Need to implement for Task 3.3")
 
     return cuda.jit()(_map)  # type: ignore
 
@@ -190,13 +190,13 @@ def _zip(
         b_strides: Strides,
     ) -> None:
 
-        out_index = cuda.local.array(MAX_DIMS, numba.int32)
-        a_index = cuda.local.array(MAX_DIMS, numba.int32)
-        b_index = cuda.local.array(MAX_DIMS, numba.int32)
-        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
+        out_index = cuda.local.array(MAX_DIMS, numba.int32)  # noqa: F841
+        a_index = cuda.local.array(MAX_DIMS, numba.int32)  # noqa: F841
+        b_index = cuda.local.array(MAX_DIMS, numba.int32)  # noqa: F841
+        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x  # noqa: F841
 
         # TODO: Implement for Task 3.3.
-        raise NotImplementedError('Need to implement for Task 3.3')
+        raise NotImplementedError("Need to implement for Task 3.3")
 
     return cuda.jit()(_zip)  # type: ignore
 
@@ -224,12 +224,12 @@ def _sum_practice(out: Storage, a: Storage, size: int) -> None:
     """
     BLOCK_DIM = 32
 
-    cache = cuda.shared.array(BLOCK_DIM, numba.float64)
-    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
-    pos = cuda.threadIdx.x
+    cache = cuda.shared.array(BLOCK_DIM, numba.float64)  # noqa: F841
+    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x  # noqa: F841
+    pos = cuda.threadIdx.x  # noqa: F841
 
     # TODO: Implement for Task 3.3.
-    raise NotImplementedError('Need to implement for Task 3.3')
+    raise NotImplementedError("Need to implement for Task 3.3")
 
 
 jit_sum_practice = cuda.jit()(_sum_practice)
@@ -273,13 +273,13 @@ def _reduce(
         reduce_value: float,
     ) -> None:
         BLOCK_DIM = 1024
-        cache = cuda.shared.array(BLOCK_DIM, numba.float64)
-        out_index = cuda.local.array(MAX_DIMS, numba.int32)
-        out_pos = cuda.blockIdx.x
-        pos = cuda.threadIdx.x
+        cache = cuda.shared.array(BLOCK_DIM, numba.float64)  # noqa: F841
+        out_index = cuda.local.array(MAX_DIMS, numba.int32)  # noqa: F841
+        out_pos = cuda.blockIdx.x  # noqa: F841
+        pos = cuda.threadIdx.x  # noqa: F841
 
         # TODO: Implement for Task 3.3.
-        raise NotImplementedError('Need to implement for Task 3.3')
+        raise NotImplementedError("Need to implement for Task 3.3")
 
     return cuda.jit()(_reduce)  # type: ignore
 
@@ -314,9 +314,9 @@ def _mm_practice(out: Storage, a: Storage, b: Storage, size: int) -> None:
         b (Storage): storage for `b` tensor.
         size (int): size of the square
     """
-    BLOCK_DIM = 32
+    BLOCK_DIM = 32  # noqa: F841
     # TODO: Implement for Task 3.3.
-    raise NotImplementedError('Need to implement for Task 3.3')
+    raise NotImplementedError("Need to implement for Task 3.3")
 
 
 jit_mm_practice = cuda.jit()(_mm_practice)
@@ -363,30 +363,30 @@ def _tensor_matrix_multiply(
     Returns:
         None : Fills in `out`
     """
-    a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0
-    b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0
+    a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0  # noqa: F841
+    b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0  # noqa: F841
     # Batch dimension - fixed
-    batch = cuda.blockIdx.z
+    batch = cuda.blockIdx.z  # noqa: F841
 
     BLOCK_DIM = 32
-    a_shared = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
-    b_shared = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+    a_shared = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)  # noqa: F841
+    b_shared = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)  # noqa: F841
 
     # The final position c[i, j]
-    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
-    j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
+    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x  # noqa: F841
+    j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y  # noqa: F841
 
     # The local position in the block.
-    pi = cuda.threadIdx.x
-    pj = cuda.threadIdx.y
+    pi = cuda.threadIdx.x  # noqa: F841
+    pj = cuda.threadIdx.y  # noqa: F841
 
     # Code Plan:
     # 1) Move across shared dimension by block dim.
     #    a) Copy into shared memory for a matrix.
     #    b) Copy into shared memory for b matrix
     #    c) Compute the dot produce for position c[i, j]
     # TODO: Implement for Task 3.4.
-    raise NotImplementedError('Need to implement for Task 3.4')
+    raise NotImplementedError("Need to implement for Task 3.4")
 
 
 tensor_matrix_multiply = cuda.jit(_tensor_matrix_multiply)
@@ -1,13 +1,12 @@
 from typing import Tuple
 
-import numpy as np
-from numba import njit, prange
+import numpy as np  # noqa: F401
+from numba import njit, prange  # noqa: F401
 
 from .autodiff import Context
 from .tensor import Tensor
+from .tensor_data import MAX_DIMS, Index  # noqa: F401
 from .tensor_data import (
-    MAX_DIMS,
-    Index,
     Shape,
     Strides,
     broadcast_index,
@@ -77,11 +76,11 @@ def _tensor_conv1d(
         and in_channels == in_channels_
         and out_channels == out_channels_
     )
-    s1 = input_strides
-    s2 = weight_strides
+    s1 = input_strides  # noqa: F841
+    s2 = weight_strides  # noqa: F841
 
     # TODO: Implement for Task 4.1.
-    raise NotImplementedError('Need to implement for Task 4.1')
+    raise NotImplementedError("Need to implement for Task 4.1")
 
 
 tensor_conv1d = njit(parallel=True)(_tensor_conv1d)
@@ -203,11 +202,11 @@ def _tensor_conv2d(
     s1 = input_strides
     s2 = weight_strides
     # inners
-    s10, s11, s12, s13 = s1[0], s1[1], s1[2], s1[3]
-    s20, s21, s22, s23 = s2[0], s2[1], s2[2], s2[3]
+    s10, s11, s12, s13 = s1[0], s1[1], s1[2], s1[3]  # noqa: F841
+    s20, s21, s22, s23 = s2[0], s2[1], s2[2], s2[3]  # noqa: F841
 
     # TODO: Implement for Task 4.2.
-    raise NotImplementedError('Need to implement for Task 4.2')
+    raise NotImplementedError("Need to implement for Task 4.2")
 
 
 tensor_conv2d = njit(parallel=True, fastmath=True)(_tensor_conv2d)
 
@@ -2,11 +2,12 @@
 
 from typing import TYPE_CHECKING
 
-import numpy as np
-from numba import njit, prange
+import numpy as np  # noqa: F401
+from numba import njit
+from numba import prange  # noqa: F401
 
+from .tensor_data import MAX_DIMS  # noqa: F401
 from .tensor_data import (
-    MAX_DIMS,
     broadcast_index,
     index_to_position,
     shape_broadcast,
@@ -18,7 +19,8 @@
     from typing import Callable, Optional
 
     from .tensor import Tensor
-    from .tensor_data import Index, Shape, Storage, Strides
+    from .tensor_data import Index  # noqa: F401
+    from .tensor_data import Shape, Storage, Strides
 
 # TIP: Use `NUMBA_DISABLE_JIT=1 pytest tests/ -m task3_1` to run these tests without JIT.
 
@@ -160,7 +162,7 @@ def _map(
         in_strides: Strides,
     ) -> None:
         # TODO: Implement for Task 3.1.
-        raise NotImplementedError('Need to implement for Task 3.1')
+        raise NotImplementedError("Need to implement for Task 3.1")
 
     return njit(parallel=True)(_map)  # type: ignore
 
@@ -199,7 +201,7 @@ def _zip(
         b_strides: Strides,
     ) -> None:
         # TODO: Implement for Task 3.1.
-        raise NotImplementedError('Need to implement for Task 3.1')
+        raise NotImplementedError("Need to implement for Task 3.1")
 
     return njit(parallel=True)(_zip)  # type: ignore
 
@@ -233,7 +235,7 @@ def _reduce(
         reduce_dim: int,
     ) -> None:
         # TODO: Implement for Task 3.1.
-        raise NotImplementedError('Need to implement for Task 3.1')
+        raise NotImplementedError("Need to implement for Task 3.1")
 
     return njit(parallel=True)(_reduce)  # type: ignore
 
@@ -279,11 +281,11 @@ def _tensor_matrix_multiply(
     Returns:
         None : Fills in `out`
     """
-    a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0
-    b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0
+    a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0  # noqa: F841
+    b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0  # noqa: F841
 
     # TODO: Implement for Task 3.2.
-    raise NotImplementedError('Need to implement for Task 3.2')
+    raise NotImplementedError("Need to implement for Task 3.2")
 
 
 tensor_matrix_multiply = njit(parallel=True, fastmath=True)(_tensor_matrix_multiply)
@@ -32,12 +32,12 @@ def modules(self) -> Sequence[Module]:
     def train(self) -> None:
         "Set the mode of this module and all descendent modules to `train`."
         # TODO: Implement for Task 0.4.
-        raise NotImplementedError('Need to implement for Task 0.4')
+        raise NotImplementedError("Need to implement for Task 0.4")
 
     def eval(self) -> None:
         "Set the mode of this module and all descendent modules to `eval`."
         # TODO: Implement for Task 0.4.
-        raise NotImplementedError('Need to implement for Task 0.4')
+        raise NotImplementedError("Need to implement for Task 0.4")
 
     def named_parameters(self) -> Sequence[Tuple[str, Parameter]]:
         """
@@ -48,12 +48,12 @@ def named_parameters(self) -> Sequence[Tuple[str, Parameter]]:
             The name and `Parameter` of each ancestor parameter.
         """
         # TODO: Implement for Task 0.4.
-        raise NotImplementedError('Need to implement for Task 0.4')
+        raise NotImplementedError("Need to implement for Task 0.4")
 
     def parameters(self) -> Sequence[Parameter]:
         "Enumerate over all the parameters of this module and its descendents."
         # TODO: Implement for Task 0.4.
-        raise NotImplementedError('Need to implement for Task 0.4')
+        raise NotImplementedError("Need to implement for Task 0.4")
 
     def add_parameter(self, k: str, v: Any) -> Parameter:
         """