diff --git a/65691.pbs111.OU b/65691.pbs111.OU
new file mode 100644
index 0000000..ddf58cd
--- /dev/null
+++ b/65691.pbs111.OU
@@ -0,0 +1,64 @@
+/var/spool/pbs/mom_priv/jobs/65691.pbs111.SC: line 10: deactivate: command not found
+wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
+wandb: Currently logged in as: runjiachen (runjiachen-nus). Use `wandb login --relogin` to force relogin
+wandb: Tracking run with wandb version 0.18.7
+wandb: Run data is saved locally in /home/users/nus/e1113744/native-sparse-attention-pytorch/wandb/run-20250707_181358-5mxyov4r
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run summer-butterfly-2
+wandb: ⭐️ View project at https://wandb.ai/runjiachen-nus/native-sparse-attention
+wandb: 🚀 View run at https://wandb.ai/runjiachen-nus/native-sparse-attention/runs/5mxyov4r
+wandb: WARNING Calling wandb.run.save without any arguments is deprecated.Changes to attributes are automatically persisted.
+training:   0%|          | 0/100000 [00:00<?, ?it/s]training:   0%|          | 0/100000 [00:00<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", line 160, in <module>
+    loss = model(data, return_loss = True)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/native-sparse-attention-pytorch/native_sparse_attention_pytorch/transformer.py", line 308, in forward
+    attn_out, layer_cache = attn(
+                            ^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: Attention.forward() got an unexpected keyword argument 'cache'
+
+real	0m28.958s
+user	0m12.499s
+sys	0m5.281s
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+			Resource Usage on 2025-07-07 18:14:13.967119:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	JobId: 65691.pbs111
+	Project: 71001002
+	Exit Status: 1
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	NCPUs: Requested(14), Used(14)
+	CPU Time Used: 00:00:17
+	Memory: Requested(235gb), Used(1829988kb)
+	Vmem Used: 522802812kb
+	Walltime: Requested(12:00:00), Used(00:00:30)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	Execution Nodes Used: (a2ap-dgx034:ngpus=1:ncpus=14:mem=246415360kb)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	GPU Duration: 34.13secs
+	GPU Power Consumed: 60.97W
+	GPU Max GPU Memory Used: 704.0MB
+	Memory Throughput Rate (Average): a2ap-dgx034:(gpu7:0%)
+	Memory Throughput Rate (Max): a2ap-dgx034:(gpu7:0%)
+	Memory Throughput Rate (Min): a2ap-dgx034:(gpu7:0%)
+	GPU SM Utilization (Average): a2ap-dgx034:(gpu7:0%)
+	GPU SM Utilization (Max): a2ap-dgx034:(gpu7:0%)
+	GPU SM Utilization (Min): a2ap-dgx034:(gpu7:0%)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Warning: All GPUs have a percentage of 0 utilisation.
+GPU application profile: Idle
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
diff --git a/65693.pbs111.OU b/65693.pbs111.OU
new file mode 100644
index 0000000..0fae911
--- /dev/null
+++ b/65693.pbs111.OU
@@ -0,0 +1,64 @@
+/var/spool/pbs/mom_priv/jobs/65693.pbs111.SC: line 10: deactivate: command not found
+wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
+wandb: Currently logged in as: runjiachen (runjiachen-nus). Use `wandb login --relogin` to force relogin
+wandb: Tracking run with wandb version 0.18.7
+wandb: Run data is saved locally in /home/users/nus/e1113744/native-sparse-attention-pytorch/wandb/run-20250707_181437-cjdzttf7
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run morning-paper-3
+wandb: ⭐️ View project at https://wandb.ai/runjiachen-nus/native-sparse-attention
+wandb: 🚀 View run at https://wandb.ai/runjiachen-nus/native-sparse-attention/runs/cjdzttf7
+wandb: WARNING Calling wandb.run.save without any arguments is deprecated.Changes to attributes are automatically persisted.
+training:   0%|          | 0/100000 [00:00<?, ?it/s]training:   0%|          | 0/100000 [00:00<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", line 160, in <module>
+    loss = model(data, return_loss = True)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/native-sparse-attention-pytorch/native_sparse_attention_pytorch/transformer.py", line 308, in forward
+    attn_out, layer_cache = attn(
+                            ^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: Attention.forward() got an unexpected keyword argument 'cache'
+
+real	0m14.447s
+user	0m12.093s
+sys	0m3.989s
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+			Resource Usage on 2025-07-07 18:14:45.053803:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	JobId: 65693.pbs111
+	Project: 71001002
+	Exit Status: 1
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	NCPUs: Requested(14), Used(14)
+	CPU Time Used: 00:00:16
+	Memory: Requested(235gb), Used(1141256kb)
+	Vmem Used: 521964248kb
+	Walltime: Requested(12:00:00), Used(00:00:16)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	Execution Nodes Used: (a2ap-dgx034:ngpus=1:ncpus=14:mem=246415360kb)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	GPU Duration: 17.99secs
+	GPU Power Consumed: 56.66W
+	GPU Max GPU Memory Used: 704.0MB
+	Memory Throughput Rate (Average): a2ap-dgx034:(gpu7:0%)
+	Memory Throughput Rate (Max): a2ap-dgx034:(gpu7:0%)
+	Memory Throughput Rate (Min): a2ap-dgx034:(gpu7:0%)
+	GPU SM Utilization (Average): a2ap-dgx034:(gpu7:0%)
+	GPU SM Utilization (Max): a2ap-dgx034:(gpu7:0%)
+	GPU SM Utilization (Min): a2ap-dgx034:(gpu7:0%)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Warning: All GPUs have a percentage of 0 utilisation.
+GPU application profile: Idle
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
diff --git a/native_sparse_attention_pytorch/benchmark.py b/native_sparse_attention_pytorch/benchmark.py
new file mode 100644
index 0000000..bdf38db
--- /dev/null
+++ b/native_sparse_attention_pytorch/benchmark.py
@@ -0,0 +1,109 @@
+import time
+import torch
+import numpy as np
+
+# Import your modules
+from native_sparse_attention import SparseAttention
+from transformer import Attention as StandardAttention
+
+
+def benchmark_module(module, x, runs=50, warmups=5):
+    """
+    Benchmark forward and backward pass of a module.
+    Returns forward and backward times as numpy arrays (in seconds).
+    """
+    # Warm-up to stabilize JIT/CUDA and trigger Triton compilation
+    for _ in range(warmups):
+        out = module(x)
+        loss = out.sum()
+        loss.backward()
+        module.zero_grad()
+
+    # Forward timing
+    fwd_times = []
+    for _ in range(runs):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        out = module(x)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        fwd_times.append(time.perf_counter() - t0)
+
+    # Backward timing
+    bwd_times = []
+    for _ in range(runs):
+        out = module(x)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        out.sum().backward()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        bwd_times.append(time.perf_counter() - t0)
+        module.zero_grad()
+
+    return np.array(fwd_times), np.array(bwd_times)
+
+
+def main():
+    
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Hyperparameters
+    batch_size = 128
+    seq_len = 1024
+    d_model = 512
+    n_heads = 8
+    d_head = d_model // n_heads
+
+    # NSA-specific hyperparameters
+    sliding_window_size = 32
+    compress_block_size = 4
+    compress_block_sliding_stride = 4
+    selection_block_size = 16
+    num_selected_blocks = 4
+
+    # Create input tensor
+    x = torch.randn(batch_size, seq_len, d_model, device=device)
+
+    # Instantiate modules with identical hyperparameters
+    std_attn = StandardAttention(
+        dim=d_model,
+        dim_head=d_head,
+        heads=n_heads,
+        causal=True
+    ).to(device)
+
+    nsa_attn = SparseAttention(
+        d_model,
+        d_head,
+        n_heads,
+        sliding_window_size,
+        compress_block_size,
+        compress_block_sliding_stride,
+        selection_block_size,
+        num_selected_blocks,
+        use_triton_kernel=True
+    ).to(device)
+
+    # Run benchmarks
+    runs = 5000
+    warmups = 500
+    std_fwd, std_bwd = benchmark_module(std_attn, x, runs, warmups)
+    nsa_fwd, nsa_bwd = benchmark_module(nsa_attn, x, runs, warmups)
+
+    # Report results
+    print(f"{'Module':<25}{'Fwd Mean (ms)':>15}{'Fwd Std (ms)':>15}{'Bwd Mean (ms)':>15}{'Bwd Std (ms)':>15}")
+    for name, fwd, bwd in [
+        ("StandardAttention", std_fwd, std_bwd),
+        ("SparseAttention (NSA)", nsa_fwd, nsa_bwd),
+    ]:
+        print(f"{name:<25}{fwd.mean()*1000:>15.3f}{fwd.std()*1000:>15.3f}{bwd.mean()*1000:>15.3f}{bwd.std()*1000:>15.3f}")
+
+    print(nsa_attn.timer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py
index 7c95716..b41ab00 100644
--- a/native_sparse_attention_pytorch/native_sparse_attention.py
+++ b/native_sparse_attention_pytorch/native_sparse_attention.py
@@ -19,6 +19,8 @@
 from einops import einsum, repeat, rearrange, reduce, pack, unpack
 from einops.layers.torch import Rearrange
 
+import time
+
 # b - batch
 # h - heads
 # qh - grouped query heads
@@ -185,6 +187,42 @@ def attend(
 
 # classes
 
+class Timer: 
+
+    def __init__(self): 
+        self.t1 = 0 
+        self.t2 = 0 
+        self.t3 = 0 
+        self.t4 = 0
+        self.counter = 0 
+
+    def add_t1(self, val): 
+        self.t1 += val
+        ##self.counter += 1
+
+    def add_t2(self, val): 
+        self.t2 += val
+        ##self.counter += 1
+
+    def add_t3(self, val): 
+        self.t3 += val
+        ##self.counter += 1
+
+    def add_t4(self, val): 
+        self.t4 += val
+        ##self.counter += 1
+
+    def __str__(self):
+        return (
+            f"time spent in t1 is: {self.t1}, "
+            f"time spent in t2 is: {self.t2}, "
+            f"time spent in t3 is: {self.t3}"
+            f"time spent in t4 is: {self.t4}"
+            f"counter is: {self.counter}"
+
+        )
+
+
 class SparseAttention(Module):
     def __init__(
         self,
@@ -205,10 +243,12 @@ def __init__(
         query_heads_share_selected_kv = True, # if set to True, importance score is averaged across query heads to select top-n buckets of kv per kv head - but can be set to False for each query head within a group to look at different sets of kv buckets. will be more memory and compute of course
         compress_mlp: Module | None = None,
         compress_mlp_expand_factor = 1.,
-        strategy_combine_mlp: Module | None = None
+        strategy_combine_mlp: Module | None = None, 
     ):
         super().__init__()
 
+        self.timer = Timer()
+
         # attention heads
         # handling gqa if `kv_heads` is set
 
@@ -247,6 +287,8 @@ def __init__(
 
         # sliding window strategy
 
+        
+
         self.sliding_window = LocalAttention(
             dim = dim_head,
             window_size = sliding_window_size,
@@ -553,8 +595,9 @@ def forward(
         disable_triton_kernel = False,
         sliding_window_flex_mask = None,
         fine_selection_flex_mask = None,
-        return_cache = False
+        return_cache = False,
     ):
+        ##start = time.perf_counter()
         is_inferencing = exists(cache)
 
         if is_inferencing:
@@ -618,6 +661,7 @@ def forward(
 
         # 1. coarse attention over compressed
 
+        start = time.perf_counter() 
         mem_ck, mem_cv = repeat(self.compress_mem_kv, 'kv ... -> kv b ...', b = batch)
 
         num_mem_compress_kv = mem_ck.shape[-2]
@@ -636,7 +680,9 @@ def forward(
 
             cmask = einx.less('j, i -> i j', ck_seq, cq_seq)
 
+        
         compressed_attn_out, csim = attend(cq, ck, cv, mask = cmask, return_sim = True)
+        self.timer.add_t1(time.perf_counter() - start)
 
         # for 2. and 3., will give them relative positions with rotary - compressed needs to be handled separately (even if they already have intra block absolute positions)
 
@@ -706,6 +752,8 @@ def forward(
         remainder = fine_divisible_seq_len - seq_len
         pad_to_multiple = partial(pad_at_dim, pad = (0, remainder), dim = -2)
 
+        start = time.perf_counter()
+
         if has_selected_kv_for_fine_attn:
 
             # get the top-n kv segments for fine attention
@@ -716,10 +764,11 @@ def forward(
 
             if self.use_triton_kernel and not disable_triton_kernel:
 
-                from native_sparse_attention_pytorch.triton_native_sparse_attention import native_sparse_attend
+                from triton_native_sparse_attention import native_sparse_attend
 
                 fmask = selected_importance_values > 1e-10
 
+                
                 fine_attn_out = native_sparse_attend(
                     fq, fk, fv,
                     self.selection_block_size,
@@ -728,6 +777,7 @@ def forward(
                     sel_scale = gates,
                     include_block_causal = self.causal
                 )
+                
 
             elif exists(fine_selection_flex_mask):
                 assert not self.use_diff_topk, 'differential topk is not available for flex attention'
@@ -836,6 +886,8 @@ def forward(
             fine_attn_out = rearrange(fine_attn_out, '(b w) h n d -> b h (w n) d', b = batch)
             fine_attn_out = fine_attn_out[..., :seq_len, :]
 
+        self.timer.add_t2(time.perf_counter() - start)
+
         # 3. overlapping sliding window, this is unsurprising and expected - `s` for sliding
 
         sq = q
@@ -847,7 +899,9 @@ def forward(
         else:
             sk, sv = tuple(repeat(t, 'b h ... -> b (h num_grouped_queries) ...', num_grouped_queries = self.num_grouped_queries) for t in (sk, sv))
 
+            start = time.perf_counter()
             sliding_window_attn_out = self.sliding_window(sq, sk, sv)
+            self.timer.add_t3(time.perf_counter() - start)
 
         # combine strategies
 
@@ -863,5 +917,7 @@ def forward(
 
         if not return_cache:
             return out
+        
+        ##self.timer.add_t4(time.perf_counter() - start)
 
         return out, (cache_kv, cache_compressed_kv)
diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py
index cd2a662..94a1b92 100644
--- a/native_sparse_attention_pytorch/transformer.py
+++ b/native_sparse_attention_pytorch/transformer.py
@@ -11,7 +11,7 @@
 
 from rotary_embedding_torch import RotaryEmbedding
 
-from native_sparse_attention_pytorch.native_sparse_attention import (
+from native_sparse_attention import (
     SparseAttention,
     create_compress_mask,
     create_fine_mask,
@@ -62,68 +62,85 @@ def top_k(logits, thres = 0.9):
 
 # attention
 
-class Attention(Module):
+class Attention(nn.Module):
     def __init__(
         self,
         dim,
-        dim_head = 64,
-        heads = 8,
-        causal = True,
-        kv_heads = None
+        dim_head=64,
+        heads=8,
+        causal=True,
+        kv_heads=None
     ):
         super().__init__()
         self.norm = RMSNorm(dim)
 
         self.heads = heads
-        kv_heads = default(kv_heads, heads)
-        dim_inner = heads * dim_head
-        dim_kv_inner = kv_heads * dim_head
-
+        kv_heads = kv_heads or heads
         self.kv_heads = kv_heads
         self.causal = causal
 
+        dim_inner = heads * dim_head
+        dim_kv_inner = kv_heads * dim_head
+
         self.rotary_embed = RotaryEmbedding(dim_head)
 
-        self.to_q = nn.Linear(dim, dim_inner, bias = False)
-        self.to_k = nn.Linear(dim, dim_kv_inner, bias = False)
-        self.to_v = nn.Linear(dim, dim_kv_inner, bias = False)
+        self.to_q = nn.Linear(dim, dim_inner, bias=False)
+        self.to_k = nn.Linear(dim, dim_kv_inner, bias=False)
+        self.to_v = nn.Linear(dim, dim_kv_inner, bias=False)
 
-        self.split_heads = Rearrange('b n (h d) -> b h n d', d = dim_head)
-        self.merge_heads = Rearrange('b h n d -> b n (h d)')
+        self.split_heads = lambda x: rearrange(x, 'b n (h d) -> b h n d', d=dim_head)
+        self.merge_heads = lambda x: rearrange(x, 'b h n d -> b n (h d)')
 
-        self.to_out = nn.Linear(dim_inner, dim, bias = False)
+        self.to_out = nn.Linear(dim_inner, dim, bias=False)
 
     def forward(
         self,
-        x
+        x,
+        cache=None,
+        return_cache=False
     ):
-
+        # Layer normalization
         x = self.norm(x)
 
+        # Project to Q, K, V
         q = self.to_q(x)
         k = self.to_k(x)
         v = self.to_v(x)
 
+        # Split into heads
         q, k, v = map(self.split_heads, (q, k, v))
 
-        # relative positions
-
+        # Apply rotary embeddings
         q, k = self.rotary_embed.rotate_queries_with_cached_keys(q, k)
 
-        # naive gqa
+        # Group key and value heads for attention
+        k_grouped = repeat(k, 'b h n d -> b (g h) n d', g=self.heads // self.kv_heads)
+        v_grouped = repeat(v, 'b h n d -> b (g h) n d', g=self.heads // self.kv_heads)
 
-        k, v = tuple(repeat(t, 'b h ... -> b (g h) ...', g = self.heads // self.kv_heads) for t in (k, v))
-
-        # attention branch
+        # Prepend past key/value if provided
+        if cache is not None:
+            past_k, past_v = cache
+            k_grouped = torch.cat([past_k, k_grouped], dim=2)
+            v_grouped = torch.cat([past_v, v_grouped], dim=2)
 
+        # Compute attention (q still has original head count)
         out = F.scaled_dot_product_attention(
-            q, k, v,
-            is_causal = self.causal
+            q,
+            k_grouped,
+            v_grouped,
+            is_causal=self.causal
         )
 
+        # Merge heads and project out
         out = self.merge_heads(out)
+        out = self.to_out(out)
+
+        # Return output and new cache if requested
+        if return_cache:
+            return out, (k_grouped, v_grouped)
+        return out
+
 
-        return self.to_out(out)
 
 # feedforward
 
@@ -309,7 +326,6 @@ def forward(
                 tokens,
                 cache = next(iter_cache, None),
                 return_cache = True,
-                **attn_kwargs
             )
 
             next_cache.append(layer_cache)
diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
index 9069683..64f1383 100644
--- a/native_sparse_attention_pytorch/triton_native_sparse_attention.py
+++ b/native_sparse_attention_pytorch/triton_native_sparse_attention.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from native_sparse_attention_pytorch.tensor_typing import Float, Int, Bool
+from tensor_typing import Float, Int, Bool
 
 # taken from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py
 # with fixes for triton 2.3
@@ -1835,6 +1835,7 @@ def forward(
         block_dk_dv_use_dot,
         return_sliding_window_out
     ):
+        ##print("triton called")
         dtype = fq.dtype
 
         q_heads, kv_heads = fq.shape[1], fk.shape[1]
diff --git a/normal.txt b/normal.txt
new file mode 100644
index 0000000..32a475f
--- /dev/null
+++ b/normal.txt
@@ -0,0 +1,3 @@
+using custom triton kernel
+[1;34mwandb[0m: 🚀 View run [33mmorning-paper-3[0m at: [34mhttps://wandb.ai/runjiachen-nus/native-sparse-attention/runs/cjdzttf7[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250707_181437-cjdzttf7/logs[0m
diff --git a/normal_run.pbs b/normal_run.pbs
new file mode 100644
index 0000000..0784c06
--- /dev/null
+++ b/normal_run.pbs
@@ -0,0 +1,15 @@
+#!/bin/bash
+#PBS -l select=1:ngpus=1
+#PBS -l walltime=12:00:00
+#PBS -o ./
+#PBS -j oe
+#PBS -N submit
+#PBS -q ic102
+#PBS -P 71001002
+
+deactivate 
+cd llm-foundry 
+source .venv/bin/activate 
+cd ~ 
+cd native-sparse-attention-pytorch 
+time python train.py >> normal.txt
\ No newline at end of file
diff --git a/out_triton.txt b/out_triton.txt
new file mode 100644
index 0000000..3d0cbeb
--- /dev/null
+++ b/out_triton.txt
@@ -0,0 +1,3 @@
+using custom triton kernel
+[1;34mwandb[0m: 🚀 View run [33msummer-butterfly-2[0m at: [34mhttps://wandb.ai/runjiachen-nus/native-sparse-attention/runs/5mxyov4r[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250707_181358-5mxyov4r/logs[0m
diff --git a/train.py b/train.py
index 0efdf57..9bee49d 100644
--- a/train.py
+++ b/train.py
@@ -31,8 +31,8 @@
 HEADS = 8
 KV_HEADS = 4
 
-USE_SPARSE_ATTN = True
-USE_TRITON_NSA = True
+USE_SPARSE_ATTN = False
+USE_TRITON_NSA = False
 USE_FLEX_FOR_FINE_SELECTION = False   # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels
 QUERY_HEADS_SHARE_SELECTION = True    # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA
 
@@ -53,7 +53,7 @@
 
 PROJECT_NAME = 'native-sparse-attention'
 RUN_NAME = 'baseline' if not USE_SPARSE_ATTN else f'sparse-attn: compress size {COMPRESS_BLOCK_SIZE} | fine size {FINE_BLOCK_SIZE} | {NUM_FINE_SELECTED} selected'
-WANDB_ONLINE = False # turn this on to pipe experiment to cloud
+WANDB_ONLINE = True # turn this on to pipe experiment to cloud
 
 # helpers
 
diff --git a/triton_run.pbs b/triton_run.pbs
new file mode 100644
index 0000000..a02d3cb
--- /dev/null
+++ b/triton_run.pbs
@@ -0,0 +1,15 @@
+#!/bin/bash
+#PBS -l select=1:ngpus=1
+#PBS -l walltime=12:00:00
+#PBS -o ./
+#PBS -j oe
+#PBS -N submit
+#PBS -q ic102
+#PBS -P 71001002
+
+deactivate 
+cd llm-foundry 
+source .venv/bin/activate 
+cd ~ 
+cd native-sparse-attention-pytorch 
+time python train.py >> out_triton.txt
\ No newline at end of file
diff --git a/wandb/latest-run b/wandb/latest-run
new file mode 120000
index 0000000..5bd4cc5
--- /dev/null
+++ b/wandb/latest-run
@@ -0,0 +1 @@
+run-20250708_100933-5h0en8f2
\ No newline at end of file
diff --git a/wandb/run-20250707_171826-rb9axy3v/files/requirements.txt b/wandb/run-20250707_171826-rb9axy3v/files/requirements.txt
new file mode 100644
index 0000000..dbe4370
--- /dev/null
+++ b/wandb/run-20250707_171826-rb9axy3v/files/requirements.txt
@@ -0,0 +1,215 @@
+docker-pycreds==0.4.0
+onnxruntime==1.22.0
+google-api-core==2.25.1
+googleapis-common-protos==1.70.0
+opentelemetry-sdk==1.34.1
+zstd==1.5.7.2
+jmespath==1.0.1
+s3transfer==0.13.0
+huggingface-hub==0.33.2
+pytz==2025.2
+xxhash==3.5.0
+certifi==2025.6.15
+attrs==25.3.0
+opentelemetry-api==1.34.1
+validators==0.35.0
+pyasn1==0.6.1
+prompt_toolkit==3.0.51
+sentry-sdk==2.30.0
+python-snappy==0.7.3
+cachetools==5.5.2
+wcwidth==0.2.13
+itsdangerous==2.2.0
+nvidia-cudnn-cu12==9.5.1.17
+tenacity==9.1.2
+nvidia-cusparselt-cu12==0.6.3
+torchmetrics==1.7.1
+pandas==2.3.0
+tabulate==0.9.0
+apache-libcloud==3.8.0
+tzdata==2025.2
+shellingham==1.5.4
+pfzy==0.3.4
+setproctitle==1.3.6
+catalogue==2.0.10
+argcomplete==3.6.2
+nvidia-cusolver-cu12==11.7.1.2
+llm-foundry==0.22.0.dev0
+wheel==0.45.1
+typing-inspection==0.4.1
+GitPython==3.1.44
+isodate==0.7.2
+typer==0.16.0
+annotated-types==0.7.0
+anyio==4.9.0
+iniconfig==2.1.0
+networkx==3.5
+mlflow-skinny==2.21.3
+filelock==3.18.0
+coloredlogs==15.0.1
+databricks-sdk==0.57.0
+cycler==0.12.1
+pynvml==11.5.3
+soupsieve==2.7
+six==1.17.0
+mdurl==0.1.2
+circuitbreaker==2.1.3
+packaging==25.0
+boto3==1.38.42
+websockets==11.0.3
+nvidia-cufile-cu12==1.11.1.6
+sqlparse==0.5.3
+gitdb==4.0.12
+proto-plus==1.26.1
+torchvision==0.22.0
+nvidia-cuda-nvrtc-cu12==12.6.77
+ruamel.yaml.clib==0.2.12
+sniffio==1.3.1
+pycparser==2.22
+questionary==2.1.0
+idna==3.10
+azure-storage-blob==12.25.1
+oci==2.154.3
+pillow==11.2.1
+smmap==5.0.2
+google-cloud-core==2.4.3
+azure-identity==1.23.0
+click==8.2.1
+py-cpuinfo==9.0.0
+mlflow==2.21.3
+jaxtyping==0.3.2
+bcrypt==4.3.0
+torch-optimizer==0.3.0
+safetensors==0.5.3
+Pygments==2.19.2
+pytorch-ranger==0.1.1
+pyparsing==3.2.3
+importlib_metadata==8.7.0
+aiosignal==1.3.2
+contourpy==1.3.2
+urllib3==2.5.0
+einops==0.8.1
+triton==3.3.0
+PyJWT==2.10.1
+humanfriendly==10.0
+nvidia-curand-cu12==10.3.7.77
+aiohttp==3.12.13
+cffi==1.17.1
+starlette==0.46.2
+Flask==3.1.1
+Brotli==1.1.0
+aiohappyeyeballs==2.6.1
+mosaicml-streaming==0.12.0
+pydantic==2.11.7
+threadpoolctl==3.6.0
+markdown-it-py==3.0.0
+types-python-dateutil==2.9.0.20250516
+nvidia-nccl-cu12==2.26.2
+python-dateutil==2.9.0.post0
+scipy==1.16.0
+greenlet==3.2.3
+wadler_lindig==0.1.7
+zipp==3.23.0
+Jinja2==3.1.6
+dill==0.3.8
+hyper-connections==0.2.1
+rsa==4.9.1
+wandb==0.18.7
+kiwisolver==1.4.8
+protobuf==5.29.5
+propcache==0.3.2
+fsspec==2025.5.1
+pyarrow==19.0.1
+sympy==1.14.0
+sentencepiece==0.2.0
+rotary-embedding-torch==0.8.6
+local-attention==1.11.1
+nvidia-cusparse-cu12==12.5.4.2
+yarl==1.20.1
+graphql-relay==3.2.0
+mpmath==1.3.0
+pytest==8.4.1
+joblib==1.5.1
+tqdm==4.67.1
+frozenlist==1.7.0
+Markdown==3.8.2
+accelerate==1.7.0
+google-resumable-media==2.7.2
+nvidia-nvjitlink-cu12==12.6.85
+numpy==2.1.3
+pip==24.0
+Mako==1.3.10
+multiprocess==0.70.16
+psutil==7.0.0
+datasets==3.6.0
+nvidia-cuda-runtime-cu12==12.6.77
+mosaicml==0.31.0
+nvidia-cublas-cu12==12.6.4.1
+uvicorn==0.34.3
+inquirerpy==0.3.4
+msal==1.32.3
+coolname==2.2.0
+cramjam==2.10.0
+platformdirs==4.3.8
+flatbuffers==25.2.10
+scikit-learn==1.7.0
+cloudpickle==3.1.1
+requests==2.32.4
+h11==0.16.0
+omegaconf==2.3.0
+ruamel.yaml==0.18.14
+pyasn1_modules==0.4.2
+tokenizers==0.21.2
+arrow==1.3.0
+slack_sdk==3.35.0
+gunicorn==23.0.0
+pydantic_core==2.33.2
+google-auth==2.40.3
+charset-normalizer==3.4.2
+azure-storage-file-datalake==12.20.0
+nvidia-cufft-cu12==11.3.0.4
+blinker==1.9.0
+torch==2.7.0
+fastapi==0.115.13
+fonttools==4.58.4
+regex==2024.11.6
+paramiko==3.5.1
+einx==0.3.0
+msal-extensions==1.3.1
+PyNaCl==1.5.0
+docker==7.1.0
+azure-core==1.34.0
+rich==14.0.0
+typing_extensions==4.14.0
+opentelemetry-semantic-conventions==0.55b1
+MarkupSafe==3.0.2
+graphql-core==3.2.6
+termcolor==3.1.0
+nvidia-cuda-cupti-cu12==12.6.80
+flash_attn==2.7.4.post1
+Werkzeug==3.1.3
+onnx==1.18.0
+lightning-utilities==0.14.3
+PyYAML==6.0.2
+setuptools==80.9.0
+cryptography==44.0.3
+antlr4-python3-runtime==4.9.3
+google-crc32c==1.7.1
+botocore==1.38.42
+nvidia-nvtx-cu12==12.6.77
+backoff==2.2.1
+graphene==3.4.3
+mosaicml-cli==0.7.3
+matplotlib==3.10.3
+beautifulsoup4==4.13.4
+transformers==4.51.3
+alembic==1.16.2
+multidict==6.5.0
+google-cloud-storage==2.10.0
+pyOpenSSL==24.3.0
+gql==3.5.3
+hf_transfer==0.1.9
+frozendict==2.4.6
+SQLAlchemy==2.0.41
+pluggy==1.6.0
+hf-xet==1.1.5
diff --git a/wandb/run-20250707_171826-rb9axy3v/files/wandb-metadata.json b/wandb/run-20250707_171826-rb9axy3v/files/wandb-metadata.json
new file mode 100644
index 0000000..184e965
--- /dev/null
+++ b/wandb/run-20250707_171826-rb9axy3v/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-07T09:18:26.333954Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx033",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "28870660096"
+    }
+  },
+  "memory": {
+    "total": "2164194279424"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250707_171826-rb9axy3v/run-rb9axy3v.wandb b/wandb/run-20250707_171826-rb9axy3v/run-rb9axy3v.wandb
new file mode 100644
index 0000000..c600475
Binary files /dev/null and b/wandb/run-20250707_171826-rb9axy3v/run-rb9axy3v.wandb differ
diff --git a/wandb/run-20250707_181358-5mxyov4r/files/config.yaml b/wandb/run-20250707_181358-5mxyov4r/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250707_181358-5mxyov4r/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250707_181358-5mxyov4r/files/wandb-metadata.json b/wandb/run-20250707_181358-5mxyov4r/files/wandb-metadata.json
new file mode 100644
index 0000000..ec35301
--- /dev/null
+++ b/wandb/run-20250707_181358-5mxyov4r/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-07T10:13:58.986561Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "28806520832"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250707_181358-5mxyov4r/files/wandb-summary.json b/wandb/run-20250707_181358-5mxyov4r/files/wandb-summary.json
new file mode 100644
index 0000000..4e355fc
--- /dev/null
+++ b/wandb/run-20250707_181358-5mxyov4r/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":1}}
\ No newline at end of file
diff --git a/wandb/run-20250707_181358-5mxyov4r/run-5mxyov4r.wandb b/wandb/run-20250707_181358-5mxyov4r/run-5mxyov4r.wandb
new file mode 100644
index 0000000..db0aaac
Binary files /dev/null and b/wandb/run-20250707_181358-5mxyov4r/run-5mxyov4r.wandb differ
diff --git a/wandb/run-20250707_181437-cjdzttf7/files/config.yaml b/wandb/run-20250707_181437-cjdzttf7/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250707_181437-cjdzttf7/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250707_181437-cjdzttf7/files/wandb-metadata.json b/wandb/run-20250707_181437-cjdzttf7/files/wandb-metadata.json
new file mode 100644
index 0000000..9e64558
--- /dev/null
+++ b/wandb/run-20250707_181437-cjdzttf7/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-07T10:14:37.136627Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "28806852608"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250707_181437-cjdzttf7/files/wandb-summary.json b/wandb/run-20250707_181437-cjdzttf7/files/wandb-summary.json
new file mode 100644
index 0000000..6c37fe1
--- /dev/null
+++ b/wandb/run-20250707_181437-cjdzttf7/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20250707_181437-cjdzttf7/run-cjdzttf7.wandb b/wandb/run-20250707_181437-cjdzttf7/run-cjdzttf7.wandb
new file mode 100644
index 0000000..dd2cb1a
Binary files /dev/null and b/wandb/run-20250707_181437-cjdzttf7/run-cjdzttf7.wandb differ
diff --git a/wandb/run-20250708_084054-hxxnj5bd/files/config.yaml b/wandb/run-20250708_084054-hxxnj5bd/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_084054-hxxnj5bd/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_084054-hxxnj5bd/files/wandb-metadata.json b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-metadata.json
new file mode 100644
index 0000000..39559a3
--- /dev/null
+++ b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T00:40:54.087515Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29075877888"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_084054-hxxnj5bd/files/wandb-summary.json b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-summary.json
new file mode 100644
index 0000000..c437ff1
--- /dev/null
+++ b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":7}}
\ No newline at end of file
diff --git a/wandb/run-20250708_084054-hxxnj5bd/run-hxxnj5bd.wandb b/wandb/run-20250708_084054-hxxnj5bd/run-hxxnj5bd.wandb
new file mode 100644
index 0000000..6b3370b
Binary files /dev/null and b/wandb/run-20250708_084054-hxxnj5bd/run-hxxnj5bd.wandb differ
diff --git a/wandb/run-20250708_084403-zrhsz2wi/files/config.yaml b/wandb/run-20250708_084403-zrhsz2wi/files/config.yaml
new file mode 100644
index 0000000..ca5f6f6
--- /dev/null
+++ b/wandb/run-20250708_084403-zrhsz2wi/files/config.yaml
@@ -0,0 +1,25 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+                - 61
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_084403-zrhsz2wi/files/wandb-metadata.json b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-metadata.json
new file mode 100644
index 0000000..184d77c
--- /dev/null
+++ b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T00:44:03.665695Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29077024768"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_084403-zrhsz2wi/files/wandb-summary.json b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-summary.json
new file mode 100644
index 0000000..95ea6e2
--- /dev/null
+++ b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_timestamp":1.7519359383420506e+09,"valid_loss":1.3284306526184082,"_runtime":494.768991449,"_step":1969,"_wandb":{"runtime":494},"loss":1.6118727922439575}
\ No newline at end of file
diff --git a/wandb/run-20250708_084403-zrhsz2wi/run-zrhsz2wi.wandb b/wandb/run-20250708_084403-zrhsz2wi/run-zrhsz2wi.wandb
new file mode 100644
index 0000000..2173c70
Binary files /dev/null and b/wandb/run-20250708_084403-zrhsz2wi/run-zrhsz2wi.wandb differ
diff --git a/wandb/run-20250708_085118-aw5dl503/files/config.yaml b/wandb/run-20250708_085118-aw5dl503/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_085118-aw5dl503/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_085118-aw5dl503/files/wandb-metadata.json b/wandb/run-20250708_085118-aw5dl503/files/wandb-metadata.json
new file mode 100644
index 0000000..7ae3f5c
--- /dev/null
+++ b/wandb/run-20250708_085118-aw5dl503/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T00:51:18.131816Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29079195648"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_085118-aw5dl503/files/wandb-summary.json b/wandb/run-20250708_085118-aw5dl503/files/wandb-summary.json
new file mode 100644
index 0000000..6c37fe1
--- /dev/null
+++ b/wandb/run-20250708_085118-aw5dl503/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20250708_085118-aw5dl503/run-aw5dl503.wandb b/wandb/run-20250708_085118-aw5dl503/run-aw5dl503.wandb
new file mode 100644
index 0000000..faabfcb
Binary files /dev/null and b/wandb/run-20250708_085118-aw5dl503/run-aw5dl503.wandb differ
diff --git a/wandb/run-20250708_085151-792413q0/files/config.yaml b/wandb/run-20250708_085151-792413q0/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_085151-792413q0/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_085151-792413q0/files/wandb-metadata.json b/wandb/run-20250708_085151-792413q0/files/wandb-metadata.json
new file mode 100644
index 0000000..efb4a86
--- /dev/null
+++ b/wandb/run-20250708_085151-792413q0/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T00:51:51.673787Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29079429120"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_085151-792413q0/files/wandb-summary.json b/wandb/run-20250708_085151-792413q0/files/wandb-summary.json
new file mode 100644
index 0000000..6c37fe1
--- /dev/null
+++ b/wandb/run-20250708_085151-792413q0/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20250708_085151-792413q0/run-792413q0.wandb b/wandb/run-20250708_085151-792413q0/run-792413q0.wandb
new file mode 100644
index 0000000..b1eb7d9
Binary files /dev/null and b/wandb/run-20250708_085151-792413q0/run-792413q0.wandb differ
diff --git a/wandb/run-20250708_085331-fdnrdel8/files/requirements.txt b/wandb/run-20250708_085331-fdnrdel8/files/requirements.txt
new file mode 100644
index 0000000..dbe4370
--- /dev/null
+++ b/wandb/run-20250708_085331-fdnrdel8/files/requirements.txt
@@ -0,0 +1,215 @@
+docker-pycreds==0.4.0
+onnxruntime==1.22.0
+google-api-core==2.25.1
+googleapis-common-protos==1.70.0
+opentelemetry-sdk==1.34.1
+zstd==1.5.7.2
+jmespath==1.0.1
+s3transfer==0.13.0
+huggingface-hub==0.33.2
+pytz==2025.2
+xxhash==3.5.0
+certifi==2025.6.15
+attrs==25.3.0
+opentelemetry-api==1.34.1
+validators==0.35.0
+pyasn1==0.6.1
+prompt_toolkit==3.0.51
+sentry-sdk==2.30.0
+python-snappy==0.7.3
+cachetools==5.5.2
+wcwidth==0.2.13
+itsdangerous==2.2.0
+nvidia-cudnn-cu12==9.5.1.17
+tenacity==9.1.2
+nvidia-cusparselt-cu12==0.6.3
+torchmetrics==1.7.1
+pandas==2.3.0
+tabulate==0.9.0
+apache-libcloud==3.8.0
+tzdata==2025.2
+shellingham==1.5.4
+pfzy==0.3.4
+setproctitle==1.3.6
+catalogue==2.0.10
+argcomplete==3.6.2
+nvidia-cusolver-cu12==11.7.1.2
+llm-foundry==0.22.0.dev0
+wheel==0.45.1
+typing-inspection==0.4.1
+GitPython==3.1.44
+isodate==0.7.2
+typer==0.16.0
+annotated-types==0.7.0
+anyio==4.9.0
+iniconfig==2.1.0
+networkx==3.5
+mlflow-skinny==2.21.3
+filelock==3.18.0
+coloredlogs==15.0.1
+databricks-sdk==0.57.0
+cycler==0.12.1
+pynvml==11.5.3
+soupsieve==2.7
+six==1.17.0
+mdurl==0.1.2
+circuitbreaker==2.1.3
+packaging==25.0
+boto3==1.38.42
+websockets==11.0.3
+nvidia-cufile-cu12==1.11.1.6
+sqlparse==0.5.3
+gitdb==4.0.12
+proto-plus==1.26.1
+torchvision==0.22.0
+nvidia-cuda-nvrtc-cu12==12.6.77
+ruamel.yaml.clib==0.2.12
+sniffio==1.3.1
+pycparser==2.22
+questionary==2.1.0
+idna==3.10
+azure-storage-blob==12.25.1
+oci==2.154.3
+pillow==11.2.1
+smmap==5.0.2
+google-cloud-core==2.4.3
+azure-identity==1.23.0
+click==8.2.1
+py-cpuinfo==9.0.0
+mlflow==2.21.3
+jaxtyping==0.3.2
+bcrypt==4.3.0
+torch-optimizer==0.3.0
+safetensors==0.5.3
+Pygments==2.19.2
+pytorch-ranger==0.1.1
+pyparsing==3.2.3
+importlib_metadata==8.7.0
+aiosignal==1.3.2
+contourpy==1.3.2
+urllib3==2.5.0
+einops==0.8.1
+triton==3.3.0
+PyJWT==2.10.1
+humanfriendly==10.0
+nvidia-curand-cu12==10.3.7.77
+aiohttp==3.12.13
+cffi==1.17.1
+starlette==0.46.2
+Flask==3.1.1
+Brotli==1.1.0
+aiohappyeyeballs==2.6.1
+mosaicml-streaming==0.12.0
+pydantic==2.11.7
+threadpoolctl==3.6.0
+markdown-it-py==3.0.0
+types-python-dateutil==2.9.0.20250516
+nvidia-nccl-cu12==2.26.2
+python-dateutil==2.9.0.post0
+scipy==1.16.0
+greenlet==3.2.3
+wadler_lindig==0.1.7
+zipp==3.23.0
+Jinja2==3.1.6
+dill==0.3.8
+hyper-connections==0.2.1
+rsa==4.9.1
+wandb==0.18.7
+kiwisolver==1.4.8
+protobuf==5.29.5
+propcache==0.3.2
+fsspec==2025.5.1
+pyarrow==19.0.1
+sympy==1.14.0
+sentencepiece==0.2.0
+rotary-embedding-torch==0.8.6
+local-attention==1.11.1
+nvidia-cusparse-cu12==12.5.4.2
+yarl==1.20.1
+graphql-relay==3.2.0
+mpmath==1.3.0
+pytest==8.4.1
+joblib==1.5.1
+tqdm==4.67.1
+frozenlist==1.7.0
+Markdown==3.8.2
+accelerate==1.7.0
+google-resumable-media==2.7.2
+nvidia-nvjitlink-cu12==12.6.85
+numpy==2.1.3
+pip==24.0
+Mako==1.3.10
+multiprocess==0.70.16
+psutil==7.0.0
+datasets==3.6.0
+nvidia-cuda-runtime-cu12==12.6.77
+mosaicml==0.31.0
+nvidia-cublas-cu12==12.6.4.1
+uvicorn==0.34.3
+inquirerpy==0.3.4
+msal==1.32.3
+coolname==2.2.0
+cramjam==2.10.0
+platformdirs==4.3.8
+flatbuffers==25.2.10
+scikit-learn==1.7.0
+cloudpickle==3.1.1
+requests==2.32.4
+h11==0.16.0
+omegaconf==2.3.0
+ruamel.yaml==0.18.14
+pyasn1_modules==0.4.2
+tokenizers==0.21.2
+arrow==1.3.0
+slack_sdk==3.35.0
+gunicorn==23.0.0
+pydantic_core==2.33.2
+google-auth==2.40.3
+charset-normalizer==3.4.2
+azure-storage-file-datalake==12.20.0
+nvidia-cufft-cu12==11.3.0.4
+blinker==1.9.0
+torch==2.7.0
+fastapi==0.115.13
+fonttools==4.58.4
+regex==2024.11.6
+paramiko==3.5.1
+einx==0.3.0
+msal-extensions==1.3.1
+PyNaCl==1.5.0
+docker==7.1.0
+azure-core==1.34.0
+rich==14.0.0
+typing_extensions==4.14.0
+opentelemetry-semantic-conventions==0.55b1
+MarkupSafe==3.0.2
+graphql-core==3.2.6
+termcolor==3.1.0
+nvidia-cuda-cupti-cu12==12.6.80
+flash_attn==2.7.4.post1
+Werkzeug==3.1.3
+onnx==1.18.0
+lightning-utilities==0.14.3
+PyYAML==6.0.2
+setuptools==80.9.0
+cryptography==44.0.3
+antlr4-python3-runtime==4.9.3
+google-crc32c==1.7.1
+botocore==1.38.42
+nvidia-nvtx-cu12==12.6.77
+backoff==2.2.1
+graphene==3.4.3
+mosaicml-cli==0.7.3
+matplotlib==3.10.3
+beautifulsoup4==4.13.4
+transformers==4.51.3
+alembic==1.16.2
+multidict==6.5.0
+google-cloud-storage==2.10.0
+pyOpenSSL==24.3.0
+gql==3.5.3
+hf_transfer==0.1.9
+frozendict==2.4.6
+SQLAlchemy==2.0.41
+pluggy==1.6.0
+hf-xet==1.1.5
diff --git a/wandb/run-20250708_085331-fdnrdel8/files/wandb-metadata.json b/wandb/run-20250708_085331-fdnrdel8/files/wandb-metadata.json
new file mode 100644
index 0000000..de789f7
--- /dev/null
+++ b/wandb/run-20250708_085331-fdnrdel8/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T00:53:31.228466Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29080027136"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_085331-fdnrdel8/run-fdnrdel8.wandb b/wandb/run-20250708_085331-fdnrdel8/run-fdnrdel8.wandb
new file mode 100644
index 0000000..e18f2b4
Binary files /dev/null and b/wandb/run-20250708_085331-fdnrdel8/run-fdnrdel8.wandb differ
diff --git a/wandb/run-20250708_085421-g296eupq/files/config.yaml b/wandb/run-20250708_085421-g296eupq/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_085421-g296eupq/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_085421-g296eupq/files/wandb-metadata.json b/wandb/run-20250708_085421-g296eupq/files/wandb-metadata.json
new file mode 100644
index 0000000..3937348
--- /dev/null
+++ b/wandb/run-20250708_085421-g296eupq/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T00:54:21.844685Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29080342528"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_085421-g296eupq/files/wandb-summary.json b/wandb/run-20250708_085421-g296eupq/files/wandb-summary.json
new file mode 100644
index 0000000..6c37fe1
--- /dev/null
+++ b/wandb/run-20250708_085421-g296eupq/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20250708_085421-g296eupq/run-g296eupq.wandb b/wandb/run-20250708_085421-g296eupq/run-g296eupq.wandb
new file mode 100644
index 0000000..7634122
Binary files /dev/null and b/wandb/run-20250708_085421-g296eupq/run-g296eupq.wandb differ
diff --git a/wandb/run-20250708_091756-wmmb56r6/files/config.yaml b/wandb/run-20250708_091756-wmmb56r6/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_091756-wmmb56r6/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_091756-wmmb56r6/files/wandb-metadata.json b/wandb/run-20250708_091756-wmmb56r6/files/wandb-metadata.json
new file mode 100644
index 0000000..e1d4879
--- /dev/null
+++ b/wandb/run-20250708_091756-wmmb56r6/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T01:17:56.452867Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29087944704"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_091756-wmmb56r6/files/wandb-summary.json b/wandb/run-20250708_091756-wmmb56r6/files/wandb-summary.json
new file mode 100644
index 0000000..4e355fc
--- /dev/null
+++ b/wandb/run-20250708_091756-wmmb56r6/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":1}}
\ No newline at end of file
diff --git a/wandb/run-20250708_091756-wmmb56r6/run-wmmb56r6.wandb b/wandb/run-20250708_091756-wmmb56r6/run-wmmb56r6.wandb
new file mode 100644
index 0000000..7dc9b1b
Binary files /dev/null and b/wandb/run-20250708_091756-wmmb56r6/run-wmmb56r6.wandb differ
diff --git a/wandb/run-20250708_091817-s4wd9h25/files/config.yaml b/wandb/run-20250708_091817-s4wd9h25/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_091817-s4wd9h25/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_091817-s4wd9h25/files/wandb-metadata.json b/wandb/run-20250708_091817-s4wd9h25/files/wandb-metadata.json
new file mode 100644
index 0000000..6166e43
--- /dev/null
+++ b/wandb/run-20250708_091817-s4wd9h25/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T01:18:17.113494Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29088096256"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_091817-s4wd9h25/files/wandb-summary.json b/wandb/run-20250708_091817-s4wd9h25/files/wandb-summary.json
new file mode 100644
index 0000000..4e355fc
--- /dev/null
+++ b/wandb/run-20250708_091817-s4wd9h25/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":1}}
\ No newline at end of file
diff --git a/wandb/run-20250708_091817-s4wd9h25/run-s4wd9h25.wandb b/wandb/run-20250708_091817-s4wd9h25/run-s4wd9h25.wandb
new file mode 100644
index 0000000..aab06a9
Binary files /dev/null and b/wandb/run-20250708_091817-s4wd9h25/run-s4wd9h25.wandb differ
diff --git a/wandb/run-20250708_092024-mxbeuycs/files/config.yaml b/wandb/run-20250708_092024-mxbeuycs/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_092024-mxbeuycs/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_092024-mxbeuycs/files/wandb-metadata.json b/wandb/run-20250708_092024-mxbeuycs/files/wandb-metadata.json
new file mode 100644
index 0000000..170b0f2
--- /dev/null
+++ b/wandb/run-20250708_092024-mxbeuycs/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T01:20:24.559917Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29088919552"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_092024-mxbeuycs/files/wandb-summary.json b/wandb/run-20250708_092024-mxbeuycs/files/wandb-summary.json
new file mode 100644
index 0000000..4e355fc
--- /dev/null
+++ b/wandb/run-20250708_092024-mxbeuycs/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":1}}
\ No newline at end of file
diff --git a/wandb/run-20250708_092024-mxbeuycs/run-mxbeuycs.wandb b/wandb/run-20250708_092024-mxbeuycs/run-mxbeuycs.wandb
new file mode 100644
index 0000000..5f2a215
Binary files /dev/null and b/wandb/run-20250708_092024-mxbeuycs/run-mxbeuycs.wandb differ
diff --git a/wandb/run-20250708_092048-k4evr4pi/files/config.yaml b/wandb/run-20250708_092048-k4evr4pi/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_092048-k4evr4pi/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_092048-k4evr4pi/files/wandb-metadata.json b/wandb/run-20250708_092048-k4evr4pi/files/wandb-metadata.json
new file mode 100644
index 0000000..6b90c94
--- /dev/null
+++ b/wandb/run-20250708_092048-k4evr4pi/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T01:20:48.673820Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29089042432"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_092048-k4evr4pi/files/wandb-summary.json b/wandb/run-20250708_092048-k4evr4pi/files/wandb-summary.json
new file mode 100644
index 0000000..1d52051
--- /dev/null
+++ b/wandb/run-20250708_092048-k4evr4pi/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":2}}
\ No newline at end of file
diff --git a/wandb/run-20250708_092048-k4evr4pi/run-k4evr4pi.wandb b/wandb/run-20250708_092048-k4evr4pi/run-k4evr4pi.wandb
new file mode 100644
index 0000000..21c36b6
Binary files /dev/null and b/wandb/run-20250708_092048-k4evr4pi/run-k4evr4pi.wandb differ
diff --git a/wandb/run-20250708_092146-lw8l9lnj/files/config.yaml b/wandb/run-20250708_092146-lw8l9lnj/files/config.yaml
new file mode 100644
index 0000000..ca5f6f6
--- /dev/null
+++ b/wandb/run-20250708_092146-lw8l9lnj/files/config.yaml
@@ -0,0 +1,25 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+                - 61
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_092146-lw8l9lnj/files/wandb-metadata.json b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-metadata.json
new file mode 100644
index 0000000..c767ccf
--- /dev/null
+++ b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T01:21:46.340269Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29089415168"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_092146-lw8l9lnj/files/wandb-summary.json b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-summary.json
new file mode 100644
index 0000000..31a72e0
--- /dev/null
+++ b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":117},"_step":1883,"loss":1.665515422821045,"_timestamp":1.7519378234756608e+09,"valid_loss":1.703580379486084,"_runtime":117.163514931}
\ No newline at end of file
diff --git a/wandb/run-20250708_092146-lw8l9lnj/run-lw8l9lnj.wandb b/wandb/run-20250708_092146-lw8l9lnj/run-lw8l9lnj.wandb
new file mode 100644
index 0000000..9d37459
Binary files /dev/null and b/wandb/run-20250708_092146-lw8l9lnj/run-lw8l9lnj.wandb differ
diff --git a/wandb/run-20250708_092417-72fudjdt/files/config.yaml b/wandb/run-20250708_092417-72fudjdt/files/config.yaml
new file mode 100644
index 0000000..ca5f6f6
--- /dev/null
+++ b/wandb/run-20250708_092417-72fudjdt/files/config.yaml
@@ -0,0 +1,25 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+                - 61
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_092417-72fudjdt/files/wandb-metadata.json b/wandb/run-20250708_092417-72fudjdt/files/wandb-metadata.json
new file mode 100644
index 0000000..cdc0cfc
--- /dev/null
+++ b/wandb/run-20250708_092417-72fudjdt/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T01:24:17.901516Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29098737664"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_092417-72fudjdt/files/wandb-summary.json b/wandb/run-20250708_092417-72fudjdt/files/wandb-summary.json
new file mode 100644
index 0000000..35319c5
--- /dev/null
+++ b/wandb/run-20250708_092417-72fudjdt/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":9184},"valid_loss":1.009443759918213,"_runtime":9184.695781432,"_step":99999,"loss":1.03251051902771,"_timestamp":1.751947042589338e+09}
\ No newline at end of file
diff --git a/wandb/run-20250708_092417-72fudjdt/run-72fudjdt.wandb b/wandb/run-20250708_092417-72fudjdt/run-72fudjdt.wandb
new file mode 100644
index 0000000..b13b6fa
Binary files /dev/null and b/wandb/run-20250708_092417-72fudjdt/run-72fudjdt.wandb differ
diff --git a/wandb/run-20250708_100655-xbr2eet2/files/config.yaml b/wandb/run-20250708_100655-xbr2eet2/files/config.yaml
new file mode 100644
index 0000000..34ed405
--- /dev/null
+++ b/wandb/run-20250708_100655-xbr2eet2/files/config.yaml
@@ -0,0 +1,24 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_100655-xbr2eet2/files/wandb-metadata.json b/wandb/run-20250708_100655-xbr2eet2/files/wandb-metadata.json
new file mode 100644
index 0000000..d68610c
--- /dev/null
+++ b/wandb/run-20250708_100655-xbr2eet2/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T02:06:55.561010Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29111660544"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_100655-xbr2eet2/files/wandb-summary.json b/wandb/run-20250708_100655-xbr2eet2/files/wandb-summary.json
new file mode 100644
index 0000000..6c37fe1
--- /dev/null
+++ b/wandb/run-20250708_100655-xbr2eet2/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20250708_100655-xbr2eet2/run-xbr2eet2.wandb b/wandb/run-20250708_100655-xbr2eet2/run-xbr2eet2.wandb
new file mode 100644
index 0000000..7cfcf4c
Binary files /dev/null and b/wandb/run-20250708_100655-xbr2eet2/run-xbr2eet2.wandb differ
diff --git a/wandb/run-20250708_100743-bp26doib/files/config.yaml b/wandb/run-20250708_100743-bp26doib/files/config.yaml
new file mode 100644
index 0000000..ca5f6f6
--- /dev/null
+++ b/wandb/run-20250708_100743-bp26doib/files/config.yaml
@@ -0,0 +1,25 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+                - 61
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_100743-bp26doib/files/wandb-metadata.json b/wandb/run-20250708_100743-bp26doib/files/wandb-metadata.json
new file mode 100644
index 0000000..06b7260
--- /dev/null
+++ b/wandb/run-20250708_100743-bp26doib/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T02:07:43.207774Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29111173120"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_100743-bp26doib/files/wandb-summary.json b/wandb/run-20250708_100743-bp26doib/files/wandb-summary.json
new file mode 100644
index 0000000..a55553d
--- /dev/null
+++ b/wandb/run-20250708_100743-bp26doib/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":2},"_runtime":2.200631544,"_step":0,"loss":5.773008346557617,"_timestamp":1.7519404648534203e+09,"valid_loss":5.134904861450195}
\ No newline at end of file
diff --git a/wandb/run-20250708_100743-bp26doib/run-bp26doib.wandb b/wandb/run-20250708_100743-bp26doib/run-bp26doib.wandb
new file mode 100644
index 0000000..e5ef9f2
Binary files /dev/null and b/wandb/run-20250708_100743-bp26doib/run-bp26doib.wandb differ
diff --git a/wandb/run-20250708_100933-5h0en8f2/files/config.yaml b/wandb/run-20250708_100933-5h0en8f2/files/config.yaml
new file mode 100644
index 0000000..ca5f6f6
--- /dev/null
+++ b/wandb/run-20250708_100933-5h0en8f2/files/config.yaml
@@ -0,0 +1,25 @@
+_wandb:
+    value:
+        cli_version: 0.18.7
+        m: []
+        python_version: 3.11.13
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 17
+                - 23
+                - 55
+                - 61
+            "4": 3.11.13
+            "5": 0.18.7
+            "8":
+                - 5
+            "10":
+                - 3
+            "12": 0.18.7
+            "13": linux-x86_64
diff --git a/wandb/run-20250708_100933-5h0en8f2/files/wandb-metadata.json b/wandb/run-20250708_100933-5h0en8f2/files/wandb-metadata.json
new file mode 100644
index 0000000..ae378a4
--- /dev/null
+++ b/wandb/run-20250708_100933-5h0en8f2/files/wandb-metadata.json
@@ -0,0 +1,43 @@
+{
+  "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35",
+  "python": "3.11.13",
+  "startedAt": "2025-07-08T02:09:33.037948Z",
+  "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py",
+  "codePath": "train.py",
+  "git": {
+    "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git",
+    "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71"
+  },
+  "email": "runjia@u.nus.edu",
+  "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch",
+  "host": "a2ap-dgx034",
+  "username": "e1113744",
+  "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 112,
+  "cpu_count_logical": 224,
+  "gpu": "NVIDIA H100 80GB HBM3",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1888556142592",
+      "used": "29111791616"
+    }
+  },
+  "memory": {
+    "total": "2164194275328"
+  },
+  "cpu": {
+    "count": 112,
+    "countLogical": 224
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA H100 80GB HBM3",
+      "memoryTotal": "85520809984",
+      "cudaCores": 16896,
+      "architecture": "Hopper"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/run-20250708_100933-5h0en8f2/files/wandb-summary.json b/wandb/run-20250708_100933-5h0en8f2/files/wandb-summary.json
new file mode 100644
index 0000000..9133f62
--- /dev/null
+++ b/wandb/run-20250708_100933-5h0en8f2/files/wandb-summary.json
@@ -0,0 +1 @@
+{"loss":0.7260929346084595,"_timestamp":1.7519495866363919e+09,"_wandb":{"runtime":9013},"valid_loss":0.73642897605896,"_runtime":9013.610222683,"_step":99999}
\ No newline at end of file
diff --git a/wandb/run-20250708_100933-5h0en8f2/run-5h0en8f2.wandb b/wandb/run-20250708_100933-5h0en8f2/run-5h0en8f2.wandb
new file mode 100644
index 0000000..5830bb2
Binary files /dev/null and b/wandb/run-20250708_100933-5h0en8f2/run-5h0en8f2.wandb differ