diff --git a/65691.pbs111.OU b/65691.pbs111.OU new file mode 100644 index 0000000..ddf58cd --- /dev/null +++ b/65691.pbs111.OU @@ -0,0 +1,64 @@ +/var/spool/pbs/mom_priv/jobs/65691.pbs111.SC: line 10: deactivate: command not found +wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +wandb: Currently logged in as: runjiachen (runjiachen-nus). Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.18.7 +wandb: Run data is saved locally in /home/users/nus/e1113744/native-sparse-attention-pytorch/wandb/run-20250707_181358-5mxyov4r +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run summer-butterfly-2 +wandb: ⭐️ View project at https://wandb.ai/runjiachen-nus/native-sparse-attention +wandb: 🚀 View run at https://wandb.ai/runjiachen-nus/native-sparse-attention/runs/5mxyov4r +wandb: WARNING Calling wandb.run.save without any arguments is deprecated.Changes to attributes are automatically persisted. + training: 0%| | 0/100000 [00:00 + loss = model(data, return_loss = True) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/native-sparse-attention-pytorch/native_sparse_attention_pytorch/transformer.py", line 308, in forward + attn_out, layer_cache = attn( + ^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +TypeError: Attention.forward() got an unexpected keyword argument 'cache' + +real 0m28.958s +user 0m12.499s +sys 0m5.281s +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Resource Usage on 2025-07-07 18:14:13.967119: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + JobId: 65691.pbs111 + Project: 71001002 + Exit Status: 1 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + NCPUs: Requested(14), Used(14) + CPU Time Used: 00:00:17 + Memory: Requested(235gb), Used(1829988kb) + Vmem Used: 522802812kb + Walltime: Requested(12:00:00), Used(00:00:30) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Execution Nodes Used: (a2ap-dgx034:ngpus=1:ncpus=14:mem=246415360kb) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + GPU Duration: 34.13secs + GPU Power Consumed: 60.97W + GPU Max GPU Memory Used: 704.0MB + Memory Throughput Rate (Average): a2ap-dgx034:(gpu7:0%) + Memory Throughput Rate (Max): a2ap-dgx034:(gpu7:0%) + Memory Throughput Rate (Min): a2ap-dgx034:(gpu7:0%) + GPU SM Utilization (Average): a2ap-dgx034:(gpu7:0%) + GPU SM Utilization (Max): a2ap-dgx034:(gpu7:0%) + GPU SM Utilization (Min): a2ap-dgx034:(gpu7:0%) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Warning: All GPUs have a percentage of 0 utilisation. +GPU application profile: Idle +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + diff --git a/65693.pbs111.OU b/65693.pbs111.OU new file mode 100644 index 0000000..0fae911 --- /dev/null +++ b/65693.pbs111.OU @@ -0,0 +1,64 @@ +/var/spool/pbs/mom_priv/jobs/65693.pbs111.SC: line 10: deactivate: command not found +wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +wandb: Currently logged in as: runjiachen (runjiachen-nus). Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.18.7 +wandb: Run data is saved locally in /home/users/nus/e1113744/native-sparse-attention-pytorch/wandb/run-20250707_181437-cjdzttf7 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run morning-paper-3 +wandb: ⭐️ View project at https://wandb.ai/runjiachen-nus/native-sparse-attention +wandb: 🚀 View run at https://wandb.ai/runjiachen-nus/native-sparse-attention/runs/cjdzttf7 +wandb: WARNING Calling wandb.run.save without any arguments is deprecated.Changes to attributes are automatically persisted. + training: 0%| | 0/100000 [00:00 + loss = model(data, return_loss = True) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/native-sparse-attention-pytorch/native_sparse_attention_pytorch/transformer.py", line 308, in forward + attn_out, layer_cache = attn( + ^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/users/nus/e1113744/llm-foundry/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +TypeError: Attention.forward() got an unexpected keyword argument 'cache' + +real 0m14.447s +user 0m12.093s +sys 0m3.989s +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Resource Usage on 2025-07-07 18:14:45.053803: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + JobId: 65693.pbs111 + Project: 71001002 + Exit Status: 1 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + NCPUs: Requested(14), Used(14) + CPU Time Used: 00:00:16 + Memory: Requested(235gb), Used(1141256kb) + Vmem Used: 521964248kb + Walltime: Requested(12:00:00), Used(00:00:16) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Execution Nodes Used: (a2ap-dgx034:ngpus=1:ncpus=14:mem=246415360kb) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + GPU Duration: 17.99secs + GPU Power Consumed: 56.66W + GPU Max GPU Memory Used: 704.0MB + Memory Throughput Rate (Average): a2ap-dgx034:(gpu7:0%) + Memory Throughput Rate (Max): a2ap-dgx034:(gpu7:0%) + Memory Throughput Rate (Min): a2ap-dgx034:(gpu7:0%) + GPU SM Utilization (Average): a2ap-dgx034:(gpu7:0%) + GPU SM Utilization (Max): a2ap-dgx034:(gpu7:0%) + GPU SM Utilization (Min): a2ap-dgx034:(gpu7:0%) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Warning: All GPUs have a percentage of 0 utilisation. +GPU application profile: Idle +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + diff --git a/native_sparse_attention_pytorch/benchmark.py b/native_sparse_attention_pytorch/benchmark.py new file mode 100644 index 0000000..bdf38db --- /dev/null +++ b/native_sparse_attention_pytorch/benchmark.py @@ -0,0 +1,109 @@ +import time +import torch +import numpy as np + +# Import your modules +from native_sparse_attention import SparseAttention +from transformer import Attention as StandardAttention + + +def benchmark_module(module, x, runs=50, warmups=5): + """ + Benchmark forward and backward pass of a module. + Returns forward and backward times as numpy arrays (in seconds). + """ + # Warm-up to stabilize JIT/CUDA and trigger Triton compilation + for _ in range(warmups): + out = module(x) + loss = out.sum() + loss.backward() + module.zero_grad() + + # Forward timing + fwd_times = [] + for _ in range(runs): + if torch.cuda.is_available(): + torch.cuda.synchronize() + t0 = time.perf_counter() + out = module(x) + if torch.cuda.is_available(): + torch.cuda.synchronize() + fwd_times.append(time.perf_counter() - t0) + + # Backward timing + bwd_times = [] + for _ in range(runs): + out = module(x) + if torch.cuda.is_available(): + torch.cuda.synchronize() + t0 = time.perf_counter() + out.sum().backward() + if torch.cuda.is_available(): + torch.cuda.synchronize() + bwd_times.append(time.perf_counter() - t0) + module.zero_grad() + + return np.array(fwd_times), np.array(bwd_times) + + +def main(): + + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Hyperparameters + batch_size = 128 + seq_len = 1024 + d_model = 512 + n_heads = 8 + d_head = d_model // n_heads + + # NSA-specific hyperparameters + sliding_window_size = 32 + compress_block_size = 4 + compress_block_sliding_stride = 4 + selection_block_size = 16 + num_selected_blocks = 4 + + # Create input tensor + x = torch.randn(batch_size, seq_len, d_model, device=device) + + # Instantiate modules with identical hyperparameters + std_attn = StandardAttention( + dim=d_model, + dim_head=d_head, + heads=n_heads, + causal=True + ).to(device) + + nsa_attn = SparseAttention( + d_model, + d_head, + n_heads, + sliding_window_size, + compress_block_size, + compress_block_sliding_stride, + selection_block_size, + num_selected_blocks, + use_triton_kernel=True + ).to(device) + + # Run benchmarks + runs = 5000 + warmups = 500 + std_fwd, std_bwd = benchmark_module(std_attn, x, runs, warmups) + nsa_fwd, nsa_bwd = benchmark_module(nsa_attn, x, runs, warmups) + + # Report results + print(f"{'Module':<25}{'Fwd Mean (ms)':>15}{'Fwd Std (ms)':>15}{'Bwd Mean (ms)':>15}{'Bwd Std (ms)':>15}") + for name, fwd, bwd in [ + ("StandardAttention", std_fwd, std_bwd), + ("SparseAttention (NSA)", nsa_fwd, nsa_bwd), + ]: + print(f"{name:<25}{fwd.mean()*1000:>15.3f}{fwd.std()*1000:>15.3f}{bwd.mean()*1000:>15.3f}{bwd.std()*1000:>15.3f}") + + print(nsa_attn.timer) + + +if __name__ == "__main__": + main() diff --git a/native_sparse_attention_pytorch/native_sparse_attention.py b/native_sparse_attention_pytorch/native_sparse_attention.py index 7c95716..b41ab00 100644 --- a/native_sparse_attention_pytorch/native_sparse_attention.py +++ b/native_sparse_attention_pytorch/native_sparse_attention.py @@ -19,6 +19,8 @@ from einops import einsum, repeat, rearrange, reduce, pack, unpack from einops.layers.torch import Rearrange +import time + # b - batch # h - heads # qh - grouped query heads @@ -185,6 +187,42 @@ def attend( # classes +class Timer: + + def __init__(self): + self.t1 = 0 + self.t2 = 0 + self.t3 = 0 + self.t4 = 0 + self.counter = 0 + + def add_t1(self, val): + self.t1 += val + ##self.counter += 1 + + def add_t2(self, val): + self.t2 += val + ##self.counter += 1 + + def add_t3(self, val): + self.t3 += val + ##self.counter += 1 + + def add_t4(self, val): + self.t4 += val + ##self.counter += 1 + + def __str__(self): + return ( + f"time spent in t1 is: {self.t1}, " + f"time spent in t2 is: {self.t2}, " + f"time spent in t3 is: {self.t3}" + f"time spent in t4 is: {self.t4}" + f"counter is: {self.counter}" + + ) + + class SparseAttention(Module): def __init__( self, @@ -205,10 +243,12 @@ def __init__( query_heads_share_selected_kv = True, # if set to True, importance score is averaged across query heads to select top-n buckets of kv per kv head - but can be set to False for each query head within a group to look at different sets of kv buckets. will be more memory and compute of course compress_mlp: Module | None = None, compress_mlp_expand_factor = 1., - strategy_combine_mlp: Module | None = None + strategy_combine_mlp: Module | None = None, ): super().__init__() + self.timer = Timer() + # attention heads # handling gqa if `kv_heads` is set @@ -247,6 +287,8 @@ def __init__( # sliding window strategy + + self.sliding_window = LocalAttention( dim = dim_head, window_size = sliding_window_size, @@ -553,8 +595,9 @@ def forward( disable_triton_kernel = False, sliding_window_flex_mask = None, fine_selection_flex_mask = None, - return_cache = False + return_cache = False, ): + ##start = time.perf_counter() is_inferencing = exists(cache) if is_inferencing: @@ -618,6 +661,7 @@ def forward( # 1. coarse attention over compressed + start = time.perf_counter() mem_ck, mem_cv = repeat(self.compress_mem_kv, 'kv ... -> kv b ...', b = batch) num_mem_compress_kv = mem_ck.shape[-2] @@ -636,7 +680,9 @@ def forward( cmask = einx.less('j, i -> i j', ck_seq, cq_seq) + compressed_attn_out, csim = attend(cq, ck, cv, mask = cmask, return_sim = True) + self.timer.add_t1(time.perf_counter() - start) # for 2. and 3., will give them relative positions with rotary - compressed needs to be handled separately (even if they already have intra block absolute positions) @@ -706,6 +752,8 @@ def forward( remainder = fine_divisible_seq_len - seq_len pad_to_multiple = partial(pad_at_dim, pad = (0, remainder), dim = -2) + start = time.perf_counter() + if has_selected_kv_for_fine_attn: # get the top-n kv segments for fine attention @@ -716,10 +764,11 @@ def forward( if self.use_triton_kernel and not disable_triton_kernel: - from native_sparse_attention_pytorch.triton_native_sparse_attention import native_sparse_attend + from triton_native_sparse_attention import native_sparse_attend fmask = selected_importance_values > 1e-10 + fine_attn_out = native_sparse_attend( fq, fk, fv, self.selection_block_size, @@ -728,6 +777,7 @@ def forward( sel_scale = gates, include_block_causal = self.causal ) + elif exists(fine_selection_flex_mask): assert not self.use_diff_topk, 'differential topk is not available for flex attention' @@ -836,6 +886,8 @@ def forward( fine_attn_out = rearrange(fine_attn_out, '(b w) h n d -> b h (w n) d', b = batch) fine_attn_out = fine_attn_out[..., :seq_len, :] + self.timer.add_t2(time.perf_counter() - start) + # 3. overlapping sliding window, this is unsurprising and expected - `s` for sliding sq = q @@ -847,7 +899,9 @@ def forward( else: sk, sv = tuple(repeat(t, 'b h ... -> b (h num_grouped_queries) ...', num_grouped_queries = self.num_grouped_queries) for t in (sk, sv)) + start = time.perf_counter() sliding_window_attn_out = self.sliding_window(sq, sk, sv) + self.timer.add_t3(time.perf_counter() - start) # combine strategies @@ -863,5 +917,7 @@ def forward( if not return_cache: return out + + ##self.timer.add_t4(time.perf_counter() - start) return out, (cache_kv, cache_compressed_kv) diff --git a/native_sparse_attention_pytorch/transformer.py b/native_sparse_attention_pytorch/transformer.py index cd2a662..94a1b92 100644 --- a/native_sparse_attention_pytorch/transformer.py +++ b/native_sparse_attention_pytorch/transformer.py @@ -11,7 +11,7 @@ from rotary_embedding_torch import RotaryEmbedding -from native_sparse_attention_pytorch.native_sparse_attention import ( +from native_sparse_attention import ( SparseAttention, create_compress_mask, create_fine_mask, @@ -62,68 +62,85 @@ def top_k(logits, thres = 0.9): # attention -class Attention(Module): +class Attention(nn.Module): def __init__( self, dim, - dim_head = 64, - heads = 8, - causal = True, - kv_heads = None + dim_head=64, + heads=8, + causal=True, + kv_heads=None ): super().__init__() self.norm = RMSNorm(dim) self.heads = heads - kv_heads = default(kv_heads, heads) - dim_inner = heads * dim_head - dim_kv_inner = kv_heads * dim_head - + kv_heads = kv_heads or heads self.kv_heads = kv_heads self.causal = causal + dim_inner = heads * dim_head + dim_kv_inner = kv_heads * dim_head + self.rotary_embed = RotaryEmbedding(dim_head) - self.to_q = nn.Linear(dim, dim_inner, bias = False) - self.to_k = nn.Linear(dim, dim_kv_inner, bias = False) - self.to_v = nn.Linear(dim, dim_kv_inner, bias = False) + self.to_q = nn.Linear(dim, dim_inner, bias=False) + self.to_k = nn.Linear(dim, dim_kv_inner, bias=False) + self.to_v = nn.Linear(dim, dim_kv_inner, bias=False) - self.split_heads = Rearrange('b n (h d) -> b h n d', d = dim_head) - self.merge_heads = Rearrange('b h n d -> b n (h d)') + self.split_heads = lambda x: rearrange(x, 'b n (h d) -> b h n d', d=dim_head) + self.merge_heads = lambda x: rearrange(x, 'b h n d -> b n (h d)') - self.to_out = nn.Linear(dim_inner, dim, bias = False) + self.to_out = nn.Linear(dim_inner, dim, bias=False) def forward( self, - x + x, + cache=None, + return_cache=False ): - + # Layer normalization x = self.norm(x) + # Project to Q, K, V q = self.to_q(x) k = self.to_k(x) v = self.to_v(x) + # Split into heads q, k, v = map(self.split_heads, (q, k, v)) - # relative positions - + # Apply rotary embeddings q, k = self.rotary_embed.rotate_queries_with_cached_keys(q, k) - # naive gqa + # Group key and value heads for attention + k_grouped = repeat(k, 'b h n d -> b (g h) n d', g=self.heads // self.kv_heads) + v_grouped = repeat(v, 'b h n d -> b (g h) n d', g=self.heads // self.kv_heads) - k, v = tuple(repeat(t, 'b h ... -> b (g h) ...', g = self.heads // self.kv_heads) for t in (k, v)) - - # attention branch + # Prepend past key/value if provided + if cache is not None: + past_k, past_v = cache + k_grouped = torch.cat([past_k, k_grouped], dim=2) + v_grouped = torch.cat([past_v, v_grouped], dim=2) + # Compute attention (q still has original head count) out = F.scaled_dot_product_attention( - q, k, v, - is_causal = self.causal + q, + k_grouped, + v_grouped, + is_causal=self.causal ) + # Merge heads and project out out = self.merge_heads(out) + out = self.to_out(out) + + # Return output and new cache if requested + if return_cache: + return out, (k_grouped, v_grouped) + return out + - return self.to_out(out) # feedforward @@ -309,7 +326,6 @@ def forward( tokens, cache = next(iter_cache, None), return_cache = True, - **attn_kwargs ) next_cache.append(layer_cache) diff --git a/native_sparse_attention_pytorch/triton_native_sparse_attention.py b/native_sparse_attention_pytorch/triton_native_sparse_attention.py index 9069683..64f1383 100644 --- a/native_sparse_attention_pytorch/triton_native_sparse_attention.py +++ b/native_sparse_attention_pytorch/triton_native_sparse_attention.py @@ -1,5 +1,5 @@ from __future__ import annotations -from native_sparse_attention_pytorch.tensor_typing import Float, Int, Bool +from tensor_typing import Float, Int, Bool # taken from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py # with fixes for triton 2.3 @@ -1835,6 +1835,7 @@ def forward( block_dk_dv_use_dot, return_sliding_window_out ): + ##print("triton called") dtype = fq.dtype q_heads, kv_heads = fq.shape[1], fk.shape[1] diff --git a/normal.txt b/normal.txt new file mode 100644 index 0000000..32a475f --- /dev/null +++ b/normal.txt @@ -0,0 +1,3 @@ +using custom triton kernel +wandb: 🚀 View run morning-paper-3 at: https://wandb.ai/runjiachen-nus/native-sparse-attention/runs/cjdzttf7 +wandb: Find logs at: wandb/run-20250707_181437-cjdzttf7/logs diff --git a/normal_run.pbs b/normal_run.pbs new file mode 100644 index 0000000..0784c06 --- /dev/null +++ b/normal_run.pbs @@ -0,0 +1,15 @@ +#!/bin/bash +#PBS -l select=1:ngpus=1 +#PBS -l walltime=12:00:00 +#PBS -o ./ +#PBS -j oe +#PBS -N submit +#PBS -q ic102 +#PBS -P 71001002 + +deactivate +cd llm-foundry +source .venv/bin/activate +cd ~ +cd native-sparse-attention-pytorch +time python train.py >> normal.txt \ No newline at end of file diff --git a/out_triton.txt b/out_triton.txt new file mode 100644 index 0000000..3d0cbeb --- /dev/null +++ b/out_triton.txt @@ -0,0 +1,3 @@ +using custom triton kernel +wandb: 🚀 View run summer-butterfly-2 at: https://wandb.ai/runjiachen-nus/native-sparse-attention/runs/5mxyov4r +wandb: Find logs at: wandb/run-20250707_181358-5mxyov4r/logs diff --git a/train.py b/train.py index 0efdf57..9bee49d 100644 --- a/train.py +++ b/train.py @@ -31,8 +31,8 @@ HEADS = 8 KV_HEADS = 4 -USE_SPARSE_ATTN = True -USE_TRITON_NSA = True +USE_SPARSE_ATTN = False +USE_TRITON_NSA = False USE_FLEX_FOR_FINE_SELECTION = False # will push flex a bit, won't be efficient as each layer needs sparsity dynmically generated, but may be enough just to compare to full attention before going all-in on triton kernels QUERY_HEADS_SHARE_SELECTION = True # if set to False, each query head can look at a different segment of their corresponding key / value head in GQA @@ -53,7 +53,7 @@ PROJECT_NAME = 'native-sparse-attention' RUN_NAME = 'baseline' if not USE_SPARSE_ATTN else f'sparse-attn: compress size {COMPRESS_BLOCK_SIZE} | fine size {FINE_BLOCK_SIZE} | {NUM_FINE_SELECTED} selected' -WANDB_ONLINE = False # turn this on to pipe experiment to cloud +WANDB_ONLINE = True # turn this on to pipe experiment to cloud # helpers diff --git a/triton_run.pbs b/triton_run.pbs new file mode 100644 index 0000000..a02d3cb --- /dev/null +++ b/triton_run.pbs @@ -0,0 +1,15 @@ +#!/bin/bash +#PBS -l select=1:ngpus=1 +#PBS -l walltime=12:00:00 +#PBS -o ./ +#PBS -j oe +#PBS -N submit +#PBS -q ic102 +#PBS -P 71001002 + +deactivate +cd llm-foundry +source .venv/bin/activate +cd ~ +cd native-sparse-attention-pytorch +time python train.py >> out_triton.txt \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run new file mode 120000 index 0000000..5bd4cc5 --- /dev/null +++ b/wandb/latest-run @@ -0,0 +1 @@ +run-20250708_100933-5h0en8f2 \ No newline at end of file diff --git a/wandb/run-20250707_171826-rb9axy3v/files/requirements.txt b/wandb/run-20250707_171826-rb9axy3v/files/requirements.txt new file mode 100644 index 0000000..dbe4370 --- /dev/null +++ b/wandb/run-20250707_171826-rb9axy3v/files/requirements.txt @@ -0,0 +1,215 @@ +docker-pycreds==0.4.0 +onnxruntime==1.22.0 +google-api-core==2.25.1 +googleapis-common-protos==1.70.0 +opentelemetry-sdk==1.34.1 +zstd==1.5.7.2 +jmespath==1.0.1 +s3transfer==0.13.0 +huggingface-hub==0.33.2 +pytz==2025.2 +xxhash==3.5.0 +certifi==2025.6.15 +attrs==25.3.0 +opentelemetry-api==1.34.1 +validators==0.35.0 +pyasn1==0.6.1 +prompt_toolkit==3.0.51 +sentry-sdk==2.30.0 +python-snappy==0.7.3 +cachetools==5.5.2 +wcwidth==0.2.13 +itsdangerous==2.2.0 +nvidia-cudnn-cu12==9.5.1.17 +tenacity==9.1.2 +nvidia-cusparselt-cu12==0.6.3 +torchmetrics==1.7.1 +pandas==2.3.0 +tabulate==0.9.0 +apache-libcloud==3.8.0 +tzdata==2025.2 +shellingham==1.5.4 +pfzy==0.3.4 +setproctitle==1.3.6 +catalogue==2.0.10 +argcomplete==3.6.2 +nvidia-cusolver-cu12==11.7.1.2 +llm-foundry==0.22.0.dev0 +wheel==0.45.1 +typing-inspection==0.4.1 +GitPython==3.1.44 +isodate==0.7.2 +typer==0.16.0 +annotated-types==0.7.0 +anyio==4.9.0 +iniconfig==2.1.0 +networkx==3.5 +mlflow-skinny==2.21.3 +filelock==3.18.0 +coloredlogs==15.0.1 +databricks-sdk==0.57.0 +cycler==0.12.1 +pynvml==11.5.3 +soupsieve==2.7 +six==1.17.0 +mdurl==0.1.2 +circuitbreaker==2.1.3 +packaging==25.0 +boto3==1.38.42 +websockets==11.0.3 +nvidia-cufile-cu12==1.11.1.6 +sqlparse==0.5.3 +gitdb==4.0.12 +proto-plus==1.26.1 +torchvision==0.22.0 +nvidia-cuda-nvrtc-cu12==12.6.77 +ruamel.yaml.clib==0.2.12 +sniffio==1.3.1 +pycparser==2.22 +questionary==2.1.0 +idna==3.10 +azure-storage-blob==12.25.1 +oci==2.154.3 +pillow==11.2.1 +smmap==5.0.2 +google-cloud-core==2.4.3 +azure-identity==1.23.0 +click==8.2.1 +py-cpuinfo==9.0.0 +mlflow==2.21.3 +jaxtyping==0.3.2 +bcrypt==4.3.0 +torch-optimizer==0.3.0 +safetensors==0.5.3 +Pygments==2.19.2 +pytorch-ranger==0.1.1 +pyparsing==3.2.3 +importlib_metadata==8.7.0 +aiosignal==1.3.2 +contourpy==1.3.2 +urllib3==2.5.0 +einops==0.8.1 +triton==3.3.0 +PyJWT==2.10.1 +humanfriendly==10.0 +nvidia-curand-cu12==10.3.7.77 +aiohttp==3.12.13 +cffi==1.17.1 +starlette==0.46.2 +Flask==3.1.1 +Brotli==1.1.0 +aiohappyeyeballs==2.6.1 +mosaicml-streaming==0.12.0 +pydantic==2.11.7 +threadpoolctl==3.6.0 +markdown-it-py==3.0.0 +types-python-dateutil==2.9.0.20250516 +nvidia-nccl-cu12==2.26.2 +python-dateutil==2.9.0.post0 +scipy==1.16.0 +greenlet==3.2.3 +wadler_lindig==0.1.7 +zipp==3.23.0 +Jinja2==3.1.6 +dill==0.3.8 +hyper-connections==0.2.1 +rsa==4.9.1 +wandb==0.18.7 +kiwisolver==1.4.8 +protobuf==5.29.5 +propcache==0.3.2 +fsspec==2025.5.1 +pyarrow==19.0.1 +sympy==1.14.0 +sentencepiece==0.2.0 +rotary-embedding-torch==0.8.6 +local-attention==1.11.1 +nvidia-cusparse-cu12==12.5.4.2 +yarl==1.20.1 +graphql-relay==3.2.0 +mpmath==1.3.0 +pytest==8.4.1 +joblib==1.5.1 +tqdm==4.67.1 +frozenlist==1.7.0 +Markdown==3.8.2 +accelerate==1.7.0 +google-resumable-media==2.7.2 +nvidia-nvjitlink-cu12==12.6.85 +numpy==2.1.3 +pip==24.0 +Mako==1.3.10 +multiprocess==0.70.16 +psutil==7.0.0 +datasets==3.6.0 +nvidia-cuda-runtime-cu12==12.6.77 +mosaicml==0.31.0 +nvidia-cublas-cu12==12.6.4.1 +uvicorn==0.34.3 +inquirerpy==0.3.4 +msal==1.32.3 +coolname==2.2.0 +cramjam==2.10.0 +platformdirs==4.3.8 +flatbuffers==25.2.10 +scikit-learn==1.7.0 +cloudpickle==3.1.1 +requests==2.32.4 +h11==0.16.0 +omegaconf==2.3.0 +ruamel.yaml==0.18.14 +pyasn1_modules==0.4.2 +tokenizers==0.21.2 +arrow==1.3.0 +slack_sdk==3.35.0 +gunicorn==23.0.0 +pydantic_core==2.33.2 +google-auth==2.40.3 +charset-normalizer==3.4.2 +azure-storage-file-datalake==12.20.0 +nvidia-cufft-cu12==11.3.0.4 +blinker==1.9.0 +torch==2.7.0 +fastapi==0.115.13 +fonttools==4.58.4 +regex==2024.11.6 +paramiko==3.5.1 +einx==0.3.0 +msal-extensions==1.3.1 +PyNaCl==1.5.0 +docker==7.1.0 +azure-core==1.34.0 +rich==14.0.0 +typing_extensions==4.14.0 +opentelemetry-semantic-conventions==0.55b1 +MarkupSafe==3.0.2 +graphql-core==3.2.6 +termcolor==3.1.0 +nvidia-cuda-cupti-cu12==12.6.80 +flash_attn==2.7.4.post1 +Werkzeug==3.1.3 +onnx==1.18.0 +lightning-utilities==0.14.3 +PyYAML==6.0.2 +setuptools==80.9.0 +cryptography==44.0.3 +antlr4-python3-runtime==4.9.3 +google-crc32c==1.7.1 +botocore==1.38.42 +nvidia-nvtx-cu12==12.6.77 +backoff==2.2.1 +graphene==3.4.3 +mosaicml-cli==0.7.3 +matplotlib==3.10.3 +beautifulsoup4==4.13.4 +transformers==4.51.3 +alembic==1.16.2 +multidict==6.5.0 +google-cloud-storage==2.10.0 +pyOpenSSL==24.3.0 +gql==3.5.3 +hf_transfer==0.1.9 +frozendict==2.4.6 +SQLAlchemy==2.0.41 +pluggy==1.6.0 +hf-xet==1.1.5 diff --git a/wandb/run-20250707_171826-rb9axy3v/files/wandb-metadata.json b/wandb/run-20250707_171826-rb9axy3v/files/wandb-metadata.json new file mode 100644 index 0000000..184e965 --- /dev/null +++ b/wandb/run-20250707_171826-rb9axy3v/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-07T09:18:26.333954Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx033", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "28870660096" + } + }, + "memory": { + "total": "2164194279424" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250707_171826-rb9axy3v/run-rb9axy3v.wandb b/wandb/run-20250707_171826-rb9axy3v/run-rb9axy3v.wandb new file mode 100644 index 0000000..c600475 Binary files /dev/null and b/wandb/run-20250707_171826-rb9axy3v/run-rb9axy3v.wandb differ diff --git a/wandb/run-20250707_181358-5mxyov4r/files/config.yaml b/wandb/run-20250707_181358-5mxyov4r/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250707_181358-5mxyov4r/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250707_181358-5mxyov4r/files/wandb-metadata.json b/wandb/run-20250707_181358-5mxyov4r/files/wandb-metadata.json new file mode 100644 index 0000000..ec35301 --- /dev/null +++ b/wandb/run-20250707_181358-5mxyov4r/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-07T10:13:58.986561Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "28806520832" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250707_181358-5mxyov4r/files/wandb-summary.json b/wandb/run-20250707_181358-5mxyov4r/files/wandb-summary.json new file mode 100644 index 0000000..4e355fc --- /dev/null +++ b/wandb/run-20250707_181358-5mxyov4r/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1}} \ No newline at end of file diff --git a/wandb/run-20250707_181358-5mxyov4r/run-5mxyov4r.wandb b/wandb/run-20250707_181358-5mxyov4r/run-5mxyov4r.wandb new file mode 100644 index 0000000..db0aaac Binary files /dev/null and b/wandb/run-20250707_181358-5mxyov4r/run-5mxyov4r.wandb differ diff --git a/wandb/run-20250707_181437-cjdzttf7/files/config.yaml b/wandb/run-20250707_181437-cjdzttf7/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250707_181437-cjdzttf7/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250707_181437-cjdzttf7/files/wandb-metadata.json b/wandb/run-20250707_181437-cjdzttf7/files/wandb-metadata.json new file mode 100644 index 0000000..9e64558 --- /dev/null +++ b/wandb/run-20250707_181437-cjdzttf7/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-07T10:14:37.136627Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "28806852608" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250707_181437-cjdzttf7/files/wandb-summary.json b/wandb/run-20250707_181437-cjdzttf7/files/wandb-summary.json new file mode 100644 index 0000000..6c37fe1 --- /dev/null +++ b/wandb/run-20250707_181437-cjdzttf7/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20250707_181437-cjdzttf7/run-cjdzttf7.wandb b/wandb/run-20250707_181437-cjdzttf7/run-cjdzttf7.wandb new file mode 100644 index 0000000..dd2cb1a Binary files /dev/null and b/wandb/run-20250707_181437-cjdzttf7/run-cjdzttf7.wandb differ diff --git a/wandb/run-20250708_084054-hxxnj5bd/files/config.yaml b/wandb/run-20250708_084054-hxxnj5bd/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_084054-hxxnj5bd/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_084054-hxxnj5bd/files/wandb-metadata.json b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-metadata.json new file mode 100644 index 0000000..39559a3 --- /dev/null +++ b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T00:40:54.087515Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29075877888" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_084054-hxxnj5bd/files/wandb-summary.json b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-summary.json new file mode 100644 index 0000000..c437ff1 --- /dev/null +++ b/wandb/run-20250708_084054-hxxnj5bd/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":7}} \ No newline at end of file diff --git a/wandb/run-20250708_084054-hxxnj5bd/run-hxxnj5bd.wandb b/wandb/run-20250708_084054-hxxnj5bd/run-hxxnj5bd.wandb new file mode 100644 index 0000000..6b3370b Binary files /dev/null and b/wandb/run-20250708_084054-hxxnj5bd/run-hxxnj5bd.wandb differ diff --git a/wandb/run-20250708_084403-zrhsz2wi/files/config.yaml b/wandb/run-20250708_084403-zrhsz2wi/files/config.yaml new file mode 100644 index 0000000..ca5f6f6 --- /dev/null +++ b/wandb/run-20250708_084403-zrhsz2wi/files/config.yaml @@ -0,0 +1,25 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + - 61 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_084403-zrhsz2wi/files/wandb-metadata.json b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-metadata.json new file mode 100644 index 0000000..184d77c --- /dev/null +++ b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T00:44:03.665695Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29077024768" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_084403-zrhsz2wi/files/wandb-summary.json b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-summary.json new file mode 100644 index 0000000..95ea6e2 --- /dev/null +++ b/wandb/run-20250708_084403-zrhsz2wi/files/wandb-summary.json @@ -0,0 +1 @@ +{"_timestamp":1.7519359383420506e+09,"valid_loss":1.3284306526184082,"_runtime":494.768991449,"_step":1969,"_wandb":{"runtime":494},"loss":1.6118727922439575} \ No newline at end of file diff --git a/wandb/run-20250708_084403-zrhsz2wi/run-zrhsz2wi.wandb b/wandb/run-20250708_084403-zrhsz2wi/run-zrhsz2wi.wandb new file mode 100644 index 0000000..2173c70 Binary files /dev/null and b/wandb/run-20250708_084403-zrhsz2wi/run-zrhsz2wi.wandb differ diff --git a/wandb/run-20250708_085118-aw5dl503/files/config.yaml b/wandb/run-20250708_085118-aw5dl503/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_085118-aw5dl503/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_085118-aw5dl503/files/wandb-metadata.json b/wandb/run-20250708_085118-aw5dl503/files/wandb-metadata.json new file mode 100644 index 0000000..7ae3f5c --- /dev/null +++ b/wandb/run-20250708_085118-aw5dl503/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T00:51:18.131816Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29079195648" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_085118-aw5dl503/files/wandb-summary.json b/wandb/run-20250708_085118-aw5dl503/files/wandb-summary.json new file mode 100644 index 0000000..6c37fe1 --- /dev/null +++ b/wandb/run-20250708_085118-aw5dl503/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20250708_085118-aw5dl503/run-aw5dl503.wandb b/wandb/run-20250708_085118-aw5dl503/run-aw5dl503.wandb new file mode 100644 index 0000000..faabfcb Binary files /dev/null and b/wandb/run-20250708_085118-aw5dl503/run-aw5dl503.wandb differ diff --git a/wandb/run-20250708_085151-792413q0/files/config.yaml b/wandb/run-20250708_085151-792413q0/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_085151-792413q0/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_085151-792413q0/files/wandb-metadata.json b/wandb/run-20250708_085151-792413q0/files/wandb-metadata.json new file mode 100644 index 0000000..efb4a86 --- /dev/null +++ b/wandb/run-20250708_085151-792413q0/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T00:51:51.673787Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29079429120" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_085151-792413q0/files/wandb-summary.json b/wandb/run-20250708_085151-792413q0/files/wandb-summary.json new file mode 100644 index 0000000..6c37fe1 --- /dev/null +++ b/wandb/run-20250708_085151-792413q0/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20250708_085151-792413q0/run-792413q0.wandb b/wandb/run-20250708_085151-792413q0/run-792413q0.wandb new file mode 100644 index 0000000..b1eb7d9 Binary files /dev/null and b/wandb/run-20250708_085151-792413q0/run-792413q0.wandb differ diff --git a/wandb/run-20250708_085331-fdnrdel8/files/requirements.txt b/wandb/run-20250708_085331-fdnrdel8/files/requirements.txt new file mode 100644 index 0000000..dbe4370 --- /dev/null +++ b/wandb/run-20250708_085331-fdnrdel8/files/requirements.txt @@ -0,0 +1,215 @@ +docker-pycreds==0.4.0 +onnxruntime==1.22.0 +google-api-core==2.25.1 +googleapis-common-protos==1.70.0 +opentelemetry-sdk==1.34.1 +zstd==1.5.7.2 +jmespath==1.0.1 +s3transfer==0.13.0 +huggingface-hub==0.33.2 +pytz==2025.2 +xxhash==3.5.0 +certifi==2025.6.15 +attrs==25.3.0 +opentelemetry-api==1.34.1 +validators==0.35.0 +pyasn1==0.6.1 +prompt_toolkit==3.0.51 +sentry-sdk==2.30.0 +python-snappy==0.7.3 +cachetools==5.5.2 +wcwidth==0.2.13 +itsdangerous==2.2.0 +nvidia-cudnn-cu12==9.5.1.17 +tenacity==9.1.2 +nvidia-cusparselt-cu12==0.6.3 +torchmetrics==1.7.1 +pandas==2.3.0 +tabulate==0.9.0 +apache-libcloud==3.8.0 +tzdata==2025.2 +shellingham==1.5.4 +pfzy==0.3.4 +setproctitle==1.3.6 +catalogue==2.0.10 +argcomplete==3.6.2 +nvidia-cusolver-cu12==11.7.1.2 +llm-foundry==0.22.0.dev0 +wheel==0.45.1 +typing-inspection==0.4.1 +GitPython==3.1.44 +isodate==0.7.2 +typer==0.16.0 +annotated-types==0.7.0 +anyio==4.9.0 +iniconfig==2.1.0 +networkx==3.5 +mlflow-skinny==2.21.3 +filelock==3.18.0 +coloredlogs==15.0.1 +databricks-sdk==0.57.0 +cycler==0.12.1 +pynvml==11.5.3 +soupsieve==2.7 +six==1.17.0 +mdurl==0.1.2 +circuitbreaker==2.1.3 +packaging==25.0 +boto3==1.38.42 +websockets==11.0.3 +nvidia-cufile-cu12==1.11.1.6 +sqlparse==0.5.3 +gitdb==4.0.12 +proto-plus==1.26.1 +torchvision==0.22.0 +nvidia-cuda-nvrtc-cu12==12.6.77 +ruamel.yaml.clib==0.2.12 +sniffio==1.3.1 +pycparser==2.22 +questionary==2.1.0 +idna==3.10 +azure-storage-blob==12.25.1 +oci==2.154.3 +pillow==11.2.1 +smmap==5.0.2 +google-cloud-core==2.4.3 +azure-identity==1.23.0 +click==8.2.1 +py-cpuinfo==9.0.0 +mlflow==2.21.3 +jaxtyping==0.3.2 +bcrypt==4.3.0 +torch-optimizer==0.3.0 +safetensors==0.5.3 +Pygments==2.19.2 +pytorch-ranger==0.1.1 +pyparsing==3.2.3 +importlib_metadata==8.7.0 +aiosignal==1.3.2 +contourpy==1.3.2 +urllib3==2.5.0 +einops==0.8.1 +triton==3.3.0 +PyJWT==2.10.1 +humanfriendly==10.0 +nvidia-curand-cu12==10.3.7.77 +aiohttp==3.12.13 +cffi==1.17.1 +starlette==0.46.2 +Flask==3.1.1 +Brotli==1.1.0 +aiohappyeyeballs==2.6.1 +mosaicml-streaming==0.12.0 +pydantic==2.11.7 +threadpoolctl==3.6.0 +markdown-it-py==3.0.0 +types-python-dateutil==2.9.0.20250516 +nvidia-nccl-cu12==2.26.2 +python-dateutil==2.9.0.post0 +scipy==1.16.0 +greenlet==3.2.3 +wadler_lindig==0.1.7 +zipp==3.23.0 +Jinja2==3.1.6 +dill==0.3.8 +hyper-connections==0.2.1 +rsa==4.9.1 +wandb==0.18.7 +kiwisolver==1.4.8 +protobuf==5.29.5 +propcache==0.3.2 +fsspec==2025.5.1 +pyarrow==19.0.1 +sympy==1.14.0 +sentencepiece==0.2.0 +rotary-embedding-torch==0.8.6 +local-attention==1.11.1 +nvidia-cusparse-cu12==12.5.4.2 +yarl==1.20.1 +graphql-relay==3.2.0 +mpmath==1.3.0 +pytest==8.4.1 +joblib==1.5.1 +tqdm==4.67.1 +frozenlist==1.7.0 +Markdown==3.8.2 +accelerate==1.7.0 +google-resumable-media==2.7.2 +nvidia-nvjitlink-cu12==12.6.85 +numpy==2.1.3 +pip==24.0 +Mako==1.3.10 +multiprocess==0.70.16 +psutil==7.0.0 +datasets==3.6.0 +nvidia-cuda-runtime-cu12==12.6.77 +mosaicml==0.31.0 +nvidia-cublas-cu12==12.6.4.1 +uvicorn==0.34.3 +inquirerpy==0.3.4 +msal==1.32.3 +coolname==2.2.0 +cramjam==2.10.0 +platformdirs==4.3.8 +flatbuffers==25.2.10 +scikit-learn==1.7.0 +cloudpickle==3.1.1 +requests==2.32.4 +h11==0.16.0 +omegaconf==2.3.0 +ruamel.yaml==0.18.14 +pyasn1_modules==0.4.2 +tokenizers==0.21.2 +arrow==1.3.0 +slack_sdk==3.35.0 +gunicorn==23.0.0 +pydantic_core==2.33.2 +google-auth==2.40.3 +charset-normalizer==3.4.2 +azure-storage-file-datalake==12.20.0 +nvidia-cufft-cu12==11.3.0.4 +blinker==1.9.0 +torch==2.7.0 +fastapi==0.115.13 +fonttools==4.58.4 +regex==2024.11.6 +paramiko==3.5.1 +einx==0.3.0 +msal-extensions==1.3.1 +PyNaCl==1.5.0 +docker==7.1.0 +azure-core==1.34.0 +rich==14.0.0 +typing_extensions==4.14.0 +opentelemetry-semantic-conventions==0.55b1 +MarkupSafe==3.0.2 +graphql-core==3.2.6 +termcolor==3.1.0 +nvidia-cuda-cupti-cu12==12.6.80 +flash_attn==2.7.4.post1 +Werkzeug==3.1.3 +onnx==1.18.0 +lightning-utilities==0.14.3 +PyYAML==6.0.2 +setuptools==80.9.0 +cryptography==44.0.3 +antlr4-python3-runtime==4.9.3 +google-crc32c==1.7.1 +botocore==1.38.42 +nvidia-nvtx-cu12==12.6.77 +backoff==2.2.1 +graphene==3.4.3 +mosaicml-cli==0.7.3 +matplotlib==3.10.3 +beautifulsoup4==4.13.4 +transformers==4.51.3 +alembic==1.16.2 +multidict==6.5.0 +google-cloud-storage==2.10.0 +pyOpenSSL==24.3.0 +gql==3.5.3 +hf_transfer==0.1.9 +frozendict==2.4.6 +SQLAlchemy==2.0.41 +pluggy==1.6.0 +hf-xet==1.1.5 diff --git a/wandb/run-20250708_085331-fdnrdel8/files/wandb-metadata.json b/wandb/run-20250708_085331-fdnrdel8/files/wandb-metadata.json new file mode 100644 index 0000000..de789f7 --- /dev/null +++ b/wandb/run-20250708_085331-fdnrdel8/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T00:53:31.228466Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29080027136" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_085331-fdnrdel8/run-fdnrdel8.wandb b/wandb/run-20250708_085331-fdnrdel8/run-fdnrdel8.wandb new file mode 100644 index 0000000..e18f2b4 Binary files /dev/null and b/wandb/run-20250708_085331-fdnrdel8/run-fdnrdel8.wandb differ diff --git a/wandb/run-20250708_085421-g296eupq/files/config.yaml b/wandb/run-20250708_085421-g296eupq/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_085421-g296eupq/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_085421-g296eupq/files/wandb-metadata.json b/wandb/run-20250708_085421-g296eupq/files/wandb-metadata.json new file mode 100644 index 0000000..3937348 --- /dev/null +++ b/wandb/run-20250708_085421-g296eupq/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T00:54:21.844685Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29080342528" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_085421-g296eupq/files/wandb-summary.json b/wandb/run-20250708_085421-g296eupq/files/wandb-summary.json new file mode 100644 index 0000000..6c37fe1 --- /dev/null +++ b/wandb/run-20250708_085421-g296eupq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20250708_085421-g296eupq/run-g296eupq.wandb b/wandb/run-20250708_085421-g296eupq/run-g296eupq.wandb new file mode 100644 index 0000000..7634122 Binary files /dev/null and b/wandb/run-20250708_085421-g296eupq/run-g296eupq.wandb differ diff --git a/wandb/run-20250708_091756-wmmb56r6/files/config.yaml b/wandb/run-20250708_091756-wmmb56r6/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_091756-wmmb56r6/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_091756-wmmb56r6/files/wandb-metadata.json b/wandb/run-20250708_091756-wmmb56r6/files/wandb-metadata.json new file mode 100644 index 0000000..e1d4879 --- /dev/null +++ b/wandb/run-20250708_091756-wmmb56r6/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T01:17:56.452867Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29087944704" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_091756-wmmb56r6/files/wandb-summary.json b/wandb/run-20250708_091756-wmmb56r6/files/wandb-summary.json new file mode 100644 index 0000000..4e355fc --- /dev/null +++ b/wandb/run-20250708_091756-wmmb56r6/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1}} \ No newline at end of file diff --git a/wandb/run-20250708_091756-wmmb56r6/run-wmmb56r6.wandb b/wandb/run-20250708_091756-wmmb56r6/run-wmmb56r6.wandb new file mode 100644 index 0000000..7dc9b1b Binary files /dev/null and b/wandb/run-20250708_091756-wmmb56r6/run-wmmb56r6.wandb differ diff --git a/wandb/run-20250708_091817-s4wd9h25/files/config.yaml b/wandb/run-20250708_091817-s4wd9h25/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_091817-s4wd9h25/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_091817-s4wd9h25/files/wandb-metadata.json b/wandb/run-20250708_091817-s4wd9h25/files/wandb-metadata.json new file mode 100644 index 0000000..6166e43 --- /dev/null +++ b/wandb/run-20250708_091817-s4wd9h25/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T01:18:17.113494Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29088096256" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_091817-s4wd9h25/files/wandb-summary.json b/wandb/run-20250708_091817-s4wd9h25/files/wandb-summary.json new file mode 100644 index 0000000..4e355fc --- /dev/null +++ b/wandb/run-20250708_091817-s4wd9h25/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1}} \ No newline at end of file diff --git a/wandb/run-20250708_091817-s4wd9h25/run-s4wd9h25.wandb b/wandb/run-20250708_091817-s4wd9h25/run-s4wd9h25.wandb new file mode 100644 index 0000000..aab06a9 Binary files /dev/null and b/wandb/run-20250708_091817-s4wd9h25/run-s4wd9h25.wandb differ diff --git a/wandb/run-20250708_092024-mxbeuycs/files/config.yaml b/wandb/run-20250708_092024-mxbeuycs/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_092024-mxbeuycs/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_092024-mxbeuycs/files/wandb-metadata.json b/wandb/run-20250708_092024-mxbeuycs/files/wandb-metadata.json new file mode 100644 index 0000000..170b0f2 --- /dev/null +++ b/wandb/run-20250708_092024-mxbeuycs/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T01:20:24.559917Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29088919552" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_092024-mxbeuycs/files/wandb-summary.json b/wandb/run-20250708_092024-mxbeuycs/files/wandb-summary.json new file mode 100644 index 0000000..4e355fc --- /dev/null +++ b/wandb/run-20250708_092024-mxbeuycs/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1}} \ No newline at end of file diff --git a/wandb/run-20250708_092024-mxbeuycs/run-mxbeuycs.wandb b/wandb/run-20250708_092024-mxbeuycs/run-mxbeuycs.wandb new file mode 100644 index 0000000..5f2a215 Binary files /dev/null and b/wandb/run-20250708_092024-mxbeuycs/run-mxbeuycs.wandb differ diff --git a/wandb/run-20250708_092048-k4evr4pi/files/config.yaml b/wandb/run-20250708_092048-k4evr4pi/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_092048-k4evr4pi/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_092048-k4evr4pi/files/wandb-metadata.json b/wandb/run-20250708_092048-k4evr4pi/files/wandb-metadata.json new file mode 100644 index 0000000..6b90c94 --- /dev/null +++ b/wandb/run-20250708_092048-k4evr4pi/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T01:20:48.673820Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29089042432" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_092048-k4evr4pi/files/wandb-summary.json b/wandb/run-20250708_092048-k4evr4pi/files/wandb-summary.json new file mode 100644 index 0000000..1d52051 --- /dev/null +++ b/wandb/run-20250708_092048-k4evr4pi/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2}} \ No newline at end of file diff --git a/wandb/run-20250708_092048-k4evr4pi/run-k4evr4pi.wandb b/wandb/run-20250708_092048-k4evr4pi/run-k4evr4pi.wandb new file mode 100644 index 0000000..21c36b6 Binary files /dev/null and b/wandb/run-20250708_092048-k4evr4pi/run-k4evr4pi.wandb differ diff --git a/wandb/run-20250708_092146-lw8l9lnj/files/config.yaml b/wandb/run-20250708_092146-lw8l9lnj/files/config.yaml new file mode 100644 index 0000000..ca5f6f6 --- /dev/null +++ b/wandb/run-20250708_092146-lw8l9lnj/files/config.yaml @@ -0,0 +1,25 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + - 61 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_092146-lw8l9lnj/files/wandb-metadata.json b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-metadata.json new file mode 100644 index 0000000..c767ccf --- /dev/null +++ b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T01:21:46.340269Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29089415168" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_092146-lw8l9lnj/files/wandb-summary.json b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-summary.json new file mode 100644 index 0000000..31a72e0 --- /dev/null +++ b/wandb/run-20250708_092146-lw8l9lnj/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":117},"_step":1883,"loss":1.665515422821045,"_timestamp":1.7519378234756608e+09,"valid_loss":1.703580379486084,"_runtime":117.163514931} \ No newline at end of file diff --git a/wandb/run-20250708_092146-lw8l9lnj/run-lw8l9lnj.wandb b/wandb/run-20250708_092146-lw8l9lnj/run-lw8l9lnj.wandb new file mode 100644 index 0000000..9d37459 Binary files /dev/null and b/wandb/run-20250708_092146-lw8l9lnj/run-lw8l9lnj.wandb differ diff --git a/wandb/run-20250708_092417-72fudjdt/files/config.yaml b/wandb/run-20250708_092417-72fudjdt/files/config.yaml new file mode 100644 index 0000000..ca5f6f6 --- /dev/null +++ b/wandb/run-20250708_092417-72fudjdt/files/config.yaml @@ -0,0 +1,25 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + - 61 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_092417-72fudjdt/files/wandb-metadata.json b/wandb/run-20250708_092417-72fudjdt/files/wandb-metadata.json new file mode 100644 index 0000000..cdc0cfc --- /dev/null +++ b/wandb/run-20250708_092417-72fudjdt/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T01:24:17.901516Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29098737664" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_092417-72fudjdt/files/wandb-summary.json b/wandb/run-20250708_092417-72fudjdt/files/wandb-summary.json new file mode 100644 index 0000000..35319c5 --- /dev/null +++ b/wandb/run-20250708_092417-72fudjdt/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":9184},"valid_loss":1.009443759918213,"_runtime":9184.695781432,"_step":99999,"loss":1.03251051902771,"_timestamp":1.751947042589338e+09} \ No newline at end of file diff --git a/wandb/run-20250708_092417-72fudjdt/run-72fudjdt.wandb b/wandb/run-20250708_092417-72fudjdt/run-72fudjdt.wandb new file mode 100644 index 0000000..b13b6fa Binary files /dev/null and b/wandb/run-20250708_092417-72fudjdt/run-72fudjdt.wandb differ diff --git a/wandb/run-20250708_100655-xbr2eet2/files/config.yaml b/wandb/run-20250708_100655-xbr2eet2/files/config.yaml new file mode 100644 index 0000000..34ed405 --- /dev/null +++ b/wandb/run-20250708_100655-xbr2eet2/files/config.yaml @@ -0,0 +1,24 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_100655-xbr2eet2/files/wandb-metadata.json b/wandb/run-20250708_100655-xbr2eet2/files/wandb-metadata.json new file mode 100644 index 0000000..d68610c --- /dev/null +++ b/wandb/run-20250708_100655-xbr2eet2/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T02:06:55.561010Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29111660544" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_100655-xbr2eet2/files/wandb-summary.json b/wandb/run-20250708_100655-xbr2eet2/files/wandb-summary.json new file mode 100644 index 0000000..6c37fe1 --- /dev/null +++ b/wandb/run-20250708_100655-xbr2eet2/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20250708_100655-xbr2eet2/run-xbr2eet2.wandb b/wandb/run-20250708_100655-xbr2eet2/run-xbr2eet2.wandb new file mode 100644 index 0000000..7cfcf4c Binary files /dev/null and b/wandb/run-20250708_100655-xbr2eet2/run-xbr2eet2.wandb differ diff --git a/wandb/run-20250708_100743-bp26doib/files/config.yaml b/wandb/run-20250708_100743-bp26doib/files/config.yaml new file mode 100644 index 0000000..ca5f6f6 --- /dev/null +++ b/wandb/run-20250708_100743-bp26doib/files/config.yaml @@ -0,0 +1,25 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + - 61 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_100743-bp26doib/files/wandb-metadata.json b/wandb/run-20250708_100743-bp26doib/files/wandb-metadata.json new file mode 100644 index 0000000..06b7260 --- /dev/null +++ b/wandb/run-20250708_100743-bp26doib/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T02:07:43.207774Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29111173120" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_100743-bp26doib/files/wandb-summary.json b/wandb/run-20250708_100743-bp26doib/files/wandb-summary.json new file mode 100644 index 0000000..a55553d --- /dev/null +++ b/wandb/run-20250708_100743-bp26doib/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2},"_runtime":2.200631544,"_step":0,"loss":5.773008346557617,"_timestamp":1.7519404648534203e+09,"valid_loss":5.134904861450195} \ No newline at end of file diff --git a/wandb/run-20250708_100743-bp26doib/run-bp26doib.wandb b/wandb/run-20250708_100743-bp26doib/run-bp26doib.wandb new file mode 100644 index 0000000..e5ef9f2 Binary files /dev/null and b/wandb/run-20250708_100743-bp26doib/run-bp26doib.wandb differ diff --git a/wandb/run-20250708_100933-5h0en8f2/files/config.yaml b/wandb/run-20250708_100933-5h0en8f2/files/config.yaml new file mode 100644 index 0000000..ca5f6f6 --- /dev/null +++ b/wandb/run-20250708_100933-5h0en8f2/files/config.yaml @@ -0,0 +1,25 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.11.13 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 17 + - 23 + - 55 + - 61 + "4": 3.11.13 + "5": 0.18.7 + "8": + - 5 + "10": + - 3 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250708_100933-5h0en8f2/files/wandb-metadata.json b/wandb/run-20250708_100933-5h0en8f2/files/wandb-metadata.json new file mode 100644 index 0000000..ae378a4 --- /dev/null +++ b/wandb/run-20250708_100933-5h0en8f2/files/wandb-metadata.json @@ -0,0 +1,43 @@ +{ + "os": "Linux-5.15.0-1076-nvidia-x86_64-with-glibc2.35", + "python": "3.11.13", + "startedAt": "2025-07-08T02:09:33.037948Z", + "program": "/home/users/nus/e1113744/native-sparse-attention-pytorch/train.py", + "codePath": "train.py", + "git": { + "remote": "git@github.com:RunjiaChen/native-sparse-attention-pytorch.git", + "commit": "8fe8317a961a9e897f67cdc1dc3b94d811ecfe71" + }, + "email": "runjia@u.nus.edu", + "root": "/home/users/nus/e1113744/native-sparse-attention-pytorch", + "host": "a2ap-dgx034", + "username": "e1113744", + "executable": "/home/users/nus/e1113744/llm-foundry/.venv/bin/python", + "codePathLocal": "train.py", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "1888556142592", + "used": "29111791616" + } + }, + "memory": { + "total": "2164194275328" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "cudaVersion": "12.4" +} \ No newline at end of file diff --git a/wandb/run-20250708_100933-5h0en8f2/files/wandb-summary.json b/wandb/run-20250708_100933-5h0en8f2/files/wandb-summary.json new file mode 100644 index 0000000..9133f62 --- /dev/null +++ b/wandb/run-20250708_100933-5h0en8f2/files/wandb-summary.json @@ -0,0 +1 @@ +{"loss":0.7260929346084595,"_timestamp":1.7519495866363919e+09,"_wandb":{"runtime":9013},"valid_loss":0.73642897605896,"_runtime":9013.610222683,"_step":99999} \ No newline at end of file diff --git a/wandb/run-20250708_100933-5h0en8f2/run-5h0en8f2.wandb b/wandb/run-20250708_100933-5h0en8f2/run-5h0en8f2.wandb new file mode 100644 index 0000000..5830bb2 Binary files /dev/null and b/wandb/run-20250708_100933-5h0en8f2/run-5h0en8f2.wandb differ