Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
16 changes: 9 additions & 7 deletions analysis/dashboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def render_sidebar(logs_dir, runs):
with st.sidebar.expander("📊 ISL/OSL", expanded=False):
isl_osl_pairs = set()
for run in sorted_runs:
if run.profiler.isl and run.profiler.osl:
isl_osl_pairs.add(f"{run.profiler.isl}/{run.profiler.osl}")
if run.profiler_metadata.isl and run.profiler_metadata.osl:
isl_osl_pairs.add(f"{run.profiler_metadata.isl}/{run.profiler_metadata.osl}")

if isl_osl_pairs:
pair_options = sorted(isl_osl_pairs)
Expand All @@ -115,7 +115,9 @@ def render_sidebar(logs_dir, runs):
)

if selected_pairs:
sorted_runs = [r for r in sorted_runs if f"{r.profiler.isl}/{r.profiler.osl}" in selected_pairs]
sorted_runs = [
r for r in sorted_runs if f"{r.profiler_metadata.isl}/{r.profiler_metadata.osl}" in selected_pairs
]
else:
st.caption("No ISL/OSL information available")

Expand Down Expand Up @@ -176,8 +178,8 @@ def render_sidebar(logs_dir, runs):

for run in sorted_runs:
topology = run.metadata.topology_label
isl = run.profiler.isl
osl = run.profiler.osl
isl = run.profiler_metadata.isl
osl = run.profiler_metadata.osl
gpu_type = run.metadata.gpu_type
gpu_suffix = f" [{gpu_type}]" if gpu_type else ""
# Include job ID to ensure unique labels
Expand Down Expand Up @@ -284,7 +286,7 @@ def render_sidebar(logs_dir, runs):
f"{run.job_id} | "
f"{run.metadata.agg_workers}A | "
f"{total_gpus} GPUs | "
f"{run.profiler.isl}/{run.profiler.osl}"
f"{run.profiler_metadata.isl}/{run.profiler_metadata.osl}"
)
else:
run_id = (
Expand All @@ -298,7 +300,7 @@ def render_sidebar(logs_dir, runs):
f"{run.job_id} | "
f"{run.metadata.prefill_workers}P{run.metadata.decode_workers}D | "
f"{prefill_gpus}/{decode_gpus} | "
f"{run.profiler.isl}/{run.profiler.osl}"
f"{run.profiler_metadata.isl}/{run.profiler_metadata.osl}"
)

if run.metadata.gpu_type:
Expand Down
6 changes: 5 additions & 1 deletion analysis/dashboard/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,11 @@ def _node_to_dict(node) -> dict:
Temporary converter for compatibility with existing visualization code.
"""
return {
"node_info": node.node_info,
"node_info": {
"node": node.node_name,
"worker_type": node.worker_type,
"worker_id": node.worker_id,
},
"prefill_batches": [_batch_to_dict(b) for b in node.batches],
"memory_snapshots": [_memory_to_dict(m) for m in node.memory_snapshots],
"config": node.config,
Expand Down
4 changes: 2 additions & 2 deletions analysis/dashboard/config_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def render(filtered_runs: list):
with col2:
st.metric("GPU", config_data["summary"]["gpu_type"])
with col3:
st.metric("ISL/OSL", f"{run.profiler.isl}/{run.profiler.osl}")
st.metric("ISL/OSL", f"{run.profiler_metadata.isl}/{run.profiler_metadata.osl}")
with col4:
gpu_type_suffix = f" ({run.metadata.gpu_type})" if run.metadata.gpu_type else ""
st.metric("Profiler", f"{run.profiler.profiler_type}{gpu_type_suffix}")
st.metric("Profiler", f"{run.profiler_metadata.profiler_type}{gpu_type_suffix}")

st.caption(f"Model: {config_data['summary']['model']}")
st.divider()
Expand Down
20 changes: 10 additions & 10 deletions analysis/dashboard/node_metrics_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
import streamlit as st

from analysis.dashboard.components import (
load_node_metrics,
create_node_throughput_graph,
create_kv_cache_utilization_graph,
create_queue_depth_graph,
create_node_inflight_requests_graph,
create_decode_running_requests_graph,
create_decode_disagg_stacked_graph,
create_decode_gen_throughput_graph,
create_decode_transfer_req_graph,
create_decode_prealloc_req_graph,
create_decode_disagg_stacked_graph,
create_decode_running_requests_graph,
create_decode_transfer_req_graph,
create_kv_cache_utilization_graph,
create_node_inflight_requests_graph,
create_node_throughput_graph,
create_queue_depth_graph,
load_node_metrics,
)
from analysis.srtlog.visualizations import aggregate_all_nodes, group_nodes_by_dp

Expand Down Expand Up @@ -72,8 +72,8 @@ def render(filtered_runs: list, logs_dir: str):
"agg_workers": run.metadata.agg_workers,
"gpus_per_node": run.metadata.gpus_per_node,
"total_gpus": run.total_gpus,
"isl": run.profiler.isl,
"osl": run.profiler.osl,
"isl": run.profiler_metadata.isl,
"osl": run.profiler_metadata.osl,
"gpu_type": run.metadata.gpu_type,
}
all_node_metrics.extend(node_metrics)
Expand Down
56 changes: 49 additions & 7 deletions analysis/dashboard/rate_match_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,48 @@
from analysis.dashboard.components import load_node_metrics


def _parse_timestamp(timestamp: str) -> datetime:
"""Parse timestamp from multiple possible formats.

Supports:
- ISO 8601: 2025-12-30T15:52:38.206058Z
- YYYY-MM-DD HH:MM:SS
- MM/DD/YYYY-HH:MM:SS (TRTLLM format)

Args:
timestamp: Timestamp string in one of the supported formats

Returns:
datetime object

Raises:
ValueError: If timestamp format is not recognized
"""
# Try YYYY-MM-DD HH:MM:SS format first (most common)
try:
return datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
except ValueError:
pass

# Try ISO 8601 format (SGLang)
try:
ts = timestamp.rstrip("Z")
if "." in ts:
return datetime.fromisoformat(ts)
else:
return datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S")
except ValueError:
pass

# Try MM/DD/YYYY-HH:MM:SS format (TRTLLM)
try:
return datetime.strptime(timestamp, "%m/%d/%Y-%H:%M:%S")
except ValueError:
pass

raise ValueError(f"Unable to parse timestamp: {timestamp}")


def render(filtered_runs: list, logs_dir: str):
"""Render rate match analysis.

Expand Down Expand Up @@ -84,11 +126,11 @@ def render(filtered_runs: list, logs_dir: str):
decode_gpus = run.metadata.decode_nodes * run.metadata.gpus_per_node
st.metric("GPU Split", f"{prefill_gpus} / {decode_gpus}")
with col4:
st.metric("ISL/OSL", f"{run.profiler.isl}/{run.profiler.osl}")
st.metric("ISL/OSL", f"{run.profiler_metadata.isl}/{run.profiler_metadata.osl}")

# Create rate match graph
isl = int(run.profiler.isl) if run.profiler.isl else None
osl = int(run.profiler.osl) if run.profiler.osl else None
isl = int(run.profiler_metadata.isl) if run.profiler_metadata.isl else None
osl = int(run.profiler_metadata.osl) if run.profiler_metadata.osl else None
Comment on lines +129 to +133
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, find the ProfilerMetadata class definition
rg -t py "class ProfilerMetadata" --max-count 5

Repository: ishandhanani/srt-slurm

Length of output: 115


🏁 Script executed:

# Look at the full context of rate_match_tab.py around lines 129-133
fd -t f "rate_match_tab.py" -x wc -l {}

Repository: ishandhanani/srt-slurm

Length of output: 108


🏁 Script executed:

# Check the file size first
wc -l analysis/dashboard/rate_match_tab.py 2>/dev/null || fd -t f "rate_match_tab.py"

Repository: ishandhanani/srt-slurm

Length of output: 106


🏁 Script executed:

# Read ProfilerMetadata class definition
cat -n analysis/srtlog/models.py | grep -A 50 "class ProfilerMetadata"

Repository: ishandhanani/srt-slurm

Length of output: 2092


🏁 Script executed:

# Read the rate_match_tab.py context
sed -n '120,145p' analysis/dashboard/rate_match_tab.py

Repository: ishandhanani/srt-slurm

Length of output: 1421


🏁 Script executed:

# Search for test data or JSON examples showing isl/osl values
fd -t f "\\.json" | head -10

Repository: ishandhanani/srt-slurm

Length of output: 92


🏁 Script executed:

# Look at _create_rate_match_graph to see how isl/osl are used
sed -n '140,180p' analysis/dashboard/rate_match_tab.py

Repository: ishandhanani/srt-slurm

Length of output: 1682


🏁 Script executed:

# Check if there are any tests or data samples
find . -type f \( -name "*.json" -o -name "*test*" \) | grep -i "profiler\|metadata\|isl\|osl" | head -20

Repository: ishandhanani/srt-slurm

Length of output: 48


🏁 Script executed:

# Check the config file to see if there are examples
cat configs/deepep_config.json

Repository: ishandhanani/srt-slurm

Length of output: 559


🏁 Script executed:

# Search for where isl/osl values are set or used
rg "isl|osl" -t py -B 2 -A 2 | head -100

Repository: ishandhanani/srt-slurm

Length of output: 5297


🏁 Script executed:

# Look for any test files that might show expected values
find . -type f -name "*test*.py" | xargs rg -l "isl\|osl" 2>/dev/null

Repository: ishandhanani/srt-slurm

Length of output: 50


🏁 Script executed:

# Check the actual JSON source to see if isl/osl can be non-numeric
rg "\"isl\"|\"osl\"" -t py -B 5 -A 5 | grep -A 10 "get"

Repository: ishandhanani/srt-slurm

Length of output: 1565


🏁 Script executed:

# Look for any real-world JSON samples or where the JSON data comes from
rg "benchmark.*json\|\.json.*benchmark" -t py -B 3 -A 3

Repository: ishandhanani/srt-slurm

Length of output: 48


🏁 Script executed:

# Check if there's any existing error handling for int() conversions in the file
rg "int\(" analysis/dashboard/rate_match_tab.py -B 2 -A 2

Repository: ishandhanani/srt-slurm

Length of output: 440


Guard ISL/OSL int conversion against non-numeric strings.

ProfilerMetadata.isl and ProfilerMetadata.osl are stored as strings without validation. Non-numeric values (such as "N/A") will raise ValueError and crash the dashboard. Wrap conversions in try-except blocks to safely handle invalid inputs.

🛠️ Proposed fix
-            isl = int(run.profiler_metadata.isl) if run.profiler_metadata.isl else None
-            osl = int(run.profiler_metadata.osl) if run.profiler_metadata.osl else None
+            try:
+                isl = int(run.profiler_metadata.isl) if run.profiler_metadata.isl else None
+            except (TypeError, ValueError):
+                isl = None
+            try:
+                osl = int(run.profiler_metadata.osl) if run.profiler_metadata.osl else None
+            except (TypeError, ValueError):
+                osl = None
🤖 Prompt for AI Agents
In `@analysis/dashboard/rate_match_tab.py` around lines 129 - 133, The int
conversions for run.profiler_metadata.isl and .osl can raise ValueError for
non-numeric strings; update the block that assigns isl and osl (currently
creating variables isl and osl used for the rate match graph) to wrap each
int(...) conversion in a try/except (ValueError, TypeError) and fall back to
None on failure, leaving st.metric display unchanged; ensure you reference
run.profiler_metadata.isl and run.profiler_metadata.osl and assign valid
integers or None so downstream graph code handles missing values safely.

rate_fig = _create_rate_match_graph(
prefill_nodes, decode_nodes, run.job_id, show_request_rate=show_request_rate, isl=isl, osl=osl
)
Expand Down Expand Up @@ -139,8 +181,8 @@ def _create_rate_match_graph(prefill_nodes, decode_nodes, job_id="", show_reques
avg_input_tps.append(avg / prefill_divisor)

if timestamps:
first_time = datetime.strptime(timestamps[0], "%Y-%m-%d %H:%M:%S")
elapsed = [(datetime.strptime(ts, "%Y-%m-%d %H:%M:%S") - first_time).total_seconds() for ts in timestamps]
first_time = _parse_timestamp(timestamps[0])
elapsed = [(_parse_timestamp(ts) - first_time).total_seconds() for ts in timestamps]

unit = "req/s" if show_request_rate else "tok/s"
rate_fig.add_trace(
Expand Down Expand Up @@ -175,8 +217,8 @@ def _create_rate_match_graph(prefill_nodes, decode_nodes, job_id="", show_reques
avg_gen_tps.append(avg / decode_divisor)

if timestamps:
first_time = datetime.strptime(timestamps[0], "%Y-%m-%d %H:%M:%S")
elapsed = [(datetime.strptime(ts, "%Y-%m-%d %H:%M:%S") - first_time).total_seconds() for ts in timestamps]
first_time = _parse_timestamp(timestamps[0])
elapsed = [(_parse_timestamp(ts) - first_time).total_seconds() for ts in timestamps]

unit = "req/s" if show_request_rate else "tok/s"
rate_fig.add_trace(
Expand Down
Loading
Loading