Skip to content

Commit 9ffdd76

Browse files
authored
Update Ray Data histograms to show percentiles in data dashboard (#58650)
## Description The data grafana dashboard by default should work well when viewing across all operators. Use a percentile graph that is grouped by operator instead. For the histogram bar charts, hide that in the operator panels row. This is useful for areas where we by default filter by a single operator (like in the data dashboard) <img width="1163" height="379" alt="Screenshot 2025-11-14 at 2 33 16 PM" src="https://github.com/user-attachments/assets/fd4a3d0c-4a60-4bb9-a803-859b3ed14a59" /> <img width="1158" height="430" alt="Screenshot 2025-11-14 at 2 33 08 PM" src="https://github.com/user-attachments/assets/a78b4dde-8066-478b-86b4-b838761431f2" /> --------- Signed-off-by: Alan Guo <[email protected]>
1 parent 219fb67 commit 9ffdd76

File tree

3 files changed

+245
-37
lines changed

3 files changed

+245
-37
lines changed

python/ray/dashboard/modules/metrics/dashboards/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ class Panel:
500500
linewidth: int = 1
501501
grid_pos: Optional[GridPos] = None
502502
template: Optional[PanelTemplate] = PanelTemplate.GRAPH
503+
hideXAxis: bool = False
503504

504505

505506
@DeveloperAPI

python/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py

Lines changed: 241 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -477,72 +477,184 @@
477477
)
478478

479479
# Task Completion Time Percentiles
480-
TASK_COMPLETION_TIME_PANEL = Panel(
480+
TASK_COMPLETION_TIME_P50_PANEL = Panel(
481481
id=38,
482-
title="Task Completion Time Histogram (s)",
483-
description="Time (in seconds) spent (including backpressure) running tasks to completion. Larger bars means more tasks finished within that duration range.",
482+
title="P50 Task Completion Time",
483+
description="P50 time (in seconds) spent (including backpressure) running tasks to completion.",
484484
targets=[
485485
Target(
486-
expr='sum by (le) (max_over_time(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
487-
legend="{{le}} s",
488-
template=TargetTemplate.HISTOGRAM_BAR_CHART,
486+
expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
487+
legend="{{operator}}",
489488
),
490489
],
491-
unit="short",
490+
unit="s",
492491
fill=0,
493492
stack=False,
494-
template=PanelTemplate.BAR_CHART,
495493
)
496494

497-
BLOCK_COMPLETION_TIME_PANEL = Panel(
495+
TASK_COMPLETION_TIME_P90_PANEL = Panel(
496+
id=82,
497+
title="P90 Task Completion Time",
498+
description="P90 time (in seconds) spent (including backpressure) running tasks to completion.",
499+
targets=[
500+
Target(
501+
expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
502+
legend="{{operator}}",
503+
),
504+
],
505+
unit="s",
506+
fill=0,
507+
stack=False,
508+
)
509+
510+
TASK_COMPLETION_TIME_P99_PANEL = Panel(
511+
id=83,
512+
title="P99 Task Completion Time",
513+
description="P99 time (in seconds) spent (including backpressure) running tasks to completion.",
514+
targets=[
515+
Target(
516+
expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
517+
legend="{{operator}}",
518+
),
519+
],
520+
unit="s",
521+
fill=0,
522+
stack=False,
523+
)
524+
525+
BLOCK_COMPLETION_TIME_P50_PANEL = Panel(
526+
id=84,
527+
title="P50 Block Completion Time",
528+
description="P50 time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process.",
529+
targets=[
530+
Target(
531+
expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
532+
legend="{{operator}}",
533+
),
534+
],
535+
unit="s",
536+
fill=0,
537+
stack=False,
538+
)
539+
540+
BLOCK_COMPLETION_TIME_P90_PANEL = Panel(
498541
id=61,
499-
title="Block Completion Time Histogram (s)",
500-
description="Time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process. Larger bars means more blocks finished within that duration range.",
542+
title="P90 Block Completion Time",
543+
description="P90 time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process.",
501544
targets=[
502545
Target(
503-
expr='sum by (le) (max_over_time(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
504-
legend="{{le}} s",
505-
template=TargetTemplate.HISTOGRAM_BAR_CHART,
546+
expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
547+
legend="{{operator}}",
506548
),
507549
],
508-
unit="short",
550+
unit="s",
551+
fill=0,
552+
stack=False,
553+
)
554+
555+
BLOCK_COMPLETION_TIME_P99_PANEL = Panel(
556+
id=85,
557+
title="P99 Block Completion Time",
558+
description="P99 time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process.",
559+
targets=[
560+
Target(
561+
expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
562+
legend="{{operator}}",
563+
),
564+
],
565+
unit="s",
509566
fill=0,
510567
stack=False,
511-
template=PanelTemplate.BAR_CHART,
512568
)
513569

514-
BLOCK_SIZE_BYTES_PANEL = Panel(
570+
BLOCK_SIZE_BYTES_P50_PANEL = Panel(
571+
id=86,
572+
title="P50 Block Size (Bytes)",
573+
description="P50 size (in bytes) per block.",
574+
targets=[
575+
Target(
576+
expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
577+
legend="{{operator}}",
578+
),
579+
],
580+
unit="bytes",
581+
fill=0,
582+
stack=False,
583+
)
584+
585+
BLOCK_SIZE_BYTES_P90_PANEL = Panel(
515586
id=62,
516-
title="Block Size (Bytes) Histogram",
517-
description="Size (in bytes) per block. Larger bars means more blocks are within that size range.",
587+
title="P90 Block Size (Bytes)",
588+
description="P90 size (in bytes) per block.",
518589
targets=[
519590
Target(
520-
expr='sum by (le) (max_over_time(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
521-
legend="{{le}} bytes",
522-
template=TargetTemplate.HISTOGRAM_BAR_CHART,
591+
expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
592+
legend="{{operator}}",
523593
),
524594
],
525-
unit="short",
595+
unit="bytes",
596+
fill=0,
597+
stack=False,
598+
)
599+
600+
BLOCK_SIZE_BYTES_P99_PANEL = Panel(
601+
id=87,
602+
title="P99 Block Size (Bytes)",
603+
description="P99 size (in bytes) per block.",
604+
targets=[
605+
Target(
606+
expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
607+
legend="{{operator}}",
608+
),
609+
],
610+
unit="bytes",
526611
fill=0,
527612
stack=False,
528-
template=PanelTemplate.BAR_CHART,
529613
)
530614

531-
BLOCK_SIZE_ROWS_PANEL = Panel(
615+
BLOCK_SIZE_ROWS_P50_PANEL = Panel(
616+
id=88,
617+
title="P50 Block Size (Rows)",
618+
description="P50 number of rows per block.",
619+
targets=[
620+
Target(
621+
expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
622+
legend="{{operator}}",
623+
),
624+
],
625+
unit="rows",
626+
fill=0,
627+
stack=False,
628+
)
629+
630+
BLOCK_SIZE_ROWS_P90_PANEL = Panel(
532631
id=63,
533-
title="Block Size (Rows) Histogram",
534-
description="Number of rows per block. Larger bars means more blocks are within that number of rows range.",
632+
title="P90 Block Size (Rows)",
633+
description="P90 number of rows per block.",
535634
targets=[
536635
Target(
537-
expr='sum by (le) (max_over_time(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
538-
legend="{{le}} rows",
539-
template=TargetTemplate.HISTOGRAM_BAR_CHART,
636+
expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
637+
legend="{{operator}}",
540638
),
541639
],
542-
unit="short",
640+
unit="rows",
641+
fill=0,
642+
stack=False,
643+
)
644+
645+
BLOCK_SIZE_ROWS_P99_PANEL = Panel(
646+
id=89,
647+
title="P99 Block Size (Rows)",
648+
description="P99 number of rows per block.",
649+
targets=[
650+
Target(
651+
expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))',
652+
legend="{{operator}}",
653+
),
654+
],
655+
unit="rows",
543656
fill=0,
544657
stack=False,
545-
template=PanelTemplate.BAR_CHART,
546658
)
547659

548660
TASK_OUTPUT_BACKPRESSURE_TIME_PANEL = Panel(
@@ -1051,10 +1163,88 @@
10511163
stack=False,
10521164
)
10531165

1166+
OPERATOR_TASK_COMPLETION_TIME_PANEL = Panel(
1167+
id=78,
1168+
title="Task Completion Time Histogram (s)",
1169+
description="Time (in seconds) spent (including backpressure) running tasks to completion. Larger bars means more tasks finished within that duration range.",
1170+
targets=[
1171+
Target(
1172+
expr='sum by (le) (max_over_time(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
1173+
legend="{{le}} s",
1174+
template=TargetTemplate.HISTOGRAM_BAR_CHART,
1175+
),
1176+
],
1177+
unit="short",
1178+
fill=0,
1179+
stack=False,
1180+
template=PanelTemplate.BAR_CHART,
1181+
)
1182+
1183+
OPERATOR_BLOCK_COMPLETION_TIME_PANEL = Panel(
1184+
id=79,
1185+
title="Block Completion Time Histogram (s)",
1186+
description="Time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process. Larger bars means more blocks finished within that duration range.",
1187+
targets=[
1188+
Target(
1189+
expr='sum by (le) (max_over_time(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
1190+
legend="{{le}} s",
1191+
template=TargetTemplate.HISTOGRAM_BAR_CHART,
1192+
),
1193+
],
1194+
unit="short",
1195+
fill=0,
1196+
stack=False,
1197+
template=PanelTemplate.BAR_CHART,
1198+
)
1199+
1200+
OPERATOR_BLOCK_SIZE_BYTES_PANEL = Panel(
1201+
id=80,
1202+
title="Block Size (Bytes) Histogram",
1203+
description="Size (in bytes) per block. Larger bars means more blocks are within that size range.",
1204+
targets=[
1205+
Target(
1206+
expr='sum by (le) (max_over_time(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
1207+
legend="{{le}} bytes",
1208+
template=TargetTemplate.HISTOGRAM_BAR_CHART,
1209+
),
1210+
],
1211+
unit="short",
1212+
fill=0,
1213+
stack=False,
1214+
template=PanelTemplate.BAR_CHART,
1215+
# We hide the X axis because the values are too large to fit and they are not useful.
1216+
# We also cannot format it to higher units so it has too many digits.
1217+
hideXAxis=True,
1218+
)
1219+
1220+
OPERATOR_BLOCK_SIZE_ROWS_PANEL = Panel(
1221+
id=81,
1222+
title="Block Size (Rows) Histogram",
1223+
description="Number of rows per block. Larger bars means more blocks are within that number of rows range.",
1224+
targets=[
1225+
Target(
1226+
expr='sum by (le) (max_over_time(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))',
1227+
legend="{{le}} rows",
1228+
template=TargetTemplate.HISTOGRAM_BAR_CHART,
1229+
),
1230+
],
1231+
unit="short",
1232+
fill=0,
1233+
stack=False,
1234+
template=PanelTemplate.BAR_CHART,
1235+
# We hide the X axis because the values are too large to fit and they are not useful.
1236+
# We also cannot format it to higher units so it has too many digits.
1237+
hideXAxis=True,
1238+
)
1239+
10541240
OPERATOR_PANELS = [
10551241
ROWS_OUTPUT_PER_SECOND_PANEL,
10561242
ALL_RESOURCES_UTILIZATION_PANEL,
10571243
COMBINED_INQUEUE_BLOCKS_PANEL,
1244+
OPERATOR_TASK_COMPLETION_TIME_PANEL,
1245+
OPERATOR_BLOCK_COMPLETION_TIME_PANEL,
1246+
OPERATOR_BLOCK_SIZE_BYTES_PANEL,
1247+
OPERATOR_BLOCK_SIZE_ROWS_PANEL,
10581248
]
10591249

10601250
DATA_GRAFANA_ROWS = [
@@ -1117,8 +1307,12 @@
11171307
title="Outputs",
11181308
id=103,
11191309
panels=[
1120-
BLOCK_SIZE_BYTES_PANEL,
1121-
BLOCK_SIZE_ROWS_PANEL,
1310+
BLOCK_SIZE_BYTES_P50_PANEL,
1311+
BLOCK_SIZE_BYTES_P90_PANEL,
1312+
BLOCK_SIZE_BYTES_P99_PANEL,
1313+
BLOCK_SIZE_ROWS_P50_PANEL,
1314+
BLOCK_SIZE_ROWS_P90_PANEL,
1315+
BLOCK_SIZE_ROWS_P99_PANEL,
11221316
OUTPUT_BLOCKS_TAKEN_PANEL,
11231317
OUTPUT_BYTES_TAKEN_PANEL,
11241318
OUTPUT_BYTES_BY_NODE_PANEL,
@@ -1136,8 +1330,12 @@
11361330
title="Tasks",
11371331
id=104,
11381332
panels=[
1139-
TASK_COMPLETION_TIME_PANEL,
1140-
BLOCK_COMPLETION_TIME_PANEL,
1333+
TASK_COMPLETION_TIME_P50_PANEL,
1334+
TASK_COMPLETION_TIME_P90_PANEL,
1335+
TASK_COMPLETION_TIME_P99_PANEL,
1336+
BLOCK_COMPLETION_TIME_P50_PANEL,
1337+
BLOCK_COMPLETION_TIME_P90_PANEL,
1338+
BLOCK_COMPLETION_TIME_P99_PANEL,
11411339
TASK_COMPLETION_TIME_WITHOUT_BACKPRESSURE_PANEL,
11421340
TASK_OUTPUT_BACKPRESSURE_TIME_PANEL,
11431341
TASK_SUBMISSION_BACKPRESSURE_PANEL,
@@ -1199,7 +1397,13 @@
11991397
Row(
12001398
title="Operator Panels",
12011399
id=108,
1202-
panels=[ALL_RESOURCES_UTILIZATION_PANEL],
1400+
panels=[
1401+
ALL_RESOURCES_UTILIZATION_PANEL,
1402+
OPERATOR_TASK_COMPLETION_TIME_PANEL,
1403+
OPERATOR_BLOCK_COMPLETION_TIME_PANEL,
1404+
OPERATOR_BLOCK_SIZE_BYTES_PANEL,
1405+
OPERATOR_BLOCK_SIZE_ROWS_PANEL,
1406+
],
12031407
collapsed=True,
12041408
),
12051409
]

python/ray/dashboard/modules/metrics/grafana_dashboard_factory.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@ def _generate_panel_template(
229229
template["stack"] = panel.stack
230230
template["linewidth"] = panel.linewidth
231231

232+
if panel.hideXAxis:
233+
template.setdefault("xaxis", {})["show"] = False
234+
232235
# Handle stacking visualization
233236
if panel.stack is True:
234237
template["nullPointMode"] = "connected"

0 commit comments

Comments
 (0)