|
477 | 477 | ) |
478 | 478 |
|
479 | 479 | # Task Completion Time Percentiles |
480 | | -TASK_COMPLETION_TIME_PANEL = Panel( |
| 480 | +TASK_COMPLETION_TIME_P50_PANEL = Panel( |
481 | 481 | id=38, |
482 | | - title="Task Completion Time Histogram (s)", |
483 | | - description="Time (in seconds) spent (including backpressure) running tasks to completion. Larger bars means more tasks finished within that duration range.", |
| 482 | + title="P50 Task Completion Time", |
| 483 | + description="P50 time (in seconds) spent (including backpressure) running tasks to completion.", |
484 | 484 | targets=[ |
485 | 485 | Target( |
486 | | - expr='sum by (le) (max_over_time(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
487 | | - legend="{{le}} s", |
488 | | - template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 486 | + expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 487 | + legend="{{operator}}", |
489 | 488 | ), |
490 | 489 | ], |
491 | | - unit="short", |
| 490 | + unit="s", |
492 | 491 | fill=0, |
493 | 492 | stack=False, |
494 | | - template=PanelTemplate.BAR_CHART, |
495 | 493 | ) |
496 | 494 |
|
497 | | -BLOCK_COMPLETION_TIME_PANEL = Panel( |
| 495 | +TASK_COMPLETION_TIME_P90_PANEL = Panel( |
| 496 | + id=82, |
| 497 | + title="P90 Task Completion Time", |
| 498 | + description="P90 time (in seconds) spent (including backpressure) running tasks to completion.", |
| 499 | + targets=[ |
| 500 | + Target( |
| 501 | + expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 502 | + legend="{{operator}}", |
| 503 | + ), |
| 504 | + ], |
| 505 | + unit="s", |
| 506 | + fill=0, |
| 507 | + stack=False, |
| 508 | +) |
| 509 | + |
| 510 | +TASK_COMPLETION_TIME_P99_PANEL = Panel( |
| 511 | + id=83, |
| 512 | + title="P99 Task Completion Time", |
| 513 | + description="P99 time (in seconds) spent (including backpressure) running tasks to completion.", |
| 514 | + targets=[ |
| 515 | + Target( |
| 516 | + expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 517 | + legend="{{operator}}", |
| 518 | + ), |
| 519 | + ], |
| 520 | + unit="s", |
| 521 | + fill=0, |
| 522 | + stack=False, |
| 523 | +) |
| 524 | + |
| 525 | +BLOCK_COMPLETION_TIME_P50_PANEL = Panel( |
| 526 | + id=84, |
| 527 | + title="P50 Block Completion Time", |
| 528 | + description="P50 time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process.", |
| 529 | + targets=[ |
| 530 | + Target( |
| 531 | + expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 532 | + legend="{{operator}}", |
| 533 | + ), |
| 534 | + ], |
| 535 | + unit="s", |
| 536 | + fill=0, |
| 537 | + stack=False, |
| 538 | +) |
| 539 | + |
| 540 | +BLOCK_COMPLETION_TIME_P90_PANEL = Panel( |
498 | 541 | id=61, |
499 | | - title="Block Completion Time Histogram (s)", |
500 | | - description="Time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process. Larger bars means more blocks finished within that duration range.", |
| 542 | + title="P90 Block Completion Time", |
| 543 | + description="P90 time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process.", |
501 | 544 | targets=[ |
502 | 545 | Target( |
503 | | - expr='sum by (le) (max_over_time(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
504 | | - legend="{{le}} s", |
505 | | - template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 546 | + expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 547 | + legend="{{operator}}", |
506 | 548 | ), |
507 | 549 | ], |
508 | | - unit="short", |
| 550 | + unit="s", |
| 551 | + fill=0, |
| 552 | + stack=False, |
| 553 | +) |
| 554 | + |
| 555 | +BLOCK_COMPLETION_TIME_P99_PANEL = Panel( |
| 556 | + id=85, |
| 557 | + title="P99 Block Completion Time", |
| 558 | + description="P99 time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process.", |
| 559 | + targets=[ |
| 560 | + Target( |
| 561 | + expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 562 | + legend="{{operator}}", |
| 563 | + ), |
| 564 | + ], |
| 565 | + unit="s", |
509 | 566 | fill=0, |
510 | 567 | stack=False, |
511 | | - template=PanelTemplate.BAR_CHART, |
512 | 568 | ) |
513 | 569 |
|
514 | | -BLOCK_SIZE_BYTES_PANEL = Panel( |
| 570 | +BLOCK_SIZE_BYTES_P50_PANEL = Panel( |
| 571 | + id=86, |
| 572 | + title="P50 Block Size (Bytes)", |
| 573 | + description="P50 size (in bytes) per block.", |
| 574 | + targets=[ |
| 575 | + Target( |
| 576 | + expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 577 | + legend="{{operator}}", |
| 578 | + ), |
| 579 | + ], |
| 580 | + unit="bytes", |
| 581 | + fill=0, |
| 582 | + stack=False, |
| 583 | +) |
| 584 | + |
| 585 | +BLOCK_SIZE_BYTES_P90_PANEL = Panel( |
515 | 586 | id=62, |
516 | | - title="Block Size (Bytes) Histogram", |
517 | | - description="Size (in bytes) per block. Larger bars means more blocks are within that size range.", |
| 587 | + title="P90 Block Size (Bytes)", |
| 588 | + description="P90 size (in bytes) per block.", |
518 | 589 | targets=[ |
519 | 590 | Target( |
520 | | - expr='sum by (le) (max_over_time(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
521 | | - legend="{{le}} bytes", |
522 | | - template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 591 | + expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 592 | + legend="{{operator}}", |
523 | 593 | ), |
524 | 594 | ], |
525 | | - unit="short", |
| 595 | + unit="bytes", |
| 596 | + fill=0, |
| 597 | + stack=False, |
| 598 | +) |
| 599 | + |
| 600 | +BLOCK_SIZE_BYTES_P99_PANEL = Panel( |
| 601 | + id=87, |
| 602 | + title="P99 Block Size (Bytes)", |
| 603 | + description="P99 size (in bytes) per block.", |
| 604 | + targets=[ |
| 605 | + Target( |
| 606 | + expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 607 | + legend="{{operator}}", |
| 608 | + ), |
| 609 | + ], |
| 610 | + unit="bytes", |
526 | 611 | fill=0, |
527 | 612 | stack=False, |
528 | | - template=PanelTemplate.BAR_CHART, |
529 | 613 | ) |
530 | 614 |
|
531 | | -BLOCK_SIZE_ROWS_PANEL = Panel( |
| 615 | +BLOCK_SIZE_ROWS_P50_PANEL = Panel( |
| 616 | + id=88, |
| 617 | + title="P50 Block Size (Rows)", |
| 618 | + description="P50 number of rows per block.", |
| 619 | + targets=[ |
| 620 | + Target( |
| 621 | + expr='histogram_quantile(0.5, sum by (operator, le) (rate(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 622 | + legend="{{operator}}", |
| 623 | + ), |
| 624 | + ], |
| 625 | + unit="rows", |
| 626 | + fill=0, |
| 627 | + stack=False, |
| 628 | +) |
| 629 | + |
| 630 | +BLOCK_SIZE_ROWS_P90_PANEL = Panel( |
532 | 631 | id=63, |
533 | | - title="Block Size (Rows) Histogram", |
534 | | - description="Number of rows per block. Larger bars means more blocks are within that number of rows range.", |
| 632 | + title="P90 Block Size (Rows)", |
| 633 | + description="P90 number of rows per block.", |
535 | 634 | targets=[ |
536 | 635 | Target( |
537 | | - expr='sum by (le) (max_over_time(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
538 | | - legend="{{le}} rows", |
539 | | - template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 636 | + expr='histogram_quantile(0.9, sum by (operator, le) (rate(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 637 | + legend="{{operator}}", |
540 | 638 | ), |
541 | 639 | ], |
542 | | - unit="short", |
| 640 | + unit="rows", |
| 641 | + fill=0, |
| 642 | + stack=False, |
| 643 | +) |
| 644 | + |
| 645 | +BLOCK_SIZE_ROWS_P99_PANEL = Panel( |
| 646 | + id=89, |
| 647 | + title="P99 Block Size (Rows)", |
| 648 | + description="P99 number of rows per block.", |
| 649 | + targets=[ |
| 650 | + Target( |
| 651 | + expr='histogram_quantile(0.99, sum by (operator, le) (rate(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator"}}[$__rate_interval])))', |
| 652 | + legend="{{operator}}", |
| 653 | + ), |
| 654 | + ], |
| 655 | + unit="rows", |
543 | 656 | fill=0, |
544 | 657 | stack=False, |
545 | | - template=PanelTemplate.BAR_CHART, |
546 | 658 | ) |
547 | 659 |
|
548 | 660 | TASK_OUTPUT_BACKPRESSURE_TIME_PANEL = Panel( |
|
1051 | 1163 | stack=False, |
1052 | 1164 | ) |
1053 | 1165 |
|
| 1166 | +OPERATOR_TASK_COMPLETION_TIME_PANEL = Panel( |
| 1167 | + id=78, |
| 1168 | + title="Task Completion Time Histogram (s)", |
| 1169 | + description="Time (in seconds) spent (including backpressure) running tasks to completion. Larger bars means more tasks finished within that duration range.", |
| 1170 | + targets=[ |
| 1171 | + Target( |
| 1172 | + expr='sum by (le) (max_over_time(ray_data_task_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
| 1173 | + legend="{{le}} s", |
| 1174 | + template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 1175 | + ), |
| 1176 | + ], |
| 1177 | + unit="short", |
| 1178 | + fill=0, |
| 1179 | + stack=False, |
| 1180 | + template=PanelTemplate.BAR_CHART, |
| 1181 | +) |
| 1182 | + |
| 1183 | +OPERATOR_BLOCK_COMPLETION_TIME_PANEL = Panel( |
| 1184 | + id=79, |
| 1185 | + title="Block Completion Time Histogram (s)", |
| 1186 | + description="Time (in seconds) spent processing blocks to completion. If multiple blocks are generated per task, this is approximated by assuming each block took an equal amount of time to process. Larger bars means more blocks finished within that duration range.", |
| 1187 | + targets=[ |
| 1188 | + Target( |
| 1189 | + expr='sum by (le) (max_over_time(ray_data_block_completion_time_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
| 1190 | + legend="{{le}} s", |
| 1191 | + template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 1192 | + ), |
| 1193 | + ], |
| 1194 | + unit="short", |
| 1195 | + fill=0, |
| 1196 | + stack=False, |
| 1197 | + template=PanelTemplate.BAR_CHART, |
| 1198 | +) |
| 1199 | + |
| 1200 | +OPERATOR_BLOCK_SIZE_BYTES_PANEL = Panel( |
| 1201 | + id=80, |
| 1202 | + title="Block Size (Bytes) Histogram", |
| 1203 | + description="Size (in bytes) per block. Larger bars means more blocks are within that size range.", |
| 1204 | + targets=[ |
| 1205 | + Target( |
| 1206 | + expr='sum by (le) (max_over_time(ray_data_block_size_bytes_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
| 1207 | + legend="{{le}} bytes", |
| 1208 | + template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 1209 | + ), |
| 1210 | + ], |
| 1211 | + unit="short", |
| 1212 | + fill=0, |
| 1213 | + stack=False, |
| 1214 | + template=PanelTemplate.BAR_CHART, |
| 1215 | + # We hide the X axis because the values are too large to fit and they are not useful. |
| 1216 | + # We also cannot format it to higher units so it has too many digits. |
| 1217 | + hideXAxis=True, |
| 1218 | +) |
| 1219 | + |
| 1220 | +OPERATOR_BLOCK_SIZE_ROWS_PANEL = Panel( |
| 1221 | + id=81, |
| 1222 | + title="Block Size (Rows) Histogram", |
| 1223 | + description="Number of rows per block. Larger bars means more blocks are within that number of rows range.", |
| 1224 | + targets=[ |
| 1225 | + Target( |
| 1226 | + expr='sum by (le) (max_over_time(ray_data_block_size_rows_bucket{{{global_filters}, operator=~"$Operator", le!="+Inf"}}[$__range]))', |
| 1227 | + legend="{{le}} rows", |
| 1228 | + template=TargetTemplate.HISTOGRAM_BAR_CHART, |
| 1229 | + ), |
| 1230 | + ], |
| 1231 | + unit="short", |
| 1232 | + fill=0, |
| 1233 | + stack=False, |
| 1234 | + template=PanelTemplate.BAR_CHART, |
| 1235 | + # We hide the X axis because the values are too large to fit and they are not useful. |
| 1236 | + # We also cannot format it to higher units so it has too many digits. |
| 1237 | + hideXAxis=True, |
| 1238 | +) |
| 1239 | + |
1054 | 1240 | OPERATOR_PANELS = [ |
1055 | 1241 | ROWS_OUTPUT_PER_SECOND_PANEL, |
1056 | 1242 | ALL_RESOURCES_UTILIZATION_PANEL, |
1057 | 1243 | COMBINED_INQUEUE_BLOCKS_PANEL, |
| 1244 | + OPERATOR_TASK_COMPLETION_TIME_PANEL, |
| 1245 | + OPERATOR_BLOCK_COMPLETION_TIME_PANEL, |
| 1246 | + OPERATOR_BLOCK_SIZE_BYTES_PANEL, |
| 1247 | + OPERATOR_BLOCK_SIZE_ROWS_PANEL, |
1058 | 1248 | ] |
1059 | 1249 |
|
1060 | 1250 | DATA_GRAFANA_ROWS = [ |
|
1117 | 1307 | title="Outputs", |
1118 | 1308 | id=103, |
1119 | 1309 | panels=[ |
1120 | | - BLOCK_SIZE_BYTES_PANEL, |
1121 | | - BLOCK_SIZE_ROWS_PANEL, |
| 1310 | + BLOCK_SIZE_BYTES_P50_PANEL, |
| 1311 | + BLOCK_SIZE_BYTES_P90_PANEL, |
| 1312 | + BLOCK_SIZE_BYTES_P99_PANEL, |
| 1313 | + BLOCK_SIZE_ROWS_P50_PANEL, |
| 1314 | + BLOCK_SIZE_ROWS_P90_PANEL, |
| 1315 | + BLOCK_SIZE_ROWS_P99_PANEL, |
1122 | 1316 | OUTPUT_BLOCKS_TAKEN_PANEL, |
1123 | 1317 | OUTPUT_BYTES_TAKEN_PANEL, |
1124 | 1318 | OUTPUT_BYTES_BY_NODE_PANEL, |
|
1136 | 1330 | title="Tasks", |
1137 | 1331 | id=104, |
1138 | 1332 | panels=[ |
1139 | | - TASK_COMPLETION_TIME_PANEL, |
1140 | | - BLOCK_COMPLETION_TIME_PANEL, |
| 1333 | + TASK_COMPLETION_TIME_P50_PANEL, |
| 1334 | + TASK_COMPLETION_TIME_P90_PANEL, |
| 1335 | + TASK_COMPLETION_TIME_P99_PANEL, |
| 1336 | + BLOCK_COMPLETION_TIME_P50_PANEL, |
| 1337 | + BLOCK_COMPLETION_TIME_P90_PANEL, |
| 1338 | + BLOCK_COMPLETION_TIME_P99_PANEL, |
1141 | 1339 | TASK_COMPLETION_TIME_WITHOUT_BACKPRESSURE_PANEL, |
1142 | 1340 | TASK_OUTPUT_BACKPRESSURE_TIME_PANEL, |
1143 | 1341 | TASK_SUBMISSION_BACKPRESSURE_PANEL, |
|
1199 | 1397 | Row( |
1200 | 1398 | title="Operator Panels", |
1201 | 1399 | id=108, |
1202 | | - panels=[ALL_RESOURCES_UTILIZATION_PANEL], |
| 1400 | + panels=[ |
| 1401 | + ALL_RESOURCES_UTILIZATION_PANEL, |
| 1402 | + OPERATOR_TASK_COMPLETION_TIME_PANEL, |
| 1403 | + OPERATOR_BLOCK_COMPLETION_TIME_PANEL, |
| 1404 | + OPERATOR_BLOCK_SIZE_BYTES_PANEL, |
| 1405 | + OPERATOR_BLOCK_SIZE_ROWS_PANEL, |
| 1406 | + ], |
1203 | 1407 | collapsed=True, |
1204 | 1408 | ), |
1205 | 1409 | ] |
|
0 commit comments