-
Notifications
You must be signed in to change notification settings - Fork 7k
[Data] Make test_dataset_throughput deterministic and refactor throughput stats
#58693
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1202,42 +1202,33 @@ def to_string( | |
| out += "\nDataset memory:\n" | ||
| out += "* Spilled to disk: {}MB\n".format(dataset_mb_spilled) | ||
|
|
||
| # For throughput, we compute both an observed Ray Data dataset throughput | ||
| # and an estimated single node dataset throughput. | ||
|
|
||
| # The observed dataset throughput is computed by dividing the total number | ||
| # of rows produced by the total wall time of the dataset (i.e. from start to | ||
| # finish how long did the dataset take to be processed). With the recursive | ||
| # nature of the DatasetStatsSummary, we use get_total_wall_time to determine | ||
| # the total wall time (this finds the difference between the earliest start | ||
| # and latest end for any block in any operator). | ||
|
|
||
| # The estimated single node dataset throughput is computed by dividing the | ||
| # total number of rows produced the sum of the wall times across all blocks | ||
| # of all operators. This assumes that on a single node the work done would | ||
| # be equivalent, with no concurrency. | ||
| output_num_rows = self.operators_stats[-1].output_num_rows | ||
| total_num_out_rows = output_num_rows["sum"] if output_num_rows else 0 | ||
| wall_time = self.get_total_wall_time() | ||
| total_time_all_blocks = self.get_total_time_all_blocks() | ||
| if total_num_out_rows and wall_time and total_time_all_blocks: | ||
| if self.num_rows_per_s: | ||
| out += "\n" | ||
| out += "Dataset throughput:\n" | ||
| out += ( | ||
| "\t* Ray Data throughput:" | ||
| f" {total_num_out_rows / wall_time} " | ||
| "rows/s\n" | ||
| ) | ||
| out += ( | ||
| "\t* Estimated single node throughput:" | ||
| f" {total_num_out_rows / total_time_all_blocks} " | ||
| "rows/s\n" | ||
| ) | ||
| out += "\t* Ray Data throughput:" f" {self.num_rows_per_s} " "rows/s\n" | ||
| if verbose_stats_logs and add_global_stats: | ||
| out += "\n" + self.runtime_metrics() | ||
|
|
||
| return out | ||
|
|
||
| @property | ||
dancingactor marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def num_rows_per_s(self) -> float: | ||
| """Calculates the throughput in rows per second for the entire dataset.""" | ||
| # The observed dataset throughput is computed by dividing the total number | ||
| # of rows produced by the total wall time of the dataset (i.e. from start to | ||
| # finish how long did the dataset take to be processed). With the recursive | ||
| # nature of the DatasetStatsSummary, we use get_total_wall_time to determine | ||
| # the total wall time (this finds the difference between the earliest start | ||
| # and latest end for any block in any operator). | ||
| output_num_rows = ( | ||
| self.operators_stats[-1].output_num_rows if self.operators_stats else 0 | ||
| ) | ||
| total_num_out_rows = output_num_rows["sum"] if output_num_rows else 0 | ||
| wall_time = self.get_total_wall_time() | ||
| if not total_num_out_rows or not wall_time: | ||
| return 0.0 | ||
| return total_num_out_rows / wall_time | ||
|
|
||
| @staticmethod | ||
| def _collect_dataset_stats_summaries( | ||
| curr: "DatasetStatsSummary", | ||
|
|
@@ -1386,6 +1377,26 @@ class OperatorStatsSummary: | |
| node_count: Optional[Dict[str, float]] = None | ||
| task_rows: Optional[Dict[str, float]] = None | ||
|
|
||
| @property | ||
| def num_rows_per_s(self) -> float: | ||
| # The observed Ray Data operator throughput is computed by dividing the | ||
| # total number of rows produced by the wall time of the operator, | ||
| # time_total_s. | ||
| if not self.output_num_rows or not self.time_total_s: | ||
| return 0.0 | ||
| return self.output_num_rows["sum"] / self.time_total_s | ||
|
|
||
| @property | ||
| def num_rows_per_task_s(self) -> float: | ||
| """Calculates the estimated single-task throughput in rows per second.""" | ||
| # The estimated single task operator throughput is computed by dividing the | ||
| # total number of rows produced by the sum of the wall times across all | ||
| # blocks of the operator. This assumes that on a single task the work done | ||
| # would be equivalent, with no concurrency. | ||
| if not self.output_num_rows or not self.wall_time or not self.wall_time["sum"]: | ||
| return 0.0 | ||
| return self.output_num_rows["sum"] / self.wall_time["sum"] | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| @classmethod | ||
| def from_block_metadata( | ||
| cls, | ||
|
|
@@ -1630,18 +1641,7 @@ def __str__(self) -> str: | |
| node_count_stats["mean"], | ||
| node_count_stats["count"], | ||
| ) | ||
| if output_num_rows_stats and self.time_total_s and wall_time_stats: | ||
| # For throughput, we compute both an observed Ray Data operator throughput | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These comments were moved to
|
||
| # and an estimated single node operator throughput. | ||
|
|
||
| # The observed Ray Data operator throughput is computed by dividing the | ||
| # total number of rows produced by the wall time of the operator, | ||
| # time_total_s. | ||
|
|
||
| # The estimated single node operator throughput is computed by dividing the | ||
| # total number of rows produced by the sum of the wall times across all | ||
| # blocks of the operator. This assumes that on a single node the work done | ||
| # would be equivalent, with no concurrency. | ||
| if self.num_rows_per_s and self.num_rows_per_task_s: | ||
| total_num_in_rows = ( | ||
| self.total_input_num_rows if self.total_input_num_rows else 0 | ||
| ) | ||
|
|
@@ -1656,12 +1656,12 @@ def __str__(self) -> str: | |
| ) | ||
| out += ( | ||
| indent + "\t* Ray Data throughput:" | ||
| f" {total_num_out_rows / self.time_total_s} " | ||
| f" {self.num_rows_per_s} " | ||
| "rows/s\n" | ||
| ) | ||
| out += ( | ||
| indent + "\t* Estimated single node throughput:" | ||
| f" {total_num_out_rows / wall_time_stats['sum']} " | ||
| indent + "\t* Estimated single task throughput:" | ||
dancingactor marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f" {self.num_rows_per_task_s} " | ||
| "rows/s\n" | ||
| ) | ||
| return out | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comments were moved to https://github.com/ray-project/ray/pull/58693/files#diff-4dba40d789c60bfba4ae769f109b39979aa7d6977390329e7e2bb0e666569009R1221-R1226
the comment for "estimated single node" was removed since we removed this part from class
Dataset