[Data] Make test_dataset_throughput deterministic by increasing workload and applying tolerance

dancingactor · dancingactor · commit 1b087b344fb0 · 2025-11-24T14:58:31.000+08:00
Signed-off-by: dancingactor &lt;s990346@gmail.com&gt;
diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py
@@ -1230,6 +1230,18 @@ def to_string(
 
         return out
 
+    @property
+    def num_rows_per_s(self) -> float:
+        """Calculates the throughput in rows per second for the entire dataset."""
+        output_num_rows = (
+            self.operators_stats[-1].output_num_rows if self.operators_stats else 0
+        )
+        total_num_out_rows = output_num_rows["sum"] if output_num_rows else 0
+        wall_time = self.get_total_wall_time()
+        if not total_num_out_rows or not wall_time:
+            return 0.0
+        return total_num_out_rows / wall_time
+
     @staticmethod
     def _collect_dataset_stats_summaries(
         curr: "DatasetStatsSummary",
@@ -1378,6 +1390,20 @@ class OperatorStatsSummary:
     node_count: Optional[Dict[str, float]] = None
     task_rows: Optional[Dict[str, float]] = None
 
+    @property
+    def num_rows_per_s(self) -> float:
+        """Calculates the overall throughput in rows per second for this operator."""
+        if not self.output_num_rows or not self.time_total_s:
+            return 0.0
+        return self.output_num_rows["sum"] / self.time_total_s
+
+    @property
+    def num_rows_per_task_s(self) -> float:
+        """Calculates the estimated single-task throughput in rows per second."""
+        if not self.output_num_rows or not self.wall_time or not self.wall_time["sum"]:
+            return 0.0
+        return self.output_num_rows["sum"] / self.wall_time["sum"]
+
     @classmethod
     def from_block_metadata(
         cls,
diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py
@@ -1744,35 +1744,73 @@ def test_task_duration_stats():
     )  # ddof=1 for sample standard deviation
 
 
-# NOTE: All tests above share a Ray cluster, while the tests below do not. These
-# tests should only be carefully reordered to retain this invariant!
+def test_dataset_throughput_calculation(ray_start_regular_shared):
+    """Test throughput calculations using mock block stats."""
+    from ray.data._internal.stats import DatasetStats
+    from ray.data.block import BlockExecStats, BlockStats
+
+    def create_block_stats(start_time, end_time, num_rows):
+        exec_stats = BlockExecStats()
+        exec_stats.start_time_s = start_time
+        exec_stats.end_time_s = end_time
+        exec_stats.wall_time_s = end_time - start_time
+        exec_stats.cpu_time_s = exec_stats.wall_time_s
+        return BlockStats(num_rows=num_rows, size_bytes=None, exec_stats=exec_stats)
+
+    blocks = [
+        create_block_stats(0.0, 2.0, 100),
+        create_block_stats(0.5, 2.5, 100),
+        create_block_stats(1.0, 3.0, 100),
+    ]
 
+    stats = DatasetStats(metadata={"Map": blocks}, parent=None)
+    summary = stats.to_summary()
 
-def test_dataset_throughput(shutdown_only):
-    ray.shutdown()
-    ray.init(num_cpus=2)
+    # Throughput: total rows / total execution duration
+    # Total rows = 300
+    # Duration = max end_time - min start_time = 3.0s
+    # 300 rows / 3s = 100 rows/s
+    assert summary.num_rows_per_s == 100
 
-    f = dummy_map_batches_sleep(0.01)
-    ds = ray.data.range(100).map(f).materialize().map(f).materialize()
 
-    operator_pattern = re.compile(
-        r"Operator (\d+).*?\* Operator throughput:\s*.*?\* Ray Data throughput: (\d+\.\d+) rows/s.*?\* Estimated single node throughput: (\d+\.\d+) rows/s",
-        re.DOTALL,
-    )
+def test_operator_throughput_calculation(ray_start_regular_shared):
+    """Test operator throughput calculations using mock BlockStats."""
+    from ray.data._internal.stats import OperatorStatsSummary
+    from ray.data.block import BlockExecStats, BlockStats
 
-    # Ray data throughput should always be better than single node throughput for
-    # multi-cpu case.
-    for match in operator_pattern.findall(ds.stats()):
-        assert float(match[1]) >= float(match[2])
+    def create_block_stats(start_time, end_time, num_rows, task_idx):
+        exec_stats = BlockExecStats()
+        exec_stats.start_time_s = start_time
+        exec_stats.end_time_s = end_time
+        exec_stats.wall_time_s = end_time - start_time
+        exec_stats.cpu_time_s = exec_stats.wall_time_s
+        exec_stats.task_idx = task_idx
 
-    # Pattern to match dataset throughput
-    dataset_pattern = re.compile(
-        r"Dataset throughput:.*?Ray Data throughput: (\d+\.\d+) rows/s.*?Estimated single node throughput: (\d+\.\d+) rows/s",  # noqa: E501
-        re.DOTALL,
+        return BlockStats(num_rows=num_rows, size_bytes=None, exec_stats=exec_stats)
+
+    blocks = [
+        create_block_stats(0.0, 2.0, 100, 1),
+        create_block_stats(0.0, 2.0, 100, 2),
+    ]
+
+    summary = OperatorStatsSummary.from_block_metadata(
+        operator_name="MockOperator",
+        block_stats=blocks,
+        is_sub_operator=False,
     )
 
-    dataset_match = dataset_pattern.search(ds.stats())
-    assert float(dataset_match[1]) >= float(dataset_match[2])
+    # Total rows = 200
+    # Total operator wall time (from earliest start to latest end) = 2.0s
+    # Sum of individual task wall times = 2.0s + 2.0s = 4.0s
+    # Overall throughput: Total rows / Total operator wall time
+    assert summary.num_rows_per_s == 200 / (2.0 - 0.0)
+
+    # Estimated single task throughput: Total rows / Sum of individual task wall times｀
+    assert summary.num_rows_per_task_s == 200 / (2.0 + 2.0)
+
+
+# NOTE: All tests above share a Ray cluster, while the tests below do not. These
+# tests should only be carefully reordered to retain this invariant!
 
 
 def test_individual_operator_num_rows(shutdown_only):