diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
new file mode 100644
index 0000000000..93a1143cde
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H
+
+#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer(
+    ParallelComputationGraph const &pcg,
+    parallel_layer_guid_t const &layer,
+    MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
index d2ff3f42e7..5e81d6c10e 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
@@ -11,7 +11,11 @@ includes = [
 ]
 
 [[fields]]
-name = "runtime"
+name = "forward_runtime"
+type = "float"
+
+[[fields]]
+name = "backward_runtime"
 type = "float"
 
 [[fields]]
diff --git a/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h b/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h
new file mode 100644
index 0000000000..34188ff97c
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TENSOR_SET_MOVEMENT_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TENSOR_SET_MOVEMENT_H
+
+#include "compiler/cost_estimator/tensor_set_movement.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
+
+namespace FlexFlow {
+
+TensorSetMovement get_tensor_set_movement_from_pcg_edge(
+    ParallelComputationGraphEdge const &edge,
+    ParallelComputationGraph const &pcg,
+    MachineView const &src_mv,
+    MachineView const &dst_mv);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
index 06cbbf942d..7375cde985 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
@@ -2,6 +2,10 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h
new file mode 100644
index 0000000000..0fb31210fd
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_UNSTRUCTURED_DEVICE_MAPPING_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_UNSTRUCTURED_DEVICE_MAPPING_H
+
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+
+namespace FlexFlow {
+
+UnstructuredDeviceMapping
+    get_unstructured_device_mapping(MachineMapping const &machine_mapping,
+                                    MachineSpecification const &machine_spec,
+                                    ParallelComputationGraph const &pcg);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml
new file mode 100644
index 0000000000..ae38a37292
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "UnstructuredDeviceMapping"
+features = [
+  "eq",
+  # "ord",
+  "hash",
+  # "json",
+  # "rapidcheck",
+  "fmt",
+]
+
+includes = [
+  "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h",
+  "pcg/device_id_t.dtg.h"
+]  
+
+src_includes = [   
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",  
+  "utils/hash/unordered_set.h",  
+  "utils/fmt/unordered_set.h" 
+]
+
+[[fields]]
+name = "raw_device_map"
+type = "std::unordered_map<::FlexFlow::parallel_layer_guid_t, std::unordered_set<::FlexFlow::device_id_t>>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml
new file mode 100644
index 0000000000..71e0e17f5e
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "InProgressTask"
+
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "ord"
+]
+
+includes = [
+  "utils/graph/node/node.dtg.h"
+]
+
+
+[[fields]]
+name = "start_time"
+type = "float"
+
+[[fields]]
+name = "end_time"
+type = "float"
+
+[[fields]]
+name = "node"
+type = "::FlexFlow::Node"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h
new file mode 100644
index 0000000000..ed509cb7be
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H
+
+#include "compiler/task_graph_simulator/in_progress_task.dtg.h"
+#include <tuple>
+
+namespace FlexFlow {
+struct InProgressTaskComparator {
+  bool operator()(InProgressTask const &lhs, InProgressTask const &rhs) const;
+};
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
new file mode 100644
index 0000000000..13f2f17652
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "PCGTask"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/cost_estimator/op_cost_estimate_key.dtg.h",
+  "compiler/cost_estimator/tensor_set_movement.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::OpCostEstimateKey"
+key = "operator"
+
+[[values]]
+type = "::FlexFlow::TensorSetMovement"
+key = "tensor_movement"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h
new file mode 100644
index 0000000000..2c6d6514e8
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_PCG_TASK_GRAPH_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_PCG_TASK_GRAPH_H
+
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/task_graph_simulator/pcg_task_graph.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+
+namespace FlexFlow {
+
+PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg,
+                                MachineMapping const &machine_mapping,
+                                MachineSpecification const &machine_spec);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml
new file mode 100644
index 0000000000..099f44c564
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml
@@ -0,0 +1,34 @@
+namespace = "FlexFlow"
+name = "PCGTaskGraph"
+
+features = [
+]
+
+includes = [
+  "utils/graph/digraph/digraph_view.h",
+  "utils/bidict/bidict.h",
+  "compiler/task_graph_simulator/pcg_task.dtg.h",
+  "pcg/device_id_t.dtg.h",
+  "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h",
+  "<unordered_set>",
+  "<unordered_map>"
+]
+
+src_includes = [
+  "utils/fmt/unordered_set.h",
+  "utils/hash/unordered_set.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h"
+]
+
+[[fields]]
+name = "graph"
+type = "::FlexFlow::DiGraphView"
+
+[[fields]]
+name = "node_to_task"
+type = "::FlexFlow::bidict<::FlexFlow::Node, ::FlexFlow::PCGTask>"
+
+[[fields]]
+name = "node_to_devices"
+type = "std::unordered_map<::FlexFlow::Node, std::unordered_set<::FlexFlow::device_id_t>>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h b/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h
new file mode 100644
index 0000000000..424e65f9df
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_SIMULATE_TASK_GRAPH_EXECUTION_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_SIMULATE_TASK_GRAPH_EXECUTION_H
+
+#include "compiler/task_graph_simulator/task_execution_constraint.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+#include "utils/graph/digraph/digraph_view.h"
+#include <functional>
+namespace FlexFlow {
+
+TaskGraphExecutionTrace simulate_task_graph_execution(
+    DiGraphView const &task_graph,
+    std::function<float(Node const &)> cost_function,
+    TaskExecutionConstraint const &constraint);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml
new file mode 100644
index 0000000000..004655b5ec
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml
@@ -0,0 +1,15 @@
+namespace = "FlexFlow"
+name = "TaskExecutionConstraint"
+features = [
+]
+
+includes = [
+  "utils/graph/node/node.dtg.h",
+  "<functional>",
+  "<unordered_set>"
+]
+
+
+[[fields]]
+name = "is_satisfied"
+type = "std::function<bool(Node const &, std::unordered_set<Node> const &, std::unordered_set<Node> const &)>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml
new file mode 100644
index 0000000000..b96d7264b9
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml
@@ -0,0 +1,40 @@
+namespace = "FlexFlow"
+name = "TaskGraphExecutionState"
+
+features = [
+]
+
+includes = [
+  "utils/deduplicated_priority_queue.h",
+  "utils/graph/node/node.dtg.h",
+  "compiler/task_graph_simulator/in_progress_task.dtg.h", 
+  "compiler/task_graph_simulator/in_progress_task_comparator.h",
+  "<unordered_set>",
+  "<set>",
+  "<functional>"
+]
+
+src_includes = [
+  "utils/hash/unordered_set.h", 
+  "utils/fmt/unordered_set.h",
+  "utils/hash/set.h", 
+  "utils/fmt/set.h",
+  "utils/fmt/vector.h", 
+  "utils/hash/vector.h"
+]
+
+[[fields]]
+name = "ready_tasks"
+type = "std::set<::FlexFlow::Node>"
+
+[[fields]]
+name = "in_progress_tasks"
+type = "::FlexFlow::DeduplicatedPriorityQueue<::FlexFlow::InProgressTask, std::vector<::FlexFlow::InProgressTask>, ::FlexFlow::InProgressTaskComparator>"
+
+[[fields]]
+name = "finished_tasks"
+type = "std::unordered_set<::FlexFlow::Node>"
+
+[[fields]]
+name = "current_time"
+type = "float"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h
new file mode 100644
index 0000000000..0ad5b4824b
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H
+
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+
+namespace FlexFlow {
+
+float get_total_execution_time(TaskGraphExecutionTrace const &trace);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml
new file mode 100644
index 0000000000..3003e5a157
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "TaskGraphExecutionTrace"
+
+features = [
+  "hash",
+  "fmt",
+  "eq"
+]
+
+includes = [
+  "compiler/task_graph_simulator/task_profile.dtg.h",
+  "<unordered_set>"
+]
+
+src_includes = [
+  "utils/fmt/unordered_set.h",
+  "utils/hash/unordered_set.h"
+]
+
+
+[[fields]]
+name = "task_profiles"
+type = "std::unordered_set<::FlexFlow::TaskProfile>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml
new file mode 100644
index 0000000000..1a47acfa0e
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "TaskProfile"
+
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "ord"
+]
+
+includes = [
+  "utils/graph/node/node.dtg.h"
+]
+
+
+[[fields]]
+name = "node"
+type = "::FlexFlow::Node"
+
+[[fields]]
+name = "start_time"
+type = "float"
+
+[[fields]]
+name = "end_time"
+type = "float"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
new file mode 100644
index 0000000000..b35733e419
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+
+namespace FlexFlow {
+float task_simulator_estimate_forward_pass_time(
+    ParallelComputationGraph const &pcg,
+    CostEstimator const &estimator,
+    MachineMapping const &machine_mapping,
+    MachineSpecification const &machine_spec);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc
index 1c226f79b0..db7477b460 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/allowed_machine_views.cc
@@ -24,6 +24,10 @@ namespace FlexFlow {
 bool is_valid_machine_view(MachineView const &mv,
                            OperatorTaskSpace const &task,
                            MachineSpecification const &ms) {
+  if (num_dims(mv) != num_dims(task)) {
+    return false;
+  }
+
   std::optional<MachineSpaceCoordinate> maximum_device_coord =
       get_machine_space_coordinate(
           task, mv, get_task_space_maximum_coordinate(task), ms);
diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
new file mode 100644
index 0000000000..ef5775851f
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
@@ -0,0 +1,23 @@
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer(
+    ParallelComputationGraph const &pcg,
+    parallel_layer_guid_t const &layer,
+    MachineView const &machine_view) {
+  return map_unmapped_op_cost_estimate_key(
+      get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), machine_view);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc b/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc
new file mode 100644
index 0000000000..8f2ab84b84
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc
@@ -0,0 +1,16 @@
+#include "compiler/cost_estimator/tensor_set_movement.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+namespace FlexFlow {
+
+TensorSetMovement get_tensor_set_movement_from_pcg_edge(
+    ParallelComputationGraphEdge const &edge,
+    ParallelComputationGraph const &pcg,
+    MachineView const &src_mv,
+    MachineView const &dst_mv) {
+  ParallelTensorShape tensor_shape =
+      get_parallel_tensor_shape(pcg, parallel_tensor_guid_t{edge.raw_edge.src});
+  return TensorSetMovement{
+      {SingleTensorMovement{tensor_shape, {src_mv}, {dst_mv}}}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 5bdd8645a5..49d528e4ab 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -1,4 +1,5 @@
 #include "compiler/machine_mapping/get_optimal_machine_mapping.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/get_machine_resource_splits.h"
 #include "compiler/machine_mapping/machine_mapping_cache.h"
@@ -240,8 +241,8 @@ MachineMappingResult
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    float cost = context.cost_estimator.estimate_cost(mapped).runtime;
-
+    OpCostMetrics metrics = context.cost_estimator.estimate_cost(mapped);
+    float cost = metrics.forward_runtime + metrics.backward_runtime;
     return make_singleton_machine_mapping_result(cost, machine_view);
   };
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 57e82684e9..fc3a58995c 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,13 +1,20 @@
 #include "compiler/machine_mapping/machine_mapping.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/are_disjoint.h"
+#include "utils/containers/get_one_of.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/map_values.h"
 #include "utils/containers/merge_maps.h"
 
 namespace FlexFlow {
 
-MachineMapping combine_disjoint_mappings(MachineMapping const &s1,
-                                         MachineMapping const &s2) {
-  return MachineMapping{merge_maps(s1.machine_views, s2.machine_views)};
+MachineMapping combine_disjoint_mappings(MachineMapping const &m1,
+                                         MachineMapping const &m2) {
+  return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)};
 }
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
index a6c2d1ed04..9b4a1fd6fe 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -30,7 +30,9 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
     bool is_pareto_optimal = true;
     for (MachineMappingForSingleLayer const &other_mapping :
          result.machine_mappings) {
-      if (mapping.cost.runtime >= other_mapping.cost.runtime &&
+      if (mapping.cost.forward_runtime >= other_mapping.cost.forward_runtime &&
+          mapping.cost.backward_runtime >=
+              other_mapping.cost.backward_runtime &&
           mapping.cost.memory >= other_mapping.cost.memory &&
           mapping != other_mapping) {
         is_pareto_optimal = false;
@@ -54,7 +56,10 @@ MachineMappingWithMemoryResult
       [&](MachineMappingForSingleLayer const &pre_mm,
           MachineMappingForSingleLayer const &post_mm) {
         OpCostMetrics cost = OpCostMetrics{
-            pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
+            pre_mm.cost.forward_runtime + comm_cost +
+                post_mm.cost.forward_runtime,
+            pre_mm.cost.backward_runtime + comm_cost +
+                post_mm.cost.backward_runtime,
             pre_mm.cost.memory + post_mm.cost.memory,
         };
 
@@ -93,7 +98,9 @@ MachineMappingWithMemoryResult
       [&](MachineMappingForSingleLayer const &lhs_mm,
           MachineMappingForSingleLayer const &rhs_mm) {
         OpCostMetrics cost = OpCostMetrics{
-            std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
+            std::max(lhs_mm.cost.forward_runtime, rhs_mm.cost.forward_runtime),
+            std::max(lhs_mm.cost.backward_runtime,
+                     rhs_mm.cost.backward_runtime), //(@wmdi) is this correct?
             std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
         };
 
diff --git a/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc b/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc
new file mode 100644
index 0000000000..63e359d9ac
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc
@@ -0,0 +1,28 @@
+
+#include "compiler/machine_mapping/unstructured_device_mapping.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/map_values.h"
+
+namespace FlexFlow {
+
+UnstructuredDeviceMapping
+    get_unstructured_device_mapping(MachineMapping const &machine_mapping,
+                                    MachineSpecification const &machine_spec,
+                                    ParallelComputationGraph const &pcg) {
+  std::unordered_map<parallel_layer_guid_t, std::unordered_set<device_id_t>>
+      device_mapping;
+  for (auto const &[layer, machine_view] : machine_mapping.machine_views) {
+    OperatorTaskSpace op = get_operator_task_space(pcg, layer);
+    device_mapping.insert(
+        {layer, get_device_ids(op, machine_view, machine_spec)});
+  }
+  return UnstructuredDeviceMapping{device_mapping};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc b/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc
new file mode 100644
index 0000000000..2064c56a52
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc
@@ -0,0 +1,11 @@
+#include "compiler/task_graph_simulator/in_progress_task_comparator.h"
+#include <tuple>
+
+namespace FlexFlow {
+
+bool InProgressTaskComparator::operator()(InProgressTask const &lhs,
+                                          InProgressTask const &rhs) const {
+  return std::tie(lhs.end_time, lhs.node) > std::tie(rhs.end_time, rhs.node);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
new file mode 100644
index 0000000000..539c44a963
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
@@ -0,0 +1,59 @@
+#include "compiler/task_graph_simulator/pcg_task_graph.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/tensor_set_movement.h"
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "utils/bidict/bidict.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include <unordered_map>
+#include <unordered_set>
+
+namespace FlexFlow {
+
+PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg,
+                                MachineMapping const &machine_mapping,
+                                MachineSpecification const &machine_spec) {
+  DiGraph digraph = DiGraph::create<AdjacencyDiGraph>();
+  bidict<Node, PCGTask> node_to_task;
+  bidict<Node, parallel_layer_guid_t> node_to_layer;
+  std::unordered_map<Node, std::unordered_set<device_id_t>> node_to_devices;
+
+  for (parallel_layer_guid_t const &layer : get_parallel_layers(pcg)) {
+    MachineView mv = machine_mapping.machine_views.at(layer);
+    OpCostEstimateKey op_key =
+        get_mapped_op_cost_estimate_key_for_layer(pcg, layer, mv);
+    Node node = digraph.add_node();
+    node_to_task.equate(node, PCGTask{op_key});
+    node_to_layer.equate(node, layer);
+    node_to_devices[node] =
+        get_device_ids(get_operator_task_space(pcg, layer),
+                       machine_mapping.machine_views.at(layer),
+                       machine_spec);
+  }
+
+  for (ParallelComputationGraphEdge const &edge : get_edges(pcg)) {
+    MachineView src_mv = machine_mapping.machine_views.at(get_src_layer(edge));
+    MachineView dst_mv = machine_mapping.machine_views.at(get_dst_layer(edge));
+    TensorSetMovement movement =
+        get_tensor_set_movement_from_pcg_edge(edge, pcg, src_mv, dst_mv);
+    Node node = digraph.add_node();
+    node_to_task.equate(node, PCGTask{movement});
+    node_to_devices[node] = {};
+    Node src_node = node_to_layer.at_r(get_src_layer(edge));
+    Node dst_node = node_to_layer.at_r(get_dst_layer(edge));
+
+    digraph.add_edge(DirectedEdge{src_node, node});
+    digraph.add_edge(DirectedEdge{node, dst_node});
+  }
+
+  return PCGTaskGraph{digraph, node_to_task, node_to_devices};
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc b/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
new file mode 100644
index 0000000000..974a70ddf5
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
@@ -0,0 +1,107 @@
+#include "compiler/task_graph_simulator/simulate_task_graph_execution.h"
+#include "compiler/task_graph_simulator/in_progress_task.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_state.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/filtrans.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/set_of.h"
+#include "utils/containers/sorted.h"
+#include "utils/exception.h"
+#include "utils/graph/digraph/algorithms.h"
+#include "utils/graph/digraph/algorithms/get_predecessors.h"
+#include "utils/graph/digraph/algorithms/get_successors.h"
+#include "utils/graph/digraph/algorithms/is_acyclic.h"
+#include "utils/graph/digraph/digraph_view.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/overload.h"
+#include <functional>
+#include <unordered_set>
+
+namespace FlexFlow {
+
+TaskGraphExecutionTrace simulate_task_graph_execution(
+    DiGraphView const &task_graph,
+    std::function<float(Node const &)> cost_function,
+    TaskExecutionConstraint const &constraint) {
+  if (!is_acyclic(task_graph)) {
+    throw mk_runtime_error(
+        "simulate_task_graph_execution cannot simulate cyclic directed graphs");
+  }
+
+  TaskGraphExecutionState execution_state =
+      TaskGraphExecutionState{/*ready_tasks=*/set_of(get_sources(task_graph)),
+                              /*in_progress_tasks=*/{},
+                              /*finished_tasks=*/{},
+                              /*current_time=*/0.0};
+
+  std::unordered_set<TaskProfile> task_profiles;
+
+  auto start_task_processing = [&](Node const &task) {
+    float cost = cost_function(task);
+    execution_state.in_progress_tasks.push(
+        InProgressTask{execution_state.current_time,
+                       execution_state.current_time + cost,
+                       task});
+    execution_state.ready_tasks.erase(task);
+  };
+
+  auto dependencies_are_satisfied = [&](Node const &task) {
+    std::unordered_set<Node> incoming_dependencies =
+        get_predecessors(task_graph, task);
+    return is_subseteq_of(incoming_dependencies,
+                          execution_state.finished_tasks);
+  };
+
+  auto finish_task_processing = [&](InProgressTask const &in_progress_task) {
+    execution_state.finished_tasks.insert(in_progress_task.node);
+    for (Node const &task : get_successors(task_graph, in_progress_task.node)) {
+      if (dependencies_are_satisfied(task)) {
+        execution_state.ready_tasks.insert(task);
+      }
+    }
+    task_profiles.insert(TaskProfile{in_progress_task.node,
+                                     in_progress_task.start_time,
+                                     in_progress_task.end_time});
+    execution_state.current_time = in_progress_task.end_time;
+  };
+
+  auto is_processing_done = [&]() {
+    return execution_state.ready_tasks.empty() &&
+           execution_state.in_progress_tasks.empty();
+  };
+
+  auto get_next_task_to_finish = [&]() {
+    InProgressTask task = execution_state.in_progress_tasks.top();
+    execution_state.in_progress_tasks.pop();
+    return task;
+  };
+
+  while (!is_processing_done()) {
+    auto ready_tasks_copy = execution_state.ready_tasks;
+    for (Node const &task : ready_tasks_copy) {
+      std::unordered_set<Node> raw_in_progress_tasks = transform(
+          unordered_set_of(execution_state.in_progress_tasks.contents()),
+          [](InProgressTask const &t) { return t.node; });
+
+      if (constraint.is_satisfied(
+              task, raw_in_progress_tasks, execution_state.finished_tasks)) {
+        start_task_processing(task);
+      }
+    }
+
+    if (!execution_state.in_progress_tasks.empty()) {
+      InProgressTask next_task = get_next_task_to_finish();
+      finish_task_processing(next_task);
+    } else {
+      throw mk_runtime_error("Constraints cannot be satisfied");
+    }
+  }
+  if (execution_state.finished_tasks.size() != num_nodes(task_graph)) {
+    throw mk_runtime_error("Failed to execute all tasks in given graph");
+  }
+
+  return TaskGraphExecutionTrace{task_profiles};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
new file mode 100644
index 0000000000..716a7afe15
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
@@ -0,0 +1,27 @@
+#include "compiler/task_graph_simulator/task_graph_execution_trace.h"
+#include "utils/containers/maximum.h"
+#include "utils/containers/minimum.h"
+#include "utils/containers/transform.h"
+#include "utils/exception.h"
+#include "utils/fmt/unordered_set.h"
+
+namespace FlexFlow {
+
+float get_total_execution_time(TaskGraphExecutionTrace const &trace) {
+  if (trace.task_profiles.empty()) {
+    throw mk_runtime_error(
+        fmt::format("TaskGraphExecutionTrace {} is empty", trace));
+  }
+  float end_time =
+      maximum(transform(trace.task_profiles, [](TaskProfile const &profile) {
+        return profile.end_time;
+      }));
+  float start_time =
+      minimum(transform(trace.task_profiles, [](TaskProfile const &profile) {
+        return profile.start_time;
+      }));
+
+  return end_time - start_time;
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
new file mode 100644
index 0000000000..ab204e7d71
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
@@ -0,0 +1,71 @@
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.h"
+#include "compiler/task_graph_simulator/pcg_task.dtg.h"
+#include "compiler/task_graph_simulator/pcg_task_graph.h"
+#include "compiler/task_graph_simulator/simulate_task_graph_execution.h"
+#include "compiler/task_graph_simulator/task_execution_constraint.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "utils/containers/filtrans.h"
+#include "utils/containers/set_union.h"
+#include "utils/containers/transform.h"
+#include "utils/graph/digraph/digraph.h"
+#include "utils/hash/unordered_set.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+float task_simulator_estimate_forward_pass_time(
+    ParallelComputationGraph const &pcg,
+    CostEstimator const &estimator,
+    MachineMapping const &machine_mapping,
+    MachineSpecification const &machine_spec) {
+
+  PCGTaskGraph task_graph =
+      get_pcg_task_graph(pcg, machine_mapping, machine_spec);
+
+  auto cost_function = [&](Node const &node) -> float {
+    PCGTask task = task_graph.node_to_task.at_l(node);
+    if (task.is_operator()) {
+      return estimator.estimate_cost(task.require_operator()).forward_runtime;
+    } else {
+      return estimator.estimate_cost(task.require_tensor_movement());
+    }
+  };
+
+  auto is_allowed_to_run =
+      [&](Node const &task,
+          std::unordered_set<Node> const &in_progress_tasks,
+          std::unordered_set<Node> const &finished_tasks) -> bool {
+    PCGTask current_task = task_graph.node_to_task.at_l(task);
+
+    UnstructuredDeviceMapping device_map =
+        get_unstructured_device_mapping(machine_mapping, machine_spec, pcg);
+
+    if (current_task.is_tensor_movement()) {
+      return true;
+    }
+    assert(current_task.is_operator());
+
+    auto get_devices = [&](Node const &n) {
+      return task_graph.node_to_devices.at(n);
+    };
+
+    std::unordered_set<device_id_t> devices_occupied =
+        set_union(transform(in_progress_tasks, get_devices));
+    std::unordered_set<device_id_t> required_devices = get_devices(task);
+    return intersection(devices_occupied, required_devices).empty();
+  };
+
+  TaskExecutionConstraint constraint =
+      TaskExecutionConstraint{is_allowed_to_run};
+
+  return get_total_execution_time(simulate_task_graph_execution(
+      task_graph.graph, cost_function, constraint));
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/cost_estimator_for_test.cc
similarity index 72%
rename from lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
rename to lib/compiler/test/src/compiler/cost_estimator_for_test.cc
index 0431104878..48e6f5e561 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/cost_estimator_for_test.cc
@@ -1,6 +1,8 @@
 #include "./cost_estimator_for_test.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -40,4 +42,15 @@ CostEstimator make_fake_cost_estimator(
       });
 }
 
+CostEstimator make_fake_constant_cost_estimator(float forward_op_cost,
+                                                float backward_op_cost,
+                                                float comm_cost,
+                                                nonnegative_int memory_cost) {
+  return make_fake_cost_estimator(
+      [=](OpCostEstimateKey const &op) {
+        return OpCostMetrics{forward_op_cost, backward_op_cost, memory_cost};
+      },
+      [=](TensorSetMovement const &op) { return comm_cost; });
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/cost_estimator_for_test.h
similarity index 77%
rename from lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
rename to lib/compiler/test/src/compiler/cost_estimator_for_test.h
index 16ea3a85bc..1e8ce83caf 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/cost_estimator_for_test.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_TEST_COST_ESTIMATOR_H
-#define _FLEXFLOW_TEST_COST_ESTIMATOR_H
+#ifndef _FLEXFLOW_TEST_COST_ESTIMATOR_FOR_TEST_H
+#define _FLEXFLOW_TEST_COST_ESTIMATOR_FOR_TEST_H
 
 #include "compiler/cost_estimator/cost_estimator.h"
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
@@ -7,6 +7,7 @@
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
 #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -34,6 +35,11 @@ CostEstimator make_fake_cost_estimator(
     std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
     std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
 
+CostEstimator make_fake_constant_cost_estimator(float forward_op_cost,
+                                                float backward_op_cost,
+                                                float comm_cost,
+                                                nonnegative_int memory_cost);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index ac180cd079..542edd9fa9 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -1,5 +1,5 @@
 #include "compiler/machine_mapping/get_optimal_machine_mapping.h"
-#include "./cost_estimator_for_test.h"
+#include "../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_cache.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
@@ -9,6 +9,7 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "utils/containers/get_only.h"
 #include "utils/full_binary_tree/binary_tree_path.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -146,13 +147,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
         {map_unmapped_op_cost_estimate_key(k1, mv1),
-         OpCostMetrics{/*runtime=*/1.0, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/0.5,
+                       /*backward_runtime=*/0.5,
+                       /*memory=*/nonnegative_int{0}}},
         {map_unmapped_op_cost_estimate_key(k2, mv1),
-         OpCostMetrics{/*runtime=*/2.0, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/1.0,
+                       /*backward_runtime=*/1.0,
+                       /*memory=*/nonnegative_int{0}}},
         {map_unmapped_op_cost_estimate_key(k1, mv2),
-         OpCostMetrics{/*runtime=*/1.5, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/0.75,
+                       /*backward_runtime=*/0.75,
+                       /*memory=*/nonnegative_int{0}}},
         {map_unmapped_op_cost_estimate_key(k2, mv2),
-         OpCostMetrics{/*runtime=*/2.5, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/1.25,
+                       /*backward_runtime=*/1.25,
+                       /*memory=*/nonnegative_int{0}}},
     }};
 
     CostEstimator cost_estimator = make_fake_cost_estimator(
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index e22f715d82..52ad82595d 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -1,5 +1,5 @@
 #include "compiler/machine_mapping/get_tensor_set_movement_across_split.h"
-#include "./cost_estimator_for_test.h"
+#include "../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
index 221cca3ae1..304034f9be 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,5 +1,4 @@
 #include "compiler/machine_mapping/machine_mapping.h"
-#include "cost_estimator_for_test.h"
 #include "doctest/doctest.h"
 #include "pcg/machine_view.h"
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 9706f1c75f..8612017705 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -1,5 +1,5 @@
 #include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
-#include "../cost_estimator_for_test.h"
+#include "../../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
@@ -9,6 +9,7 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "utils/containers/get_only.h"
 #include "utils/full_binary_tree/binary_tree_path.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -147,24 +148,32 @@ TEST_SUITE(FF_TEST_SUITE) {
     CostEstimator cost_estimator = make_fake_cost_estimator(
         std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
             {map_unmapped_op_cost_estimate_key(k1, mv1),
-             OpCostMetrics{1.0, nonnegative_int{2}}},
+             OpCostMetrics{/*forward_runtime=*/1.0,
+                           /*backward_runtime=*/1.0,
+                           /*memory=*/nonnegative_int{2}}},
             {map_unmapped_op_cost_estimate_key(k2, mv1),
-             OpCostMetrics{2.0, nonnegative_int{3}}},
+             OpCostMetrics{/*forward_runtime=*/2.0,
+                           /*backward_runtime=*/2.0,
+                           /*memory=*/nonnegative_int{3}}},
             {map_unmapped_op_cost_estimate_key(k1, mv2),
-             OpCostMetrics{1.5, nonnegative_int{1}}},
+             OpCostMetrics{/*forward_runtime=*/1.5,
+                           /*backward_runtime=*/1.5,
+                           /*memory=*/nonnegative_int{1}}},
             {map_unmapped_op_cost_estimate_key(k2, mv2),
-             OpCostMetrics{2.5, nonnegative_int{2}}},
+             OpCostMetrics{/*forward_runtime=*/2.5,
+                           /*backward_runtime=*/2.5,
+                           /*memory=*/nonnegative_int{2}}},
         }},
         std::unordered_map<TensorSetMovement, float>{{
-            {TensorSetMovement{{}}, 0.0},
+            {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-             0.1},
+             /*cost=*/0.1},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-             0.2},
+             /*cost=*/0.2},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-             0.3},
+             /*cost=*/0.3},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-             0.4},
+             /*cost=*/0.4},
         }});
 
     MachineMappingContext context = MachineMappingContext{
@@ -187,13 +196,17 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
-              OpCostMetrics{1.0, nonnegative_int{2}},
+              OpCostMetrics{/*forward_runtime=*/1.0,
+                            /*backward_runtime=*/1.0,
+                            /*memory=*/nonnegative_int{2}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
               }},
           },
           MachineMappingForSingleLayer{
-              OpCostMetrics{1.5, nonnegative_int{1}},
+              OpCostMetrics{/*forward_runtime=*/1.5,
+                            /*backward_runtime=*/1.5,
+                            /*memory=*/nonnegative_int{1}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv2},
               }},
@@ -217,7 +230,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
               OpCostMetrics{
-                  /*runtime=*/1.0 + 2.0 + 0.1,
+                  /*forward_runtime=*/1.0 + 2.0 + 0.1,
+                  /*backward_runtime=*/1.0 + 2.0 + 0.1,
                   /*memory=*/nonnegative_int{2 + 3},
               },
               ParallelLayerGuidObliviousMachineMapping{{
@@ -236,7 +250,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               }},
           },
           MachineMappingForSingleLayer{
-              OpCostMetrics{1.5 + 2.5 + 0.1, nonnegative_int{1 + 2}},
+              OpCostMetrics{/*forward_runtime=*/1.5 + 2.5 + 0.1,
+                            /*backward_runtime=*/1.5 + 2.5 + 0.1,
+                            /*memory=*/nonnegative_int{1 + 2}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
@@ -270,7 +286,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct =
           MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{
-              OpCostMetrics{2.5, nonnegative_int{2}},
+              OpCostMetrics{/*forward_runtime=*/2.5,
+                            /*backward_runtime=*/2.5,
+                            /*memory=*/nonnegative_int{2}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index ecfb7cfeb3..1f3b7545a8 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -1,5 +1,6 @@
 #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
 #include "pcg/machine_view.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -52,15 +53,20 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
+
     OpCostMetrics cost2 = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
+
     OpCostMetrics cost3 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{3},
     };
 
@@ -182,7 +188,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics pre_cost = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
@@ -208,7 +215,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics post_cost = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
 
@@ -253,8 +261,10 @@ TEST_SUITE(FF_TEST_SUITE) {
               {
                   MachineMappingForSingleLayer{
                       /*cost=*/OpCostMetrics{
-                          /*runtime=*/pre_cost.runtime + comm_cost +
-                              post_cost.runtime,
+                          /*forward_runtime=*/pre_cost.forward_runtime +
+                              comm_cost + post_cost.forward_runtime,
+                          /*backward_runtime=*/pre_cost.backward_runtime +
+                              comm_cost + post_cost.backward_runtime,
                           /*memory=*/pre_cost.memory + post_cost.memory,
                       },
                       /*machine_mapping=*/
@@ -307,8 +317,10 @@ TEST_SUITE(FF_TEST_SUITE) {
             {
                 MachineMappingForSingleLayer{
                     /*cost=*/OpCostMetrics{
-                        /*runtime=*/pre_cost.runtime + comm_cost +
-                            post_cost.runtime,
+                        /*forward_runtime=*/pre_cost.forward_runtime +
+                            comm_cost + post_cost.forward_runtime,
+                        /*backward_runtime=*/pre_cost.backward_runtime +
+                            comm_cost + post_cost.backward_runtime,
                         /*memory=*/pre_cost.memory + post_cost.memory,
                     },
                     /*machine_mapping=*/
@@ -377,7 +389,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics lhs_cost = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
@@ -403,7 +416,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics rhs_cost = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
@@ -442,7 +456,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
               /*cost=*/OpCostMetrics{
-                  /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime),
+                  /*forward_runtime=*/std::max(lhs_cost.forward_runtime,
+                                               rhs_cost.forward_runtime),
+                  /*backward_runtime=*/
+                  std::max(lhs_cost.backward_runtime,
+                           rhs_cost.backward_runtime),
                   /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory),
               },
               /*machine_mapping=*/
@@ -518,15 +536,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
     OpCostMetrics cost2 = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
     OpCostMetrics cost3 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{3},
     };
 
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc b/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
new file mode 100644
index 0000000000..e88f2b7840
--- /dev/null
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
@@ -0,0 +1,211 @@
+#include "compiler/task_graph_simulator/simulate_task_graph_execution.h"
+#include "compiler/task_graph_simulator/task_graph_execution_state.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+#include "utils/containers/lookup_in_map.h"
+#include "utils/graph/algorithms.h"
+#include "utils/graph/digraph/directed_edge.dtg.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include <doctest/doctest.h>
+#include <optional>
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("simulate_task_graph_execution") {
+    DiGraph g = DiGraph::create<AdjacencyDiGraph>();
+    SUBCASE("linear graph") {
+      std::vector<Node> n = add_nodes(g, 4);
+      add_edges(g,
+                {
+                    DirectedEdge{n.at(0), n.at(1)},
+                    DirectedEdge{n.at(1), n.at(2)},
+                    DirectedEdge{n.at(2), n.at(3)},
+                });
+
+      auto cost_function = lookup_in_map<Node, float>(
+          {{n.at(0), 1}, {n.at(1), 10}, {n.at(2), 100}, {n.at(3), 1000}});
+
+      auto is_allowed_to_run =
+          [&](Node const &n,
+              std::unordered_set<Node> const &in_progress_tasks,
+              std::unordered_set<Node> const &finished_tasks) { return true; };
+
+      TaskExecutionConstraint constraint =
+          TaskExecutionConstraint{is_allowed_to_run};
+
+      TaskGraphExecutionTrace result =
+          simulate_task_graph_execution(g, cost_function, constraint);
+      TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+          TaskProfile{n.at(0), 0, 1},
+          TaskProfile{n.at(1), 1, 11},
+          TaskProfile{n.at(2), 11, 111},
+          TaskProfile{n.at(3), 111, 1111},
+      }};
+      CHECK(correct == result);
+    }
+
+    SUBCASE("rhomboidal graph") {
+      std::vector<Node> n = add_nodes(g, 4);
+
+      add_edges(g,
+                {DirectedEdge{n.at(0), n.at(1)},
+                 DirectedEdge{n.at(0), n.at(2)},
+                 DirectedEdge{n.at(1), n.at(3)},
+                 DirectedEdge{n.at(2), n.at(3)}});
+
+      auto cost_function = lookup_in_map<Node, float>(
+          {{n.at(0), 10}, {n.at(1), 15}, {n.at(2), 20}, {n.at(3), 25}});
+
+      SUBCASE("no processing constraints") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return true;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 10, 30},
+            TaskProfile{n.at(3), 30, 55},
+        }};
+        CHECK(correct == result);
+      }
+
+      SUBCASE("one node at a time") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return in_progress_tasks.size() == 0;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 25, 45},
+            TaskProfile{n.at(3), 45, 70},
+        }};
+        CHECK(correct == result);
+      }
+    }
+
+    SUBCASE("diamond graph with crossing") {
+      std::vector<Node> n = add_nodes(g, 6);
+
+      add_edges(g,
+                {
+                    DirectedEdge{n.at(0), n.at(1)},
+                    DirectedEdge{n.at(0), n.at(2)},
+                    DirectedEdge{n.at(1), n.at(3)},
+                    DirectedEdge{n.at(2), n.at(3)},
+                    DirectedEdge{n.at(2), n.at(4)},
+                    DirectedEdge{n.at(3), n.at(5)},
+                    DirectedEdge{n.at(4), n.at(5)},
+                });
+
+      auto cost_function = lookup_in_map<Node, float>({{n.at(0), 10},
+                                                       {n.at(1), 15},
+                                                       {n.at(2), 20},
+                                                       {n.at(3), 25},
+                                                       {n.at(4), 30},
+                                                       {n.at(5), 35}});
+
+      SUBCASE("no processing constraints") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return true;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 10, 30},
+            TaskProfile{n.at(3), 30, 55},
+            TaskProfile{n.at(4), 30, 60},
+            TaskProfile{n.at(5), 60, 95},
+        }};
+        CHECK(correct == result);
+      }
+
+      SUBCASE("one node at a time") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return in_progress_tasks.size() == 0;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 25, 45},
+            TaskProfile{n.at(3), 45, 70},
+            TaskProfile{n.at(4), 70, 100},
+            TaskProfile{n.at(5), 100, 135},
+        }};
+        CHECK(correct == result);
+      }
+    }
+
+    SUBCASE("all-to-all intermediate") {
+      std::vector<Node> n = add_nodes(g, 5);
+
+      add_edges(g,
+                {DirectedEdge{n.at(0), n.at(1)},
+                 DirectedEdge{n.at(0), n.at(2)},
+                 DirectedEdge{n.at(0), n.at(3)},
+                 DirectedEdge{n.at(1), n.at(4)},
+                 DirectedEdge{n.at(2), n.at(4)},
+                 DirectedEdge{n.at(3), n.at(4)}});
+
+      auto cost_function = lookup_in_map<Node, float>({{n.at(0), 10},
+                                                       {n.at(1), 100},
+                                                       {n.at(2), 100},
+                                                       {n.at(3), 100},
+                                                       {n.at(4), 20}});
+
+      SUBCASE("at most two nodes at a time") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return in_progress_tasks.size() < 2;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 110},
+            TaskProfile{n.at(2), 10, 110},
+            TaskProfile{n.at(3), 110, 210},
+            TaskProfile{n.at(4), 210, 230},
+        }};
+        CHECK(correct == result);
+      }
+    }
+  }
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
new file mode 100644
index 0000000000..e278338440
--- /dev/null
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -0,0 +1,265 @@
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "../cost_estimator_for_test.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "op-attrs/ops/input_attrs.dtg.h"
+#include "op-attrs/parallel_tensor_dims.dtg.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/device_id.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_specification_dimension.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/machine_view_dimension.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "pcg/stride_t.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "utils/containers/get_only.h"
+#include "utils/deduplicated_priority_queue.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_source_nodes.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <doctest/doctest.h>
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("task_simulator_estimate_forward_pass_time") {
+    MachineSpecification machine_spec =
+        MachineSpecification{/*num_nodes=*/3,
+                             /*num_cpus_per_node=*/3,
+                             /*num_gpus_per_node=*/3,
+                             /*inter_node_bandwidth=*/1.0f,
+                             /*intra_node_bandwidth=*/1.0f};
+
+    SUBCASE("linear graph") {
+      ParallelComputationGraphBuilder b;
+      ParallelTensorShape input_shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{},
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+      parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape);
+      parallel_tensor_guid_t tensor1 = b.relu(tensor0);
+
+      parallel_layer_guid_t layer0 = get_source_layer(tensor0);
+      parallel_layer_guid_t layer1 = get_source_layer(tensor1);
+
+      std::vector<MachineViewDimension> dims = {
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+      };
+      ParallelComputationGraph pcg = b.pcg;
+      MachineView mv1 =
+          MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+      MachineView mv2 =
+          MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
+
+      MachineMapping device_mapping = MachineMapping{{
+          {layer0, mv1},
+          {layer1, mv2},
+      }};
+
+      SUBCASE("constant op, comm cost") {
+        CostEstimator estimator = make_fake_constant_cost_estimator(
+            /*forward_op_cost=*/10.0f,
+            /*backward_op_cost=*/10.0f,
+            /*comm_cost=*/1.0f,
+            /*memory_cost=*/nonnegative_int{0});
+
+        float result = task_simulator_estimate_forward_pass_time(
+            pcg, estimator, device_mapping, machine_spec);
+
+        float correct = 10 + 1 + 10;
+        CHECK(result == correct);
+      }
+
+      SUBCASE("variable op, comm cost") {
+        CostEstimator cost_estimator = make_fake_cost_estimator(
+            [](OpCostEstimateKey const &op) {
+              if (op.op_attrs.has<InputAttrs>()) {
+                return OpCostMetrics{/*forward_runtime=*/10.0f,
+                                     /*backward_runtime=*/10.0f,
+                                     /*memory=*/nonnegative_int{0}}; // layer0
+              }
+              if (op.op_attrs.has<ElementUnaryAttrs>()) {
+                return OpCostMetrics{/*forward_runtime=*/1.0f,
+                                     /*backward_runtime=*/1.0f,
+                                     /*memory=*/nonnegative_int{0}}; // layer1
+              }
+              return OpCostMetrics{/*forward_runtime=*/0.0f,
+                                   /*backward_runtime=*/0.0f,
+                                   /*memory=*/nonnegative_int{0}};
+            },
+            [](TensorSetMovement const &comm) { return 5.0f; });
+
+        float result = task_simulator_estimate_forward_pass_time(
+            pcg, cost_estimator, device_mapping, machine_spec);
+        float correct = 10 + 5 + 1;
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("rhomboidal graph") {
+      ParallelComputationGraphBuilder b;
+
+      ParallelTensorShape input_shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{ShardParallelDim{10, 1}},
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape);
+      parallel_tensor_guid_t tensor1 = b.relu(tensor0);
+      parallel_tensor_guid_t tensor2 = b.relu(tensor0);
+      parallel_tensor_guid_t tensor3 = b.add(tensor1, tensor2);
+
+      parallel_layer_guid_t layer0 = get_source_layer(tensor0);
+      parallel_layer_guid_t layer1 = get_source_layer(tensor1);
+      parallel_layer_guid_t layer2 = get_source_layer(tensor2);
+      parallel_layer_guid_t layer3 = get_source_layer(tensor3);
+
+      ParallelComputationGraph pcg = b.pcg;
+      std::vector<MachineViewDimension> dims = {
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+      };
+
+      SUBCASE("all different devices") {
+        MachineView mv0 =
+            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+        MachineView mv1 =
+            MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
+        MachineView mv2 =
+            MachineView{MachineSpaceCoordinate{1, 0, DeviceType::GPU}, dims};
+        MachineView mv3 =
+            MachineView{MachineSpaceCoordinate{1, 1, DeviceType::GPU}, dims};
+
+        MachineMapping device_mapping = MachineMapping{{
+            {layer0, mv0},
+            {layer1, mv1},
+            {layer2, mv2},
+            {layer3, mv3},
+        }};
+        SUBCASE("constant op, comm cost") {
+          CostEstimator estimator = make_fake_constant_cost_estimator(
+              /*forward_op_cost=*/10.0f,
+              /*backward_op_cost=*/10.0f,
+              /*comm_cost=*/1.0f,
+              /*memory_cost=*/nonnegative_int{0});
+
+          float result = task_simulator_estimate_forward_pass_time(
+              pcg, estimator, device_mapping, machine_spec);
+          float correct = 10 + 1 + 10 + 1 + 10;
+          CHECK(result == correct);
+        }
+        SUBCASE("variable op, comm cost") {
+          CostEstimator cost_estimator = make_fake_cost_estimator(
+              [](OpCostEstimateKey const &op) {
+                if (op.op_attrs.has<InputAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/10.0f,
+                                       /*backward_runtime=*/10.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer0
+                }
+                if (op.op_attrs.has<ElementUnaryAttrs>()) {
+                  return OpCostMetrics{
+                      /*forward_runtime=*/1.0f,
+                      /*backward_runtime=*/1.0f,
+                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                }
+                if (op.op_attrs.has<ElementBinaryAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/2.0f,
+                                       /*backward_runtime=*/2.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer3
+                }
+                return OpCostMetrics{/*forward_runtime=*/0.0f,
+                                     /*backward_runtime=*/0.0f,
+                                     /*memory=*/nonnegative_int{0}};
+              },
+              [](TensorSetMovement const &comm) { return 5.0f; });
+        }
+      }
+
+      SUBCASE("all the same device") {
+        MachineView mv =
+            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+        MachineMapping device_mapping = MachineMapping{{
+            {layer0, mv},
+            {layer1, mv},
+            {layer2, mv},
+            {layer3, mv},
+        }};
+        SUBCASE("constant op, cost cost") {
+          CostEstimator cost_estimator = make_fake_constant_cost_estimator(
+              /*forward_op_cost=*/10.0f,
+              /*backward_op_cost=*/10.0f,
+              /*comm_cost=*/1.0f,
+              /*memory_cost=*/nonnegative_int{0});
+
+          float result = task_simulator_estimate_forward_pass_time(
+              pcg, cost_estimator, device_mapping, machine_spec);
+          float correct = 10 + 10 + 10 + 10 + 1 + 1;
+          CHECK(result == correct);
+        }
+        SUBCASE("variable op, cost cost") {
+          CostEstimator cost_estimator = make_fake_cost_estimator(
+              [](OpCostEstimateKey const &op) {
+                if (op.op_attrs.has<InputAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/10.0f,
+                                       /*backward_runtime=*/10.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer0
+                }
+                if (op.op_attrs.has<ElementUnaryAttrs>()) {
+                  return OpCostMetrics{
+                      /*forward_runtime=*/1.0f,
+                      /*backward_runtime=*/1.0f,
+                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                }
+                if (op.op_attrs.has<ElementBinaryAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/2.0f,
+                                       /*backward_runtime=*/2.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer3
+                }
+                return OpCostMetrics{/*forward_runtime=*/0.0f,
+                                     /*backward_runtime=*/0.0f,
+                                     /*memory=*/nonnegative_int{0}};
+              },
+              [](TensorSetMovement const &comm) { return 5.0f; });
+          float result = task_simulator_estimate_forward_pass_time(
+              pcg, cost_estimator, device_mapping, machine_spec);
+          float correct = 10 + 5 + (1 + 1) + 5 + 2;
+          CHECK(result == correct);
+        }
+      }
+    }
+  }
+}
+} // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h
index 6ffa9900c2..39591e8a70 100644
--- a/lib/pcg/include/pcg/machine_specification.h
+++ b/lib/pcg/include/pcg/machine_specification.h
@@ -20,6 +20,7 @@ bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
 
 device_id_t get_device_id(MachineSpecification const &ms,
                           MachineSpaceCoordinate const &coord);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h
index 293227b7a1..f72b2359dc 100644
--- a/lib/pcg/include/pcg/machine_view.h
+++ b/lib/pcg/include/pcg/machine_view.h
@@ -37,6 +37,14 @@ std::unordered_set<MachineSpaceCoordinate>
                                   MachineView const &mv,
                                   MachineSpecification const &ms);
 
+std::unordered_set<device_id_t> get_device_ids(OperatorTaskSpace const &task,
+                                               MachineView const &mv,
+                                               MachineSpecification const &ms);
+
+MachineView make_1d_machine_view(MachineSpaceCoordinate const &start,
+                                 MachineSpecificationDimension const &dim,
+                                 stride_t stride);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h
index 61cab4eff1..1a19397c72 100644
--- a/lib/pcg/include/pcg/operator_task_space.h
+++ b/lib/pcg/include/pcg/operator_task_space.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_PCG_INCLUDE_OPERATOR_TASK_SPACE_H
 
 #include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/task_space_coordinate.dtg.h"
 #include <cstddef>
 #include <unordered_set>
@@ -17,6 +19,9 @@ TaskSpaceCoordinate
 size_t num_dims(OperatorTaskSpace const &task);
 size_t num_tasks(OperatorTaskSpace const &task);
 
+OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
+                                          parallel_layer_guid_t const &layer);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index c740e1ffd2..f7567b5025 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -6,6 +6,7 @@
 #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include <unordered_set>
 
 namespace FlexFlow {
 
@@ -31,6 +32,20 @@ std::unordered_set<ParallelComputationGraphEdge>
                                       parallel_layer_guid_t const &,
                                       parallel_layer_guid_t const &);
 
+std::unordered_set<ParallelComputationGraphEdge>
+    get_edges(ParallelComputationGraph const &);
+
+std::unordered_set<ParallelComputationGraphEdge>
+    get_outgoing_edges(ParallelComputationGraph const &,
+                       parallel_layer_guid_t const &);
+
+std::unordered_set<ParallelComputationGraphEdge>
+    get_incoming_edges(ParallelComputationGraph const &,
+                       parallel_layer_guid_t const &);
+
+std::unordered_set<parallel_layer_guid_t>
+    get_initial_layers(ParallelComputationGraph const &);
+
 std::vector<parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
@@ -45,6 +60,9 @@ std::vector<parallel_tensor_guid_t>
     get_incoming_weights(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
 
+parallel_layer_guid_t get_source_layer(ParallelComputationGraph const &g,
+                                       parallel_tensor_guid_t const &t);
+
 ParallelLayerAttrs get_parallel_layer_attrs(ParallelComputationGraph const &,
                                             parallel_layer_guid_t const &);
 PCGOperatorAttrs pcg_get_op_attrs(ParallelComputationGraph const &,
diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc
index ca5b8ba047..19ff50b4b7 100644
--- a/lib/pcg/src/pcg/machine_specification.cc
+++ b/lib/pcg/src/pcg/machine_specification.cc
@@ -1,5 +1,6 @@
 #include "pcg/machine_specification.h"
 #include "pcg/device_id.h"
+#include "utils/containers/transform.h"
 #include "utils/exception.h"
 namespace FlexFlow {
 
diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc
index 18f6cacb7e..cc42ad83b2 100644
--- a/lib/pcg/src/pcg/machine_view.cc
+++ b/lib/pcg/src/pcg/machine_view.cc
@@ -1,14 +1,21 @@
 #include "pcg/machine_view.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/machine_specification.dtg.h"
 #include "pcg/machine_specification.h"
+#include "pcg/machine_specification_dimension.dtg.h"
+#include "pcg/machine_view_dimension.dtg.h"
+#include "pcg/operator_task_space.dtg.h"
 #include "pcg/operator_task_space.h"
+#include "pcg/stride_t.dtg.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/count.h"
 #include "utils/containers/filter.h"
+#include "utils/containers/get_only.h"
 #include "utils/containers/scanl.h"
 #include "utils/containers/sum.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/zip.h"
-
+#include "utils/exception.h"
 namespace FlexFlow {
 
 size_t num_dims(MachineView const &mv) {
@@ -35,6 +42,13 @@ MachineView machine_view_from_strides_and_machine_spec_dimensions(
     MachineSpaceCoordinate const &start,
     std::vector<stride_t> const &strides,
     std::vector<MachineSpecificationDimension> const &dims) {
+  if (strides.size() != dims.size()) {
+    throw mk_runtime_error(fmt::format(
+        "Length of strides ({}) and dims ({}) must match when calling "
+        "machine_view_from_strides_and_machine_spec_dimensions",
+        start,
+        strides));
+  }
   std::vector<MachineViewDimension> dimensions =
       transform(zip(strides, dims), [&](auto const &p) {
         return MachineViewDimension{p.first, p.second};
@@ -48,6 +62,14 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
     TaskSpaceCoordinate const &coord,
     MachineSpecification const &machine_specification) {
 
+  if (num_dims(machine_view) != task.degrees.size()) {
+    throw mk_runtime_error(
+        fmt::format("Dimension of machine_view ({}) must match dimension of "
+                    "task ({}) when computing machine space coordinate",
+                    machine_view,
+                    task.degrees));
+  }
+
   auto get_dimension_indices_for_dimension =
       [&](MachineSpecificationDimension dimension) {
         std::vector<MachineSpecificationDimension> mv_dimensions =
@@ -106,10 +128,37 @@ std::unordered_set<MachineSpaceCoordinate> get_machine_space_coordinates(
     MachineSpecification const &machine_specification) {
   return transform(
       get_task_space_coordinates(task), [&](TaskSpaceCoordinate const &coord) {
-        return get_machine_space_coordinate(
-                   task, machine_view, coord, machine_specification)
-            .value();
+        std::optional<MachineSpaceCoordinate> maybe_coordinate =
+            get_machine_space_coordinate(
+                task, machine_view, coord, machine_specification);
+        if (!maybe_coordinate.has_value()) {
+          throw mk_runtime_error(
+              fmt::format("In get_machine_space_coordinates, the given "
+                          "OperatorTaskSpace {} and MachineView {} are not "
+                          "compatible with the given MachineSpecification {}",
+                          task,
+                          machine_view,
+                          machine_specification));
+        }
+        return maybe_coordinate.value();
       });
 }
 
+std::unordered_set<device_id_t> get_device_ids(OperatorTaskSpace const &task,
+                                               MachineView const &mv,
+                                               MachineSpecification const &ms) {
+  return transform(get_machine_space_coordinates(task, mv, ms),
+                   [&](MachineSpaceCoordinate const &coord) {
+                     return get_device_id(ms, coord);
+                   });
+}
+
+MachineView make_1d_machine_view(MachineSpaceCoordinate const &start,
+                                 MachineSpecificationDimension const &dim,
+                                 stride_t stride) {
+
+  return machine_view_from_strides_and_machine_spec_dimensions(
+      start, {stride}, {dim});
+}
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 2538cb4ea0..7157b75082 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -1,12 +1,19 @@
 #include "pcg/operator_task_space.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "utils/containers/cartesian_product.h"
+#include "utils/containers/extend.h"
 #include "utils/containers/maximum.h"
 #include "utils/containers/product.h"
 #include "utils/containers/range.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_set_of.h"
+#include "utils/containers/vector_of.h"
 #include "utils/fmt/unordered_set.h"
-
 namespace FlexFlow {
 
 std::unordered_set<TaskSpaceCoordinate>
@@ -36,4 +43,16 @@ size_t num_tasks(OperatorTaskSpace const &task) {
   return product(task.degrees);
 }
 
+OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
+                                          parallel_layer_guid_t const &layer) {
+  parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0);
+  ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor);
+
+  std::vector<int> degrees;
+  extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
+  degrees.push_back(get_sum_degree(shape));
+  degrees.push_back(get_discard_copy_degree(shape));
+  return OperatorTaskSpace{degrees};
+}
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index 781c44640c..4cc0500fa2 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -1,15 +1,25 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "utils/containers/filtrans.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
 #include "utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.h"
+#include "utils/graph/dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
+#include "utils/graph/dataflow_graph/dataflow_edge.dtg.h"
+#include "utils/graph/digraph/algorithms.h"
 #include "utils/graph/digraph/algorithms/get_topological_ordering.h"
 #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
 #include "utils/graph/labelled_dataflow_graph/algorithms/find_isomorphism.h"
 #include "utils/graph/labelled_dataflow_graph/algorithms/rewrite_node_labels.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/node/node.dtg.h"
+#include <unordered_set>
 
 namespace FlexFlow {
 
@@ -66,6 +76,13 @@ ParallelLayerAddedResult
                             /*output_labels=*/{tensor_attrs});
 }
 
+std::unordered_set<ParallelComputationGraphEdge>
+    get_edges(ParallelComputationGraph const &pcg) {
+  return transform(get_edges(pcg.raw_graph), [](DataflowEdge const &e) {
+    return ParallelComputationGraphEdge{e};
+  });
+}
+
 std::unordered_set<ParallelComputationGraphEdge>
     get_pcg_edges_from_layer_to_layer(ParallelComputationGraph const &pcg,
                                       parallel_layer_guid_t const &src,
@@ -78,6 +95,33 @@ std::unordered_set<ParallelComputationGraphEdge>
   });
 }
 
+std::unordered_set<ParallelComputationGraphEdge>
+    get_outgoing_edges(ParallelComputationGraph const &pcg,
+                       parallel_layer_guid_t const &l) {
+  std::unordered_set<DataflowEdge> raw_edges =
+      get_outgoing_edges(pcg.raw_graph, l.raw_graph_node);
+  return transform(raw_edges, [](DataflowEdge const &e) {
+    return ParallelComputationGraphEdge{e};
+  });
+}
+
+std::unordered_set<ParallelComputationGraphEdge>
+    get_incoming_edges(ParallelComputationGraph const &pcg,
+                       parallel_layer_guid_t const &l) {
+  std::unordered_set<DataflowEdge> raw_edges =
+      unordered_set_of(get_incoming_edges(pcg.raw_graph, l.raw_graph_node));
+  return transform(raw_edges, [](DataflowEdge const &e) {
+    return ParallelComputationGraphEdge{e};
+  });
+}
+
+std::unordered_set<parallel_layer_guid_t>
+    get_initial_layers(ParallelComputationGraph const &pcg) {
+  std::unordered_set<Node> raw_sources = get_sources(pcg.raw_graph);
+  return transform(raw_sources,
+                   [](Node const &n) { return parallel_layer_guid_t{n}; });
+}
+
 std::vector<parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &pcg,
                          parallel_layer_guid_t const &l) {
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
index dca8154eb4..d30739486e 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
@@ -1,4 +1,5 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc
index dcf22d6c00..3e9d48fac3 100644
--- a/lib/pcg/test/src/pcg/machine_view.cc
+++ b/lib/pcg/test/src/pcg/machine_view.cc
@@ -1,4 +1,5 @@
 #include "pcg/machine_view.h"
+#include "pcg/gpu_id_t.dtg.h"
 #include "test/utils/doctest/fmt/optional.h"
 #include "utils/containers/transform.h"
 #include "utils/fmt/unordered_set.h"
@@ -298,4 +299,94 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
     }
   }
+
+  TEST_CASE("get_device_ids") {
+
+    SUBCASE("1D machine view") {
+
+      // This operator has shape (3,), and thus 3 tasks.
+      // The (only) dimension is projected onto the INTRA (device) dimension
+      // with a stride of 2. The start of the projection defined by MachineView
+      // is at MachineSpaceCoordinate (0, 1), and the machine space has 1 node
+      // and 6 devices per node.
+
+      /**
+       * The tasks will thus be distributed like this:
+       *  +-------+-------+-------+-------+-------+-------+
+       *  |   0   | ((1)) |   2   | ((3)) |   4   | ((5)) |
+       *  +-------+-------+-------+-------+-------+-------+
+       * Where the integers are the device ids and ((x)) are the devices we
+       * select
+       */
+      MachineSpecification ms =
+          MachineSpecification{/*num_nodes=*/1,
+                               /*num_cpus_per_node=*/6,
+                               /*num_gpus_per_node=*/6,
+                               /*inter_node_bandwidth=*/0,
+                               /*intra_node_bandwidth=*/0};
+
+      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      MachineView mv = MachineView{
+          MachineSpaceCoordinate{
+              /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
+          {MachineViewDimension{stride_t{2},
+                                MachineSpecificationDimension::INTRA_NODE}}};
+
+      std::unordered_set<device_id_t> correct = {
+          device_id_t{gpu_id_t{1}},
+          device_id_t{gpu_id_t{3}},
+          device_id_t{gpu_id_t{5}},
+      };
+      std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
+      CHECK(result == correct);
+    }
+
+    SUBCASE("2D machine view") {
+      // This operator has shape (2, 2), and thus 2 * 2 = 4 tasks.
+      // - The first dimension is projected onto the INTER (node) dimension with
+      // stride 1,
+      // - The second dimension is projected onto the INTRA (device) dimension
+      // with stride 2. The start of the projection defined by MachineView is at
+      // MachineSpaceCoordinate (1, 2), and the machine space has 3 nodes and 5
+      // devices per node.
+
+      /**
+       * The tasks will thus be distributed like this:
+       *  +-------+-------+-------+-------+-------+
+       *  |   0   |   1   |   2   |   3   |   4   |
+       *  +-------+-------+-------+-------+-------+
+       *  |   5   |   6   | ((7)) |   8   | ((9)) |
+       *  +-------+-------+-------+-------+-------+
+       *  |   10  |   11  | ((12))|  13   | ((14))|
+       *  +-------+-------+-------+-------+-------+
+       * Where the integers are the device ids and ((x)) are the devices we
+       * select
+       */
+
+      MachineSpecification ms =
+          MachineSpecification{/*num_nodes=*/3,
+                               /*num_cpus_per_node=*/5,
+                               /*num_gpus_per_node=*/5,
+                               /*inter_node_bandwidth=*/0,
+                               /*intra_node_bandwidth=*/0};
+
+      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      MachineView mv = MachineView{
+          MachineSpaceCoordinate{
+              /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU},
+          {MachineViewDimension{stride_t{1},
+                                MachineSpecificationDimension::INTER_NODE},
+           MachineViewDimension{stride_t{2},
+                                MachineSpecificationDimension::INTRA_NODE}}};
+
+      std::unordered_set<device_id_t> correct = {
+          device_id_t{gpu_id_t{7}},
+          device_id_t{gpu_id_t{9}},
+          device_id_t{gpu_id_t{12}},
+          device_id_t{gpu_id_t{14}},
+      };
+      std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
+      CHECK(result == correct);
+    }
+  }
 }
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index fc07edf5b3..dd8308561f 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -36,8 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     parallel_tensor_guid_t tensor3 = get_only(layer3_added.outputs);
 
     std::vector<parallel_layer_guid_t> result = topological_ordering(pcg);
-    // std::vector<parallel_layer_guid_t> correct = {layer1, layer2, layer3};
-    // CHECK(result == correct);
+    std::vector<parallel_layer_guid_t> correct = {layer1, layer2, layer3};
+    CHECK(result == correct);
   }
 
   TEST_CASE(
@@ -105,6 +105,82 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
+  TEST_CASE(
+      "get_source_layer(ParallelComputationGraph, parallel_tensor_guid_t)") {
+    ParallelTensorShape tensor_shape = ParallelTensorShape{
+        ParallelTensorDims{
+            FFOrdered<ShardParallelDim>{
+                ShardParallelDim{10, 2},
+                ShardParallelDim{12, 1},
+            },
+            ReplicaParallelDimSet{
+                SumDegree{1},
+                DiscardCopyDegree{1},
+            },
+        },
+        DataType::FLOAT,
+    };
+
+    ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+    ParallelLayerAttrs layer_label = some<ParallelLayerAttrs>();
+    ParallelTensorAttrs tensor_label = some<ParallelTensorAttrs>();
+
+    SUBCASE("single layer") {
+      ParallelLayerAddedResult layer1_added =
+          add_parallel_layer(pcg, layer_label, {}, {tensor_label});
+      parallel_layer_guid_t layer1 = layer1_added.parallel_layer;
+      parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs);
+
+      parallel_layer_guid_t result = get_source_layer(pcg, tensor1);
+      parallel_layer_guid_t correct = layer1;
+      CHECK(result == correct);
+    }
+
+    SUBCASE("two connected layers") {
+      ParallelLayerAddedResult layer1_added =
+          add_parallel_layer(pcg, layer_label, {}, {tensor_label});
+      parallel_layer_guid_t layer1 = layer1_added.parallel_layer;
+      parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs);
+
+      ParallelLayerAddedResult layer2_added =
+          add_parallel_layer(pcg, layer_label, {tensor1}, {tensor_label});
+      parallel_layer_guid_t layer2 = layer2_added.parallel_layer;
+
+      parallel_layer_guid_t result = get_source_layer(pcg, tensor1);
+      parallel_layer_guid_t correct = layer1;
+      CHECK(result == correct);
+    }
+
+    SUBCASE("three layers in series") {
+      ParallelLayerAddedResult layer1_added =
+          add_parallel_layer(pcg, layer_label, {}, {tensor_label});
+      parallel_layer_guid_t layer1 = layer1_added.parallel_layer;
+      parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs);
+
+      ParallelLayerAddedResult layer2_added =
+          add_parallel_layer(pcg, layer_label, {tensor1}, {tensor_label});
+      parallel_layer_guid_t layer2 = layer2_added.parallel_layer;
+      parallel_tensor_guid_t tensor2 = get_only(layer2_added.outputs);
+
+      ParallelLayerAddedResult layer3_added =
+          add_parallel_layer(pcg, layer_label, {tensor2}, {tensor_label});
+      parallel_layer_guid_t layer3 = layer3_added.parallel_layer;
+
+      SUBCASE("tensor 1") {
+        parallel_layer_guid_t result = get_source_layer(pcg, tensor1);
+        parallel_layer_guid_t correct = layer1;
+        CHECK(result == correct);
+      }
+
+      SUBCASE("tensor 2") {
+        parallel_layer_guid_t result = get_source_layer(pcg, tensor2);
+        parallel_layer_guid_t correct = layer2;
+        CHECK(result == correct);
+      }
+    }
+  }
+
   TEST_CASE(
       "get_incoming_weights(ParallelComputationGraph, parallel_layer_guid_t)") {
     ParallelTensorShape input_shape = ParallelTensorShape{
diff --git a/lib/runtime/src/parallel_compuation_graph.cc b/lib/runtime/src/parallel_compuation_graph.cc
deleted file mode 100644
index ebc5ac1e8e..0000000000
--- a/lib/runtime/src/parallel_compuation_graph.cc
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "parallel_computation_graph.h"
-
-namespace FlexFlow {
-
-ParallelTensor ParallelComputationGraph::{}
-
-} // namespace FlexFlow
diff --git a/lib/utils/include/utils/archetypes/value_type.h b/lib/utils/include/utils/archetypes/value_type.h
index 1635747612..e45b8fda7e 100644
--- a/lib/utils/include/utils/archetypes/value_type.h
+++ b/lib/utils/include/utils/archetypes/value_type.h
@@ -2,7 +2,10 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_ARCHETYPES_VALUE_TYPE_H
 
 #include <cassert>
+#include <fmt/format.h>
 #include <functional>
+#include <ostream>
+#include <sstream>
 
 namespace FlexFlow {
 
@@ -32,6 +35,16 @@ struct value_type {
   }
 };
 
+template <int TAG>
+std::string format_as(value_type<TAG> const &) {
+  assert(false);
+}
+
+template <int TAG>
+std::ostream &operator<<(std::ostream &s, value_type<TAG> const &x) {
+  assert(false);
+}
+
 } // namespace FlexFlow
 
 namespace std {
diff --git a/lib/utils/include/utils/containers/lookup_in_map.h b/lib/utils/include/utils/containers/lookup_in_map.h
new file mode 100644
index 0000000000..946fc589db
--- /dev/null
+++ b/lib/utils/include/utils/containers/lookup_in_map.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_LOOKUP_IN_MAP_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_LOOKUP_IN_MAP_H
+
+#include "utils/containers/contains.h"
+#include "utils/containers/keys.h"
+#include "utils/exception.h"
+#include "utils/fmt/unordered_map.h"
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+template <typename K, typename V>
+std::function<V(K const &)> lookup_in_map(std::unordered_map<K, V> const &map) {
+  return [map](K const &key) -> V {
+    if (!contains(keys(map), key)) {
+      throw mk_runtime_error(fmt::format(
+          "Key {} is not present in the underlying map {}", key, map));
+    }
+    return map.at(key);
+  };
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/minimum.h b/lib/utils/include/utils/containers/minimum.h
new file mode 100644
index 0000000000..8bdd6ea985
--- /dev/null
+++ b/lib/utils/include/utils/containers/minimum.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MINIMUM_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MINIMUM_H
+
+#include "utils/exception.h"
+#include <algorithm>
+
+namespace FlexFlow {
+
+template <typename C>
+typename C::value_type minimum(C const &c) {
+  if (c.empty()) {
+    throw mk_runtime_error(
+        fmt::format("minimum expected non-empty container but received {}", c));
+  }
+
+  return *std::min_element(c.begin(), c.end());
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/deduplicated_priority_queue.h b/lib/utils/include/utils/deduplicated_priority_queue.h
index 66f6e524d4..afad3f5889 100644
--- a/lib/utils/include/utils/deduplicated_priority_queue.h
+++ b/lib/utils/include/utils/deduplicated_priority_queue.h
@@ -3,6 +3,7 @@
 
 #include "utils/containers/contains.h"
 #include <queue>
+#include <set>
 #include <unordered_set>
 #include <vector>
 
@@ -38,6 +39,16 @@ class DeduplicatedPriorityQueue {
     impl.pop();
   }
 
+  std::set<Elem, Compare> contents() const {
+    auto temp = impl;
+    std::set<Elem, Compare> result;
+    while (!temp.empty()) {
+      result.insert(temp.top());
+      temp.pop();
+    }
+    return result;
+  }
+
 private:
   std::priority_queue<Elem, Container, Compare> impl;
   std::unordered_set<Elem, Hash> hashmap;
diff --git a/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h b/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h
new file mode 100644
index 0000000000..a8b5efe66e
--- /dev/null
+++ b/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_GET_OUTGOING_EDGES_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_GET_OUTGOING_EDGES_H
+
+#include "utils/graph/dataflow_graph/dataflow_graph_view.h"
+
+namespace FlexFlow {
+
+std::unordered_set<DataflowEdge> get_outgoing_edges(DataflowGraphView const &,
+                                                    Node const &);
+std::unordered_set<DataflowEdge>
+    get_outgoing_edges(DataflowGraphView const &,
+                       std::unordered_set<Node> const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/src/utils/containers/lookup_in_map.cc b/lib/utils/src/utils/containers/lookup_in_map.cc
new file mode 100644
index 0000000000..a0d7db8e82
--- /dev/null
+++ b/lib/utils/src/utils/containers/lookup_in_map.cc
@@ -0,0 +1,12 @@
+#include "utils/containers/lookup_in_map.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using K = value_type<0>;
+using V = value_type<1>;
+
+template std::function<V(K const &)>
+    lookup_in_map(std::unordered_map<K, V> const &map);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/minimum.cc b/lib/utils/src/utils/containers/minimum.cc
new file mode 100644
index 0000000000..c9bbc7706f
--- /dev/null
+++ b/lib/utils/src/utils/containers/minimum.cc
@@ -0,0 +1 @@
+#include "utils/containers/minimum.h"
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
new file mode 100644
index 0000000000..2376e4897f
--- /dev/null
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -0,0 +1,28 @@
+#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
+#include "utils/containers/sorted_by.h"
+
+namespace FlexFlow {
+
+std::unordered_set<DataflowEdge> get_outgoing_edges(DataflowGraphView const &g,
+                                                    Node const &n) {
+  return g.query_edges(DataflowEdgeQuery{
+      {n},
+      query_set<int>::matchall(),
+      query_set<Node>::matchall(),
+      query_set<int>::matchall(),
+  });
+}
+
+std::unordered_set<DataflowEdge>
+    get_outgoing_edges(DataflowGraphView const &g,
+                       std::unordered_set<Node> const &ns) {
+  DataflowEdgeQuery query = DataflowEdgeQuery{
+      query_set<Node>{ns},
+      query_set<int>::matchall(),
+      query_set<Node>::matchall(),
+      query_set<int>::matchall(),
+  };
+  return g.query_edges(query);
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/lookup_in_map.cc b/lib/utils/test/src/utils/containers/lookup_in_map.cc
new file mode 100644
index 0000000000..9ca356ee4b
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/lookup_in_map.cc
@@ -0,0 +1,31 @@
+#include "utils/containers/lookup_in_map.h"
+#include <doctest/doctest.h>
+#include <functional>
+#include <string>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("lookup_in_map") {
+
+    std::unordered_map<std::string, int> map = {{"a", 1}, {"b", 2}};
+
+    SUBCASE("existing keys") {
+      std::function<int(std::string const &)> func = lookup_in_map(map);
+      CHECK(func("a") == 1);
+      CHECK(func("b") == 2);
+    }
+
+    SUBCASE("missing key") {
+      std::function<int(std::string const &)> func = lookup_in_map(map);
+      CHECK_THROWS(func("c"));
+    }
+
+    SUBCASE("empty map") {
+      std::unordered_map<std::string, int> map = {};
+      std::function<int(std::string const &)> func = lookup_in_map(map);
+      CHECK_THROWS(func("a"));
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
new file mode 100644
index 0000000000..86e4802cdb
--- /dev/null
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
@@ -0,0 +1,51 @@
+#include "utils/graph/dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/dataflow_graph/dataflow_graph.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_incoming_edges(DataflowGraphView, Node)") {
+    DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n1_added = g.add_node({}, 1);
+    Node n1 = n1_added.node;
+    DataflowOutput o1 = get_only(n1_added.outputs);
+
+    NodeAddedResult n2_added = g.add_node({}, 1);
+    Node n2 = n2_added.node;
+    DataflowOutput o2 = get_only(n2_added.outputs);
+
+    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    Node n3 = n3_added.node;
+    DataflowOutput o3 = get_only(n3_added.outputs);
+
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    Node n4 = n4_added.node;
+    DataflowOutput o4 = get_only(n4_added.outputs);
+
+    SUBCASE("n4 - multiple incoming edges") {
+      std::vector<DataflowEdge> result = get_incoming_edges(g, n4);
+      std::vector<DataflowEdge> correct = {
+          DataflowEdge{o2, DataflowInput{n4, 0}},
+          DataflowEdge{o3, DataflowInput{n4, 1}}};
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n3- single incoming edge") {
+      std::vector<DataflowEdge> result = get_incoming_edges(g, n3);
+      std::vector<DataflowEdge> correct = {
+          DataflowEdge{o2, DataflowInput{n3, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n1- no incoming edges") {
+      std::vector<DataflowEdge> result = get_incoming_edges(g, n1);
+      std::vector<DataflowEdge> correct = {};
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
new file mode 100644
index 0000000000..be874b7e29
--- /dev/null
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -0,0 +1,90 @@
+#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/dataflow_graph/dataflow_graph.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_outgoing_edges(DataflowGraphView, Node)") {
+    DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n1_added = g.add_node({}, 1);
+    Node n1 = n1_added.node;
+    DataflowOutput o1 = get_only(n1_added.outputs);
+
+    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    Node n2 = n2_added.node;
+    DataflowOutput o2 = get_only(n2_added.outputs);
+
+    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    Node n3 = n3_added.node;
+    DataflowOutput o3 = get_only(n3_added.outputs);
+
+    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    Node n4 = n4_added.node;
+    DataflowOutput o4 = get_only(n4_added.outputs);
+
+    SUBCASE("n2 - single outgoing edge") {
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n2);
+      std::unordered_set<DataflowEdge> correct = {
+          DataflowEdge{o2, DataflowInput{n4, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n1 - multiple outgoing edges") {
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n1);
+      std::unordered_set<DataflowEdge> correct = {
+          DataflowEdge{o1, DataflowInput{n2, 0}},
+          DataflowEdge{o1, DataflowInput{n3, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n4 - no outgoing edges") {
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n4);
+      std::unordered_set<DataflowEdge> correct = {};
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("get_outgoing_edges(DataflowGraphView, std::unordered_set<Node>)") {
+    DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n1_added = g.add_node({}, 1);
+    Node n1 = n1_added.node;
+    DataflowOutput o1 = get_only(n1_added.outputs);
+
+    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    Node n2 = n2_added.node;
+    DataflowOutput o2 = get_only(n2_added.outputs);
+
+    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    Node n3 = n3_added.node;
+    DataflowOutput o3 = get_only(n3_added.outputs);
+
+    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    Node n4 = n4_added.node;
+    DataflowOutput o4 = get_only(n4_added.outputs);
+
+    SUBCASE("multiple nodes - combined outgoing edges") {
+      std::unordered_set<Node> nodes = {n1, n2};
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, nodes);
+      std::unordered_set<DataflowEdge> correct = {
+          DataflowEdge{o1, DataflowInput{n2, 0}},
+          DataflowEdge{o1, DataflowInput{n3, 0}},
+          DataflowEdge{o2, DataflowInput{n4, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("multiple nodes - no outgoing edges") {
+      std::unordered_set<Node> nodes = {n3, n4};
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, nodes);
+      std::unordered_set<DataflowEdge> correct = {};
+      CHECK(result == correct);
+    }
+  }
+}