diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h new file mode 100644 index 0000000000..93a1143cde --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H + +#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" +#include "pcg/device_id_t.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" + +namespace FlexFlow { + +OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer( + ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &layer, + MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml index d2ff3f42e7..5e81d6c10e 100644 --- a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml @@ -11,7 +11,11 @@ includes = [ ] [[fields]] -name = "runtime" +name = "forward_runtime" +type = "float" + +[[fields]] +name = "backward_runtime" type = "float" [[fields]] diff --git a/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h b/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h new file mode 100644 index 0000000000..34188ff97c --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TENSOR_SET_MOVEMENT_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TENSOR_SET_MOVEMENT_H + +#include "compiler/cost_estimator/tensor_set_movement.dtg.h" +#include "pcg/machine_view.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h" + +namespace FlexFlow { + +TensorSetMovement get_tensor_set_movement_from_pcg_edge( + ParallelComputationGraphEdge const &edge, + ParallelComputationGraph const &pcg, + MachineView const &src_mv, + MachineView const &dst_mv); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h index 06cbbf942d..7375cde985 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h @@ -2,6 +2,10 @@ #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H #include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "pcg/device_id_t.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/operator_task_space.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" namespace FlexFlow { diff --git a/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h new file mode 100644 index 0000000000..0fb31210fd --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_UNSTRUCTURED_DEVICE_MAPPING_H +#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_UNSTRUCTURED_DEVICE_MAPPING_H + +#include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" + +namespace FlexFlow { + +UnstructuredDeviceMapping + get_unstructured_device_mapping(MachineMapping const &machine_mapping, + MachineSpecification const &machine_spec, + ParallelComputationGraph const &pcg); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml new file mode 100644 index 0000000000..ae38a37292 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "UnstructuredDeviceMapping" +features = [ + "eq", + # "ord", + "hash", + # "json", + # "rapidcheck", + "fmt", +] + +includes = [ + "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h", + "pcg/device_id_t.dtg.h" +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h", + "utils/hash/unordered_set.h", + "utils/fmt/unordered_set.h" +] + +[[fields]] +name = "raw_device_map" +type = "std::unordered_map<::FlexFlow::parallel_layer_guid_t, std::unordered_set<::FlexFlow::device_id_t>>" diff --git a/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml new file mode 100644 index 0000000000..71e0e17f5e --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "InProgressTask" + +features = [ + "eq", + "hash", + "fmt", + "ord" +] + +includes = [ + "utils/graph/node/node.dtg.h" +] + + +[[fields]] +name = "start_time" +type = "float" + +[[fields]] +name = "end_time" +type = "float" + +[[fields]] +name = "node" +type = "::FlexFlow::Node" diff --git a/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h new file mode 100644 index 0000000000..ed509cb7be --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H + +#include "compiler/task_graph_simulator/in_progress_task.dtg.h" +#include + +namespace FlexFlow { +struct InProgressTaskComparator { + bool operator()(InProgressTask const &lhs, InProgressTask const &rhs) const; +}; +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml new file mode 100644 index 0000000000..13f2f17652 --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml @@ -0,0 +1,20 @@ +namespace = "FlexFlow" +name = "PCGTask" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "compiler/cost_estimator/op_cost_estimate_key.dtg.h", + "compiler/cost_estimator/tensor_set_movement.dtg.h", +] + +[[values]] +type = "::FlexFlow::OpCostEstimateKey" +key = "operator" + +[[values]] +type = "::FlexFlow::TensorSetMovement" +key = "tensor_movement" diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h new file mode 100644 index 0000000000..2c6d6514e8 --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_PCG_TASK_GRAPH_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_PCG_TASK_GRAPH_H + +#include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "compiler/task_graph_simulator/pcg_task_graph.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" + +namespace FlexFlow { + +PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg, + MachineMapping const &machine_mapping, + MachineSpecification const &machine_spec); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml new file mode 100644 index 0000000000..099f44c564 --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml @@ -0,0 +1,34 @@ +namespace = "FlexFlow" +name = "PCGTaskGraph" + +features = [ +] + +includes = [ + "utils/graph/digraph/digraph_view.h", + "utils/bidict/bidict.h", + "compiler/task_graph_simulator/pcg_task.dtg.h", + "pcg/device_id_t.dtg.h", + "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h", + "", + "" +] + +src_includes = [ + "utils/fmt/unordered_set.h", + "utils/hash/unordered_set.h", + "utils/fmt/unordered_map.h", + "utils/hash/unordered_map.h" +] + +[[fields]] +name = "graph" +type = "::FlexFlow::DiGraphView" + +[[fields]] +name = "node_to_task" +type = "::FlexFlow::bidict<::FlexFlow::Node, ::FlexFlow::PCGTask>" + +[[fields]] +name = "node_to_devices" +type = "std::unordered_map<::FlexFlow::Node, std::unordered_set<::FlexFlow::device_id_t>>" diff --git a/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h b/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h new file mode 100644 index 0000000000..424e65f9df --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_SIMULATE_TASK_GRAPH_EXECUTION_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_SIMULATE_TASK_GRAPH_EXECUTION_H + +#include "compiler/task_graph_simulator/task_execution_constraint.dtg.h" +#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h" +#include "utils/graph/digraph/digraph_view.h" +#include +namespace FlexFlow { + +TaskGraphExecutionTrace simulate_task_graph_execution( + DiGraphView const &task_graph, + std::function cost_function, + TaskExecutionConstraint const &constraint); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml new file mode 100644 index 0000000000..004655b5ec --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml @@ -0,0 +1,15 @@ +namespace = "FlexFlow" +name = "TaskExecutionConstraint" +features = [ +] + +includes = [ + "utils/graph/node/node.dtg.h", + "", + "" +] + + +[[fields]] +name = "is_satisfied" +type = "std::function const &, std::unordered_set const &)>" diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml new file mode 100644 index 0000000000..b96d7264b9 --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml @@ -0,0 +1,40 @@ +namespace = "FlexFlow" +name = "TaskGraphExecutionState" + +features = [ +] + +includes = [ + "utils/deduplicated_priority_queue.h", + "utils/graph/node/node.dtg.h", + "compiler/task_graph_simulator/in_progress_task.dtg.h", + "compiler/task_graph_simulator/in_progress_task_comparator.h", + "", + "", + "" +] + +src_includes = [ + "utils/hash/unordered_set.h", + "utils/fmt/unordered_set.h", + "utils/hash/set.h", + "utils/fmt/set.h", + "utils/fmt/vector.h", + "utils/hash/vector.h" +] + +[[fields]] +name = "ready_tasks" +type = "std::set<::FlexFlow::Node>" + +[[fields]] +name = "in_progress_tasks" +type = "::FlexFlow::DeduplicatedPriorityQueue<::FlexFlow::InProgressTask, std::vector<::FlexFlow::InProgressTask>, ::FlexFlow::InProgressTaskComparator>" + +[[fields]] +name = "finished_tasks" +type = "std::unordered_set<::FlexFlow::Node>" + +[[fields]] +name = "current_time" +type = "float" diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h new file mode 100644 index 0000000000..0ad5b4824b --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H + +#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h" + +namespace FlexFlow { + +float get_total_execution_time(TaskGraphExecutionTrace const &trace); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml new file mode 100644 index 0000000000..3003e5a157 --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "TaskGraphExecutionTrace" + +features = [ + "hash", + "fmt", + "eq" +] + +includes = [ + "compiler/task_graph_simulator/task_profile.dtg.h", + "" +] + +src_includes = [ + "utils/fmt/unordered_set.h", + "utils/hash/unordered_set.h" +] + + +[[fields]] +name = "task_profiles" +type = "std::unordered_set<::FlexFlow::TaskProfile>" diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml new file mode 100644 index 0000000000..1a47acfa0e --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "TaskProfile" + +features = [ + "eq", + "hash", + "fmt", + "ord" +] + +includes = [ + "utils/graph/node/node.dtg.h" +] + + +[[fields]] +name = "node" +type = "::FlexFlow::Node" + +[[fields]] +name = "start_time" +type = "float" + +[[fields]] +name = "end_time" +type = "float" diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h new file mode 100644 index 0000000000..b35733e419 --- /dev/null +++ b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H + +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" + +namespace FlexFlow { +float task_simulator_estimate_forward_pass_time( + ParallelComputationGraph const &pcg, + CostEstimator const &estimator, + MachineMapping const &machine_mapping, + MachineSpecification const &machine_spec); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc index 1c226f79b0..db7477b460 100644 --- a/lib/compiler/src/compiler/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/allowed_machine_views.cc @@ -24,6 +24,10 @@ namespace FlexFlow { bool is_valid_machine_view(MachineView const &mv, OperatorTaskSpace const &task, MachineSpecification const &ms) { + if (num_dims(mv) != num_dims(task)) { + return false; + } + std::optional maximum_device_coord = get_machine_space_coordinate( task, mv, get_task_space_maximum_coordinate(task), ms); diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc new file mode 100644 index 0000000000..ef5775851f --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc @@ -0,0 +1,23 @@ +#include "compiler/cost_estimator/op_cost_estimate_key.h" +#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "pcg/device_id_t.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/machine_view.dtg.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include + +namespace FlexFlow { + +OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer( + ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &layer, + MachineView const &machine_view) { + return map_unmapped_op_cost_estimate_key( + get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), machine_view); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc b/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc new file mode 100644 index 0000000000..8f2ab84b84 --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc @@ -0,0 +1,16 @@ +#include "compiler/cost_estimator/tensor_set_movement.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +namespace FlexFlow { + +TensorSetMovement get_tensor_set_movement_from_pcg_edge( + ParallelComputationGraphEdge const &edge, + ParallelComputationGraph const &pcg, + MachineView const &src_mv, + MachineView const &dst_mv) { + ParallelTensorShape tensor_shape = + get_parallel_tensor_shape(pcg, parallel_tensor_guid_t{edge.raw_edge.src}); + return TensorSetMovement{ + {SingleTensorMovement{tensor_shape, {src_mv}, {dst_mv}}}}; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 5bdd8645a5..49d528e4ab 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -1,4 +1,5 @@ #include "compiler/machine_mapping/get_optimal_machine_mapping.h" +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/get_machine_resource_splits.h" #include "compiler/machine_mapping/machine_mapping_cache.h" @@ -240,8 +241,8 @@ MachineMappingResult auto get_mapping_result = [&](MachineView const &machine_view) { OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key(leaf, machine_view); - float cost = context.cost_estimator.estimate_cost(mapped).runtime; - + OpCostMetrics metrics = context.cost_estimator.estimate_cost(mapped); + float cost = metrics.forward_runtime + metrics.backward_runtime; return make_singleton_machine_mapping_result(cost, machine_view); }; diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index 57e82684e9..fc3a58995c 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -1,13 +1,20 @@ #include "compiler/machine_mapping/machine_mapping.h" +#include "pcg/machine_specification.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.dtg.h" +#include "pcg/operator_task_space.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "utils/containers/are_disjoint.h" +#include "utils/containers/get_one_of.h" #include "utils/containers/keys.h" +#include "utils/containers/map_values.h" #include "utils/containers/merge_maps.h" namespace FlexFlow { -MachineMapping combine_disjoint_mappings(MachineMapping const &s1, - MachineMapping const &s2) { - return MachineMapping{merge_maps(s1.machine_views, s2.machine_views)}; +MachineMapping combine_disjoint_mappings(MachineMapping const &m1, + MachineMapping const &m2) { + return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)}; } bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) { diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc index a6c2d1ed04..9b4a1fd6fe 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc @@ -30,7 +30,9 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result( bool is_pareto_optimal = true; for (MachineMappingForSingleLayer const &other_mapping : result.machine_mappings) { - if (mapping.cost.runtime >= other_mapping.cost.runtime && + if (mapping.cost.forward_runtime >= other_mapping.cost.forward_runtime && + mapping.cost.backward_runtime >= + other_mapping.cost.backward_runtime && mapping.cost.memory >= other_mapping.cost.memory && mapping != other_mapping) { is_pareto_optimal = false; @@ -54,7 +56,10 @@ MachineMappingWithMemoryResult [&](MachineMappingForSingleLayer const &pre_mm, MachineMappingForSingleLayer const &post_mm) { OpCostMetrics cost = OpCostMetrics{ - pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime, + pre_mm.cost.forward_runtime + comm_cost + + post_mm.cost.forward_runtime, + pre_mm.cost.backward_runtime + comm_cost + + post_mm.cost.backward_runtime, pre_mm.cost.memory + post_mm.cost.memory, }; @@ -93,7 +98,9 @@ MachineMappingWithMemoryResult [&](MachineMappingForSingleLayer const &lhs_mm, MachineMappingForSingleLayer const &rhs_mm) { OpCostMetrics cost = OpCostMetrics{ - std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime), + std::max(lhs_mm.cost.forward_runtime, rhs_mm.cost.forward_runtime), + std::max(lhs_mm.cost.backward_runtime, + rhs_mm.cost.backward_runtime), //(@wmdi) is this correct? std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), }; diff --git a/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc b/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc new file mode 100644 index 0000000000..63e359d9ac --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc @@ -0,0 +1,28 @@ + +#include "compiler/machine_mapping/unstructured_device_mapping.h" +#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h" +#include "pcg/machine_specification.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.dtg.h" +#include "pcg/operator_task_space.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "utils/containers/keys.h" +#include "utils/containers/map_values.h" + +namespace FlexFlow { + +UnstructuredDeviceMapping + get_unstructured_device_mapping(MachineMapping const &machine_mapping, + MachineSpecification const &machine_spec, + ParallelComputationGraph const &pcg) { + std::unordered_map> + device_mapping; + for (auto const &[layer, machine_view] : machine_mapping.machine_views) { + OperatorTaskSpace op = get_operator_task_space(pcg, layer); + device_mapping.insert( + {layer, get_device_ids(op, machine_view, machine_spec)}); + } + return UnstructuredDeviceMapping{device_mapping}; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc b/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc new file mode 100644 index 0000000000..2064c56a52 --- /dev/null +++ b/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc @@ -0,0 +1,11 @@ +#include "compiler/task_graph_simulator/in_progress_task_comparator.h" +#include + +namespace FlexFlow { + +bool InProgressTaskComparator::operator()(InProgressTask const &lhs, + InProgressTask const &rhs) const { + return std::tie(lhs.end_time, lhs.node) > std::tie(rhs.end_time, rhs.node); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc new file mode 100644 index 0000000000..539c44a963 --- /dev/null +++ b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc @@ -0,0 +1,59 @@ +#include "compiler/task_graph_simulator/pcg_task_graph.h" +#include "compiler/cost_estimator/op_cost_estimate_key.h" +#include "compiler/cost_estimator/tensor_set_movement.h" +#include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "pcg/device_id_t.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/machine_view.dtg.h" +#include "pcg/machine_view.h" +#include "pcg/operator_task_space.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" +#include "utils/bidict/bidict.h" +#include "utils/graph/instances/adjacency_digraph.h" +#include +#include + +namespace FlexFlow { + +PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg, + MachineMapping const &machine_mapping, + MachineSpecification const &machine_spec) { + DiGraph digraph = DiGraph::create(); + bidict node_to_task; + bidict node_to_layer; + std::unordered_map> node_to_devices; + + for (parallel_layer_guid_t const &layer : get_parallel_layers(pcg)) { + MachineView mv = machine_mapping.machine_views.at(layer); + OpCostEstimateKey op_key = + get_mapped_op_cost_estimate_key_for_layer(pcg, layer, mv); + Node node = digraph.add_node(); + node_to_task.equate(node, PCGTask{op_key}); + node_to_layer.equate(node, layer); + node_to_devices[node] = + get_device_ids(get_operator_task_space(pcg, layer), + machine_mapping.machine_views.at(layer), + machine_spec); + } + + for (ParallelComputationGraphEdge const &edge : get_edges(pcg)) { + MachineView src_mv = machine_mapping.machine_views.at(get_src_layer(edge)); + MachineView dst_mv = machine_mapping.machine_views.at(get_dst_layer(edge)); + TensorSetMovement movement = + get_tensor_set_movement_from_pcg_edge(edge, pcg, src_mv, dst_mv); + Node node = digraph.add_node(); + node_to_task.equate(node, PCGTask{movement}); + node_to_devices[node] = {}; + Node src_node = node_to_layer.at_r(get_src_layer(edge)); + Node dst_node = node_to_layer.at_r(get_dst_layer(edge)); + + digraph.add_edge(DirectedEdge{src_node, node}); + digraph.add_edge(DirectedEdge{node, dst_node}); + } + + return PCGTaskGraph{digraph, node_to_task, node_to_devices}; +} +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc b/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc new file mode 100644 index 0000000000..974a70ddf5 --- /dev/null +++ b/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc @@ -0,0 +1,107 @@ +#include "compiler/task_graph_simulator/simulate_task_graph_execution.h" +#include "compiler/task_graph_simulator/in_progress_task.dtg.h" +#include "compiler/task_graph_simulator/task_graph_execution_state.dtg.h" +#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "utils/containers/filtrans.h" +#include "utils/containers/is_subseteq_of.h" +#include "utils/containers/set_of.h" +#include "utils/containers/sorted.h" +#include "utils/exception.h" +#include "utils/graph/digraph/algorithms.h" +#include "utils/graph/digraph/algorithms/get_predecessors.h" +#include "utils/graph/digraph/algorithms/get_successors.h" +#include "utils/graph/digraph/algorithms/is_acyclic.h" +#include "utils/graph/digraph/digraph_view.h" +#include "utils/graph/node/algorithms.h" +#include "utils/overload.h" +#include +#include + +namespace FlexFlow { + +TaskGraphExecutionTrace simulate_task_graph_execution( + DiGraphView const &task_graph, + std::function cost_function, + TaskExecutionConstraint const &constraint) { + if (!is_acyclic(task_graph)) { + throw mk_runtime_error( + "simulate_task_graph_execution cannot simulate cyclic directed graphs"); + } + + TaskGraphExecutionState execution_state = + TaskGraphExecutionState{/*ready_tasks=*/set_of(get_sources(task_graph)), + /*in_progress_tasks=*/{}, + /*finished_tasks=*/{}, + /*current_time=*/0.0}; + + std::unordered_set task_profiles; + + auto start_task_processing = [&](Node const &task) { + float cost = cost_function(task); + execution_state.in_progress_tasks.push( + InProgressTask{execution_state.current_time, + execution_state.current_time + cost, + task}); + execution_state.ready_tasks.erase(task); + }; + + auto dependencies_are_satisfied = [&](Node const &task) { + std::unordered_set incoming_dependencies = + get_predecessors(task_graph, task); + return is_subseteq_of(incoming_dependencies, + execution_state.finished_tasks); + }; + + auto finish_task_processing = [&](InProgressTask const &in_progress_task) { + execution_state.finished_tasks.insert(in_progress_task.node); + for (Node const &task : get_successors(task_graph, in_progress_task.node)) { + if (dependencies_are_satisfied(task)) { + execution_state.ready_tasks.insert(task); + } + } + task_profiles.insert(TaskProfile{in_progress_task.node, + in_progress_task.start_time, + in_progress_task.end_time}); + execution_state.current_time = in_progress_task.end_time; + }; + + auto is_processing_done = [&]() { + return execution_state.ready_tasks.empty() && + execution_state.in_progress_tasks.empty(); + }; + + auto get_next_task_to_finish = [&]() { + InProgressTask task = execution_state.in_progress_tasks.top(); + execution_state.in_progress_tasks.pop(); + return task; + }; + + while (!is_processing_done()) { + auto ready_tasks_copy = execution_state.ready_tasks; + for (Node const &task : ready_tasks_copy) { + std::unordered_set raw_in_progress_tasks = transform( + unordered_set_of(execution_state.in_progress_tasks.contents()), + [](InProgressTask const &t) { return t.node; }); + + if (constraint.is_satisfied( + task, raw_in_progress_tasks, execution_state.finished_tasks)) { + start_task_processing(task); + } + } + + if (!execution_state.in_progress_tasks.empty()) { + InProgressTask next_task = get_next_task_to_finish(); + finish_task_processing(next_task); + } else { + throw mk_runtime_error("Constraints cannot be satisfied"); + } + } + if (execution_state.finished_tasks.size() != num_nodes(task_graph)) { + throw mk_runtime_error("Failed to execute all tasks in given graph"); + } + + return TaskGraphExecutionTrace{task_profiles}; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc new file mode 100644 index 0000000000..716a7afe15 --- /dev/null +++ b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc @@ -0,0 +1,27 @@ +#include "compiler/task_graph_simulator/task_graph_execution_trace.h" +#include "utils/containers/maximum.h" +#include "utils/containers/minimum.h" +#include "utils/containers/transform.h" +#include "utils/exception.h" +#include "utils/fmt/unordered_set.h" + +namespace FlexFlow { + +float get_total_execution_time(TaskGraphExecutionTrace const &trace) { + if (trace.task_profiles.empty()) { + throw mk_runtime_error( + fmt::format("TaskGraphExecutionTrace {} is empty", trace)); + } + float end_time = + maximum(transform(trace.task_profiles, [](TaskProfile const &profile) { + return profile.end_time; + })); + float start_time = + minimum(transform(trace.task_profiles, [](TaskProfile const &profile) { + return profile.start_time; + })); + + return end_time - start_time; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc new file mode 100644 index 0000000000..ab204e7d71 --- /dev/null +++ b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc @@ -0,0 +1,71 @@ +#include "compiler/task_graph_simulator/task_simulator.h" +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/cost_estimator/op_cost_estimate_key.h" +#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h" +#include "compiler/machine_mapping/unstructured_device_mapping.h" +#include "compiler/task_graph_simulator/pcg_task.dtg.h" +#include "compiler/task_graph_simulator/pcg_task_graph.h" +#include "compiler/task_graph_simulator/simulate_task_graph_execution.h" +#include "compiler/task_graph_simulator/task_execution_constraint.dtg.h" +#include "compiler/task_graph_simulator/task_graph_execution_trace.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" +#include "utils/containers/filtrans.h" +#include "utils/containers/set_union.h" +#include "utils/containers/transform.h" +#include "utils/graph/digraph/digraph.h" +#include "utils/hash/unordered_set.h" +#include + +namespace FlexFlow { + +float task_simulator_estimate_forward_pass_time( + ParallelComputationGraph const &pcg, + CostEstimator const &estimator, + MachineMapping const &machine_mapping, + MachineSpecification const &machine_spec) { + + PCGTaskGraph task_graph = + get_pcg_task_graph(pcg, machine_mapping, machine_spec); + + auto cost_function = [&](Node const &node) -> float { + PCGTask task = task_graph.node_to_task.at_l(node); + if (task.is_operator()) { + return estimator.estimate_cost(task.require_operator()).forward_runtime; + } else { + return estimator.estimate_cost(task.require_tensor_movement()); + } + }; + + auto is_allowed_to_run = + [&](Node const &task, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) -> bool { + PCGTask current_task = task_graph.node_to_task.at_l(task); + + UnstructuredDeviceMapping device_map = + get_unstructured_device_mapping(machine_mapping, machine_spec, pcg); + + if (current_task.is_tensor_movement()) { + return true; + } + assert(current_task.is_operator()); + + auto get_devices = [&](Node const &n) { + return task_graph.node_to_devices.at(n); + }; + + std::unordered_set devices_occupied = + set_union(transform(in_progress_tasks, get_devices)); + std::unordered_set required_devices = get_devices(task); + return intersection(devices_occupied, required_devices).empty(); + }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + + return get_total_execution_time(simulate_task_graph_execution( + task_graph.graph, cost_function, constraint)); +} + +} // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/cost_estimator_for_test.cc similarity index 72% rename from lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc rename to lib/compiler/test/src/compiler/cost_estimator_for_test.cc index 0431104878..48e6f5e561 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/compiler/cost_estimator_for_test.cc @@ -1,6 +1,8 @@ #include "./cost_estimator_for_test.h" +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { @@ -40,4 +42,15 @@ CostEstimator make_fake_cost_estimator( }); } +CostEstimator make_fake_constant_cost_estimator(float forward_op_cost, + float backward_op_cost, + float comm_cost, + nonnegative_int memory_cost) { + return make_fake_cost_estimator( + [=](OpCostEstimateKey const &op) { + return OpCostMetrics{forward_op_cost, backward_op_cost, memory_cost}; + }, + [=](TensorSetMovement const &op) { return comm_cost; }); +} + } // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/cost_estimator_for_test.h similarity index 77% rename from lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h rename to lib/compiler/test/src/compiler/cost_estimator_for_test.h index 16ea3a85bc..1e8ce83caf 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h +++ b/lib/compiler/test/src/compiler/cost_estimator_for_test.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_TEST_COST_ESTIMATOR_H -#define _FLEXFLOW_TEST_COST_ESTIMATOR_H +#ifndef _FLEXFLOW_TEST_COST_ESTIMATOR_FOR_TEST_H +#define _FLEXFLOW_TEST_COST_ESTIMATOR_FOR_TEST_H #include "compiler/cost_estimator/cost_estimator.h" #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" @@ -7,6 +7,7 @@ #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h" #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { @@ -34,6 +35,11 @@ CostEstimator make_fake_cost_estimator( std::unordered_map const &op_cost_map, std::unordered_map const &comm_cost_map); +CostEstimator make_fake_constant_cost_estimator(float forward_op_cost, + float backward_op_cost, + float comm_cost, + nonnegative_int memory_cost); + } // namespace FlexFlow #endif diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index ac180cd079..542edd9fa9 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -1,5 +1,5 @@ #include "compiler/machine_mapping/get_optimal_machine_mapping.h" -#include "./cost_estimator_for_test.h" +#include "../cost_estimator_for_test.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/machine_mapping_cache.h" #include "compiler/machine_mapping/machine_mapping_constraints.h" @@ -9,6 +9,7 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "utils/containers/get_only.h" #include "utils/full_binary_tree/binary_tree_path.h" +#include "utils/nonnegative_int/nonnegative_int.h" #include using namespace FlexFlow; @@ -146,13 +147,21 @@ TEST_SUITE(FF_TEST_SUITE) { auto map1 = std::unordered_map{{ {map_unmapped_op_cost_estimate_key(k1, mv1), - OpCostMetrics{/*runtime=*/1.0, /*memory=*/nonnegative_int{0}}}, + OpCostMetrics{/*forward_runtime=*/0.5, + /*backward_runtime=*/0.5, + /*memory=*/nonnegative_int{0}}}, {map_unmapped_op_cost_estimate_key(k2, mv1), - OpCostMetrics{/*runtime=*/2.0, /*memory=*/nonnegative_int{0}}}, + OpCostMetrics{/*forward_runtime=*/1.0, + /*backward_runtime=*/1.0, + /*memory=*/nonnegative_int{0}}}, {map_unmapped_op_cost_estimate_key(k1, mv2), - OpCostMetrics{/*runtime=*/1.5, /*memory=*/nonnegative_int{0}}}, + OpCostMetrics{/*forward_runtime=*/0.75, + /*backward_runtime=*/0.75, + /*memory=*/nonnegative_int{0}}}, {map_unmapped_op_cost_estimate_key(k2, mv2), - OpCostMetrics{/*runtime=*/2.5, /*memory=*/nonnegative_int{0}}}, + OpCostMetrics{/*forward_runtime=*/1.25, + /*backward_runtime=*/1.25, + /*memory=*/nonnegative_int{0}}}, }}; CostEstimator cost_estimator = make_fake_cost_estimator( diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc index e22f715d82..52ad82595d 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc @@ -1,5 +1,5 @@ #include "compiler/machine_mapping/get_tensor_set_movement_across_split.h" -#include "./cost_estimator_for_test.h" +#include "../cost_estimator_for_test.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" #include "pcg/machine_view.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc index 221cca3ae1..304034f9be 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc @@ -1,5 +1,4 @@ #include "compiler/machine_mapping/machine_mapping.h" -#include "cost_estimator_for_test.h" #include "doctest/doctest.h" #include "pcg/machine_view.h" diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 9706f1c75f..8612017705 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -1,5 +1,5 @@ #include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h" -#include "../cost_estimator_for_test.h" +#include "../../cost_estimator_for_test.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" @@ -9,6 +9,7 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "utils/containers/get_only.h" #include "utils/full_binary_tree/binary_tree_path.h" +#include "utils/nonnegative_int/nonnegative_int.h" #include using namespace FlexFlow; @@ -147,24 +148,32 @@ TEST_SUITE(FF_TEST_SUITE) { CostEstimator cost_estimator = make_fake_cost_estimator( std::unordered_map{{ {map_unmapped_op_cost_estimate_key(k1, mv1), - OpCostMetrics{1.0, nonnegative_int{2}}}, + OpCostMetrics{/*forward_runtime=*/1.0, + /*backward_runtime=*/1.0, + /*memory=*/nonnegative_int{2}}}, {map_unmapped_op_cost_estimate_key(k2, mv1), - OpCostMetrics{2.0, nonnegative_int{3}}}, + OpCostMetrics{/*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, + /*memory=*/nonnegative_int{3}}}, {map_unmapped_op_cost_estimate_key(k1, mv2), - OpCostMetrics{1.5, nonnegative_int{1}}}, + OpCostMetrics{/*forward_runtime=*/1.5, + /*backward_runtime=*/1.5, + /*memory=*/nonnegative_int{1}}}, {map_unmapped_op_cost_estimate_key(k2, mv2), - OpCostMetrics{2.5, nonnegative_int{2}}}, + OpCostMetrics{/*forward_runtime=*/2.5, + /*backward_runtime=*/2.5, + /*memory=*/nonnegative_int{2}}}, }}, std::unordered_map{{ - {TensorSetMovement{{}}, 0.0}, + {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), - 0.1}, + /*cost=*/0.1}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), - 0.2}, + /*cost=*/0.2}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), - 0.3}, + /*cost=*/0.3}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), - 0.4}, + /*cost=*/0.4}, }}); MachineMappingContext context = MachineMappingContext{ @@ -187,13 +196,17 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ - OpCostMetrics{1.0, nonnegative_int{2}}, + OpCostMetrics{/*forward_runtime=*/1.0, + /*backward_runtime=*/1.0, + /*memory=*/nonnegative_int{2}}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, }}, }, MachineMappingForSingleLayer{ - OpCostMetrics{1.5, nonnegative_int{1}}, + OpCostMetrics{/*forward_runtime=*/1.5, + /*backward_runtime=*/1.5, + /*memory=*/nonnegative_int{1}}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv2}, }}, @@ -217,7 +230,8 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ OpCostMetrics{ - /*runtime=*/1.0 + 2.0 + 0.1, + /*forward_runtime=*/1.0 + 2.0 + 0.1, + /*backward_runtime=*/1.0 + 2.0 + 0.1, /*memory=*/nonnegative_int{2 + 3}, }, ParallelLayerGuidObliviousMachineMapping{{ @@ -236,7 +250,9 @@ TEST_SUITE(FF_TEST_SUITE) { }}, }, MachineMappingForSingleLayer{ - OpCostMetrics{1.5 + 2.5 + 0.1, nonnegative_int{1 + 2}}, + OpCostMetrics{/*forward_runtime=*/1.5 + 2.5 + 0.1, + /*backward_runtime=*/1.5 + 2.5 + 0.1, + /*memory=*/nonnegative_int{1 + 2}}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ @@ -270,7 +286,9 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{ - OpCostMetrics{2.5, nonnegative_int{2}}, + OpCostMetrics{/*forward_runtime=*/2.5, + /*backward_runtime=*/2.5, + /*memory=*/nonnegative_int{2}}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index ecfb7cfeb3..1f3b7545a8 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -1,5 +1,6 @@ #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h" #include "pcg/machine_view.h" +#include "utils/nonnegative_int/nonnegative_int.h" #include using namespace FlexFlow; @@ -52,15 +53,20 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics cost1 = OpCostMetrics{ - /*runtime=*/2.0, + /*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, /*memory=*/nonnegative_int{2}, }; + OpCostMetrics cost2 = OpCostMetrics{ - /*runtime=*/4.0, + /*forward_runtime=*/4.0, + /*backward_runtime=*/4.0, /*memory=*/nonnegative_int{1}, }; + OpCostMetrics cost3 = OpCostMetrics{ - /*runtime=*/2.0, + /*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, /*memory=*/nonnegative_int{3}, }; @@ -182,7 +188,8 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics pre_cost = OpCostMetrics{ - /*runtime=*/2.0, + /*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, /*memory=*/nonnegative_int{2}, }; MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{ @@ -208,7 +215,8 @@ TEST_SUITE(FF_TEST_SUITE) { }}; OpCostMetrics post_cost = OpCostMetrics{ - /*runtime=*/4.0, + /*forward_runtime=*/4.0, + /*backward_runtime=*/4.0, /*memory=*/nonnegative_int{1}, }; @@ -253,8 +261,10 @@ TEST_SUITE(FF_TEST_SUITE) { { MachineMappingForSingleLayer{ /*cost=*/OpCostMetrics{ - /*runtime=*/pre_cost.runtime + comm_cost + - post_cost.runtime, + /*forward_runtime=*/pre_cost.forward_runtime + + comm_cost + post_cost.forward_runtime, + /*backward_runtime=*/pre_cost.backward_runtime + + comm_cost + post_cost.backward_runtime, /*memory=*/pre_cost.memory + post_cost.memory, }, /*machine_mapping=*/ @@ -307,8 +317,10 @@ TEST_SUITE(FF_TEST_SUITE) { { MachineMappingForSingleLayer{ /*cost=*/OpCostMetrics{ - /*runtime=*/pre_cost.runtime + comm_cost + - post_cost.runtime, + /*forward_runtime=*/pre_cost.forward_runtime + + comm_cost + post_cost.forward_runtime, + /*backward_runtime=*/pre_cost.backward_runtime + + comm_cost + post_cost.backward_runtime, /*memory=*/pre_cost.memory + post_cost.memory, }, /*machine_mapping=*/ @@ -377,7 +389,8 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics lhs_cost = OpCostMetrics{ - /*runtime=*/2.0, + /*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, /*memory=*/nonnegative_int{2}, }; MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{ @@ -403,7 +416,8 @@ TEST_SUITE(FF_TEST_SUITE) { }}; OpCostMetrics rhs_cost = OpCostMetrics{ - /*runtime=*/4.0, + /*forward_runtime=*/4.0, + /*backward_runtime=*/4.0, /*memory=*/nonnegative_int{1}, }; MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{ @@ -442,7 +456,11 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ /*cost=*/OpCostMetrics{ - /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime), + /*forward_runtime=*/std::max(lhs_cost.forward_runtime, + rhs_cost.forward_runtime), + /*backward_runtime=*/ + std::max(lhs_cost.backward_runtime, + rhs_cost.backward_runtime), /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory), }, /*machine_mapping=*/ @@ -518,15 +536,18 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics cost1 = OpCostMetrics{ - /*runtime=*/2.0, + /*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, /*memory=*/nonnegative_int{2}, }; OpCostMetrics cost2 = OpCostMetrics{ - /*runtime=*/4.0, + /*forward_runtime=*/4.0, + /*backward_runtime=*/4.0, /*memory=*/nonnegative_int{1}, }; OpCostMetrics cost3 = OpCostMetrics{ - /*runtime=*/2.0, + /*forward_runtime=*/2.0, + /*backward_runtime=*/2.0, /*memory=*/nonnegative_int{3}, }; diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc b/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc new file mode 100644 index 0000000000..e88f2b7840 --- /dev/null +++ b/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc @@ -0,0 +1,211 @@ +#include "compiler/task_graph_simulator/simulate_task_graph_execution.h" +#include "compiler/task_graph_simulator/task_graph_execution_state.dtg.h" +#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h" +#include "utils/containers/lookup_in_map.h" +#include "utils/graph/algorithms.h" +#include "utils/graph/digraph/directed_edge.dtg.h" +#include "utils/graph/instances/adjacency_digraph.h" +#include +#include + +namespace FlexFlow { + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("simulate_task_graph_execution") { + DiGraph g = DiGraph::create(); + SUBCASE("linear graph") { + std::vector n = add_nodes(g, 4); + add_edges(g, + { + DirectedEdge{n.at(0), n.at(1)}, + DirectedEdge{n.at(1), n.at(2)}, + DirectedEdge{n.at(2), n.at(3)}, + }); + + auto cost_function = lookup_in_map( + {{n.at(0), 1}, {n.at(1), 10}, {n.at(2), 100}, {n.at(3), 1000}}); + + auto is_allowed_to_run = + [&](Node const &n, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) { return true; }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + + TaskGraphExecutionTrace result = + simulate_task_graph_execution(g, cost_function, constraint); + TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{ + TaskProfile{n.at(0), 0, 1}, + TaskProfile{n.at(1), 1, 11}, + TaskProfile{n.at(2), 11, 111}, + TaskProfile{n.at(3), 111, 1111}, + }}; + CHECK(correct == result); + } + + SUBCASE("rhomboidal graph") { + std::vector n = add_nodes(g, 4); + + add_edges(g, + {DirectedEdge{n.at(0), n.at(1)}, + DirectedEdge{n.at(0), n.at(2)}, + DirectedEdge{n.at(1), n.at(3)}, + DirectedEdge{n.at(2), n.at(3)}}); + + auto cost_function = lookup_in_map( + {{n.at(0), 10}, {n.at(1), 15}, {n.at(2), 20}, {n.at(3), 25}}); + + SUBCASE("no processing constraints") { + auto is_allowed_to_run = + [&](Node const &n, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) { + return true; + }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + TaskGraphExecutionTrace result = + simulate_task_graph_execution(g, cost_function, constraint); + TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{ + TaskProfile{n.at(0), 0, 10}, + TaskProfile{n.at(1), 10, 25}, + TaskProfile{n.at(2), 10, 30}, + TaskProfile{n.at(3), 30, 55}, + }}; + CHECK(correct == result); + } + + SUBCASE("one node at a time") { + auto is_allowed_to_run = + [&](Node const &n, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) { + return in_progress_tasks.size() == 0; + }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + TaskGraphExecutionTrace result = + simulate_task_graph_execution(g, cost_function, constraint); + TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{ + TaskProfile{n.at(0), 0, 10}, + TaskProfile{n.at(1), 10, 25}, + TaskProfile{n.at(2), 25, 45}, + TaskProfile{n.at(3), 45, 70}, + }}; + CHECK(correct == result); + } + } + + SUBCASE("diamond graph with crossing") { + std::vector n = add_nodes(g, 6); + + add_edges(g, + { + DirectedEdge{n.at(0), n.at(1)}, + DirectedEdge{n.at(0), n.at(2)}, + DirectedEdge{n.at(1), n.at(3)}, + DirectedEdge{n.at(2), n.at(3)}, + DirectedEdge{n.at(2), n.at(4)}, + DirectedEdge{n.at(3), n.at(5)}, + DirectedEdge{n.at(4), n.at(5)}, + }); + + auto cost_function = lookup_in_map({{n.at(0), 10}, + {n.at(1), 15}, + {n.at(2), 20}, + {n.at(3), 25}, + {n.at(4), 30}, + {n.at(5), 35}}); + + SUBCASE("no processing constraints") { + auto is_allowed_to_run = + [&](Node const &n, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) { + return true; + }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + TaskGraphExecutionTrace result = + simulate_task_graph_execution(g, cost_function, constraint); + TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{ + TaskProfile{n.at(0), 0, 10}, + TaskProfile{n.at(1), 10, 25}, + TaskProfile{n.at(2), 10, 30}, + TaskProfile{n.at(3), 30, 55}, + TaskProfile{n.at(4), 30, 60}, + TaskProfile{n.at(5), 60, 95}, + }}; + CHECK(correct == result); + } + + SUBCASE("one node at a time") { + auto is_allowed_to_run = + [&](Node const &n, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) { + return in_progress_tasks.size() == 0; + }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + TaskGraphExecutionTrace result = + simulate_task_graph_execution(g, cost_function, constraint); + TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{ + TaskProfile{n.at(0), 0, 10}, + TaskProfile{n.at(1), 10, 25}, + TaskProfile{n.at(2), 25, 45}, + TaskProfile{n.at(3), 45, 70}, + TaskProfile{n.at(4), 70, 100}, + TaskProfile{n.at(5), 100, 135}, + }}; + CHECK(correct == result); + } + } + + SUBCASE("all-to-all intermediate") { + std::vector n = add_nodes(g, 5); + + add_edges(g, + {DirectedEdge{n.at(0), n.at(1)}, + DirectedEdge{n.at(0), n.at(2)}, + DirectedEdge{n.at(0), n.at(3)}, + DirectedEdge{n.at(1), n.at(4)}, + DirectedEdge{n.at(2), n.at(4)}, + DirectedEdge{n.at(3), n.at(4)}}); + + auto cost_function = lookup_in_map({{n.at(0), 10}, + {n.at(1), 100}, + {n.at(2), 100}, + {n.at(3), 100}, + {n.at(4), 20}}); + + SUBCASE("at most two nodes at a time") { + auto is_allowed_to_run = + [&](Node const &n, + std::unordered_set const &in_progress_tasks, + std::unordered_set const &finished_tasks) { + return in_progress_tasks.size() < 2; + }; + + TaskExecutionConstraint constraint = + TaskExecutionConstraint{is_allowed_to_run}; + TaskGraphExecutionTrace result = + simulate_task_graph_execution(g, cost_function, constraint); + TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{ + TaskProfile{n.at(0), 0, 10}, + TaskProfile{n.at(1), 10, 110}, + TaskProfile{n.at(2), 10, 110}, + TaskProfile{n.at(3), 110, 210}, + TaskProfile{n.at(4), 210, 230}, + }}; + CHECK(correct == result); + } + } + } +} +} // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc new file mode 100644 index 0000000000..e278338440 --- /dev/null +++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc @@ -0,0 +1,265 @@ +#include "compiler/task_graph_simulator/task_simulator.h" +#include "../cost_estimator_for_test.h" +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" +#include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "op-attrs/ops/input_attrs.dtg.h" +#include "op-attrs/parallel_tensor_dims.dtg.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/device_id.h" +#include "pcg/device_type.dtg.h" +#include "pcg/machine_space_coordinate.dtg.h" +#include "pcg/machine_specification.h" +#include "pcg/machine_specification_dimension.dtg.h" +#include "pcg/machine_view.dtg.h" +#include "pcg/machine_view.h" +#include "pcg/machine_view_dimension.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" +#include "pcg/stride_t.dtg.h" +#include "substitutions/sub_parallel_computation_graph.dtg.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "utils/containers/get_only.h" +#include "utils/deduplicated_priority_queue.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_source_nodes.h" +#include "utils/nonnegative_int/nonnegative_int.h" +#include +#include +#include +#include + +namespace FlexFlow { + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("task_simulator_estimate_forward_pass_time") { + MachineSpecification machine_spec = + MachineSpecification{/*num_nodes=*/3, + /*num_cpus_per_node=*/3, + /*num_gpus_per_node=*/3, + /*inter_node_bandwidth=*/1.0f, + /*intra_node_bandwidth=*/1.0f}; + + SUBCASE("linear graph") { + ParallelComputationGraphBuilder b; + ParallelTensorShape input_shape = ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{}, + ReplicaParallelDimSet{ + SumDegree{1}, + DiscardCopyDegree{1}, + }, + }, + DataType::FLOAT, + }; + parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape); + parallel_tensor_guid_t tensor1 = b.relu(tensor0); + + parallel_layer_guid_t layer0 = get_source_layer(tensor0); + parallel_layer_guid_t layer1 = get_source_layer(tensor1); + + std::vector dims = { + MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTER_NODE}, + MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTER_NODE}, + }; + ParallelComputationGraph pcg = b.pcg; + MachineView mv1 = + MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims}; + MachineView mv2 = + MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims}; + + MachineMapping device_mapping = MachineMapping{{ + {layer0, mv1}, + {layer1, mv2}, + }}; + + SUBCASE("constant op, comm cost") { + CostEstimator estimator = make_fake_constant_cost_estimator( + /*forward_op_cost=*/10.0f, + /*backward_op_cost=*/10.0f, + /*comm_cost=*/1.0f, + /*memory_cost=*/nonnegative_int{0}); + + float result = task_simulator_estimate_forward_pass_time( + pcg, estimator, device_mapping, machine_spec); + + float correct = 10 + 1 + 10; + CHECK(result == correct); + } + + SUBCASE("variable op, comm cost") { + CostEstimator cost_estimator = make_fake_cost_estimator( + [](OpCostEstimateKey const &op) { + if (op.op_attrs.has()) { + return OpCostMetrics{/*forward_runtime=*/10.0f, + /*backward_runtime=*/10.0f, + /*memory=*/nonnegative_int{0}}; // layer0 + } + if (op.op_attrs.has()) { + return OpCostMetrics{/*forward_runtime=*/1.0f, + /*backward_runtime=*/1.0f, + /*memory=*/nonnegative_int{0}}; // layer1 + } + return OpCostMetrics{/*forward_runtime=*/0.0f, + /*backward_runtime=*/0.0f, + /*memory=*/nonnegative_int{0}}; + }, + [](TensorSetMovement const &comm) { return 5.0f; }); + + float result = task_simulator_estimate_forward_pass_time( + pcg, cost_estimator, device_mapping, machine_spec); + float correct = 10 + 5 + 1; + CHECK(result == correct); + } + } + + SUBCASE("rhomboidal graph") { + ParallelComputationGraphBuilder b; + + ParallelTensorShape input_shape = ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{ShardParallelDim{10, 1}}, + ReplicaParallelDimSet{ + SumDegree{1}, + DiscardCopyDegree{1}, + }, + }, + DataType::FLOAT, + }; + + parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape); + parallel_tensor_guid_t tensor1 = b.relu(tensor0); + parallel_tensor_guid_t tensor2 = b.relu(tensor0); + parallel_tensor_guid_t tensor3 = b.add(tensor1, tensor2); + + parallel_layer_guid_t layer0 = get_source_layer(tensor0); + parallel_layer_guid_t layer1 = get_source_layer(tensor1); + parallel_layer_guid_t layer2 = get_source_layer(tensor2); + parallel_layer_guid_t layer3 = get_source_layer(tensor3); + + ParallelComputationGraph pcg = b.pcg; + std::vector dims = { + MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTER_NODE}, + MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTER_NODE}, + MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTER_NODE}, + }; + + SUBCASE("all different devices") { + MachineView mv0 = + MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims}; + MachineView mv1 = + MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims}; + MachineView mv2 = + MachineView{MachineSpaceCoordinate{1, 0, DeviceType::GPU}, dims}; + MachineView mv3 = + MachineView{MachineSpaceCoordinate{1, 1, DeviceType::GPU}, dims}; + + MachineMapping device_mapping = MachineMapping{{ + {layer0, mv0}, + {layer1, mv1}, + {layer2, mv2}, + {layer3, mv3}, + }}; + SUBCASE("constant op, comm cost") { + CostEstimator estimator = make_fake_constant_cost_estimator( + /*forward_op_cost=*/10.0f, + /*backward_op_cost=*/10.0f, + /*comm_cost=*/1.0f, + /*memory_cost=*/nonnegative_int{0}); + + float result = task_simulator_estimate_forward_pass_time( + pcg, estimator, device_mapping, machine_spec); + float correct = 10 + 1 + 10 + 1 + 10; + CHECK(result == correct); + } + SUBCASE("variable op, comm cost") { + CostEstimator cost_estimator = make_fake_cost_estimator( + [](OpCostEstimateKey const &op) { + if (op.op_attrs.has()) { + return OpCostMetrics{/*forward_runtime=*/10.0f, + /*backward_runtime=*/10.0f, + /*memory=*/nonnegative_int{0}}; // layer0 + } + if (op.op_attrs.has()) { + return OpCostMetrics{ + /*forward_runtime=*/1.0f, + /*backward_runtime=*/1.0f, + /*memory=*/nonnegative_int{0}}; // layers 1, 2 + } + if (op.op_attrs.has()) { + return OpCostMetrics{/*forward_runtime=*/2.0f, + /*backward_runtime=*/2.0f, + /*memory=*/nonnegative_int{0}}; // layer3 + } + return OpCostMetrics{/*forward_runtime=*/0.0f, + /*backward_runtime=*/0.0f, + /*memory=*/nonnegative_int{0}}; + }, + [](TensorSetMovement const &comm) { return 5.0f; }); + } + } + + SUBCASE("all the same device") { + MachineView mv = + MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims}; + MachineMapping device_mapping = MachineMapping{{ + {layer0, mv}, + {layer1, mv}, + {layer2, mv}, + {layer3, mv}, + }}; + SUBCASE("constant op, cost cost") { + CostEstimator cost_estimator = make_fake_constant_cost_estimator( + /*forward_op_cost=*/10.0f, + /*backward_op_cost=*/10.0f, + /*comm_cost=*/1.0f, + /*memory_cost=*/nonnegative_int{0}); + + float result = task_simulator_estimate_forward_pass_time( + pcg, cost_estimator, device_mapping, machine_spec); + float correct = 10 + 10 + 10 + 10 + 1 + 1; + CHECK(result == correct); + } + SUBCASE("variable op, cost cost") { + CostEstimator cost_estimator = make_fake_cost_estimator( + [](OpCostEstimateKey const &op) { + if (op.op_attrs.has()) { + return OpCostMetrics{/*forward_runtime=*/10.0f, + /*backward_runtime=*/10.0f, + /*memory=*/nonnegative_int{0}}; // layer0 + } + if (op.op_attrs.has()) { + return OpCostMetrics{ + /*forward_runtime=*/1.0f, + /*backward_runtime=*/1.0f, + /*memory=*/nonnegative_int{0}}; // layers 1, 2 + } + if (op.op_attrs.has()) { + return OpCostMetrics{/*forward_runtime=*/2.0f, + /*backward_runtime=*/2.0f, + /*memory=*/nonnegative_int{0}}; // layer3 + } + return OpCostMetrics{/*forward_runtime=*/0.0f, + /*backward_runtime=*/0.0f, + /*memory=*/nonnegative_int{0}}; + }, + [](TensorSetMovement const &comm) { return 5.0f; }); + float result = task_simulator_estimate_forward_pass_time( + pcg, cost_estimator, device_mapping, machine_spec); + float correct = 10 + 5 + (1 + 1) + 5 + 2; + CHECK(result == correct); + } + } + } + } +} +} // namespace FlexFlow diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h index 6ffa9900c2..39591e8a70 100644 --- a/lib/pcg/include/pcg/machine_specification.h +++ b/lib/pcg/include/pcg/machine_specification.h @@ -20,6 +20,7 @@ bool is_valid_machine_space_coordinate(MachineSpecification const &ms, device_id_t get_device_id(MachineSpecification const &ms, MachineSpaceCoordinate const &coord); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h index 293227b7a1..f72b2359dc 100644 --- a/lib/pcg/include/pcg/machine_view.h +++ b/lib/pcg/include/pcg/machine_view.h @@ -37,6 +37,14 @@ std::unordered_set MachineView const &mv, MachineSpecification const &ms); +std::unordered_set get_device_ids(OperatorTaskSpace const &task, + MachineView const &mv, + MachineSpecification const &ms); + +MachineView make_1d_machine_view(MachineSpaceCoordinate const &start, + MachineSpecificationDimension const &dim, + stride_t stride); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h index 61cab4eff1..1a19397c72 100644 --- a/lib/pcg/include/pcg/operator_task_space.h +++ b/lib/pcg/include/pcg/operator_task_space.h @@ -2,6 +2,8 @@ #define _FLEXFLOW_PCG_INCLUDE_OPERATOR_TASK_SPACE_H #include "pcg/operator_task_space.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" #include "pcg/task_space_coordinate.dtg.h" #include #include @@ -17,6 +19,9 @@ TaskSpaceCoordinate size_t num_dims(OperatorTaskSpace const &task); size_t num_tasks(OperatorTaskSpace const &task); +OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &layer); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h index c740e1ffd2..f7567b5025 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h @@ -6,6 +6,7 @@ #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h" #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" +#include namespace FlexFlow { @@ -31,6 +32,20 @@ std::unordered_set parallel_layer_guid_t const &, parallel_layer_guid_t const &); +std::unordered_set + get_edges(ParallelComputationGraph const &); + +std::unordered_set + get_outgoing_edges(ParallelComputationGraph const &, + parallel_layer_guid_t const &); + +std::unordered_set + get_incoming_edges(ParallelComputationGraph const &, + parallel_layer_guid_t const &); + +std::unordered_set + get_initial_layers(ParallelComputationGraph const &); + std::vector get_incoming_tensors(ParallelComputationGraph const &, parallel_layer_guid_t const &); @@ -45,6 +60,9 @@ std::vector get_incoming_weights(ParallelComputationGraph const &, parallel_layer_guid_t const &); +parallel_layer_guid_t get_source_layer(ParallelComputationGraph const &g, + parallel_tensor_guid_t const &t); + ParallelLayerAttrs get_parallel_layer_attrs(ParallelComputationGraph const &, parallel_layer_guid_t const &); PCGOperatorAttrs pcg_get_op_attrs(ParallelComputationGraph const &, diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc index ca5b8ba047..19ff50b4b7 100644 --- a/lib/pcg/src/pcg/machine_specification.cc +++ b/lib/pcg/src/pcg/machine_specification.cc @@ -1,5 +1,6 @@ #include "pcg/machine_specification.h" #include "pcg/device_id.h" +#include "utils/containers/transform.h" #include "utils/exception.h" namespace FlexFlow { diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc index 18f6cacb7e..cc42ad83b2 100644 --- a/lib/pcg/src/pcg/machine_view.cc +++ b/lib/pcg/src/pcg/machine_view.cc @@ -1,14 +1,21 @@ #include "pcg/machine_view.h" +#include "pcg/machine_space_coordinate.dtg.h" +#include "pcg/machine_specification.dtg.h" #include "pcg/machine_specification.h" +#include "pcg/machine_specification_dimension.dtg.h" +#include "pcg/machine_view_dimension.dtg.h" +#include "pcg/operator_task_space.dtg.h" #include "pcg/operator_task_space.h" +#include "pcg/stride_t.dtg.h" #include "utils/containers/contains.h" #include "utils/containers/count.h" #include "utils/containers/filter.h" +#include "utils/containers/get_only.h" #include "utils/containers/scanl.h" #include "utils/containers/sum.h" #include "utils/containers/transform.h" #include "utils/containers/zip.h" - +#include "utils/exception.h" namespace FlexFlow { size_t num_dims(MachineView const &mv) { @@ -35,6 +42,13 @@ MachineView machine_view_from_strides_and_machine_spec_dimensions( MachineSpaceCoordinate const &start, std::vector const &strides, std::vector const &dims) { + if (strides.size() != dims.size()) { + throw mk_runtime_error(fmt::format( + "Length of strides ({}) and dims ({}) must match when calling " + "machine_view_from_strides_and_machine_spec_dimensions", + start, + strides)); + } std::vector dimensions = transform(zip(strides, dims), [&](auto const &p) { return MachineViewDimension{p.first, p.second}; @@ -48,6 +62,14 @@ std::optional get_machine_space_coordinate( TaskSpaceCoordinate const &coord, MachineSpecification const &machine_specification) { + if (num_dims(machine_view) != task.degrees.size()) { + throw mk_runtime_error( + fmt::format("Dimension of machine_view ({}) must match dimension of " + "task ({}) when computing machine space coordinate", + machine_view, + task.degrees)); + } + auto get_dimension_indices_for_dimension = [&](MachineSpecificationDimension dimension) { std::vector mv_dimensions = @@ -106,10 +128,37 @@ std::unordered_set get_machine_space_coordinates( MachineSpecification const &machine_specification) { return transform( get_task_space_coordinates(task), [&](TaskSpaceCoordinate const &coord) { - return get_machine_space_coordinate( - task, machine_view, coord, machine_specification) - .value(); + std::optional maybe_coordinate = + get_machine_space_coordinate( + task, machine_view, coord, machine_specification); + if (!maybe_coordinate.has_value()) { + throw mk_runtime_error( + fmt::format("In get_machine_space_coordinates, the given " + "OperatorTaskSpace {} and MachineView {} are not " + "compatible with the given MachineSpecification {}", + task, + machine_view, + machine_specification)); + } + return maybe_coordinate.value(); }); } +std::unordered_set get_device_ids(OperatorTaskSpace const &task, + MachineView const &mv, + MachineSpecification const &ms) { + return transform(get_machine_space_coordinates(task, mv, ms), + [&](MachineSpaceCoordinate const &coord) { + return get_device_id(ms, coord); + }); +} + +MachineView make_1d_machine_view(MachineSpaceCoordinate const &start, + MachineSpecificationDimension const &dim, + stride_t stride) { + + return machine_view_from_strides_and_machine_spec_dimensions( + start, {stride}, {dim}); +} + } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc index 2538cb4ea0..7157b75082 100644 --- a/lib/pcg/src/pcg/operator_task_space.cc +++ b/lib/pcg/src/pcg/operator_task_space.cc @@ -1,12 +1,19 @@ #include "pcg/operator_task_space.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/operator_task_space.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" #include "utils/containers/cartesian_product.h" +#include "utils/containers/extend.h" #include "utils/containers/maximum.h" #include "utils/containers/product.h" #include "utils/containers/range.h" #include "utils/containers/transform.h" #include "utils/containers/unordered_set_of.h" +#include "utils/containers/vector_of.h" #include "utils/fmt/unordered_set.h" - namespace FlexFlow { std::unordered_set @@ -36,4 +43,16 @@ size_t num_tasks(OperatorTaskSpace const &task) { return product(task.degrees); } +OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &layer) { + parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0); + ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor); + + std::vector degrees; + extend(degrees, vector_of(ff_ordered_shard_degrees(shape))); + degrees.push_back(get_sum_degree(shape)); + degrees.push_back(get_discard_copy_degree(shape)); + return OperatorTaskSpace{degrees}; +} + } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc index 781c44640c..4cc0500fa2 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc @@ -1,15 +1,25 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "op-attrs/get_incoming_tensor_roles.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" #include "utils/containers/filtrans.h" #include "utils/containers/get_only.h" #include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" #include "utils/graph/dataflow_graph/algorithms.h" #include "utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.h" +#include "utils/graph/dataflow_graph/algorithms/get_incoming_edges.h" +#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h" +#include "utils/graph/dataflow_graph/dataflow_edge.dtg.h" +#include "utils/graph/digraph/algorithms.h" #include "utils/graph/digraph/algorithms/get_topological_ordering.h" #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h" #include "utils/graph/labelled_dataflow_graph/algorithms/find_isomorphism.h" #include "utils/graph/labelled_dataflow_graph/algorithms/rewrite_node_labels.h" #include "utils/graph/node/algorithms.h" +#include "utils/graph/node/node.dtg.h" +#include namespace FlexFlow { @@ -66,6 +76,13 @@ ParallelLayerAddedResult /*output_labels=*/{tensor_attrs}); } +std::unordered_set + get_edges(ParallelComputationGraph const &pcg) { + return transform(get_edges(pcg.raw_graph), [](DataflowEdge const &e) { + return ParallelComputationGraphEdge{e}; + }); +} + std::unordered_set get_pcg_edges_from_layer_to_layer(ParallelComputationGraph const &pcg, parallel_layer_guid_t const &src, @@ -78,6 +95,33 @@ std::unordered_set }); } +std::unordered_set + get_outgoing_edges(ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &l) { + std::unordered_set raw_edges = + get_outgoing_edges(pcg.raw_graph, l.raw_graph_node); + return transform(raw_edges, [](DataflowEdge const &e) { + return ParallelComputationGraphEdge{e}; + }); +} + +std::unordered_set + get_incoming_edges(ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &l) { + std::unordered_set raw_edges = + unordered_set_of(get_incoming_edges(pcg.raw_graph, l.raw_graph_node)); + return transform(raw_edges, [](DataflowEdge const &e) { + return ParallelComputationGraphEdge{e}; + }); +} + +std::unordered_set + get_initial_layers(ParallelComputationGraph const &pcg) { + std::unordered_set raw_sources = get_sources(pcg.raw_graph); + return transform(raw_sources, + [](Node const &n) { return parallel_layer_guid_t{n}; }); +} + std::vector get_incoming_tensors(ParallelComputationGraph const &pcg, parallel_layer_guid_t const &l) { diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc index dca8154eb4..d30739486e 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc @@ -1,4 +1,5 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h" namespace FlexFlow { diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc index dcf22d6c00..3e9d48fac3 100644 --- a/lib/pcg/test/src/pcg/machine_view.cc +++ b/lib/pcg/test/src/pcg/machine_view.cc @@ -1,4 +1,5 @@ #include "pcg/machine_view.h" +#include "pcg/gpu_id_t.dtg.h" #include "test/utils/doctest/fmt/optional.h" #include "utils/containers/transform.h" #include "utils/fmt/unordered_set.h" @@ -298,4 +299,94 @@ TEST_SUITE(FF_TEST_SUITE) { } } } + + TEST_CASE("get_device_ids") { + + SUBCASE("1D machine view") { + + // This operator has shape (3,), and thus 3 tasks. + // The (only) dimension is projected onto the INTRA (device) dimension + // with a stride of 2. The start of the projection defined by MachineView + // is at MachineSpaceCoordinate (0, 1), and the machine space has 1 node + // and 6 devices per node. + + /** + * The tasks will thus be distributed like this: + * +-------+-------+-------+-------+-------+-------+ + * | 0 | ((1)) | 2 | ((3)) | 4 | ((5)) | + * +-------+-------+-------+-------+-------+-------+ + * Where the integers are the device ids and ((x)) are the devices we + * select + */ + MachineSpecification ms = + MachineSpecification{/*num_nodes=*/1, + /*num_cpus_per_node=*/6, + /*num_gpus_per_node=*/6, + /*inter_node_bandwidth=*/0, + /*intra_node_bandwidth=*/0}; + + OperatorTaskSpace task = OperatorTaskSpace{{3}}; + MachineView mv = MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}, + {MachineViewDimension{stride_t{2}, + MachineSpecificationDimension::INTRA_NODE}}}; + + std::unordered_set correct = { + device_id_t{gpu_id_t{1}}, + device_id_t{gpu_id_t{3}}, + device_id_t{gpu_id_t{5}}, + }; + std::unordered_set result = get_device_ids(task, mv, ms); + CHECK(result == correct); + } + + SUBCASE("2D machine view") { + // This operator has shape (2, 2), and thus 2 * 2 = 4 tasks. + // - The first dimension is projected onto the INTER (node) dimension with + // stride 1, + // - The second dimension is projected onto the INTRA (device) dimension + // with stride 2. The start of the projection defined by MachineView is at + // MachineSpaceCoordinate (1, 2), and the machine space has 3 nodes and 5 + // devices per node. + + /** + * The tasks will thus be distributed like this: + * +-------+-------+-------+-------+-------+ + * | 0 | 1 | 2 | 3 | 4 | + * +-------+-------+-------+-------+-------+ + * | 5 | 6 | ((7)) | 8 | ((9)) | + * +-------+-------+-------+-------+-------+ + * | 10 | 11 | ((12))| 13 | ((14))| + * +-------+-------+-------+-------+-------+ + * Where the integers are the device ids and ((x)) are the devices we + * select + */ + + MachineSpecification ms = + MachineSpecification{/*num_nodes=*/3, + /*num_cpus_per_node=*/5, + /*num_gpus_per_node=*/5, + /*inter_node_bandwidth=*/0, + /*intra_node_bandwidth=*/0}; + + OperatorTaskSpace task = OperatorTaskSpace{{2, 2}}; + MachineView mv = MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU}, + {MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTER_NODE}, + MachineViewDimension{stride_t{2}, + MachineSpecificationDimension::INTRA_NODE}}}; + + std::unordered_set correct = { + device_id_t{gpu_id_t{7}}, + device_id_t{gpu_id_t{9}}, + device_id_t{gpu_id_t{12}}, + device_id_t{gpu_id_t{14}}, + }; + std::unordered_set result = get_device_ids(task, mv, ms); + CHECK(result == correct); + } + } } diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc index fc07edf5b3..dd8308561f 100644 --- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc +++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc @@ -36,8 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t tensor3 = get_only(layer3_added.outputs); std::vector result = topological_ordering(pcg); - // std::vector correct = {layer1, layer2, layer3}; - // CHECK(result == correct); + std::vector correct = {layer1, layer2, layer3}; + CHECK(result == correct); } TEST_CASE( @@ -105,6 +105,82 @@ TEST_SUITE(FF_TEST_SUITE) { } } + TEST_CASE( + "get_source_layer(ParallelComputationGraph, parallel_tensor_guid_t)") { + ParallelTensorShape tensor_shape = ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{ + ShardParallelDim{10, 2}, + ShardParallelDim{12, 1}, + }, + ReplicaParallelDimSet{ + SumDegree{1}, + DiscardCopyDegree{1}, + }, + }, + DataType::FLOAT, + }; + + ParallelComputationGraph pcg = empty_parallel_computation_graph(); + + ParallelLayerAttrs layer_label = some(); + ParallelTensorAttrs tensor_label = some(); + + SUBCASE("single layer") { + ParallelLayerAddedResult layer1_added = + add_parallel_layer(pcg, layer_label, {}, {tensor_label}); + parallel_layer_guid_t layer1 = layer1_added.parallel_layer; + parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs); + + parallel_layer_guid_t result = get_source_layer(pcg, tensor1); + parallel_layer_guid_t correct = layer1; + CHECK(result == correct); + } + + SUBCASE("two connected layers") { + ParallelLayerAddedResult layer1_added = + add_parallel_layer(pcg, layer_label, {}, {tensor_label}); + parallel_layer_guid_t layer1 = layer1_added.parallel_layer; + parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs); + + ParallelLayerAddedResult layer2_added = + add_parallel_layer(pcg, layer_label, {tensor1}, {tensor_label}); + parallel_layer_guid_t layer2 = layer2_added.parallel_layer; + + parallel_layer_guid_t result = get_source_layer(pcg, tensor1); + parallel_layer_guid_t correct = layer1; + CHECK(result == correct); + } + + SUBCASE("three layers in series") { + ParallelLayerAddedResult layer1_added = + add_parallel_layer(pcg, layer_label, {}, {tensor_label}); + parallel_layer_guid_t layer1 = layer1_added.parallel_layer; + parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs); + + ParallelLayerAddedResult layer2_added = + add_parallel_layer(pcg, layer_label, {tensor1}, {tensor_label}); + parallel_layer_guid_t layer2 = layer2_added.parallel_layer; + parallel_tensor_guid_t tensor2 = get_only(layer2_added.outputs); + + ParallelLayerAddedResult layer3_added = + add_parallel_layer(pcg, layer_label, {tensor2}, {tensor_label}); + parallel_layer_guid_t layer3 = layer3_added.parallel_layer; + + SUBCASE("tensor 1") { + parallel_layer_guid_t result = get_source_layer(pcg, tensor1); + parallel_layer_guid_t correct = layer1; + CHECK(result == correct); + } + + SUBCASE("tensor 2") { + parallel_layer_guid_t result = get_source_layer(pcg, tensor2); + parallel_layer_guid_t correct = layer2; + CHECK(result == correct); + } + } + } + TEST_CASE( "get_incoming_weights(ParallelComputationGraph, parallel_layer_guid_t)") { ParallelTensorShape input_shape = ParallelTensorShape{ diff --git a/lib/runtime/src/parallel_compuation_graph.cc b/lib/runtime/src/parallel_compuation_graph.cc deleted file mode 100644 index ebc5ac1e8e..0000000000 --- a/lib/runtime/src/parallel_compuation_graph.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include "parallel_computation_graph.h" - -namespace FlexFlow { - -ParallelTensor ParallelComputationGraph::{} - -} // namespace FlexFlow diff --git a/lib/utils/include/utils/archetypes/value_type.h b/lib/utils/include/utils/archetypes/value_type.h index 1635747612..e45b8fda7e 100644 --- a/lib/utils/include/utils/archetypes/value_type.h +++ b/lib/utils/include/utils/archetypes/value_type.h @@ -2,7 +2,10 @@ #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_ARCHETYPES_VALUE_TYPE_H #include +#include #include +#include +#include namespace FlexFlow { @@ -32,6 +35,16 @@ struct value_type { } }; +template +std::string format_as(value_type const &) { + assert(false); +} + +template +std::ostream &operator<<(std::ostream &s, value_type const &x) { + assert(false); +} + } // namespace FlexFlow namespace std { diff --git a/lib/utils/include/utils/containers/lookup_in_map.h b/lib/utils/include/utils/containers/lookup_in_map.h new file mode 100644 index 0000000000..946fc589db --- /dev/null +++ b/lib/utils/include/utils/containers/lookup_in_map.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_LOOKUP_IN_MAP_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_LOOKUP_IN_MAP_H + +#include "utils/containers/contains.h" +#include "utils/containers/keys.h" +#include "utils/exception.h" +#include "utils/fmt/unordered_map.h" +#include +#include +#include + +namespace FlexFlow { + +template +std::function lookup_in_map(std::unordered_map const &map) { + return [map](K const &key) -> V { + if (!contains(keys(map), key)) { + throw mk_runtime_error(fmt::format( + "Key {} is not present in the underlying map {}", key, map)); + } + return map.at(key); + }; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/containers/minimum.h b/lib/utils/include/utils/containers/minimum.h new file mode 100644 index 0000000000..8bdd6ea985 --- /dev/null +++ b/lib/utils/include/utils/containers/minimum.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MINIMUM_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MINIMUM_H + +#include "utils/exception.h" +#include + +namespace FlexFlow { + +template +typename C::value_type minimum(C const &c) { + if (c.empty()) { + throw mk_runtime_error( + fmt::format("minimum expected non-empty container but received {}", c)); + } + + return *std::min_element(c.begin(), c.end()); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/deduplicated_priority_queue.h b/lib/utils/include/utils/deduplicated_priority_queue.h index 66f6e524d4..afad3f5889 100644 --- a/lib/utils/include/utils/deduplicated_priority_queue.h +++ b/lib/utils/include/utils/deduplicated_priority_queue.h @@ -3,6 +3,7 @@ #include "utils/containers/contains.h" #include +#include #include #include @@ -38,6 +39,16 @@ class DeduplicatedPriorityQueue { impl.pop(); } + std::set contents() const { + auto temp = impl; + std::set result; + while (!temp.empty()) { + result.insert(temp.top()); + temp.pop(); + } + return result; + } + private: std::priority_queue impl; std::unordered_set hashmap; diff --git a/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h b/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h new file mode 100644 index 0000000000..a8b5efe66e --- /dev/null +++ b/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_GET_OUTGOING_EDGES_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_GET_OUTGOING_EDGES_H + +#include "utils/graph/dataflow_graph/dataflow_graph_view.h" + +namespace FlexFlow { + +std::unordered_set get_outgoing_edges(DataflowGraphView const &, + Node const &); +std::unordered_set + get_outgoing_edges(DataflowGraphView const &, + std::unordered_set const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/src/utils/containers/lookup_in_map.cc b/lib/utils/src/utils/containers/lookup_in_map.cc new file mode 100644 index 0000000000..a0d7db8e82 --- /dev/null +++ b/lib/utils/src/utils/containers/lookup_in_map.cc @@ -0,0 +1,12 @@ +#include "utils/containers/lookup_in_map.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using K = value_type<0>; +using V = value_type<1>; + +template std::function + lookup_in_map(std::unordered_map const &map); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/minimum.cc b/lib/utils/src/utils/containers/minimum.cc new file mode 100644 index 0000000000..c9bbc7706f --- /dev/null +++ b/lib/utils/src/utils/containers/minimum.cc @@ -0,0 +1 @@ +#include "utils/containers/minimum.h" diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc new file mode 100644 index 0000000000..2376e4897f --- /dev/null +++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc @@ -0,0 +1,28 @@ +#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h" +#include "utils/containers/sorted_by.h" + +namespace FlexFlow { + +std::unordered_set get_outgoing_edges(DataflowGraphView const &g, + Node const &n) { + return g.query_edges(DataflowEdgeQuery{ + {n}, + query_set::matchall(), + query_set::matchall(), + query_set::matchall(), + }); +} + +std::unordered_set + get_outgoing_edges(DataflowGraphView const &g, + std::unordered_set const &ns) { + DataflowEdgeQuery query = DataflowEdgeQuery{ + query_set{ns}, + query_set::matchall(), + query_set::matchall(), + query_set::matchall(), + }; + return g.query_edges(query); +} + +} // namespace FlexFlow diff --git a/lib/utils/test/src/utils/containers/lookup_in_map.cc b/lib/utils/test/src/utils/containers/lookup_in_map.cc new file mode 100644 index 0000000000..9ca356ee4b --- /dev/null +++ b/lib/utils/test/src/utils/containers/lookup_in_map.cc @@ -0,0 +1,31 @@ +#include "utils/containers/lookup_in_map.h" +#include +#include +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + + TEST_CASE("lookup_in_map") { + + std::unordered_map map = {{"a", 1}, {"b", 2}}; + + SUBCASE("existing keys") { + std::function func = lookup_in_map(map); + CHECK(func("a") == 1); + CHECK(func("b") == 2); + } + + SUBCASE("missing key") { + std::function func = lookup_in_map(map); + CHECK_THROWS(func("c")); + } + + SUBCASE("empty map") { + std::unordered_map map = {}; + std::function func = lookup_in_map(map); + CHECK_THROWS(func("a")); + } + } +} diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc new file mode 100644 index 0000000000..86e4802cdb --- /dev/null +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc @@ -0,0 +1,51 @@ +#include "utils/graph/dataflow_graph/algorithms/get_incoming_edges.h" +#include "utils/containers/get_only.h" +#include "utils/graph/dataflow_graph/dataflow_graph.h" +#include "utils/graph/instances/unordered_set_dataflow_graph.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_incoming_edges(DataflowGraphView, Node)") { + DataflowGraph g = DataflowGraph::create(); + + NodeAddedResult n1_added = g.add_node({}, 1); + Node n1 = n1_added.node; + DataflowOutput o1 = get_only(n1_added.outputs); + + NodeAddedResult n2_added = g.add_node({}, 1); + Node n2 = n2_added.node; + DataflowOutput o2 = get_only(n2_added.outputs); + + NodeAddedResult n3_added = g.add_node({o2}, 1); + Node n3 = n3_added.node; + DataflowOutput o3 = get_only(n3_added.outputs); + + NodeAddedResult n4_added = g.add_node({o2, o3}, 1); + Node n4 = n4_added.node; + DataflowOutput o4 = get_only(n4_added.outputs); + + SUBCASE("n4 - multiple incoming edges") { + std::vector result = get_incoming_edges(g, n4); + std::vector correct = { + DataflowEdge{o2, DataflowInput{n4, 0}}, + DataflowEdge{o3, DataflowInput{n4, 1}}}; + CHECK(result == correct); + } + + SUBCASE("n3- single incoming edge") { + std::vector result = get_incoming_edges(g, n3); + std::vector correct = { + DataflowEdge{o2, DataflowInput{n3, 0}}, + }; + CHECK(result == correct); + } + + SUBCASE("n1- no incoming edges") { + std::vector result = get_incoming_edges(g, n1); + std::vector correct = {}; + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc new file mode 100644 index 0000000000..be874b7e29 --- /dev/null +++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc @@ -0,0 +1,90 @@ +#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h" +#include "utils/containers/get_only.h" +#include "utils/graph/dataflow_graph/dataflow_graph.h" +#include "utils/graph/instances/unordered_set_dataflow_graph.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_outgoing_edges(DataflowGraphView, Node)") { + DataflowGraph g = DataflowGraph::create(); + + NodeAddedResult n1_added = g.add_node({}, 1); + Node n1 = n1_added.node; + DataflowOutput o1 = get_only(n1_added.outputs); + + NodeAddedResult n2_added = g.add_node({o1}, 1); + Node n2 = n2_added.node; + DataflowOutput o2 = get_only(n2_added.outputs); + + NodeAddedResult n3_added = g.add_node({o1}, 1); + Node n3 = n3_added.node; + DataflowOutput o3 = get_only(n3_added.outputs); + + NodeAddedResult n4_added = g.add_node({o2}, 1); + Node n4 = n4_added.node; + DataflowOutput o4 = get_only(n4_added.outputs); + + SUBCASE("n2 - single outgoing edge") { + std::unordered_set result = get_outgoing_edges(g, n2); + std::unordered_set correct = { + DataflowEdge{o2, DataflowInput{n4, 0}}, + }; + CHECK(result == correct); + } + + SUBCASE("n1 - multiple outgoing edges") { + std::unordered_set result = get_outgoing_edges(g, n1); + std::unordered_set correct = { + DataflowEdge{o1, DataflowInput{n2, 0}}, + DataflowEdge{o1, DataflowInput{n3, 0}}, + }; + CHECK(result == correct); + } + + SUBCASE("n4 - no outgoing edges") { + std::unordered_set result = get_outgoing_edges(g, n4); + std::unordered_set correct = {}; + CHECK(result == correct); + } + } + + TEST_CASE("get_outgoing_edges(DataflowGraphView, std::unordered_set)") { + DataflowGraph g = DataflowGraph::create(); + + NodeAddedResult n1_added = g.add_node({}, 1); + Node n1 = n1_added.node; + DataflowOutput o1 = get_only(n1_added.outputs); + + NodeAddedResult n2_added = g.add_node({o1}, 1); + Node n2 = n2_added.node; + DataflowOutput o2 = get_only(n2_added.outputs); + + NodeAddedResult n3_added = g.add_node({o1}, 1); + Node n3 = n3_added.node; + DataflowOutput o3 = get_only(n3_added.outputs); + + NodeAddedResult n4_added = g.add_node({o2}, 1); + Node n4 = n4_added.node; + DataflowOutput o4 = get_only(n4_added.outputs); + + SUBCASE("multiple nodes - combined outgoing edges") { + std::unordered_set nodes = {n1, n2}; + std::unordered_set result = get_outgoing_edges(g, nodes); + std::unordered_set correct = { + DataflowEdge{o1, DataflowInput{n2, 0}}, + DataflowEdge{o1, DataflowInput{n3, 0}}, + DataflowEdge{o2, DataflowInput{n4, 0}}, + }; + CHECK(result == correct); + } + + SUBCASE("multiple nodes - no outgoing edges") { + std::unordered_set nodes = {n3, n4}; + std::unordered_set result = get_outgoing_edges(g, nodes); + std::unordered_set correct = {}; + CHECK(result == correct); + } + } +}