PaddlePaddle · JZ-LIANG · Nov 30, 2021 · Nov 10, 2021 · Nov 10, 2021 · Nov 10, 2021
diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py
@@ -0,0 +1,294 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import operator
+import functools
+import json
+import paddle
+from collections import deque
+from .graph import Node
+from .graph import Edge
+from .graph import Graph
+from .cluster import DeviceType
+from .process_group import get_process_group
+
+
+def is_collective_comm_op(op):
+    comm_list = [
+        "c_allreduce_sum", "c_allreduce_min", "c_allreduce_max",
+        "c_allreduce_prod", "c_reduce_sum", "c_reduce_min", "c_reduce_max",
+        "c_reduce_prod", "c_broadcast", "c_allgather"
+    ]
+    if op.type in comm_list:
+        return True
+    else:
+        return False
+
+
+def is_p2p_comm_op(op):
+    comm_list = ["send_v2", "recv_v2"]
+    if op.type in comm_list:
+        return True
+    else:
+        return False
+
+
+def get_dtype_bytes(dtype):
+    num_bytes = 0
+    if dtype == paddle.float64:
+        num_bytes = 8
+    elif dtype == paddle.float32:
+        num_bytes = 4
+    elif dtype == paddle.float16:
+        num_bytes = 2
+    elif dtype == paddle.bfloat16:
+        num_bytes = 2
+    elif dtype == paddle.int64:
+        num_bytes = 8
+    elif dtype == paddle.int32:
+        num_bytes = 4
+    elif dtype == paddle.int16:
+        num_bytes = 2
+    elif dtype == paddle.int8:
+        num_bytes = 1
+    elif dtype == paddle.uint8:
+        num_bytes = 1
+    else:
+        raise ValueError("Unrecognized dtype {}.".format(dtype))
+    return num_bytes
+
+
+def get_comm_volume(comm_op, src_rank, tgt_rank):
+    comm_volume = None
+    if src_rank == tgt_rank:
+        return comm_volume
+    comm_op_type = comm_op.type
+    if comm_op_type != "recv_v2":
+        tensor_name = comm_op.input_arg_names[0]
+    else:
+        tensor_name = comm_op.output_arg_names[0]
+    tensor = comm_op.block._find_var_recursive(tensor_name)
+    assert tensor is not None
+    tensor_shape = tensor.shape
+    # Skip the batch dim
+    new_tensor_shape = []
+    for val in tensor_shape:
+        if val == -1:
+            print("Warning: -1 in the tensor shape.")
+            new_tensor_shape.append(1)
+        else:
+            new_tensor_shape.append(val)
+    tensor_size = functools.reduce(operator.mul, new_tensor_shape, 1)
+    tensor_bytes = tensor_size * get_dtype_bytes(tensor.dtype)
+    if "c_allreduce" in comm_op_type:
+        comm_volume = 2 * tensor_bytes
+    elif "c_allgather" in comm_op_type:
+        comm_volume = tensor_bytes
+    elif "c_broadcast" in comm_op_type:
+        if comm_op.attr("root") == src_rank:
+            comm_volume = tensor_bytes
+        else:
+            comm_volume = None
+    elif "c_reduce" in comm_op_type:
+        if comm_op.attr("root_id") == src_rank:
+            comm_volume = None
+        else:
+            comm_volume = tensor_bytes
+    elif "send_v2" in comm_op_type:
+        if comm_op.attr("peer") == tgt_rank:
+            comm_volume = tensor_bytes
+        else:
+            comm_volume = None
+    elif "recv_v2" in comm_op_type:
+        comm_volume = None
+    else:
+        raise ValueError("Unrecognized communication operator.")
+    return comm_volume
+
+
+def analyze_comm_requirements_from_op(op, rank):
+    comm_requirements_to_ranks = {}
+    if is_collective_comm_op(op):
+        process_group_id = op.attr("ring_id")
+        process_group = get_process_group(process_group_id)
+        if rank not in process_group.ranks:
+            return comm_requirements_to_ranks
+        for tgt_rank in process_group.ranks:
+            comm_volume = get_comm_volume(op, rank, tgt_rank)
+            if comm_volume is not None:
+                comm_requirements_to_ranks[tgt_rank] = {}
+                comm_requirements_to_ranks[tgt_rank][
+                    "comm_volume"] = comm_volume
+    elif is_p2p_comm_op(op):
+        tgt_rank = op.attr("peer")
+        comm_volume = get_comm_volume(op, rank, tgt_rank)
+        if comm_volume is not None:
+            comm_requirements_to_ranks[tgt_rank] = {}
+            comm_requirements_to_ranks[tgt_rank]["comm_volume"] = comm_volume
+    else:
+        comm_requirements_to_ranks = {}
+    return comm_requirements_to_ranks
+
+
+def analyze_requirements_for_program(program, rank):
+    resource_requirements = {}
+    comm_requirements_to_ranks = {}
+    # only support device_type and only support GPU for now
+    resource_requirements["device_type"] = DeviceType.GPU
+    for block in program.blocks:
+        for op in block.ops:
+            cur_comm_requirements_to_ranks = analyze_comm_requirements_from_op(
+                op, rank)
+            for tgt_rank, link_info in cur_comm_requirements_to_ranks.items():
+                if tgt_rank in comm_requirements_to_ranks:
+                    comm_requirements_to_ranks[tgt_rank][
+                        "comm_volume"] += link_info["comm_volume"]
+                else:
+                    comm_requirements_to_ranks[tgt_rank] = {}
+                    comm_requirements_to_ranks[tgt_rank][
+                        "comm_volume"] = link_info["comm_volume"]
+    return resource_requirements, comm_requirements_to_ranks
+
+
+def build_process_graph(distributed_program):
+    graph = Graph()
+    for src_rank, src_program in distributed_program.items():
+        resource_requirements, comm_requirements_to_ranks = analyze_requirements_for_program(
+            src_program, src_rank)
+        graph.add_node(src_rank, resource_requirements=resource_requirements)
+        for tgt_rank, comm_requirements in comm_requirements_to_ranks.items():
+            graph.add_edge(
+                src_rank, tgt_rank, comm_requirements=comm_requirements)
+    return graph
+
+
+def build_cluster_graph(cluster):
+    graph = Graph()
+    for machine in cluster.machines.values():
+        for device in machine.devices.values():
+            graph.add_node(device.global_id, device=device)
+        for link in machine.links.values():
+            graph.add_edge(
+                link.source.global_id, link.target.global_id, link=link)
+    return graph
+
+
+def mapping(distributed_program, cluster):
+    # A very simple mapping algorithm only for GPUs.
+    # Here we assume one process will be mapped to one GPU.
+    # In the future, more mapping configurations and algorithms will be supported.
+    process_graph = build_process_graph(distributed_program)
+
+    cluster_graph = build_cluster_graph(cluster)
+
+    for cur_rank_node in process_graph:
+        cur_rank_node["visited"] = False
+
+    for cur_device_node in cluster_graph:
+        cur_device_node["occupied"] = False
+
+    def sort_by_comm_volume(rank_edge):
+        return rank_edge["comm_requirements"]["comm_volume"]
+
+    def sort_by_comm_bandwidth(device_edge):
+        return device_edge["link"].bandwidth
+
+    def select_unvisited_rank_node(rank_node_list):
+        selected_rank_node = None
+        for rank_node in rank_node_list:
+            if rank_node["visited"] is False:
+                selected_rank_node = rank_node
+        return selected_rank_node
+
+    queue = deque()
+    root_rank_node = select_unvisited_rank_node(
+        list(process_graph.nodes.values()))
+    while root_rank_node is not None:
+        queue.append(root_rank_node)
+        while queue:
+            cur_rank_node = queue.popleft()
+            if cur_rank_node["visited"]:
+                continue
+            device_type = cur_rank_node["resource_requirements"]["device_type"]
+            cur_device_node = None
+            for device_node in cluster_graph.nodes.values():
+                if (device_node["device"].type == device_type) and (
+                        not device_node["occupied"]):
+                    device_node["occupied"] = True
+                    cur_rank_node["visited"] = True
+                    cur_rank_node["device"] = device_node["device"]
+                    cur_device_node = device_node
+                    break
+            assert cur_device_node, "Cannot find a device to satisfy the requirement."
+
+            nbr_rank_edges = []
+            for nbr_rank_node_id, nbr_rank_edge in process_graph.adjs[
+                    cur_rank_node.id].items():
+                assert nbr_rank_edge.src_id == cur_rank_node.id and nbr_rank_edge.tgt_id == nbr_rank_node_id
+                queue.append(process_graph.nodes[nbr_rank_node_id])
+                nbr_rank_edges.append(nbr_rank_edge)
+            nbr_rank_edges.sort(key=sort_by_comm_volume)
+
+            nbr_device_edges = []
+            for nbr_device_edge in cluster_graph.adjs[
+                    cur_device_node.id].values():
+                nbr_device_edges.append(nbr_device_edge)
+            nbr_device_edges.sort(key=sort_by_comm_bandwidth)
+
+            for nbr_rank_edge in nbr_rank_edges:
+                src_rank_node = process_graph.nodes[nbr_rank_edge.src_id][
+                    "visited"]
+                if src_rank_node:
+                    continue
+                device_type = src_rank_node["resource_requirements"][
+                    "device_type"]
+                nbr_rank_node = process_graph.nodes[nbr_rank_edge.tgt_id]
+                for nbr_device_edge in nbr_device_edges:
+                    nbr_device_node = cluster_graph.nodes[
+                        nbr_device_edge.tgt_id]
+                    if (nbr_device_node["device"].type == device_type) and (
+                            not nbr_device_node["occupied"]):
+                        nbr_device_node["occupied"] = True
+                        nbr_rank_node["visited"] = True
+                        nbr_rank_node["device"] = nbr_device_node["device"]
+                        break
+        root_rank_node = select_unvisited_rank_node(
+            list(process_graph.nodes.values()))
+
+    rank_mapping = {}
+    for rank, rank_node in process_graph.nodes.items():
+        device = rank_node["device"]
+        machine = device.machine
+        if machine.id in rank_mapping:
+            rank_mapping[machine.id]["hostname"] = machine.hostname
+            rank_mapping[machine.id]["addr"] = machine.addr
+            rank_mapping[machine.id]["port"] = machine.port
+            if rank not in rank_mapping[machine.id]["ranks"]:
+                rank_mapping[machine.id]["ranks"][rank] = []
+                rank_mapping[machine.id]["ranks"][rank].append(device.local_id)
+            else:
+                rank_mapping[machine.id]["ranks"][rank].append(device.local_id)
+        else:
+            rank_mapping[machine.id] = {}
+            rank_mapping[machine.id]["hostname"] = machine.hostname
+            rank_mapping[machine.id]["addr"] = machine.addr
+            rank_mapping[machine.id]["port"] = machine.port
+            rank_mapping[machine.id]["ranks"] = {}
+            rank_mapping[machine.id]["ranks"][rank] = []
+            rank_mapping[machine.id]["ranks"][rank].append(device.local_id)
+    for machine_mapping in rank_mapping.values():
+        for rank_devices in machine_mapping["ranks"].values():
+            rank_devices.sort()
+
+    return rank_mapping
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -144,6 +144,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node)
 endif()