diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py index 4e6d23eab5..c0c8014076 100644 --- a/examples/gaudi_spawn.py +++ b/examples/gaudi_spawn.py @@ -56,6 +56,7 @@ def parse_args(): parser.add_argument("--hostfile", type=str, default=None, help="Path to the file where hosts are specified.") parser.add_argument("--use_mpi", action="store_true", help="Use MPI for distributed training") parser.add_argument("--use_deepspeed", action="store_true", help="Use DeepSpeed for distributed training") + parser.add_argument("--master_port", type=int, default=29500, help="Master port used by DeepSpeed and MPI") # positional parser.add_argument( @@ -99,6 +100,7 @@ def main(): hostfile=args.hostfile, use_mpi=args.use_mpi, use_deepspeed=args.use_deepspeed, + master_port=args.master_port, ) ret_code = distributed_runner.run() diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index cafe16d897..bc57fe30f3 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -42,6 +42,15 @@ To run generation with DeepSpeed-inference, you must launch the script as follow python ../gaudi_spawn.py --use_deepspeed --world_size number_of_devices run_generation.py ARGS ``` +To run multiple DeepSpeed tasks simultaneously, you can launch them with different `master_port` and [`HABANA_VISIBLE_MODULES`](https://docs.habana.ai/en/latest/PyTorch/PT_Multiple_Tenants_on_HPU/Multiple_Dockers_each_with_Single_Workload.html#running-distributed-workload-inside-the-docker-container), for example: + +```bash +# the following tasks could run simultaneously in a container with 8 HPUs +HABANA_VISIBLE_MODULES="0,1" python ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py ARGS # using the default master_port=29500 +HABANA_VISIBLE_MODULES="2,3,4,5" python ../gaudi_spawn.py --use_deepspeed --world_size 4 --master_port 29501 run_generation.py ARGS +HABANA_VISIBLE_MODULES="6,7" python ../gaudi_spawn.py --use_deepspeed --world_size 2 --master_port 29502 run_generation.py ARGS +``` + Without DeepSpeed-inference, you can run the script with: ```bash diff --git a/optimum/habana/distributed/distributed_runner.py b/optimum/habana/distributed/distributed_runner.py index 251a8f3fc4..91911b826c 100644 --- a/optimum/habana/distributed/distributed_runner.py +++ b/optimum/habana/distributed/distributed_runner.py @@ -41,6 +41,7 @@ def __init__( hostfile: Union[str, Path] = None, use_mpi: bool = False, use_deepspeed: bool = False, + master_port: int = 29500, use_env: bool = False, map_by: bool = "socket", multi_hls=None, @@ -68,6 +69,7 @@ def __init__( self._world_size = world_size self._hostfile = hostfile self._map_by = map_by + self._master_port = master_port self._use_env = use_env self._interpreter = f"{sys.executable} " @@ -99,7 +101,7 @@ def __init__( elif use_mpi: # Single-node multi-card run with MPI self._model_env_vars["MASTER_ADDR"] = "localhost" - self._model_env_vars["MASTER_PORT"] = "12345" + self._model_env_vars["MASTER_PORT"] = self._master_port self.create_single_node_setup_mpirun() else: # Single-node multi-card run with torch.distributed @@ -148,7 +150,7 @@ def create_single_card_setup(self, use_deepspeed=False): """ if use_deepspeed: - self._interpreter = "deepspeed --num_gpus 1 " + self._interpreter = f"deepspeed --num_gpus 1 --master_port {self._master_port} " else: self._interpreter = f"{sys.executable} " @@ -168,7 +170,9 @@ def create_single_node_setup_deepspeed(self): Single-node multi-card configuration setup for DeepSpeed. """ - self._interpreter = f"deepspeed --num_nodes 1 --num_gpus {self._world_size} --no_local_rank " + self._interpreter = ( + f"deepspeed --num_nodes 1 --num_gpus {self._world_size} --no_local_rank --master_port {self._master_port} " + ) def create_single_node_setup(self): """ @@ -187,7 +191,7 @@ def create_multi_node_setup(self): """ master_addr = self.process_hostfile() - self._interpreter = f"deepspeed --hostfile {self._hostfile} --master_addr {master_addr} --no_local_rank " + self._interpreter = f"deepspeed --hostfile {self._hostfile} --master_addr {master_addr} --no_local_rank --master_port {self._master_port} " def run(self): """