diff --git a/.gitignore b/.gitignore index cfd32b72b2..6dd8f9f0e6 100644 --- a/.gitignore +++ b/.gitignore @@ -187,6 +187,9 @@ test/filegen test/iostress test/spawn_multiple test/clichk +test/chkfs +test/chkfs +test/spawn_timeout test/mpi/spawn_multiple docs/_build diff --git a/docs/Makefile.am b/docs/Makefile.am index 1d0918189a..171bcfe17e 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2022-2023 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # -# Copyright (c) 2023-2024 Nanook Consulting All rights reserved. +# Copyright (c) 2023-2025 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -39,6 +39,7 @@ RST_SOURCE_FILES = \ $(srcdir)/prrte-rst-content/*.rst \ $(srcdir)/placement/*.rst \ $(srcdir)/hosts/*.rst \ + $(srcdir)/launching-apps/*.rst \ $(srcdir)/how-things-work/*.rst \ $(srcdir)/developers/*.rst \ $(srcdir)/man/*.rst \ diff --git a/docs/index.rst b/docs/index.rst index 8cd90ad858..082ec17957 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,6 +34,7 @@ Table of contents how-things-work/index hosts/index placement/index + launching-apps/index notifications session-directory developers/index diff --git a/docs/launching-apps/gridengine.rst b/docs/launching-apps/gridengine.rst new file mode 100644 index 0000000000..23105f1713 --- /dev/null +++ b/docs/launching-apps/gridengine.rst @@ -0,0 +1,293 @@ +Launching with Grid Engine +========================== + +PRRTE supports the family of run-time schedulers including the Sun +Grid Engine (SGE), Oracle Grid Engine (OGE), Grid Engine (GE), Son of +Grid Engine, Open Cluster Scheduler (OCS), Gridware Cluster Scheduler (GCS) +and others. + +This documentation will collectively refer to all of them as "Grid +Engine", unless a referring to a specific flavor of the Grid Engine +family. + +Verify Grid Engine support +-------------------------- + +.. important:: To build Grid Engine support in PRRTE, you will need + to explicitly request the SGE support with the ``--with-sge`` + command line switch to PRRTE's ``configure`` script. + +To verify if support for Grid Engine is configured into your PRRTE +installation, run ``prte_info`` as shown below and look for +``gridengine``. + +.. code-block:: + + shell$ prte_info | grep gridengine + MCA ras: gridengine (MCA v2.0, API v2.0, Component v1.3) + + +Launching +--------- + +When Grid Engine support is included, PRRTE will automatically +detect when it is running inside SGE and will just "do the Right +Thing." + +Specifically, if you execute an ``prterun`` command in a Grid Engine +job, it will automatically use the Grid Engine mechanisms to launch +and kill processes. There is no need to specify what nodes to run on +|mdash| PRRTE will obtain this information directly from Grid +Engine and default to a number of processes equal to the slot count +specified. For example, this will run 4 application processes on the nodes +that were allocated by Grid Engine: + +.. code-block:: sh + + # Get the environment variables for Grid Engine + + # (Assuming Grid Engine is installed at /opt/sge and $Grid + # Engine_CELL is 'default' in your environment) + shell$ . /opt/sge/default/common/settings.sh + + # Allocate an Grid Engine interactive job with 4 slots from a + # parallel environment (PE) named 'foo' and run a 4-process job + shell$ qrsh -pe foo 4 -b y prterun -n 4 mpi-hello-world + +There are also other ways to submit jobs under Grid Engine: + +.. code-block:: sh + + # Submit a batch job with the 'prterun' command embedded in a script + shell$ qsub -pe foo 4 my_prterun_job.csh + + # Submit a Grid Engine and application job and prterun in one line + shell$ qrsh -V -pe foo 4 prterun hostname + + # Use qstat(1) to show the status of Grid Engine jobs and queues + shell$ qstat -f + +In reference to the setup, be sure you have a Parallel Environment +(PE) defined for submitting parallel jobs. You don't have to name your +PE "foo". The following example shows a PE named "foo" that would +look like: + +.. code-block:: + + shell$ qconf -sp foo + pe_name foo + slots 99999 + user_lists NONE + xuser_lists NONE + start_proc_args NONE + stop_proc_args NONE + allocation_rule $fill_up + control_slaves TRUE + job_is_first_task FALSE + urgency_slots min + accounting_summary FALSE + qsort_args NONE + +.. note:: ``qsort_args`` is necessary with the Son of Grid Engine + distribution, version 8.1.1 and later, and probably only applicable + to it. + +.. note:: For very old versions of Sun Grid Engine, omit + ``accounting_summary`` too. + +.. note:: For Open Cluster Scheduler / Gridware Cluster Scheduler it is + necessary to set ``ign_sreq_on_mhost`` (ignoring slave resource requests + on the master node) to ``FALSE``. + +You may want to alter other parameters, but the important one is +``control_slaves``, specifying that the environment has "tight +integration". Note also the lack of a start or stop procedure. The +tight integration means that mpirun automatically picks up the slot +count to use as a default in place of the ``-n`` argument, picks up a +host file, spawns remote processes via ``qrsh`` so that Grid Engine +can control and monitor them, and creates and destroys a per-job +temporary directory (``$TMPDIR``), in which PRTE's directory will +be created (by default). + +Be sure the queue will make use of the PE that you specified: + +.. code-block:: + + shell$ qconf -sq all.q + [...snipped...] + pe_list make cre foo + [...snipped...] + +To determine whether the Grid Engine parallel job is successfully +launched to the remote nodes, you can pass in the MCA parameter +``--prtemca plm_base_verbose 1`` to ``prterun``. + +This will add in a ``-verbose`` flag to the ``qrsh -inherit`` command +that is used to send parallel tasks to the remote Grid Engine +execution hosts. It will show whether the connections to the remote +hosts are established successfully or not. + +Various Grid Engine documentation with pointers to more used to be available +at `the Son of GridEngine site `_, and +configuration instructions were found at `the Son of GridEngine +configuration how-to site +`_. This may no longer +be true. + +An actively developed (2024, 2025) open source successor of Sun Grid Engine is +`Open Cluster Scheduler `_. +It maintains backward compatibility with SGE and provides many new features. +An MPI parallel environment setup for OpenMPI is available in +`the Open Cluster Scheduler GitHub repository +`_. + +Grid Engine tight integration support of the ``qsub -notify`` flag +------------------------------------------------------------------ + +If you are running SGE 6.2 Update 3 or later, then the ``-notify`` +flag is supported. If you are running earlier versions, then the +``-notify`` flag will not work and using it will cause the job to be +killed. + +To use ``-notify``, one has to be careful. First, let us review what +``-notify`` does. Here is an excerpt from the qsub man page for the +``-notify`` flag. + + The ``-notify`` flag, when set causes Sun Grid Engine to send + warning signals to a running job prior to sending the signals + themselves. If a SIGSTOP is pending, the job will receive a SIGUSR1 + several seconds before the SIGSTOP. If a SIGKILL is pending, the + job will receive a SIGUSR2 several seconds before the SIGKILL. The + amount of time delay is controlled by the notify parameter in each + queue configuration. + +Let us assume the reason you want to use the ``-notify`` flag is to +get the SIGUSR1 signal prior to getting the SIGTSTP signal. PRRTE forwards +some signals by default, but others need to be specifically requested. +The following MCA param controls this behavior: + +.. code-block:: + + prte_ess_base_forward_signals: Comma-delimited list of additional signals (names or integers) to forward to + application processes [\"none\" => forward nothing]. Signals provided by + default include SIGTSTP, SIGUSR1, SIGUSR2, SIGABRT, SIGALRM, and SIGCONT + +Within that constraint, something like this batch script can be used: + +.. code-block:: sh + + #! /bin/bash + #$ -S /bin/bash + #$ -V + #$ -cwd + #$ -N Job1 + #$ -pe foo 16 + #$ -j y + #$ -l h_rt=00:20:00 + prterun -n 16 mpi-hello-world + +However, one has to make one of two changes to this script for things +to work properly. By default, a SIGUSR1 signal will kill a shell +script. So we have to make sure that does not happen. Here is one way +to handle it: + +.. code-block:: sh + + #! /bin/bash + #$ -S /bin/bash + #$ -V + #$ -cwd + #$ -N Job1 + #$ -pe ompi 16 + #$ -j y + #$ -l h_rt=00:20:00 + exec prterun -n 16 mpi-hello-world + +Alternatively, one can catch the signals in the script instead of doing +an exec on the mpirun: + +.. code-block:: sh + + #! /bin/bash + #$ -S /bin/bash + #$ -V + #$ -cwd + #$ -N Job1 + #$ -pe ompi 16 + #$ -j y + #$ -l h_rt=00:20:00 + + function sigusr1handler() + { + echo "SIGUSR1 caught by shell script" 1>&2 + } + + function sigusr2handler() + { + echo "SIGUSR2 caught by shell script" 1>&2 + } + + trap sigusr1handler SIGUSR1 + trap sigusr2handler SIGUSR2 + + prterun -n 16 mpi-hello-world + +Grid Engine job suspend / resume support +---------------------------------------- + +To suspend the job, you send a SIGTSTP (not SIGSTOP) signal to +``prterun``. ``prterun`` will catch this signal and forward it to the +``mpi-hello-world`` as a SIGSTOP signal. To resume the job, you send +a SIGCONT signal to ``prterun`` which will be caught and forwarded to +the ``mpi-hello-world``. + +Here is an example on Solaris: + +.. code-block:: sh + + shell$ prterun -n 2 mpi-hello-world + +In another window, we suspend and continue the job: + +.. code-block:: sh + + shell$ prstat -p 15301,15303,15305 + PID USERNAME SIZE RSS STATE PRI NICE TIME CPU PROCESS/NLWP + 15305 rolfv 158M 22M cpu1 0 0 0:00:21 5.9% mpi-hello-world/1 + 15303 rolfv 158M 22M cpu2 0 0 0:00:21 5.9% mpi-hello-world/1 + 15301 rolfv 8128K 5144K sleep 59 0 0:00:00 0.0% mpirun/1 + + shell$ kill -TSTP 15301 + shell$ prstat -p 15301,15303,15305 + PID USERNAME SIZE RSS STATE PRI NICE TIME CPU PROCESS/NLWP + 15303 rolfv 158M 22M stop 30 0 0:01:44 21% mpi-hello-world/1 + 15305 rolfv 158M 22M stop 20 0 0:01:44 21% mpi-hello-world/1 + 15301 rolfv 8128K 5144K sleep 59 0 0:00:00 0.0% mpirun/1 + + shell$ kill -CONT 15301 + shell$ prstat -p 15301,15303,15305 + PID USERNAME SIZE RSS STATE PRI NICE TIME CPU PROCESS/NLWP + 15305 rolfv 158M 22M cpu1 0 0 0:02:06 17% mpi-hello-world/1 + 15303 rolfv 158M 22M cpu3 0 0 0:02:06 17% mpi-hello-world/1 + 15301 rolfv 8128K 5144K sleep 59 0 0:00:00 0.0% mpirun/1 + +Note that all this does is stop the ``mpi-hello-world`` processes. It +does not, for example, free any pinned memory when the job is in the +suspended state. + +To get this to work under the Grid Engine environment, you have to +change the ``suspend_method`` entry in the queue. It has to be set to +SIGTSTP. Here is an example of what a queue should look like. + +.. code-block:: sh + + shell$ qconf -sq all.q + qname all.q + [...snipped...] + starter_method NONE + suspend_method SIGTSTP + resume_method NONE + +Note that if you need to suspend other types of jobs with SIGSTOP +(instead of SIGTSTP) in this queue then you need to provide a script +that can implement the correct signals for each job type. diff --git a/docs/launching-apps/index.rst b/docs/launching-apps/index.rst new file mode 100644 index 0000000000..adea45f2c7 --- /dev/null +++ b/docs/launching-apps/index.rst @@ -0,0 +1,47 @@ +.. _label-running-applications: + +Launching applications +====================== + +PRRTE can launch processes in a wide variety of environments, +but they can generally be broken down into two categories: + +#. Scheduled environments: these are systems where a resource manager + and/or scheduler are used to control access to the compute nodes. + Popular resource managers include Slurm, PBS/Pro/Torque, and LSF. +#. Non-scheduled environments: these are systems where resource + managers are not used. Launches are typically local (e.g., on a + single laptop or workstation) or via ``ssh`` (e.g., across a small + number of nodes). + +PRRTE provides two commands for starting applications: + +#. ``prun`` - submits the specified application to an existing persistent DVM + for execution. The DVM continues execution once the application has + completed. The prun command will remain active until the application + completes. All application and error output will flow through prun. +#. ``prterun`` - starts a DVM instance and submits the specified application + to it for execution. The DVM is terminated once the application completes. + All application and error output will flow through prterun. + +The rest of this section usually refers only to ``prterun``, even though the +same discussions also apply to ``prun`` because the command line syntax +is identical. + + +.. toctree:: + :maxdepth: 1 + + quickstart + prerequisites + scheduling + + localhost + ssh + slurm + lsf + tm + gridengine + + unusual + troubleshooting diff --git a/docs/launching-apps/localhost.rst b/docs/launching-apps/localhost.rst new file mode 100644 index 0000000000..885883b239 --- /dev/null +++ b/docs/launching-apps/localhost.rst @@ -0,0 +1,23 @@ +Launching only on the local node +================================ + +It is common to develop applications on a single workstation or +laptop, and then move to a larger parallel / HPC environment once the +application is ready. + +PRRTE supports running multi-process jobs on a single machine. +In such cases, you can simply avoid listing a hostfile or remote +hosts, and simply list a number of processes to launch. For +example: + +.. code-block:: sh + + shell$ prterun -n 6 mpi-hello-world + Hello world, I am 0 of 6 (running on my-laptop)) + Hello world, I am 1 of 6 (running on my-laptop) + ... + Hello world, I am 5 of 6 (running on my-laptop) + +If you do not specify the ``-n`` option, ``prterun`` will default to +launching as many processes as there are processor cores (not +hyperthreads) on the machine. diff --git a/docs/launching-apps/lsf.rst b/docs/launching-apps/lsf.rst new file mode 100644 index 0000000000..f90d035a05 --- /dev/null +++ b/docs/launching-apps/lsf.rst @@ -0,0 +1,50 @@ +Launching with LSF +================== + +PRRTE supports the LSF resource manager. + +Verify LSF support +------------------ + +The ``prte_info`` command can be used to determine whether or not an +installed Open MPI includes LSF support: + +.. code-block:: + + shell$ prte_info | grep lsf + +If the PRRTE installation includes support for LSF, you +should see a line similar to that below. Note the MCA version +information varies depending on which version of PRRTE is +installed. + +.. code-block:: + + MCA ras: lsf (MCA v2.1.0, API v2.0.0, Component v3.0.0) + +Launching +--------- + +When properly configured, PRRTE obtains both the list of hosts and +how many processes to start on each host from LSF directly. Hence, it +is unnecessary to specify the ``--hostfile``, ``--host``, or ``-n`` +options to ``mpirun``. PRRTE will use LSF-native mechanisms +to launch and kill processes (``ssh`` is not required). + +For example: + +.. code-block:: sh + + # Allocate a job using 4 nodes with 2 processors per node and run the job on the nodes allocated by LSF + shell$ bsub -n 8 -R "span[ptile=2]" "prterun mpi-hello-world" + + +This will run the processes on the nodes that were allocated by +LSF. Or, if submitting a script: + +.. code-block:: sh + + shell$ cat my_script.sh + #!/bin/sh + prterun mpi-hello-world + shell$ bsub -n 8 -R "span[ptile=2]" < my_script.sh diff --git a/docs/launching-apps/prerequisites.rst b/docs/launching-apps/prerequisites.rst new file mode 100644 index 0000000000..c27bbe82d2 --- /dev/null +++ b/docs/launching-apps/prerequisites.rst @@ -0,0 +1,239 @@ +.. _running-prerequisites-label: + +Prerequisites +============= + +Successful launch of jobs by PRRTE requires the ability to +find the PRRTE daemon executables and shared libraries on all nodes at run +time. + +In general, if the PRRTE executables and libraries can be found via in system-default +search paths (i.e., without the +user needing to set or modify ``PATH`` or ``LD_LIBRARY_PATH``), then +nothing additional needs to be done. However, if the PRRTE binaries +and libraries are not found, the instructions below may be used to locate them. + +In general, PRRTE requires that its executables are in your +``PATH`` on every node on which you will run and if PRRTE was +compiled as dynamic libraries (which is the default), the directory +where its libraries are located must be in your ``LD_LIBRARY_PATH`` on +every node. +For example: + +* If PRRTE is installed in ``/usr/bin`` and ``/usr/lib``), that is + usually sufficient, and the user does not need to do anything extra. +* If PRRTE is installed in a location that is not searched by + default, users may need to add ``$prefix/bin`` to their ``PATH`` and + ``$libdir`` (which defaults to ``$prefix/lib``) to their + ``LD_LIBRARY_PATH``. + + .. caution:: In scheduled environments, ensuring PRRTE's + executables and libraries can be found on the node that + executes ``prterun`` may be + sufficient. + + In non-scheduled environments, users may need to set + the ``PATH`` and ``LD_LIBRARY_PATH`` environment + variables in their shell setup files (e.g., + ``$HOME/.bashrc``) so that non-interactive + ``ssh``-based logins will be able to find the PRRTE + executables and libraries. + + For example, if PRRTE was installed with a prefix of + ``/opt/prrte``, then the following should be in your + ``PATH`` and ``LD_LIBRARY_PATH`` + + .. list-table:: + :header-rows: 1 + + * - Environment variable + - Value to add + + * - ``PATH`` + - ``/opt/prrte/bin`` + + * - ``LD_LIBRARY_PATH`` + - ``/opt/prrte/lib`` + + Depending on your environment, you may need to set these + values in your shell startup files (e.g., ``.bashrc``, + ``.cshrc``, etc.). + +Additionally, PRRTE requires that jobs can be started on remote +nodes without any input from the keyboard. For example, if using +``ssh`` as the remote agent, you must have your environment setup to +allow execution on remote nodes without entering a password or +passphrase. + +Adding PRRTE to ``PATH`` and ``LD_LIBRARY_PATH`` +--------------------------------------------------- + +PRRTE *must* be able to find its executables in your ``PATH`` +on every node (if PRRTE was compiled as dynamic libraries, then its +library path must appear in ``LD_LIBRARY_PATH`` as well). As such, your +configuration/initialization files need to add PRRTE to your ``PATH`` +/ ``LD_LIBRARY_PATH`` properly. + +How to do this may be highly dependent upon your local configuration; +you may need to consult with your local system administrator. Some +system administrators take care of these details for you, some don't. +Some common examples are included below, however. + +You must have at least a minimum understanding of how your shell works +to get PRRTE in your ``PATH`` / ``LD_LIBRARY_PATH`` properly. Note +that PRRTE must be added to your ``PATH`` and ``LD_LIBRARY_PATH`` +in the following situations: + +#. When you login to an interactive shell + + If your interactive login environment is not configured properly, + executables like ``prterun`` will not be found, and it is typically + obvious what is wrong. The PRRTE executable directory can + manually be added to the ``PATH``, or the user's startup files can + be modified such that the PRRTE executables are added to the + ``PATH`` every login. This latter approach is preferred. + + All shells have some kind of script file that is executed at login + time to set things like ``PATH`` and ``LD_LIBRARY_PATH`` and + perform other environmental setup tasks. This startup file is the + one that needs to be edited to add PRRTE to the ``PATH`` and + ``LD_LIBRARY_PATH``. Consult the manual page for your shell for + specific details (some shells are picky about the permissions of + the startup file, for example). The table below lists some common + shells and the startup files that they read/execute upon login: + + .. list-table:: + :header-rows: 1 + :widths: 10 90 + + * - Shell + - Interactive login startup files + + * - ``bash`` + - ``.bash_profile`` if it exists, or ``.bash_login`` if it + exists, or ``.profile`` if it exists + + (in that order). Note that some Linux distributions + automatically come with + + ``.bash_profile`` scripts for users that automatically + execute ``.bashrc`` as well. + + Consult the ``bash(1)`` man page for more information. + + * - ``zsh`` + - ``.zshrc`` followed by ``.zshenv`` + + * - ``sh`` (or Bash + + named ``sh``) + - ``.profile`` + + * - ``csh`` + - ``.cshrc`` followed by ``.login`` + + * - ``tcsh`` + - ``.tcshrc`` if it exists, ``.cshrc`` if it does not, followed by + ``.login`` + +#. When you login to non-interactive shells on remote nodes + + If your non-interactive remote environment is not configured + properly, executables like ``prterun`` will not function properly, + and it can be somewhat confusing to figure out. + + The startup files in question here are the ones that are + automatically executed for a non-interactive login on a remote node + (e.g., ``ssh othernode ps``). Note that not all shells support + this, and that some shells use different files for this than listed + for interactive logins. Some shells will supersede non-interactive + login startup files with files for interactive logins. That is, + running non-interactive login startup file *may* automatically + invoke interactive login startup file. The following table lists + some common shells and the startup file that is automatically + executed, either by PRRTE or by the shell itself: + + .. list-table:: + :header-rows: 1 + :widths: 10 90 + + * - Shell + - Non-interactive login startup files + + * - ``bash`` + - ``.bashrc`` if it exists + + * - ``zsh`` + - ``.zshrc`` followed by ``.zshenv`` + + * - ``sh`` (or Bash + + named ``sh``) + - This shell does not execute any file automatically, + + so PRRTE will execute the ``.profile`` script + + before invoking PRRTE executables on remote nodes + + * - ``csh`` + - ``.cshrc`` + + * - ``tcsh`` + - ``.tcshrc`` if it exists, ``.cshrc`` if it does not + + +Using the ``--prefix`` option with prterun +------------------------------------------ + +If users are unable to add the relevant directories to ``PATH`` and +``LD_LIBRARY_PATH``, the ``prterun`` ``--prefix`` +option *may* be sufficient. + +There are some situations where you cannot modify the ``PATH`` or +``LD_LIBRARY_PATH`` |mdash| e.g., some ISV applications prefer to hide +all parallelism from the user, and therefore do not want to make the +user modify their shell startup files. + +In such cases, you can use the ``prterun````--prefix`` command line +option, which takes as an argument the +top-level directory where PRRTE was installed. While relative +directory names are possible, they can become ambiguous depending on +the job launcher used; using absolute directory names is strongly +recommended. + +For example, say that PRRTE was installed into +``/opt/prrte-VERSION``. You would use the ``--prefix`` option +thusly: + +.. code-block:: + + shell$ prterun --prefix /opt/prrte-VERSION -n 4 a.out + +This will prefix the ``PATH`` and ``LD_LIBRARY_PATH`` on both the +local and remote hosts with ``/opt/prrte-VERSION/bin`` and +``/opt/prrte-VERSION/lib``, respectively. This is *usually* +unnecessary when using resource managers to launch jobs (e.g., Slurm, +Torque, etc.) because they tend to copy the entire local environment +|mdash| to include the ``PATH`` and ``LD_LIBRARY_PATH`` |mdash| to +remote nodes before execution. As such, if ``PATH`` and +``LD_LIBRARY_PATH`` are set properly on the local node, the resource +manager will automatically propagate those values out to remote nodes. +The ``--prefix`` option is therefore usually most useful in +``ssh``-based environments (or similar), OR when the cluster has been +configured with PRRTE located in a different location on the +remote nodes. + +It is possible to make this the default behavior by passing to +``configure`` the flag ``--enable-prterun-prefix-by-default``. This +will make ``prterun`` behave exactly the same as +``prterun --prefix $prefix ...``, where ``$prefix`` is the value given +to ``--prefix`` in ``configure``. + +Finally, note that specifying the absolute pathname to ``prterun`` is +equivalent to using the ``--prefix`` argument. For +example, the following is equivalent to the above command line that +uses ``--prefix``: + +.. code-block:: + + shell$ /opt/prrte-VERSION/bin/prterun -n 4 a.out diff --git a/docs/launching-apps/quickstart.rst b/docs/launching-apps/quickstart.rst new file mode 100644 index 0000000000..c11a065be7 --- /dev/null +++ b/docs/launching-apps/quickstart.rst @@ -0,0 +1,223 @@ +.. _label-quickstart-launching-apps: + +Quick start: Launching applications +=================================== + +Although this section skips many details, it offers examples that will +probably work in many environments. + +.. caution:: Note that this section is a "Quick start" |mdash| it does + not attempt to be comprehensive or describe how to build Open MPI + in all supported environments. The examples below may therefore + not work exactly as shown in your environment. + + Please consult the other sections in this chapter for more details, + if necessary. + +Using ``prterun`` to launch applications +---------------------------------------- + +PRRTE supports both ``prterun`` and +``prun`` to +launch applications. For example: + +.. code-block:: sh + + shell$ prterun -n 2 mpi-hello-world + # or + shell$ prte & prun -n 2 mpi-hello-world + # or + shell$ prterun -n 1 mpi-hello-world : -n 1 mpi-hello-world + +are all equivalent. For simplicity, the rest of this documentation +will simply refer to ``prterun``. + +Other ``prterun`` options +^^^^^^^^^^^^^^^^^^^^^^^^^ + +``prterun`` supports the ``--help`` option which provides a usage +message and a summary of the options that it supports. It should be +considered the definitive list of what options are provided. + +Several notable options are: + +* ``--hostfile``: Specify a hostfile for launchers (such as the + ``rsh`` launcher) that need to be told on which hosts to start + parallel applications. Note that for compatibility with other + launchers, *--machinefile* is a synonym for ``--hostfile``. +* ``--host``: Specify a host or list of hosts to run on, including + support for relative index syntax. +* ``-n``: Indicate the number of processes to start. +* ``--prtemca`` or ``--pmixmca``: Set MCA parameters for either + PRRTE or the underlying PMIx library. +* ``--wdir DIRECTORY``: Set the working directory of the started + applications. If not supplied, the current working directory is + assumed (or ``$HOME``, if the current working directory does not + exist on all nodes). +* ``-x ENV_VARIABLE_NAME``: The name of an environment variable to + export to the parallel application. The ``-x`` option can be + specified multiple times to export multiple environment variables to + the parallel application. + +Note that the ``prterun`` command supports a +*large* number of options. Detailed help on any option can be obtained +using the hierarchical help system - e.g., ``prterun --help map-by``. + +Launching on a single host +-------------------------- + +It is common to develop applications on a single laptop or +workstation. In such simple "single program, multiple data (SPMD)" cases, +use ``prterun`` and +specify how many processes you want to launch via the ``-n`` +option: + +.. code-block:: sh + + shell$ prterun -n 6 mpi-hello-world + Hello world, I am 0 of 6 (running on my-laptop)) + Hello world, I am 1 of 6 (running on my-laptop) + ... + Hello world, I am 5 of 6 (running on my-laptop) + +This starts a six-process parallel application, running six copies +of the executable named ``mpi-hello-world``. + +If you do not specify the ``-n`` option, ``prterun`` will +default to launching as many processes as +there are processor cores (not hyperthreads) on the machine. + +Launching in a non-scheduled environments (via ``ssh``) +------------------------------------------------------- + +In general, PRRTE requires the following to launch and run +applications: + +#. You must be able to login to remote nodes non-interactively (e.g., + without entering a password or passphrase). +#. PRRTE's daemon executable must be findable (e.g., in your ``PATH``). +#. PRRTE's libraries must be findable (e.g., in your + ``LD_LIBRARY_PATH``). + +``prterun`` accepts a ``--hostfile`` option (and its +synonym, the ``--machinefile`` option) to specify a hostfile containing one +hostname per line: + +.. code-block:: sh + + shell$ cat my-hostfile.txt + node1.example.com + node2.example.com + node3.example.com slots=2 + node4.example.com slots=10 + +The optional ``slots`` attribute tells PRRTE the *maximum* number +of processes that can be allocated to that node. If ``slots`` is not +provided, PRRTE |mdash| by default |mdash| uses the number of +processor cores (not hyperthreads) on that node. + +Assuming that each of the 4 nodes in `my-hostfile.txt` have 16 cores: + +.. code-block:: sh + + shell$ prterun --hostfile my-hostfile.txt mpi-hello-world + Hello world, I am 0 of 44 (running on node1.example.com) + Hello world, I am 1 of 44 (running on node1.example.com) + ... + Hello world, I am 15 of 44 (running on node1.example.com) + Hello world, I am 16 of 44 (running on node2.example.com) + Hello world, I am 17 of 44 (running on node2.example.com) + ... + Hello world, I am 31 of 44 (running on node2.example.com) + Hello world, I am 32 of 44 (running on node3.example.com) + Hello world, I am 33 of 44 (running on node3.example.com) + Hello world, I am 34 of 44 (running on node4.example.com) + ... + Hello world, I am 43 of 44 (running on node4.example.com) + +You can see the breakdown of how many processes PRRTE launched on +each node: + +* node1: 16, because no ``slots`` was specified +* node2: 16, because no ``slots`` was specified +* node3: 2, because ``slots=2`` was specified +* node2: 10, because ``slots=10`` was specified + +Note, however, that not all environments require a hostfile. For +example, PRRTE will automatically detect when it is running in +batch / scheduled environments (such as Slurm, PBS/Torque, SGE, +LoadLeveler), and will use host information provided by those systems. + +Also note that if using a launcher that requires a hostfile and no +hostfile is specified, all processes are launched on the local host. + +Launching in scheduled environments +----------------------------------- + +In scheduled environments (e.g., in a Slurm job, or PBS/Pro, or LSF, +or any other schedule), the user tells the scheduler how many MPI +processes to launch, and the scheduler decides which hosts to use. +The scheduler then passes both pieces of information (the number of +processes and the hosts to use) to PRRTE. + +There are two ways to launch in a scheduled environment. Nominally, +they both achieve the same thing: they launch processes. The +main user-observable difference between the two methods is that +``prterun`` has *many* more features than scheduler +direct launchers. + +Using PRRTE's ``prterun`` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +When using the full-featured ``prterun`` in a +scheduled environment, there is no need to specify a hostfile or +number of processes to launch. ``prterun`` +will receive this information directly from the scheduler. Hence, if +you want to launch a job that completely "fills" your scheduled +allocation (i.e., one process for each slot in the scheduled +allocation), you can simply: + +.. code-block:: sh + + # Write a script that runs your application + shell$ cat my-slurm-script.sh + #!/bin/sh + # There is no need to specify -n or --hostfile because that + # information will automatically be provided by Slurm. + prterun mpi-hello-world + +You then submit the ``my-slurm-script.sh`` script to Slurm for +execution: + +.. code-block:: sh + + # Use -n to indicate how many processes you want to run. + # Slurm will pick the specific hosts which will be used. + shell$ sbatch -n 40 my-slurm-script.sh + Submitted batch job 1234 + shell$ + +After Slurm job 1234 completes, you can look at the output file to see +what happened: + +.. code-block:: sh + + shell$ cat slurm-1234.out + Hello world, I am 0 of 40 (running on node37.example.com) + Hello world, I am 1 of 40 (running on node37.example.com) + Hello world, I am 2 of 40 (running on node37.example.com) + ... + Hello world, I am 39 of 40 (running on node19.example.com) + +Note that the Slurm scheduler picked the hosts on which the processes +ran. + +The above example shows that simply invoking ``mpirun +mpi-hello-world`` |mdash| with no other CLI options |mdash| obtains +the number of processes to run and hosts to use from the scheduler. + +``prterun`` has many more features not described in +this Quick Start section. For example, while uncommon in scheduled +environments, you can use ``-n`` and/or ``--hostfile`` to launch in +subsets of the overall scheduler allocation. See the ``prterun`` +help system for more details. diff --git a/docs/launching-apps/scheduling.rst b/docs/launching-apps/scheduling.rst new file mode 100644 index 0000000000..75221523f5 --- /dev/null +++ b/docs/launching-apps/scheduling.rst @@ -0,0 +1,11 @@ +Placing processes across hosts +============================== + +PRRTE provides many options for placing application processes across +hosts, including oversubscribing processes to processors. This section +describes how to define that mapping. + +Placement overview +------------------ + +.. include:: /prrte-rst-content/detail-placement-fundamentals.rst diff --git a/docs/launching-apps/slurm.rst b/docs/launching-apps/slurm.rst new file mode 100644 index 0000000000..888b63cc98 --- /dev/null +++ b/docs/launching-apps/slurm.rst @@ -0,0 +1,56 @@ +Launching with Slurm +==================== + +PRRTE supports two modes of launching parallel jobs under +Slurm: + +#. Using PRRTE's full-features ``prterun`` launcher. +#. Using Slurm's "direct launch" capability. + +Unless there is a strong reason to use ``srun`` for direct launch, the +PRRTE team recommends using ``prterun`` for launching under Slurm jobs. + +Using ``prterun`` +----------------- + +When ``prterun`` is launched in a Slurm job, ``prterun`` will +automatically utilize the Slurm infrastructure for launching and +controlling the individual processes. +Hence, it is unnecessary to specify the ``--hostfile``, +``--host``, or ``-n`` options to ``prterun``. + +.. note:: Using ``prterun`` is the recommended method for launching + applications in Slurm jobs. + + ``prterun``'s Slurm support should always be available, regardless + of how PRRTE or Slurm was installed. + +For example: + +.. code-block:: sh + + # Allocate a Slurm job with 4 slots + shell$ salloc -n 4 + salloc: Granted job allocation 1234 + + # Now run an Open MPI job on all the slots allocated by Slurm + shell$ prterun mpi-hello-world + +This will run the 4 processes on the node(s) that were allocated +by Slurm. + +Or, if submitting a script: + +.. code-block:: sh + + shell$ cat my_script.sh + #!/bin/sh + prterun mpi-hello-world + shell$ sbatch -n 4 my_script.sh + srun: jobid 1235 submitted + shell$ + +Similar to the ``salloc`` case, no command line options specifying +number of processes were necessary, since PRRTE will obtain +that information directly from Slurm at run time. + diff --git a/docs/launching-apps/ssh.rst b/docs/launching-apps/ssh.rst new file mode 100644 index 0000000000..017291d991 --- /dev/null +++ b/docs/launching-apps/ssh.rst @@ -0,0 +1,233 @@ +Launching with SSH +================== + +When launching jobs in a non-scheduled environment, ``ssh`` +is typically used to launch commands on remote nodes. As listed in +the :doc:`quick start section `, +successfully launching MPI applications with ``ssh`` requires the +following: + +#. You must be able to non-interactively login |mdash| without + entering a password or passphrase |mdash| to all remote nodes from + all remotes nodes. +#. PRRTE's daemon executablesmust be findable (e.g., in your ``PATH``). +#. PRRTE's libraries must be findable (e.g., in your + ``LD_LIBRARY_PATH``). + +Specifying the hosts for a job +------------------------------ + +There are three mechanisms for specifying the hosts that an job will run on: + +#. The ``--hostfile`` option to ``prterun``. + + Use this option to specify a list of hosts on which to run. Note + that for compatibility with other launchers, + ``--machinefile`` is a synonym for ``--hostfile``. + +#. The ``--host`` option to ``prterun``. + + This option can be used to specify a list of hosts on which to run + on the command line. + +#. Running in a scheduled environment. + + If you are running in a scheduled environment (e.g., in a Slurm, + Torque, or LSF job), PRRTE will automatically get the lists of + hosts from the scheduler. See the next subsections for details about + launching jobs in supported scheduled environements. + +.. important:: The specification of hosts using any of the above + methods has nothing to do with the network interfaces + that are used for application traffic. The list of hosts is + *only* used for specifying which hosts on which to + launch processes. + +Non-interactive ``ssh`` logins +------------------------------ + +SSH keys must be setup such that the following can be executed without +being prompted for password or passphrase: + +.. code-block:: sh + + shell$ ssh othernode echo hello + hello + shell$ + +Consult instructions and tutorials from around the internet to learn +how to setup SSH keys. Try Google search terms like "passwordless +SSH" or "SSH key authentication". + +For simplicity, it may be desirable to configure your SSH keys +without passphrases. This adds some risk, however (e.g., if your SSH +keys are compromised). But it simplifies your SSH setup because you +will not need to use ``ssh-agent``. Evaluate the risk level you are +comfortable with. + +.. important:: PRRTE uses a tree-based pattern to launch processes + on remote nodes. This means that PRRTE must be able to + non-interactively login |mdash| without being prompted for password + or passphrase |mdash| *to any node* in the host list *from any + node* in the host list. + + It may *not* be sufficient to only setup an SSH key from the node + where you are invoking ``prterun`` to all other + nodes. + +If you have a shared ``$HOME`` filesystem between your nodes, you can +setup a single SSH key that is used to login to all nodes. + +Finding the PRRTE daemon executable and libraries +------------------------------------------------- + +Once PRRTE is able to use ``ssh`` to invoke executables on a remote +node, it must be able to find its daemon executable and shared +libraries on that remote node. + +If PRRTE is installed in a system-level folder (e.g., in +``/usr/bin``), PRRTE will likely be able to find its daemon +and libraries on the remote node with no additional assistance. + +If, however, PRRTE is installed into a path that is not searched by +default, you will need to provide assistance so that PRRTE can find +its daemon and libraries. + +.. important:: For simplicity, it is *strongly* recommended that you + install PRRTE in the same location on all nodes in your job. + +You can do this in one of two ways. + +Use "prefix" behavior +^^^^^^^^^^^^^^^^^^^^^ + +.. note:: "Prefix" behavior is only available with ``prterun``; it is not + available via resource manager direct + launch mechanisms. However, this section is about using ``ssh`` to + launch jobs, which means that there is no resource manager, and + therefore there is no direct launch mechanism available. + +When "prefix" behavior is enabled, PRRTE will automatically set the +``$PATH`` and ``$LD_LIBRARY_PATH`` on remote nodes before executing +remote commands. + +.. important:: PRRTE assumes that the installation ``prefix``, + ``bindir``, and ``libdir`` are the same on the remote node as they + are on the local node. If they are not, *then you should not use + the "prefix" behavior.* + +You can enable "prefix" behavior in one of three ways: + +#. Use an absolute path name to invoke ``prterun``. + + .. code-block:: sh + + shell$ $HOME/my-prrte/bin/prterun --hostfile my-hostfile.txt mpi-hello-world + + Simply using the absolute path name to ``prterun`` tells PRRTE to enable "prefix" mode. + + +#. Use the ``--prefix`` option to ``prterun``. + + .. code-block:: sh + + shell$ $HOME/my-prrte/bin/prterun --hostfile my-hostfile.txt \ + --prefix $HOME/my-prrte \ + mpi-hello-world + + The ``--prefix`` option takes a single argument: the prefix path to + use for the bindir and libdir on the remote node. + +#. Configure PRRTE with ``--enable-prterun-prefix-by-default``. + + If PRRTE is built this way, ``prterun`` will + always enable "prefix" behavior. + +Set the ``PATH`` and ``LD_LIBRARY_PATH`` in your shell startup files +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Consider the case where PRRTE was configured with: + +.. code-block:: sh + + shell$ ./configure --prefix=$HOME/my-prrte ... + +In this cause, PRRTE will be installed into ``$HOME/my-prrte``. +This path is almost certainly not in any system-default search paths, +so it must be added to the ``$PATH`` and ``$LD_LIBRARY_PATH`` +environment variables. + +Specifically: the goal is that the following non-interactive commands +must be able to execute without error: + +.. code-block:: sh + + # First, ensure that this command returns the correct prte_info + # instance (i.e., $HOME/my-prrte/bin/prte_info). + shell$ ssh remotenode which prte_info + /home/myusername/my-prrte/bin/prte_info + + # Next, ensure that you can run that prte_info command without + # error + shell$ ssh remotenode prte_info + + # ... lots of output ... + +Ensure that you do not see any errors about libraries that cannot be +found. + +All shells have some kind of script file that is executed at login +time perform environmental setup tasks. This startup file is the one +that needs to be edited to: + +#. Add PRRTE's daemon executable path (which is likely ``$prefix/bin``, or + ``$HOME/my-prrte/bin`` in this example) to the ``$PATH`` + environment variable. +#. Add PRRTE's library path (which is likely ``$prefix/lib``, or + ``$HOME/my-prrte/lib`` in this example) to the + ``$LD_LIBRARY_PATH`` environment variable. + +You probably want to add PRRTE's libraries to the *front* of +``$PATH`` and ``$LD_LIBRARY_PATH`` to ensure that this PRRTE +installation's files are found *first*. + +Consult the manual page for your shell for specific details (some +shells are picky about the permissions of the startup file, for +example). The list below contains some common shells and the startup +files that they read/execute upon login: + +.. list-table:: + :header-rows: 1 + + * - Shell + - Non-interactive login + - Interactive login + + * - ``bash`` or ``zsh`` + - ``$HOME/.bashrc`` if it exists. + - #. ``$HOME/.bash_profile`` if it exists, or + #. ``$HOME/.bash_login`` if it exists, or + #. ``$HOME/.profile`` if it exists (in that order). + + Note that some Linux distributions automatically come + with ``$HOME/.bash_profile`` scripts for users that + automatically execute ``$HOME/.bashrc`` as well. Consult the + bash man page for more information. + + * - ``sh`` + - This shell does not execute any file automatically, so PRRTE + will execute the ``$HOME/.profile`` script before invoking PRRTE + executables on remote nodes + - ``$HOME/.profile`` + + * - ``csh`` + - ``$HOME/.cshrc`` + - ``$HOME/.cshrc`` followed by ``$HOME/.login`` + + * - ``tcsh`` + - #. ``$HOME/.tcshrc`` if it exists, or + #. ``$HOME/.cshrc`` if it does not + - #. ``$HOME/.tcshrc`` if it exists, or + #. ``$HOME/.cshrc`` if it does not + + Afterwards, execute ``$HOME/.login`` diff --git a/docs/launching-apps/tm.rst b/docs/launching-apps/tm.rst new file mode 100644 index 0000000000..13a5ce2c36 --- /dev/null +++ b/docs/launching-apps/tm.rst @@ -0,0 +1,64 @@ +Launching with PBS / Torque +=========================== + +PRRTE supports PBS, PBS Pro, Torque, and other related resource +managers. + +Verify PBS/Torque support +------------------------- + +The ``prte_info`` command can be used to determine whether or not an +installed Open MPI includes Torque/PBS Pro support: + +.. code-block:: + + shell$ prte_info | grep ras + +If the PRRTE installation includes support for PBS/Torque, you +should see a line similar to that below. Note the MCA version +information varies depending on which version of Open MPI is +installed. + +.. code-block:: + + MCA ras: tm (MCA v2.1.0, API v2.0.0, Component v3.0.0) + +Launching +--------- + +When properly configured, PRRTE obtains both the list of hosts and +how many processes to start on each host from Torque / PBS Pro +directly. Hence, it is unnecessary to specify the ``--hostfile``, +``--host``, or ``-n`` options to ``prterun``. PRRTE will use +PBS/Torque-native mechanisms to launch and kill processes (``ssh`` is +not required). + +For example: + +.. code-block:: sh + + # Allocate a PBS job with 4 nodes + shell$ qsub -I -lnodes=4 + + # Now run a job on all the nodes allocated by PBS/Torque + shell$ prterun mpi-hello-world + +This will run the application processes on the nodes that were allocated by +PBS/Torque. Or, if submitting a script: + +.. code-block:: sh + + shell$ cat my_script.sh + #!/bin/sh + prterun mpi-hello-world + shell$ qsub -l nodes=4 my_script.sh + +.. warning:: Do not modify ``$PBS_NODEFILE``! + + We've had reports from some sites that system administrators modify + the ``$PBS_NODEFILE`` in each job according to local policies. + This will currently cause PRRTE to behave in an unpredictable + fashion. As long as no new hosts are added to the hostfile, it + *usually* means that PRRTE will incorrectly map processes to + hosts, but in some cases it can cause PRRTE to fail to launch + processes altogether. diff --git a/docs/launching-apps/troubleshooting.rst b/docs/launching-apps/troubleshooting.rst new file mode 100644 index 0000000000..0edc72c976 --- /dev/null +++ b/docs/launching-apps/troubleshooting.rst @@ -0,0 +1,167 @@ +Troubleshooting +=============== + +Launching applications can be a complex process that involves many moving parts. +This section attempts to provide solutions to some of the most common +problems users encounter. + +Errors about missing libraries +------------------------------ + +When building PRRTE with the compilers that have libraries in +non-default search path locations, you may see errors about those +compiler's support libraries when trying to launch applications if +their corresponding environments were not setup properly. + +For example, you may see warnings similar to the following: + +.. code-block:: sh + + # With the Intel compiler suite + shell$ prterun -n 1 --host node1.example.com hello + prted: error while loading shared libraries: libimf.so: cannot open shared object file: No such file or directory + -------------------------------------------------------------------------- + A daemon (pid 11893) died unexpectedly with status 127 while + attempting to launch so we are aborting. + ...more error messages... + + # With the PGI compiler suite + shell$ prterun -n 1 --host node1.example.com hello + prted: error while loading shared libraries: libpgcc.so: cannot open shared object file: No such file or directory + ...more error messages... + + # With the PathScale compiler suite + shell$ prterun -n 1 --host node1.example.com hello + prted: error while loading shared libraries: libmv.so: cannot open shared object file: No such file or directory + ...more error messages... + +Specifically, PRRTE first attempts to launch a "helper" daemon +``prted`` on ``node1.example.com``, but it failed because one of +``prted``'s dependent libraries was not able to be found. The +libraries shown above (``libimf.so``, ``libpgcc.so``, and +``libmv.so``) are specific to their compiler suites (Intel, PGI, and +PathScale, respectively). As such, it is likely that the user did not +setup the compiler library in their environment properly on this node. + +Double check that you have setup the appropriate user environment +on the target node, for both interactive and non-interactive logins. + +.. note:: It is a common error to ensure that the user environment + is setup properly for *interactive* logins, but not for + *non-interactive* logins. + +Here's an example of a user-compiled MPI application working fine +locally, but failing when invoked non-interactively on a remote node: + +.. code-block:: sh + + # Compile a trivial MPI application + head_node$ cd $HOME + head_node$ mpicc mpi_hello.c -o mpi_hello + + # Run it locally; it works fine + head_node$ ./mpi_hello + Hello world, I am 0 of 1. + + # Run it remotely interactively; it works fine + head_node$ ssh node2.example.com + + Welcome to node2. + node2$ ./mpi_hello + Hello world, I am 0 of 1. + node2$ exit + + # Run it remotely *NON*-interactively; it fails + head_node$ ssh node2.example.com $HOME/mpi_hello + mpi_hello: error while loading shared libraries: libimf.so: cannot open shared object file: No such file or directory + +In cases like this, check your shell script startup files and verify +that the appropriate compiler environment is setup properly for +non-interactive logins. + +Problems when running across multiple hosts +------------------------------------------- + +When you are able to run jobs on a single host, but fail to run +them across multiple hosts, try the following: + +#. Ensure that your launcher is able to launch across multiple hosts. + For example, if you are using ``ssh``, try to ``ssh`` to each + remote host and ensure that you are not prompted for a password. + For example: + + .. code-block:: + + shell$ ssh remotehost hostname + remotehost + + If you are unable to launch across multiple hosts, check that your + SSH keys are setup properly. Or, if you are running in a managed + environment, such as in a Slurm, Torque, or other job launcher, + check that you have reserved enough hosts, are running in an + allocated job, etc. + +#. Ensure that your ``PATH`` and ``LD_LIBRARY_PATH`` are set correctly + on each remote host on which you are trying to run. For example, + with ``ssh``: + + .. code-block:: + + shell$ ssh remotehost env | grep -i path + PATH=...path on the remote host... + LD_LIBRARY_PATH=...LD library path on the remote host... + + If your ``PATH`` or ``LD_LIBRARY_PATH`` are not set properly, see + :ref:`this section ` for + the correct values. Keep in mind that it is fine to have multiple + PRRTE installations installed on a machine; the *first* PRRTE + installation found by ``PATH`` and ``LD_LIBARY_PATH`` is the one + that matters. + +#. Run a simple operating system job across multiple hosts. This verifies + that the PRRTE run-time system is functioning properly across + multiple hosts. For example, try running the ``hostname`` command: + + .. code-block:: + + shell$ prterun --host remotehost hostname + remotehost + shell$ prterun --host remotehost,otherhost hostname + remotehost + otherhost + + If you are unable to run operating system jobs across multiple hosts, check + for common problems such as: + + #. Check your non-interactive shell setup on each remote host to + ensure that it is setting up the ``PATH`` and + ``LD_LIBRARY_PATH`` properly. + #. Check that PRRTE is finding and launching the correct + version of PRRTE on the remote hosts. + #. Ensure that you have firewalling disabled between hosts (PRRTE + opens random TCP and sometimes random UDP ports between + hosts in a single MPI job). + #. Try running with the ``plm_base_verbose`` MCA parameter at level + 10, which will enable extra debugging output to see how PRRTE + launches on remote hosts. For example: + + .. code-block:: + + prterun --prtemca plm_base_verbose 10 --host remotehost hostname`` + +#. Now run a simple PMIx-based job across multiple hosts that does not + involve inter-process communications. The ``hello_c`` program in the + ``examples`` directory in the PRRTE distribution is a good + choice. This verifies that the PMIx subsystem is able to initialize + and terminate properly. For example: + + .. code-block:: + + shell$ prterun --host remotehost,otherhost hello_c + Hello, world, I am 0 of 1, (PRRTE VERSION, package: PRRTE jsquyres@example.com Distribution, ident: VERSION, DATE) + Hello, world, I am 1 of 1, (PRRTE VERSION, package: PRRTE jsquyres@example.com Distribution, ident: VERSION, DATE) + + If you are unable to run simple, non-communication jobs, this + can indicate that your PRRTE installation is unable to + initialize properly on remote hosts. Double check your + non-interactive login setup on remote hosts. diff --git a/docs/launching-apps/unusual.rst b/docs/launching-apps/unusual.rst new file mode 100644 index 0000000000..bd0efa9355 --- /dev/null +++ b/docs/launching-apps/unusual.rst @@ -0,0 +1,166 @@ +Unusual jobs +============ + +PRRTE can run many types of applications, including non-MPI programs. +This section describes some of the less common kinds of programs that can +be executed. In the following, the prterun and prun commands will be +used interchangeably for convenience - the documentation applies +equally when running a one-shot PRRTE instance or a persistent DVM. + + +Running non-MPI programs +------------------------ + +Non-MPI programs can be launched with prterun or prun, +for example: + +.. code-block:: + + shell$ prterun -n 2 --host a,b uptime + +This will launch a copy of the Unix command ``uptime`` on the hosts ``a`` +and ``b``. + +prterun and prun work equally well for MPI and non-MPI +applications. + +Running GUI applications +------------------------ + +Running GUI applications depends on your local setup and may require additional +setup. + +You will need to have graphics forwarding (e.g., X11 +forwarding) enabled from the remote processes to the display where you +want output to appear. In a secure environment, you can simply allow +all X requests to be shown on the target display and set the +``DISPLAY`` environment variable in all application processes' environments to +the target display, perhaps something like this: + +.. code-block:: + + shell$ hostname + my_desktop.secure-cluster.example.com + shell$ xhost + + shell$ prun -n 4 -x DISPLAY=my_desktop.secure-cluster.example.com a.out + +However, this technique is not generally suitable for unsecure +environments (because it allows anyone to read and write to your +display). A slightly more secure way is to only allow X connections +from the nodes where your application will be running: + +.. code-block:: + + shell$ hostname + my_desktop.secure-cluster.example.com + shell$ xhost +compute1 +compute2 +compute3 +compute4 + compute1 being added to access control list + compute2 being added to access control list + compute3 being added to access control list + compute4 being added to access control list + shell$ prun -n 4 -x DISPLAY=my_desktop.secure-cluster.example.com a.out + +(assuming that the four nodes you are running on are ``compute1`` +through ``compute4``). + +Other methods are available, but they involve sophisticated X +forwarding through prterun and are generally +more complicated than desirable. + +Running curses-based applications +--------------------------------- + +PRRTE provides fairly sophisticated stdin / stdout / stderr +forwarding. However, it does not work well with curses, ncurses, +readline, or other sophisticated I/O packages that generally require +direct control of the terminal. + +Every application and I/O library is different |mdash| you should try to +see if yours is supported. But chances are that it won't work. + +Launching an MPMD job +--------------------- + +PRRTE supports multiple program, multiple data (MPMD) style launches, +either from the command line or from a file. For example: + +.. code-block:: + + shell$ prterun -n 2 a.out : -n 2 b.out + +This will launch a single parallel application, but the first two +processes will be instances of the ``a.out`` executable, and the +second two processes will be instances of the ``b.out`` executable. +In MPI terms, this will be a single ``MPI_COMM_WORLD``, but the +``a.out`` processes will be ranks 0 and 1 in ``MPI_COMM_WORLD``, while +the ``b.out`` processes will be ranks 2 and 3 in ``MPI_COMM_WORLD``. + +prterun can also accept a parallel application +specified in a file instead of on the command line. For example: + +.. code-block:: + + shell$ prterun --app my_appfile + +where the file ``my_appfile`` contains the following: + +.. code-block:: sh + + # Comments are supported; comments begin with # + # Application context files specify each sub-application in the + # parallel job, one per line. The first sub-application is the 2 + # a.out processes: + -n 2 a.out + # The second sub-application is the 2 b.out processes: + -n 2 b.out + +This will result in the same behavior as running ``a.out`` and ``b.out`` +from the command line. + +Connecting independent MPI applications +--------------------------------------- + +In certain environments, Open MPI supports connecting multiple, +independent MPI applications using mechanism defined in the MPI +specification such as ``MPI_Comm_connect() / MPI_Comm_accept()`` and +publishing connection information using ``MPI_Publish_name() / +MPI_Lookup_name()``. These mechanisms require a centralized service +to exchange contact information across multiple jobs. + +Beginning with Open MPI v5.0.0 this can be achieved by starting an +instance of the prte server with the ``report-uri`` option to +display the contact information of the prte server. This information +can then be used for launching subsequent MPI applications. + +The following commands show an example for launching two MPI jobs +that will connect to each other at runtime using the MPI-2 based +functionality. + + +Step 1: start the standalone prte server + +.. code-block:: + + user@myhost:~/ompi-install/bin$ ./prte --report-uri + DVM ready + +Step 2: Launch the first MPI application providing the uri of the +prte server + +.. code-block:: + + user@myhost:~/app1-dir$ mpiexec --dvm file: -np 4 ./mpi_app_1 + +Step 3: Launch the second MPI application providing the uri of the +prte server again + +.. code-block:: + + user@myhost:~/app2-dir$ mpiexec --dvm file: -np 4 ./mpi_app_2 + + +In case the prte server has been started as a system server using the +``--system-server`` argument (e.g. the nodes used by the MPI +applications are not shared by multiple jobs), the sequence can be +simplified by using ``mpiexec --dvm system`` or ``mpiexec --dvm +system-first`` instead of the uri of the prte server. diff --git a/examples/colocate.c b/examples/colocate.c index 259565cdfc..80be06bf6f 100644 --- a/examples/colocate.c +++ b/examples/colocate.c @@ -15,7 +15,7 @@ * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -118,6 +118,7 @@ static void printusage(void) fprintf(stderr, "Usage: colocate [options]\n"); fprintf(stderr, "\t--cmd foo : spawn the foo executable\n"); fprintf(stderr, "\t-n/--np/-np N : number of procs to spawn\n"); + fprintf(stderr, "\t-perproc/--perproc : colocate with each proc\n"); } int main(int argc, char **argv) @@ -140,6 +141,7 @@ int main(int argc, char **argv) size_t dninfo; pmix_status_t code = PMIX_EVENT_JOB_END; char *cmd = "hostname"; + bool perproc = false; pid = getpid(); @@ -159,6 +161,9 @@ int main(int argc, char **argv) exit(1); } np = strtol(argv[n+1], NULL, 10); + } else if (0 == strcmp(argv[n], "--perproc") || + 0 == strcmp(argv[n], "-perproc")) { + perproc = true; } else if (0 == strcmp(argv[n], "--help") || 0 == strcmp(argv[n], "-h")) { printusage(); @@ -205,7 +210,11 @@ int main(int argc, char **argv) PMIX_LOAD_PROCID(&pptr[0], myproc.nspace, PMIX_RANK_WILDCARD); PMIX_INFO_LOAD(&jinfo[0], PMIX_COLOCATE_PROCS, &darray, PMIX_DATA_ARRAY); PMIX_INFO_CONSTRUCT(&jinfo[1]); - PMIX_INFO_LOAD(&jinfo[1], PMIX_COLOCATE_NPERNODE, &np, PMIX_UINT16); + if (perproc) { + PMIX_INFO_LOAD(&jinfo[1], PMIX_COLOCATE_NPERPROC, &np, PMIX_UINT16); + } else { + PMIX_INFO_LOAD(&jinfo[1], PMIX_COLOCATE_NPERNODE, &np, PMIX_UINT16); + } fprintf(stderr, "Client %s:%u: calling PMIx_Spawn\n", myproc.nspace, myproc.rank); diff --git a/examples/debugger/direct-multi.c b/examples/debugger/direct-multi.c index 99719b90d4..e7f11d18ea 100644 --- a/examples/debugger/direct-multi.c +++ b/examples/debugger/direct-multi.c @@ -564,7 +564,7 @@ static pmix_status_t spawn_app(void) } else { PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); // All procs stop in PMIx_Init } - sprintf(map_str, "ppr:%d:node", app_npernode); + snprintf(map_str, 30, "ppr:%d:node", app_npernode); PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_MAPBY, map_str, PMIX_STRING); // app procs/node PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_RANKBY, "slot", PMIX_STRING); // match baseline PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me diff --git a/examples/debugger/direct.c b/examples/debugger/direct.c index 8edffc96fb..c36f0412ad 100644 --- a/examples/debugger/direct.c +++ b/examples/debugger/direct.c @@ -338,7 +338,7 @@ static int cospawn_launch(myrel_t *myrel) /* Process that is spawning processes is a tool process */ PMIX_INFO_LIST_ADD(rc, dirs, PMIX_REQUESTOR_IS_TOOL, NULL, PMIX_BOOL); /* Map spawned processes by slot */ - sprintf(map_str, "ppr:%d:node", app_npernode); + snprintf(map_str, 128, "ppr:%d:node", app_npernode); PMIX_INFO_LIST_ADD(rc, dirs, PMIX_MAPBY, map_str, PMIX_STRING); PMIX_INFO_LIST_CONVERT(rc, dirs, &darray); PMIX_INFO_LIST_RELEASE(dirs); @@ -824,7 +824,7 @@ int main(int argc, char **argv) // procs are to pause in PMIx_Init for debugger attach PMIX_INFO_LIST_ADD(rc, dirs, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); } - sprintf(map_str, "ppr:%d:node", app_npernode); + snprintf(map_str, 128, "ppr:%d:node", app_npernode); PMIX_INFO_LIST_ADD(rc, dirs, PMIX_MAPBY, map_str, PMIX_STRING); // 1 per node PMIX_INFO_LIST_ADD(rc, dirs, PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me PMIX_INFO_LIST_ADD(rc, dirs, PMIX_FWD_STDERR, NULL, PMIX_BOOL); // forward stderr to me diff --git a/src/docs/prrte-rst-content/Makefile.am b/src/docs/prrte-rst-content/Makefile.am index 426c15e0ed..1ece65998d 100644 --- a/src/docs/prrte-rst-content/Makefile.am +++ b/src/docs/prrte-rst-content/Makefile.am @@ -46,6 +46,7 @@ dist_rst_DATA = \ cli-no-app-prefix.rst \ cli-rank-by.rst \ cli-runtime-options.rst \ + cli-set-env.rst \ cli-stream-buffering.rst \ cli-tune.rst \ cli-unset-env.rst \ diff --git a/src/docs/prrte-rst-content/cli-set-env.rst b/src/docs/prrte-rst-content/cli-set-env.rst new file mode 100644 index 0000000000..77873adb1d --- /dev/null +++ b/src/docs/prrte-rst-content/cli-set-env.rst @@ -0,0 +1,16 @@ +.. -*- rst -*- + + Copyright (c) 2022-2025 Nanook Consulting All rights reserved. + Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. + + $COPYRIGHT$ + + Additional copyrights may follow + + $HEADER$ + +.. The following line is included so that Sphinx won't complain + about this file not being directly included in some toctree + +Set the named environmental variable to the specified value. This will overwrite the +existing value, if it exists. Equivalent to the "-x foo=val" option diff --git a/src/docs/show-help-files/help-dash-host.txt b/src/docs/show-help-files/help-dash-host.txt index 5667922110..b8e41a7494 100644 --- a/src/docs/show-help-files/help-dash-host.txt +++ b/src/docs/show-help-files/help-dash-host.txt @@ -56,7 +56,9 @@ A relative host was improperly specified — the value provided was. --host: %s You may have forgotten to preface a node with "N" or "n", or used the -"e" or "E" to indicate empty nodes. +"e" or "E" to indicate empty nodes, or you ended the value with a +colon but forgot to include the number of empty nodes you were +requesting. Re-run this command with "--help hosts" for further information. diff --git a/src/docs/show-help-files/help-prterun.txt b/src/docs/show-help-files/help-prterun.txt index c2c968ffe7..b39ec6520d 100644 --- a/src/docs/show-help-files/help-prterun.txt +++ b/src/docs/show-help-files/help-prterun.txt @@ -155,6 +155,12 @@ option to the help request as "--help