diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index adf7081ec..787ef182e 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -1,17 +1,15 @@ - ************* SmartSim API ************* - .. _experiment_api: Experiment ========== - .. currentmodule:: smartsim.experiment +.. _exp_init: .. autosummary:: Experiment.__init__ @@ -34,6 +32,8 @@ Experiment :members: +.. _settings-info: + Settings ======== @@ -377,23 +377,47 @@ container. :undoc-members: :members: +.. _orc_api: Orchestrator ============ .. currentmodule:: smartsim.database -.. _orc_api: +.. autosummary:: + + Orchestrator.__init__ + Orchestrator.db_identifier + Orchestrator.num_shards + Orchestrator.db_nodes + Orchestrator.hosts + Orchestrator.reset_hosts + Orchestrator.remove_stale_files + Orchestrator.get_address + Orchestrator.is_active + Orchestrator.set_cpus + Orchestrator.set_walltime + Orchestrator.set_hosts + Orchestrator.set_batch_arg + Orchestrator.set_run_arg + Orchestrator.enable_checkpoints + Orchestrator.set_max_memory + Orchestrator.set_eviction_strategy + Orchestrator.set_max_clients + Orchestrator.set_max_message_size + Orchestrator.set_db_conf Orchestrator ------------ +.. _orchestrator_api: .. autoclass:: Orchestrator :members: :inherited-members: :undoc-members: +.. _model_api: Model ===== @@ -417,17 +441,17 @@ Model Model.disable_key_prefixing Model.query_key_prefixing +Model +----- + .. autoclass:: Model :members: :show-inheritance: :inherited-members: -.. _ensemble_api: - Ensemble ======== - .. currentmodule:: smartsim.entity.ensemble .. autosummary:: @@ -443,6 +467,11 @@ Ensemble Ensemble.query_key_prefixing Ensemble.register_incoming_entity +Ensemble +-------- + +.. _ensemble_api: + .. autoclass:: Ensemble :members: :show-inheritance: @@ -461,7 +490,6 @@ SmartSim includes built-in utilities for supporting TensorFlow, Keras, and Pytor TensorFlow ---------- - SmartSim includes built-in utilities for supporting TensorFlow and Keras in training and inference. .. currentmodule:: smartsim.ml.tf.utils @@ -510,7 +538,6 @@ SmartSim includes built-in utilities for supporting PyTorch in training and infe Slurm ===== - .. currentmodule:: smartsim.slurm .. autosummary:: diff --git a/doc/batch_settings.rst b/doc/batch_settings.rst new file mode 100644 index 000000000..07cef4c95 --- /dev/null +++ b/doc/batch_settings.rst @@ -0,0 +1,127 @@ +.. _batch_settings_doc: + +************** +Batch Settings +************** +======== +Overview +======== +SmartSim provides functionality to launch entities (``Model`` or ``Ensemble``) +as batch jobs supported by the ``BatchSettings`` base class. While the ``BatchSettings`` base +class is not intended for direct use by users, its derived child classes offer batch +launching capabilities tailored for specific workload managers (WLMs). Each SmartSim +`launcher` interfaces with a ``BatchSettings`` subclass specific to a system's WLM: + +- The Slurm `launcher` supports: + - :ref:`SbatchSettings` +- The PBS Pro `launcher` supports: + - :ref:`QsubBatchSettings` +- The LSF `launcher` supports: + - :ref:`BsubBatchSettings` + +.. note:: + The local `launcher` does not support batch jobs. + +After creating a ``BatchSettings`` instance, users gain access to the methods +of the associated child class, providing them with the ability to further configure the batch +settings for jobs. + +In the following :ref:`Examples` subsection, we demonstrate the initialization +and configuration of a batch settings object. + +.. _batch_settings_ex: + +======== +Examples +======== +A ``BatchSettings`` child class is created using the ``Experiment.create_batch_settings`` +factory method. When the user initializes the ``Experiment`` at the beginning of the Python driver script, +they may specify a `launcher` argument. SmartSim will then register or detect the `launcher` and return the +corresponding supported child class when ``Experiment.create_batch_settings`` is called. This +design allows SmartSim driver scripts utilizing ``BatchSettings`` to be portable between systems, +requiring only a change in the specified `launcher` during ``Experiment`` initialization. + +Below are examples of how to initialize a ``BatchSettings`` object per `launcher`. + +.. tabs:: + + .. group-tab:: Slurm + To instantiate the ``SbatchSettings`` object, which interfaces with the Slurm job scheduler, specify + `launcher="slurm"` when initializing the ``Experiment``. Upon calling ``create_batch_settings``, + SmartSim will detect the job scheduler and return the appropriate batch settings object. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher Slurm + exp = Experiment("name-of-experiment", launcher="slurm") + + # Initialize a SbatchSettings object + sbatch_settings = exp.create_batch_settings(nodes=1, time="10:00:00") + # Set the account for the slurm batch job + sbatch_settings.set_account("12345-Cray") + # Set the partition for the slurm batch job + sbatch_settings.set_queue("default") + + The initialized ``SbatchSettings`` instance can now be passed to a SmartSim entity + (``Model`` or ``Ensemble``) via the `batch_settings` argument in ``create_batch_settings``. + + .. note:: + If `launcher="auto"`, SmartSim will detect that the ``Experiment`` is running on a Slurm based + machine and set the launcher to `"slurm"`. + + .. group-tab:: PBS Pro + To instantiate the ``QsubBatchSettings`` object, which interfaces with the PBS Pro job scheduler, specify + `launcher="pbs"` when initializing the ``Experiment``. Upon calling ``create_batch_settings``, + SmartSim will detect the job scheduler and return the appropriate batch settings object. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PBS Pro + exp = Experiment("name-of-experiment", launcher="pbs") + + # Initialize a QsubBatchSettings object + qsub_batch_settings = exp.create_batch_settings(nodes=1, time="10:00:00") + # Set the account for the PBS Pro batch job + qsub_batch_settings.set_account("12345-Cray") + # Set the partition for the PBS Pro batch job + qsub_batch_settings.set_queue("default") + + The initialized ``QsubBatchSettings`` instance can now be passed to a SmartSim entity + (``Model`` or ``Ensemble``) via the `batch_settings` argument in ``create_batch_settings``. + + .. note:: + If `launcher="auto"`, SmartSim will detect that the ``Experiment`` is running on a PBS Pro based + machine and set the launcher to `"pbs"`. + + .. group-tab:: LSF + To instantiate the ``BsubBatchSettings`` object, which interfaces with the LSF job scheduler, specify + `launcher="lsf"` when initializing the ``Experiment``. Upon calling ``create_batch_settings``, + SmartSim will detect the job scheduler and return the appropriate batch settings object. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher LSF + exp = Experiment("name-of-experiment", launcher="lsf") + + # Initialize a BsubBatchSettings object + bsub_batch_settings = exp.create_batch_settings(nodes=1, time="10:00:00", batch_args={"ntasks": 1}) + # Set the account for the lsf batch job + bsub_batch_settings.set_account("12345-Cray") + # Set the partition for the lsf batch job + bsub_batch_settings.set_queue("default") + + The initialized ``BsubBatchSettings`` instance can now be passed to a SmartSim entity + (``Model`` or ``Ensemble``) via the `batch_settings` argument in ``create_batch_settings``. + + .. note:: + If `launcher="auto"`, SmartSim will detect that the ``Experiment`` is running on a LSF based + machine and set the launcher to `"lsf"`. + +.. warning:: + Note that initialization values provided (e.g., `nodes`, `time`, etc) will overwrite the same arguments in `batch_args` if present. \ No newline at end of file diff --git a/doc/changelog.rst b/doc/changelog.rst index 11c4a6da7..a45a30850 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,12 +18,15 @@ To be released at some future point in time Description +- SmartSim Documentation refactor - Update the version of Redis from `7.0.4` to `7.2.4` - Update Experiment API typing - Fix publishing of development docs Detailed Notes +- Implemented new structure of SmartSim documentation. Added examples + images and further detail of SmartSim components. - Update Redis version to `7.2.4`. This change fixes an issue in the Redis build scripts causing failures on Apple Silicon hosts. (SmartSim-PR507_) - The container which builds the documentation for every merge to develop @@ -33,6 +36,7 @@ Detailed Notes (SmartSim-PR-PR504_) - Update the generic `t.Any` typehints in Experiment API. (SmartSim-PR501_) +.. _SmartSim-PR463: https://github.com/CrayLabs/SmartSim/pull/463 .. _SmartSim-PR507: https://github.com/CrayLabs/SmartSim/pull/507 .. _SmartSim-PR504: https://github.com/CrayLabs/SmartSim/pull/504 .. _SmartSim-PR501: https://github.com/CrayLabs/SmartSim/pull/501 diff --git a/doc/conf.py b/doc/conf.py index e489fd797..0739047fc 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -52,9 +52,11 @@ 'breathe', 'nbsphinx', 'sphinx_copybutton', - 'sphinx_tabs.tabs' + 'sphinx_tabs.tabs', + 'sphinx_design', ] +autodoc_mock_imports = ["smartredis.smartredisPy"] suppress_warnings = ['autosectionlabel'] # Add any paths that contain templates here, relative to this directory. @@ -82,7 +84,6 @@ # a list of builtin themes. html_theme = "sphinx_book_theme" - # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". @@ -104,8 +105,31 @@ # white background with dark themes. If sphinx-tabs updates its # static/tabs.css, this may need to be updated. html_css_files = ['custom_tab_style.css'] - autoclass_content = 'both' add_module_names = False nbsphinx_execute = 'never' + +from inspect import getsourcefile + +# Get path to directory containing this file, conf.py. +DOCS_DIRECTORY = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0))) + +def ensure_pandoc_installed(_): + import pypandoc + + # Download pandoc if necessary. If pandoc is already installed and on + # the PATH, the installed version will be used. Otherwise, we will + # download a copy of pandoc into docs/bin/ and add that to our PATH. + pandoc_dir = os.path.join(DOCS_DIRECTORY, "bin") + # Add dir containing pandoc binary to the PATH environment variable + if pandoc_dir not in os.environ["PATH"].split(os.pathsep): + os.environ["PATH"] += os.pathsep + pandoc_dir + pypandoc.ensure_pandoc_installed( + targetfolder=pandoc_dir, + delete_installer=True, + ) + + +def setup(app): + app.connect("builder-inited", ensure_pandoc_installed) \ No newline at end of file diff --git a/doc/ensemble.rst b/doc/ensemble.rst new file mode 100644 index 000000000..93019d18d --- /dev/null +++ b/doc/ensemble.rst @@ -0,0 +1,1214 @@ +.. _ensemble_doc: + +******** +Ensemble +******** +======== +Overview +======== +A SmartSim ``Ensemble`` enables users to run a **group** of computational tasks together in an +``Experiment`` workflow. An ``Ensemble`` is comprised of multiple ``Model`` objects, +where each ``Ensemble`` member (SmartSim ``Model``) represents an individual application. +An ``Ensemble`` can be managed as a single entity and +launched with other :ref:`Model's` and :ref:`Orchestrators` to construct AI-enabled workflows. + +The :ref:`Ensemble API` offers key features, including methods to: + +- :ref:`Attach Configuration Files` for use at ``Ensemble`` runtime. +- :ref:`Load AI Models` (TF, TF-lite, PT, or ONNX) into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Load TorchScripts` into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Prevent Data Collisions` within the ``Ensemble``, which allows for reuse of application code. + +To create a SmartSim ``Ensemble``, use the ``Experiment.create_ensemble`` API function. When +initializing an ``Ensemble``, consider one of the **three** creation strategies explained +in the :ref:`Initialization` section. + +SmartSim manages ``Ensemble`` instances through the :ref:`Experiment API` by providing functions to +launch, monitor, and stop applications. + +.. _init_ensemble_strategies: + +============== +Initialization +============== +Overview +======== +The :ref:`Experiment API` is responsible for initializing all workflow entities. +An ``Ensemble`` is created using the ``Experiment.create_ensemble`` factory method, and users can customize the +``Ensemble`` creation via the factory method parameters. + +The factory method arguments for ``Ensemble`` creation can be found in the :ref:`Experiment API` +under the ``create_ensemble`` docstring. + +By using specific combinations of the factory method arguments, users can tailor +the creation of an ``Ensemble`` to align with one of the following creation strategies: + +1. :ref:`Parameter Expansion`: Generate a variable-sized set of unique simulation instances + configured with user-defined input parameters. +2. :ref:`Replica Creation`: Generate a specified number of ``Model`` replicas. +3. :ref:`Manually`: Attach pre-configured ``Model``'s to an ``Ensemble`` to manage as a single unit. + +.. _param_expansion_init: + +Parameter Expansion +=================== +Parameter expansion is a technique that allows users to set parameter values per ``Ensemble`` member. +This is done by specifying input to the `params` and `perm_strategy` factory method arguments during +``Ensemble`` creation (``Experiment.create_ensemble``). Users may control how the `params` values +are applied to the ``Ensemble`` through the `perm_strategy` argument. The `perm_strategy` argument +accepts three values listed below. + +**Parameter Expansion Strategy Options:** + +- `"all_perm"`: Generate all possible parameter permutations for an exhaustive exploration. This + means that every possible combination of parameters will be used in the ``Ensemble``. +- `"step"`: Create parameter sets by collecting identically indexed values across parameter lists. + This allows for discrete combinations of parameters for ``Model``'s. +- `"random"`: Enable random selection from predefined parameter spaces, offering a stochastic approach. + This means that the parameters will be chosen randomly for each ``Model``, which can be useful + for exploring a wide range of possibilities. + +-------- +Examples +-------- +This subsection contains two examples of ``Ensemble`` parameter expansion. The +:ref:`first example` illustrates parameter expansion using two parameters +while the :ref:`second example` demonstrates parameter expansion with two +parameters along with the launch of the ``Ensemble`` as a batch workload. + +.. _param_first_ex: + +Example 1 : Parameter Expansion Using `all_perm` Strategy + + In this example an ``Ensemble`` of four ``Model`` entities is created by expanding two parameters + using the `all_perm` strategy. All of the ``Model``'s in the ``Ensemble`` share the same ``RunSettings`` + and only differ in the value of the `params` assigned to each member. The source code example + is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + + Begin by initializing a ``RunSettings`` object to apply to + all ``Ensemble`` members: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + :language: python + :linenos: + :lines: 6-7 + + Next, define the parameters that will be applied to the ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + :language: python + :linenos: + :lines: 9-13 + + Finally, initialize an ``Ensemble`` by specifying the ``RunSettings``, `params` and `perm_strategy="all_perm"`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + :language: python + :linenos: + :lines: 15-16 + + By specifying `perm_strategy="all_perm"`, all permutations of the `params` will + be calculated and distributed across ``Ensemble`` members. Here there are four permutations of the `params` values: + + .. code-block:: bash + + ensemble member 1: ["Ellie", 2] + ensemble member 2: ["Ellie", 11] + ensemble member 3: ["John", 2] + ensemble member 4: ["John", 11] + +.. _param_second_ex: + +Example 2 : Parameter Expansion Using `step` Strategy with the ``Ensemble`` Configured For Batch Launching + + In this example an ``Ensemble`` of two ``Model`` entities is created by expanding two parameters + using the `step` strategy. All of the ``Model``'s in the ``Ensemble`` share the same ``RunSettings`` + and only differ in the value of the `params` assigned to each member. Lastly, the ``Ensemble`` is + submitted as a batch workload. The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script source code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + + Begin by initializing and configuring a ``BatchSettings`` object to + run the ``Ensemble`` instance: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 6-8 + + The above ``BatchSettings`` object will instruct SmartSim to run the ``Ensemble`` on two + nodes with a timeout of `10 hours`. + + Next initialize a ``RunSettings`` object to apply to all ``Ensemble`` members: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 10-12 + + Next, define the parameters to include in ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 14-18 + + Finally, initialize an ``Ensemble`` by passing in the ``RunSettings``, `params` and `perm_strategy="step"`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 20-21 + + When specifying `perm_strategy="step"`, the `params` sets are created by collecting identically + indexed values across the `param` value lists. + + .. code-block:: bash + + ensemble member 1: ["Ellie", 2] + ensemble member 2: ["John", 11] + +.. _replicas_init: + +Replicas +======== +A replica strategy involves the creation of identical ``Model``'s within an ``Ensemble``. +This strategy is particularly useful for applications that have some inherent randomness. +Users may use the `replicas` factory method argument to create a specified number of identical +``Model`` members during ``Ensemble`` creation (``Experiment.create_ensemble``). + +-------- +Examples +-------- +This subsection contains two examples of using the replicas creation strategy. The +:ref:`first example` illustrates creating four ``Ensemble`` member clones +while the :ref:`second example` demonstrates creating four ``Ensemble`` +member clones along with the launch of the ``Ensemble`` as a batch workload. + +.. _replicas_first_ex: + +Example 1 : ``Ensemble`` creation with replicas strategy + + In this example an ``Ensemble`` of four identical ``Model`` members is created by + specifying the number of clones to create via the `replicas` argument. + All of the ``Model``'s in the ``Ensemble`` share the same ``RunSettings``. + The source code example is available in the dropdown below for convenient execution + and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_1.py + + To create an ``Ensemble`` of identical ``Model``'s, begin by initializing a ``RunSettings`` + object: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_1.py + :language: python + :linenos: + :lines: 6-7 + + Initialize the ``Ensemble`` by specifying the ``RunSettings`` object and number of clones to `replicas`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_1.py + :language: python + :linenos: + :lines: 9-10 + + By passing in `replicas=4`, four identical ``Ensemble`` members will be initialized. + +.. _replicas_second_ex: + +Example 2 : ``Ensemble`` Creation with Replicas Strategy and ``Ensemble`` Batch Launching + + In this example an ``Ensemble`` of four ``Model`` entities is created by specifying + the number of clones to create via the `replicas` argument. All of the ``Model``'s in + the ``Ensemble`` share the same ``RunSettings`` and the ``Ensemble`` is + submitted as a batch workload. The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + + To launch the ``Ensemble`` of identical ``Model``'s as a batch job, begin by initializing a ``BatchSettings`` + object: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + :language: python + :linenos: + :lines: 6-9 + + The above ``BatchSettings`` object will instruct SmartSim to run the ``Ensemble`` on four + nodes with a timeout of `10 hours`. + + Next, create a ``RunSettings`` object to apply to all ``Model`` replicas: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + :language: python + :linenos: + :lines: 10-12 + + Initialize the ``Ensemble`` by specifying the ``RunSettings`` object, ``BatchSettings`` object + and number of clones to `replicas`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + :language: python + :linenos: + :lines: 14-15 + + By passing in `replicas=4`, four identical ``Ensemble`` members will be initialized. + +.. _append_init: + +Manually Append +=============== +Manually appending ``Model``'s to an ``Ensemble`` offers an in-depth level of customization in ``Ensemble`` design. +This approach is favorable when users have distinct requirements for individual ``Model``'s, such as variations +in parameters, run settings, or different types of simulations. + +-------- +Examples +-------- +This subsection contains an example of creating an ``Ensemble`` by manually appending ``Model``'s. +The example illustrates attaching two SmartSim ``Model``'s to the ``Ensemble``. +The ``Ensemble`` is submitted as a batch workload. + +Example 1 : Append ``Model``'s to an ``Ensemble`` and Launch as a Batch Job + + In this example, we append ``Model``'s to an ``Ensemble`` for batch job execution. To do + this, we first initialize an Ensemble with a ``BatchSettings`` object. Then, manually + create ``Model``'s and add each to the ``Ensemble`` using the ``Ensemble.add_model`` function. + The source code example is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + + To create an empty ``Ensemble`` to append ``Model``'s, initialize the ``Ensemble`` with + a batch settings object: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + :language: python + :linenos: + :lines: 6-11 + + Next, create the ``Model``'s to append to the ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + :language: python + :linenos: + :lines: 13-20 + + Finally, append the ``Model`` objects to the ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + :language: python + :linenos: + :lines: 22-25 + + The new ``Ensemble`` is comprised of two appended ``Model`` members. + +.. _attach_files_ensemble: + +===== +Files +===== +Overview +======== +``Ensemble`` members often depend on external files (e.g. training datasets, evaluation datasets, etc) +to operate as intended. Users can instruct SmartSim to copy, symlink, or manipulate external files +prior to an ``Ensemble`` launch via the ``Ensemble.attach_generator_files`` function. Attached files +will be applied to all ``Ensemble`` members. + +.. note:: + Multiple calls to ``Ensemble.attach_generator_files`` will overwrite previous file configurations + on the ``Ensemble``. + +To attach a file to an ``Ensemble`` for use at runtime, provide one of the following arguments to the +``Ensemble.attach_generator_files`` function: + +* `to_copy` (t.Optional[t.List[str]] = None): Files that are copied into the path of the ``Ensemble`` members. +* `to_symlink` (t.Optional[t.List[str]] = None): Files that are symlinked into the path of the ``Ensemble`` members. + A symlink, or symbolic link, is a file that points to another file or directory, allowing you to access that file + as if it were located in the same directory as the symlink. + +To specify a template file in order to programmatically replace specified parameters during generation +of ``Ensemble`` member directories, pass the following value to the ``Ensemble.attach_generator_files`` function: + +* `to_configure` (t.Optional[t.List[str]] = None): This parameter is designed for text-based ``Ensemble`` + member input files. During directory generation for ``Ensemble`` members, the linked files are parsed and replaced with + the `params` values applied to each ``Ensemble`` member. To further explain, the ``Ensemble`` + creation strategy is considered when replacing the tagged parameters in the input files. + These tagged parameters are placeholders in the text that are replaced with the actual + parameter values during the directory generation process. The default tag is a semicolon + (e.g., THERMO = ;THERMO;). + +In the :ref:`Example` subsection, we provide an example using the value `to_configure` +within ``Ensemble.attach_generator_files``. + +.. seealso:: + To add a file to a single ``Model`` that will be appended to an ``Ensemble``, refer to the :ref:`Files` + section of the ``Model`` documentation. + +.. _files_example_doc_ensem: + +Example +======= +This example demonstrates how to attach a text file to an ``Ensemble`` for parameter replacement. +This is accomplished using the `params` function parameter in +the ``Experiment.create_ensemble`` factory function and the `to_configure` function parameter +in ``Ensemble.attach_generator_files``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + +In this example, we have a text file named `params_inputs.txt`. Within the text, is the parameter `THERMO` +that is required by each ``Ensemble`` member at runtime: + +.. code-block:: bash + + THERMO = ;THERMO; + +In order to have the tagged parameter `;THERMO;` replaced with a usable value at runtime, two steps are required: + +1. The `THERMO` variable must be included in ``Experiment.create_ensemble`` factory method as + part of the `params` parameter. +2. The file containing the tagged parameter `;THERMO;`, `params_inputs.txt`, must be attached to the ``Ensemble`` + via the ``Ensemble.attach_generator_files`` method as part of the `to_configure` parameter. + +To encapsulate our application within an ``Ensemble``, we must create an ``Experiment`` instance +to gain access to the ``Experiment`` factory method that creates the ``Ensemble``. +Begin by importing the ``Experiment`` module and initializing an ``Experiment``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 1-4 + +To create our ``Ensemble``, we are using the `replicas` initialization strategy. +Begin by creating a simple ``RunSettings`` object to specify the path to +the executable simulation as an executable: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 6-7 + +Next, initialize an ``Ensemble`` object with ``Experiment.create_ensemble`` +by passing in `ensemble_settings`, `params={"THERMO":1}` and `replicas=2`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 9-10 + +We now have an ``Ensemble`` instance named `example_ensemble`. Attach the above text file +to the ``Ensemble`` for use at entity runtime. To do so, we use the +``Ensemble.attach_generator_files`` function and specify the `to_configure` +parameter with the path to the text file, `params_inputs.txt`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 12-13 + +To create an isolated directory for the ``Ensemble`` member outputs and configuration files, invoke ``Experiment.generate`` via the +``Experiment`` instance `exp` with `example_ensemble` as an input parameter: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 15-16 + +After invoking ``Experiment.generate``, the attached generator files will be available for the +application when ``exp.start(example_ensemble)`` is called. + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 18-19 + +The contents of `params_inputs.txt` after ``Ensemble`` completion are: + +.. code-block:: bash + + THERMO = 1 + +.. _ensemble_ml_model_script: + +===================== +ML Models and Scripts +===================== +Overview +======== +SmartSim users have the capability to load ML models and TorchScripts into an ``Orchestrator`` +within the ``Experiment`` script for use within ``Ensemble`` members. Functions +accessible through an ``Ensemble`` object support loading ML models (TensorFlow, TensorFlow-lite, +PyTorch, and ONNX) and TorchScripts into standalone or colocated ``Orchestrators`` before +application runtime. + +.. seealso:: + To add an ML model or TorchScript to a single ``Model`` that will be appended to an + ``Ensemble``, refer to the :ref:`ML Models and Scripts` + section of the ``Model`` documentation. + +Depending on the planned storage method of the **ML model**, there are **two** distinct +approaches to load it into the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` + +.. warning:: + Uploading an ML model :ref:`from memory` is solely supported for + standalone ``Orchestrators``. To upload an ML model to a colocated ``Orchestrator``, users + must save the ML model to disk and upload :ref:`from file`. + +Depending on the planned storage method of the **TorchScript**, there are **three** distinct +approaches to load it into the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` +- :ref:`From String` + +.. warning:: + Uploading a TorchScript :ref:`from memory` is solely supported for + standalone ``Orchestrators``. To upload a TorchScript to a colocated ``Orchestrator``, users + upload :ref:`from file` or :ref:`from string`. + +Once a ML model or TorchScript is loaded into the ``Orchestrator``, ``Ensemble`` members can +leverage ML capabilities by utilizing the SmartSim client (:ref:`SmartRedis`) +to execute the stored ML models or TorchScripts. + +.. _ai_model_ensemble_doc: + +AI Models +========= +When configuring an ``Ensemble``, users can instruct SmartSim to load +Machine Learning (ML) models dynamically to the ``Orchestrator`` (colocated or standalone). ML models added +are loaded into the ``Orchestrator`` prior to the execution of the ``Ensemble``. To load an ML model +to the ``Orchestrator``, SmartSim users can serialize and provide the ML model **in-memory** or specify the **file path** +via the ``Ensemble.add_ml_model`` function. The supported ML frameworks are TensorFlow, +TensorFlow-lite, PyTorch, and ONNX. + +Users must **serialize TensorFlow ML models** before sending to an ``Orchestrator`` from memory +or from file. To save a TensorFlow model to memory, SmartSim offers the ``serialize_model`` +function. This function returns the TF model as a byte string with the names of the +input and output layers, which will be required upon uploading. To save a TF model to disk, +SmartSim offers the ``freeze_model`` function which returns the path to the serialized +TF model file with the names of the input and output layers. Additional TF model serialization +information and examples can be found in the :ref:`ML Features` section of SmartSim. + +.. note:: + Uploading an ML model from memory is only supported for standalone ``Orchestrators``. + +When attaching an ML model using ``Ensemble.add_ml_model``, the +following arguments are offered to customize storage and execution: + +- `name` (str): name to reference the ML model in the ``Orchestrator``. +- `backend` (str): name of the backend (TORCH, TF, TFLITE, ONNX). +- `model` (t.Optional[str] = None): An ML model in memory (only supported for non-colocated ``Orchestrators``). +- `model_path` (t.Optional[str] = None): serialized ML model. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): name of device for execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `batch_size` (int = 0): batch size for execution, defaults to 0. +- `min_batch_size` (int = 0): minimum batch size for ML model execution, defaults to 0. +- `min_batch_timeout` (int = 0): time to wait for minimum batch size, defaults to 0. +- `tag` (str = ""): additional tag for ML model information, defaults to “”. +- `inputs` (t.Optional[t.List[str]] = None): ML model inputs (TF only), defaults to None. +- `outputs` (t.Optional[t.List[str]] = None): ML model outputs (TF only), defaults to None. + +.. seealso:: + To add an ML model to a single ``Model`` that will be appended to an + ``Ensemble``, refer to the :ref:`AI Models` + section of the ``Model`` documentation. + +.. _in_mem_ML_model_ensemble_ex: + +------------------------------------- +Example: Attach an In-Memory ML Model +------------------------------------- +This example demonstrates how to attach an in-memory ML model to a SmartSim ``Ensemble`` +to load into an ``Orchestrator`` at ``Ensemble`` runtime. The source code example is +available in the dropdown below for convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to the ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``serialize_model`` which returns the + ML model as a byte string with the names of the input and output layers + +**Attach the ML Model to a SmartSim Ensemble** + +In this example, we have a serialized Tensorflow-based ML model that was saved to a byte string stored under `model`. +Additionally, the ``serialize_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add the byte string TensorFlow model using +``Ensemble.add_ml_model``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``ensemble_instance.add_ml_model`` code snippet, we offer the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model` (model): The in-memory representation of the TensorFlow model. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to the launch of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent standalone ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the ML model will be loaded to the +launched standalone ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` via a SmartSim +client (:ref:`SmartRedis`) within the application code. + +.. _from_file_ML_model_ensemble_ex: + +------------------------------------- +Example: Attach an ML Model From File +------------------------------------- +This example demonstrates how to attach a ML model from file to a SmartSim ``Ensemble`` +to load into an ``Orchestrator`` at ``Ensemble`` runtime. The source code example is +available in the dropdown below for convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py + +.. note:: + This example assumes: + + - a standalone ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``freeze_model`` which returns the + the path to the serialized model file and the names of the input and output layers + +**Attach the ML Model to a SmartSim Ensemble** + +In this example, we have a serialized Tensorflow-based ML model that was saved to disk and stored under `model`. +Additionally, the ``freeze_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add a TensorFlow model using +the ``Ensemble.add_ml_model`` function and specify the ML model path to the parameter `model_path`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``ensemble_instance.add_ml_model`` code snippet, we offer the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model_path` (model_file): The path to the ML model script. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the ML model will be loaded to the +launched ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` via a SmartSim +client (:ref:`SmartRedis`) within the application executable. + +.. _TS_ensemble_doc: + +TorchScripts +============ +When configuring an ``Ensemble``, users can instruct SmartSim to load TorchScripts dynamically +to the ``Orchestrator``. The TorchScripts become available for each ``Ensemble`` member upon being loaded +into the ``Orchestrator`` prior to the execution of the ``Ensemble``. SmartSim users may upload +a single TorchScript function via ``Ensemble.add_function`` or alternatively upload a script +containing multiple functions via ``Ensemble.add_script``. To load a TorchScript to the +``Orchestrator``, SmartSim users can follow one of the following processes: + +- :ref:`Define a TorchScript Function In-Memory` + Use the ``Ensemble.add_function`` to instruct SmartSim to load an in-memory TorchScript to the ``Orchestrator``. +- :ref:`Define Multiple TorchScript Functions From File` + Provide file path to ``Ensemble.add_script`` to instruct SmartSim to load the TorchScript from file to the ``Orchestrator``. +- :ref:`Define a TorchScript Function as String` + Provide function string to ``Ensemble.add_script`` to instruct SmartSim to load a raw string as a TorchScript function to the ``Orchestrator``. + +.. note:: + Uploading a TorchScript :ref:`from memory` using ``Ensemble.add_function`` + is only supported for standalone ``Orchestrators``. Users uploading + TorchScripts to colocated ``Orchestrators`` should instead use the function ``Ensemble.add_script`` + to upload :ref:`from file` or as a :ref:`string`. + +Each function also provides flexible device selection, allowing users to choose between which device the TorchScript is executed on, `"GPU"` or `"CPU"`. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` parameter. + +.. note:: + If `device=GPU` is specified when attaching a TorchScript function to an ``Ensemble``, this instructs + SmartSim to execute the TorchScript on GPU nodes. However, TorchScripts loaded to an ``Orchestrator`` are + executed on the ``Orchestrator`` compute resources. Therefore, users must make sure that the device + specified is included in the ``Orchestrator`` compute resources. To further explain, if a user + specifies `device=GPU`, however, initializes ``Orchestrator`` on only CPU nodes, + the TorchScript will not run on GPU nodes as advised. + +Continue or select the respective process link to learn more on how each function (``Ensemble.add_script`` and ``Ensemble.add_function``) +dynamically loads TorchScripts to the ``Orchestrator``. + +.. seealso:: + To add a TorchScript to a single ``Model`` that will be appended to an + ``Ensemble``, refer to the :ref:`TorchScripts` + section of the ``Model`` documentation. + +.. _in_mem_TF_ensemble_doc: + +------------------------------- +Attach an In-Memory TorchScript +------------------------------- +Users can define TorchScript functions within the ``Experiment`` driver script +to attach to an ``Ensemble``. This feature is supported by ``Ensemble.add_function``. + +.. warning:: + ``Ensemble.add_function`` does **not** support loading in-memory TorchScript functions to a colocated ``Orchestrator``. + If you would like to load a TorchScript function to a colocated ``Orchestrator``, define the function + as a :ref:`raw string` or :ref:`load from file`. + +When specifying an in-memory TF function using ``Ensemble.add_function``, the +following arguments are offered: + +- `name` (str): reference name for the script inside of the ``Orchestrator``. +- `function` (t.Optional[str] = None): TorchScript function code. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): device for script execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. + +.. _in_mem_TF_ex: + +Example: Load a In-Memory TorchScript Function +---------------------------------------------- +This example walks through the steps of instructing SmartSim to load an in-memory TorchScript function +to a standalone ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + +.. note:: + The example assumes: + + - a standalone ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + +**Define an In-Memory TF Function** + +To begin, define an in-memory TorchScript function within the Python driver script. +For the purpose of the example, we add a simple TorchScript function, `timestwo`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + :language: python + :linenos: + :lines: 3-4 + +**Attach the In-Memory TorchScript Function to a SmartSim Ensemble** + +We use the ``Ensemble.add_function`` function to instruct SmartSim to load the TorchScript function `timestwo` +onto the launched standalone ``Orchestrator``. Specify the function `timestwo` to the `function` +parameter: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``ensemble_instance.add_function`` code snippet, we offer the following arguments: + +- `name` ("example_func"): A name to uniquely identify the TorchScript within the ``Orchestrator``. +- `function` (timestwo): Name of the TorchScript function defined in the Python driver script. +- `device` ("GPU"): Specifying the device for TorchScript execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the TorchScript to a non-existent ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the TF function will be loaded to the +standalone ``Orchestrator``. The function can then be executed on the ``Orchestrator`` via a SmartSim +client (:ref:`SmartRedis`) within the application code. + +.. _TS_from_file_ensemble: + +------------------------------ +Attach a TorchScript From File +------------------------------ +Users can attach TorchScript functions from a file to an ``Ensemble`` and upload them to a +colocated or standalone ``Orchestrator``. This functionality is supported by the ``Ensemble.add_script`` +function's `script_path` parameter. + +When specifying a TorchScript using ``Ensemble.add_script``, the +following arguments are offered: + +- `name` (str): Reference name for the script inside of the ``Orchestrator``. +- `script` (t.Optional[str] = None): TorchScript code (only supported for non-colocated ``Orchestrators``). +- `script_path` (t.Optional[str] = None): path to TorchScript code. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): device for script execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. + +Example: Loading a TorchScript From File +---------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript from file +to an ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + +**Define a TorchScript Script** + +For the example, we create the Python script `torchscript.py`. The file contains multiple +simple torch function shown below: + +.. code-block:: python + + def negate(x): + return torch.neg(x) + + def random(x, y): + return torch.randn(x, y) + + def pos(z): + return torch.positive(z) + +**Attach the TorchScript Script to a SmartSim Ensemble** + +Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add a TorchScript script using +the ``Ensemble.add_script`` function and specify the script path to the parameter `script_path`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + :language: python + :linenos: + :lines: 12-13 + +In the above ``smartsim_model.add_script`` code snippet, we offer the following arguments: + +- `name` ("example_script"): Reference name for the script inside of the ``Orchestrator``. +- `script_path` ("path/to/torchscript.py"): Path to the script file. +- `device` ("GPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When `ensemble_instance` is started via ``Experiment.start``, the TorchScript will be loaded from file to the +``Orchestrator`` that is launched prior to the start of `ensemble_instance`. + +.. _TS_raw_string_ensemble: + +--------------------------------- +Define TorchScripts as Raw String +--------------------------------- +Users can upload TorchScript functions from string to send to a colocated or +standalone ``Orchestrator``. This feature is supported by the +``Ensemble.add_script`` function's `script` parameter. + +When specifying a TorchScript using ``Ensemble.add_script``, the +following arguments are offered: + +- `name` (str): Reference name for the script inside of the ``Orchestrator``. +- `script` (t.Optional[str] = None): String of function code (e.g. TorchScript code string). +- `script_path` (t.Optional[str] = None): path to TorchScript code. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): device for script execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. + +Example: Load a TorchScript From String +--------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript function +from string to an ``Orchestrator`` before the execution of the associated ``Ensemble``. +The source code example is available in the dropdown below for convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + +**Define a String TorchScript** + +Define the TorchScript code as a variable in the Python driver script: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py + :language: python + :linenos: + :lines: 12-13 + +**Attach the TorchScript Function to a SmartSim Ensemble** + +Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add a TorchScript using +the ``Ensemble.add_script`` function and specify the variable `torch_script_str` to the parameter +`script`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``ensemble_instance.add_script`` code snippet, we offer the following arguments: + +- `name` ("example_script"): key to store script under. +- `script` (torch_script_str): TorchScript code. +- `device` ("GPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the TorchScript will be loaded to the +``Orchestrator`` that is launched prior to the start of the ``Ensemble``. + +.. _prefix_ensemble: + +========================= +Data Collision Prevention +========================= +Overview +======== +When multiple ``Ensemble`` members use the same code to send and access their respective data +in the ``Orchestrator``, key overlapping can occur, leading to inadvertent data access +between ``Ensemble`` members. To address this, SmartSim supports key prefixing +through ``Ensemble.enable_key_prefixing`` which enables key prefixing for all +``Ensemble`` members. For example, during an ``Ensemble`` simulation with prefixing enabled, SmartSim will add +the ``Ensemble`` member `name` as a prefix to the keys sent to the ``Orchestrator``. +Enabling key prefixing eliminates issues related to key overlapping, allowing ``Ensemble`` +members to use the same code without issue. + +The key components of SmartSim ``Ensemble`` prefixing functionality include: + +1. **Sending Data to the Orchestrator**: Users can send data to an ``Orchestrator`` + with the ``Ensemble`` member name prepended to the data name by utilizing SmartSim :ref:`Ensemble functions`. +2. **Retrieving Data From the Orchestrator**: Users can instruct a ``Client`` to prepend a + ``Ensemble`` member name to a key during data retrieval, polling, or check for existence on the ``Orchestrator`` + through SmartRedis :ref:`Client functions`. However, entity interaction + must be registered using :ref:`Ensemble` or :ref:`Model` functions. + +.. seealso:: + For information on prefixing ``Client`` functions, visit the :ref:`Client functions` page of the ``Model`` + documentation. + +For example, assume you have an ``Ensemble`` that was initialized using the :ref:`replicas` creation strategy. +Two identical ``Model`` were created named `ensemble_0` and `ensemble_1` that use the same executable application +within an ``Ensemble`` named `ensemble`. In the application code you use the function ``Client.put_tensor("tensor_0", data)``. +Without key prefixing enabled, the slower member will overwrite the data from the faster simulation. +With ``Ensemble`` key prefixing turned on, `ensemble_0` and `ensemble_1` can access +their tensor `"tensor_0"` by name without overwriting or accessing the other ``Model``'s `"tensor_0"` tensor. +In this scenario, the two tensors placed in the ``Orchestrator`` are named `ensemble_0.tensor_0` and `ensemble_1.tensor_0`. + +.. _model_prefix_func_ensemble: + +------------------ +Ensemble Functions +------------------ +An ``Ensemble`` object supports two prefixing functions: ``Ensemble.enable_key_prefixing`` and +``Ensemble.register_incoming_entity``. For more information on each function, reference the +:ref:`Ensemble API docs`. + +To enable prefixing on a ``Ensemble``, users must use the ``Ensemble.enable_key_prefixing`` +function in the ``Experiment`` driver script. This function activates prefixing for tensors, +``Datasets``, and lists sent to an ``Orchestrator`` for all ``Ensemble`` members. This function +also enables access to prefixing ``Client`` functions within the ``Ensemble`` members. This excludes +the ``Client.set_data_source`` function, where ``enable_key_prefixing`` is not require for access. + +.. note:: + ML model and script prefixing is not automatically enabled through ``Ensemble.enable_key_prefixing``. + Prefixing must be enabled within the ``Ensemble`` by calling the ``use_model_ensemble_prefix`` method + on the ``Client`` embedded within the member application. + +Users can enable the SmartRedis ``Client`` to interact with prefixed data, ML models and TorchScripts +using the ``Client.set_data_source``. However, for SmartSim to recognize the producer entity name +passed to the function within an application, the producer entity must be registered on the consumer +entity using ``Ensemble.register_incoming_entity``. + +If a consumer ``Ensemble`` member requests data sent to the ``Orchestrator`` by other ``Ensemble`` members, the producer members must be +registered on consumer member. To access ``Ensemble`` members, SmartSim offers the attribute ``Ensemble.models`` that returns +a list of ``Ensemble`` members. Below we demonstrate registering producer members on a consumer member: + +.. code-block:: python + + # list of producer Ensemble members + list_of_ensemble_names = ["producer_0", "producer_1", "producer_2"] + + # Grab the consumer Ensemble member + ensemble_member = ensemble.models.get("producer_3") + # Register the producer members on the consumer member + for name in list_of_ensemble_names: + ensemble_member.register_incoming_entity(ensemble.models.get(name)) + +For examples demonstrating how to retrieve data within the entity application that produced +the data, visit the ``Model`` :ref:`Copy/Rename/Delete Operations` subsection. + +Example: Ensemble Key Prefixing +=============================== +In this example, we create an ``Ensemble`` comprised of two ``Model``'s that use identical code +to send data to a standalone ``Orchestrator``. To prevent key collisions and ensure data +integrity, we enable key prefixing on the ``Ensemble`` which automatically +appends the ``Ensemble`` member `name` to the data sent to the ``Orchestrator``. After the +``Ensemble`` completes, we launch a consumer ``Model`` within the ``Experiment`` driver script +to demonstrate accessing prefixed data sent to the ``Orchestrator`` by ``Ensemble`` members. + +This example consists of **three** Python scripts: + +1. :ref:`Application Producer Script`: This script is encapsulated + in a SmartSim ``Ensemble`` within the ``Experiment`` driver script. Prefixing is enabled + on the ``Ensemble``. The producer script puts NumPy tensors on an ``Orchestrator`` + launched in the ``Experiment`` driver script. The ``Ensemble`` creates two + identical ``Ensemble`` members. The producer script is executed + in both ``Ensemble`` members to send two prefixed tensors to the ``Orchestrator``. + The source code example is available in the dropdown below for convenient customization. + +.. dropdown:: Application Producer Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py + +1. :ref:`Application Consumer Script`: This script is encapsulated + within a SmartSim ``Model`` in the ``Experiment`` driver script. The script requests the + prefixed tensors placed by the producer script. The source code example is available in + the dropdown below for convenient customization. + +.. dropdown:: Application Consumer Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + +1. :ref:`Experiment Driver Script`: The driver script launches the + ``Orchestrator``, the ``Ensemble`` (which sends prefixed keys to the ``Orchestrator``), + and the ``Model`` (which requests prefixed keys from the ``Orchestrator``). The + ``Experiment`` driver script is the centralized spot that controls the workflow. + The source code example is available in the dropdown below for convenient execution and + customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + +.. _app_prod_prefix_ensemble: + +------------------------------- +The Application Producer Script +------------------------------- +In the ``Experiment`` driver script, we instruct SmartSim to create an ``Ensemble`` comprised of +two duplicate members that execute this producer script. In the producer script, a SmartRedis ``Client`` sends a +tensor to the ``Orchestrator``. Since the ``Ensemble`` members are identical and therefore use the same +application code, two tensors are sent to the ``Orchestrator``. Without prefixing enabled on the ``Ensemble`` +the keys can be overwritten. To prevent this, we enable key prefixing on the ``Ensemble`` in the driver script +via ``Ensemble.enable_key_prefixing``. When the producer script is executed by each ``Ensemble`` member, a +tensor is sent to the ``Orchestrator`` with the ``Ensemble`` member `name` prepended to the tensor `name`. + +Here we provide the producer script that is applied to the ``Ensemble`` members: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py + :language: python + :linenos: + +After the completion of ``Ensemble`` members `producer_0` and `producer_1`, the contents of the ``Orchestrator`` are: + +.. code-block:: bash + + 1) "producer_0.tensor" + 2) "producer_1.tensor" + +.. _app_con_prefix_ensemble: + +------------------------------- +The Application Consumer Script +------------------------------- +In the ``Experiment`` driver script, we initialize a consumer ``Model`` that encapsulates +the consumer application to request the tensors produced from the ``Ensemble``. To do +so, we use SmartRedis key prefixing functionality to instruct the SmartRedis ``Client`` +to append the name of an ``Ensemble`` member to the key `name`. + +.. seealso:: + For more information on ``Client`` prefixing functions, visit the :ref:`Client functions` + subsection of the ``Model`` documentation. + +To begin, specify the imports and initialize a SmartRedis ``Client``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 1-4 + +To retrieve the tensor from the first ``Ensemble`` member named `producer_0`, use +``Client.set_data_source``. Specify the name of the first ``Ensemble`` member +as an argument to the function. This instructs SmartSim to append the ``Ensemble`` member name to the data +search on the ``Orchestrator``. When ``Client.poll_tensor`` is executed, +the SmartRedis `client` will poll for key, `producer_0.tensor`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 6-9 + +Follow the same steps above, however, change the data source `name` to the `name` +of the second ``Ensemble`` member (`producer_1`): + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 11-14 + +We print the boolean return to verify that the tensors were found: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 16-17 + +When the ``Experiment`` driver script is executed, the following output will appear in `consumer.out`: + +.. code-block:: bash + + Default@11-46-05:producer_0.tensor was found: True + Default@11-46-05:producer_1.tensor was found: True + +.. warning:: + For SmartSim to recognize the ``Ensemble`` member names as a valid data source + to ``Client.set_data_source``, you must register each ``Ensemble`` member + on the consumer ``Model`` in the driver script via ``Model.register_incoming_entity``. + We demonstrate this in the ``Experiment`` driver script section of the example. + +.. _exp_prefix_ensemble: + +--------------------- +The Experiment Script +--------------------- +The ``Experiment`` driver script manages all workflow components and utilizes the producer and consumer +application scripts. In the example, the ``Experiment``: + +- launches standalone ``Orchestrator`` +- launches an ``Ensemble`` via the replicas initialization strategy +- launches a consumer ``Model`` +- clobbers the ``Orchestrator`` + +To begin, add the necessary imports, initialize an ``Experiment`` instance and initialize the +standalone ``Orchestrator``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 1-9 + +We are now setup to discuss key prefixing within the ``Experiment`` driver script. +To create an ``Ensemble`` using the replicas strategy, begin by initializing a ``RunSettings`` +object to apply to all ``Ensemble`` members. Specify the path to the application +producer script: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 11-12 + +Next, initialize an ``Ensemble`` by specifying `ensemble_settings` and the number of ``Model`` `replicas` to create: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 14-15 + +Instruct SmartSim to prefix all tensors sent to the ``Orchestrator`` from the ``Ensemble`` via ``Ensemble.enable_key_prefixing``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 17-18 + +Next, initialize the consumer ``Model``. The consumer ``Model`` application requests +the prefixed tensors produced by the ``Ensemble``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 20-23 + +Next, organize the SmartSim entity output files into a single ``Experiment`` folder: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 25-26 + +Launch the ``Orchestrator``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 28-29 + +Launch the ``Ensemble``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 31-32 + +Set `block=True` so that ``Experiment.start`` waits until the last ``Ensemble`` member has finished before continuing. + +The consumer ``Model`` application script uses ``Client.set_data_source`` which +accepts the ``Ensemble`` member names when searching for prefixed +keys in the ``Orchestrator``. In order for SmartSim to recognize the ``Ensemble`` +member names as a valid data source in the consumer ``Model``, we must register +the entity interaction: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 34-36 + +Launch the consumer ``Model``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 38-39 + +To finish, tear down the standalone ``Orchestrator``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 41-42 \ No newline at end of file diff --git a/doc/experiment.rst b/doc/experiment.rst index 986db4cad..9936f49a9 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -1,326 +1,356 @@ - *********** Experiments *********** +======== +Overview +======== +SmartSim helps automate the deployment of AI-enabled workflows on HPC systems. With SmartSim, users +can describe and launch combinations of applications and AI/ML infrastructure to produce novel and +scalable workflows. SmartSim supports launching these workflows on a diverse set of systems, including +local environments such as Mac or Linux, as well as HPC job schedulers (e.g. Slurm, PBS Pro, and LSF). -The Experiment acts as both a factory class for constructing the stages of an -experiment (``Model``, ``Ensemble``, ``Orchestrator``, etc.) as well as an -interface to interact with the entities created by the experiment. - -Users can initialize an :ref:`Experiment ` at the beginning of a -Jupyter notebook, interactive python session, or Python file and use the -``Experiment`` to iteratively create, configure and launch computational kernels -on the system through the specified launcher. - -.. |SmartSim Architecture| image:: images/ss-arch-overview.png - :width: 700 - :alt: Alternative text - -|SmartSim Architecture| - - -The interface was designed to be simple, with as little complexity as possible, -and agnostic to the backend launching mechanism (local, Slurm, PBSPro, etc.). - -Model -===== - -``Model(s)`` are subclasses of ``SmartSimEntity(s)`` and are created through the -Experiment API. Models represent any computational kernel. Models are flexible -enough to support many different applications, however, to be used with our -clients (SmartRedis) the application will have to be written in Python, C, C++, -or Fortran. +The ``Experiment`` API is SmartSim's top level API that provides users with methods for creating, combining, +configuring, launching and monitoring :ref:`entities` in an AI-enabled workflow. More specifically, the +``Experiment`` API offers three customizable workflow components that are created and initialized via factory +methods: -Models are given :ref:`RunSettings ` objects that specify how a kernel -should be executed with regard to the workload manager (e.g. Slurm) and the -available compute resources on the system. +* :ref:`Orchestrator` +* :ref:`Model` +* :ref:`Ensemble` -Each launcher supports specific types of ``RunSettings``. +Settings are given to ``Model`` and ``Ensemble`` objects to provide parameters for how the job should be executed. The +:ref:`Experiment API` offers two customizable Settings objects that are created via the factory methods: - - :ref:`SrunSettings ` for Slurm - - :ref:`AprunSettings ` for PBSPro - - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, LSF, and Slurm - - :ref:`JsrunSettings ` for LSF +* :ref:`RunSettings` +* :ref:`BatchSettings` -These settings can be manually specified by the user, or auto-detected by the -SmartSim Experiment through the ``Experiment.create_run_settings`` method. +Once a workflow component is initialized (e.g. ``Orchestrator``, ``Model`` or ``Ensemble``), a user has access +to the associated entity API which supports configuring and retrieving the entities' information: -A simple example of using the Experiment API to create a model and run it -locally: +* :ref:`Orchestrator API` +* :ref:`Model API` +* :ref:`Ensemble API` -.. code-block:: Python +There is no limit to the number of SmartSim entities a user can +initialize within an ``Experiment``. - from smartsim import Experiment +.. figure:: images/Experiment.png - exp = Experiment("simple", launcher="local") + Sample ``Experiment`` showing a user application leveraging + machine learning infrastructure launched by SmartSim and connected + to online analysis and visualization via the in-memory ``Orchestrator``. - settings = exp.create_run_settings("echo", exe_args="Hello World") - model = exp.create_model("hello_world", settings) +Find an example of the ``Experiment`` class and factory methods used within a +workflow in the :ref:`Example` section of this page. - exp.start(model, block=True) - print(exp.get_status(model)) +.. _launcher_exp_docs: -If the launcher has been specified, or auto-detected through setting -``launcher=auto`` in the Experiment initialization, the ``create_run_settings`` -method will automatically create the appropriate ``RunSettings`` object and -return it. +========= +Launchers +========= +SmartSim supports launching AI-enabled workflows on a wide variety of systems, including locally on a Mac or +Linux machine or on HPC machines with a job scheduler (e.g. Slurm, PBS Pro, and LSF). When creating a SmartSim +``Experiment``, the user has the opportunity to specify the `launcher` type or defer to automatic `launcher` selection. +`Launcher` selection determines how SmartSim translates entity configurations into system calls to launch, +manage, and monitor. Currently, SmartSim supports 5 `launchers`: -For example with Slurm +1. ``local`` **[default]**: for single-node, workstation, or laptop +2. ``slurm``: for systems using the Slurm scheduler +3. ``pbs``: for systems using the PBS Pro scheduler +4. ``pals``: for systems using the PALS scheduler +5. ``lsf``: for systems using the LSF scheduler +6. ``auto``: have SmartSim auto-detect the launcher to use -.. code-block:: Python +If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of +`"local"` will be assigned which will start all ``Experiment`` launched entities on the +localhost. - from smartsim import Experiment +For examples specifying a `launcher` during ``Experiment`` initialization, navigate to the +``Experiment`` :ref:`__init__ special method` in the ``Experiment`` API docstring. - exp = Experiment("hello_world_exp", launcher="slurm") - srun = exp.create_run_settings(exe="echo", exe_args="Hello World!") +.. _entities_exp_docs: - # helper methods for configuring run settings are available in - # each of the implementations of RunSettings - srun.set_nodes(1) - srun.set_tasks(32) +======== +Entities +======== +Entities are SmartSim API objects that can be launched and +managed on the compute system through the ``Experiment`` API. +The SmartSim entities include: + +* ``Orchestrator`` +* ``Model`` +* ``Ensemble`` + +While the ``Experiment`` object is intended to be instantiated once in the +Python driver script, there is no limit to the number of SmartSim entities +within the ``Experiment``. In the following subsections, we define the +general purpose of the three entities that can be created through the +``Experiment``. + +To create a reference to a newly instantiated entity object, use the +associated ``Experiment.create_...`` factory method shown below. + +.. list-table:: Experiment API Entity Creation + :widths: 20 65 25 + :header-rows: 1 + + * - Factory Method + - Example + - Return Type + * - ``create_database`` + - ``orch = exp.create_database([port, db_nodes, ...])`` + - :ref:`Orchestrator ` + * - ``create_model`` + - ``model = exp.create_model(name, run_settings)`` + - :ref:`Model ` + * - ``create_ensemble`` + - ``ensemble = exp.create_ensemble(name[, params, ...])`` + - :ref:`Ensemble ` + +After initialization, each entity can be started, monitored, and stopped using +the ``Experiment`` post-creation methods. + +.. list-table:: Interact with Entities During the Experiment + :widths: 25 55 25 + :header-rows: 1 + + * - Factory Method + - Example + - Desc + * - ``start`` + - ``exp.start(*args[, block, summary, ...])`` + - Launch an Entity + * - ``stop`` + - ``exp.stop(*args)`` + - Stop an Entity + * - ``get_status`` + - ``exp.get_status(*args)`` + - Retrieve Entity Status + +.. _orchestrator_exp_docs: + +Orchestrator +============ +The :ref:`Orchestrator` is an in-memory database built for +a wide variety of AI-enabled workflows. The ``Orchestrator`` can be thought of as a general +feature store for numerical data, ML models, and scripts. The ``Orchestrator`` is capable +of performing inference and script evaluation using data in the feature store. +Any SmartSim ``Model`` or ``Ensemble`` member can connect to the +``Orchestrator`` via the :ref:`SmartRedis` +``Client`` library to transmit data, execute ML models, and execute scripts. + +**SmartSim Offers Two Types of Orchestrator Deployments:** + +* :ref:`Standalone Orchestrator Deployment` +* :ref:`Colocated Orchestrator Deployment` + +To create a standalone ``Orchestrator`` that does not share compute resources with other +SmartSim entities, use the ``Experiment.create_database`` factory method which +returns an ``Orchestrator`` object. To create a colocated ``Orchestrator`` that +shares compute resources with a ``Model``, use the ``Model.colocate_db_tcp`` +or ``Model.colocate_db_uds`` member functions accessible after a +``Model`` object has been initialized. The functions instruct +SmartSim to launch an ``Orchestrator`` on the application compute nodes. An ``Orchestrator`` object is not +returned from a ``Model.colocate_db`` instruction, and subsequent interactions with the +colocated ``Orchestrator`` are handled through the :ref:`Model API`. + +SmartSim supports :ref:`multi-database` functionality, enabling an ``Experiment`` to have +several concurrently launched ``Orchestrator(s)``. If there is a need to launch more than +one ``Orchestrator``, the ``Experiment.create_database`` and ``Model.colocate..`` +functions mandate the specification of a unique ``Orchestrator`` identifier, denoted +by the `db_identifier` argument for each ``Orchestrator``. The `db_identifier` is used +in an application script by a SmartRedis ``Client`` to connect to a specific ``Orchestrator``. + +.. _model_exp_docs: - model = exp.create_model("hello_world", srun) - exp.start(model, block=True, summary=True) +Model +===== +:ref:`Model(s)` represent a simulation model or any computational kernel, +including applications, scripts, or generally, a program. They can +interact with other SmartSim entities via data transmitted to/from +SmartSim ``Orchestrator(s)`` using a SmartRedis ``Client``. - print(exp.get_status(model)) +A ``Model`` is created through the factory method: ``Experiment.create_model``. +``Model(s)`` are initialized with ``RunSettings`` objects that specify +how a ``Model`` should be launched by a workload manager +(e.g., Slurm) and the compute resources required. +Optionally, the user may also specify a ``BatchSettings`` object if +the ``Model`` should be launched as a batch job on the WLM system. +The ``create_model`` factory method returns an initialized ``Model`` object that +gives you access to functions associated with the :ref:`Model API`. -The above will run ``srun -n 32 -N 1 echo Hello World!``, monitor its -execution, and inform the user when it is completed. This driver script can be -executed in an interactive allocation, or placed into a batch script as follows: +A ``Model`` supports key features, including methods to: -.. code-block:: bash +- :ref:`Attach configuration files` for use at ``Model`` runtime. +- :ref:`Colocate an Orchestrator` to a SmartSim ``Model``. +- :ref:`Load an ML model` into the ``Orchestrator`` at ``Model`` runtime. +- :ref:`Load a TorchScript function` into the ``Orchestrator`` at ``Model`` runtime. +- :ref:`Enable data collision prevention` which allows + for reuse of key names in different ``Model`` applications. - #!/bin/bash - #SBATCH --exclusive - #SBATCH --nodes=1 - #SBATCH --ntasks-per-node=32 - #SBATCH --time=00:10:00 +Visit the respective links for more information on each topic. - python /path/to/script.py +.. _ensemble_exp_docs: Ensemble ======== +In addition to a single ``Model``, SmartSim allows users to create, +configure, and launch an :ref:`Ensemble` of ``Model`` objects. +``Ensemble(s)`` can be given parameters and a permutation strategy that define how the +``Ensemble`` will create the underlying ``Model`` objects. Users may also +manually create and append ``Model(s)`` to an ``Ensemble``. For information +and examples on ``Ensemble`` creation strategies, visit the :ref:`Initialization` +section within the ``Ensemble`` documentation. + +An ``Ensemble`` supports key features, including methods to: + +- :ref:`Attach configuration files` for use at ``Ensemble`` runtime. +- :ref:`Load an ML model` (TF, TF-lite, PT, or ONNX) into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Load a TorchScript function` into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Prevent data collisions` within the ``Ensemble``, which allows for reuse of application code. + +Visit the respective links for more information on each topic. + +.. _exp_example: + +======= +Example +======= +.. compound:: + In the following section, we provide an example of using SmartSim to automate the + deployment of an HPC workflow consisting of a ``Model`` and standalone ``Orchestrator``. + The example demonstrates: + + *Initializing* + - a workflow (``Experiment``) + - an in-memory database (standalone ``Orchestrator``) + - an application (``Model``) + *Generating* + - the ``Orchestrator`` output directory + - the ``Model`` output directory + *Starting* + - an in-memory database (standalone ``Orchestrator``) + - an application (``Model``) + *Stopping* + - an in-memory database (standalone ``Orchestrator``) + + The example source code is available in the dropdown below for convenient execution + and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + +Initializing +============ +.. compound:: + To create a workflow, *initialize* an ``Experiment`` object + at the start of the Python driver script. This involves specifying + a name and the system launcher that will execute all entities. + Set the `launcher` argument to `auto` to instruct SmartSim to attempt + to find the machines WLM. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 1-7 + + We also initialize a SmartSim :ref:`logger`. We will use the logger to log the ``Experiment`` + summary. + +.. compound:: + Next, launch an in-memory database, referred to as an ``Orchestrator``. + To *initialize* an ``Orchestrator`` object, use the ``Experiment.create_database`` + factory method. Create a multi-sharded ``Orchestrator`` by setting the argument `db_nodes` to three. + SmartSim will assign a `port` to the ``Orchestrator`` and attempt to detect your machine's + network interface if not provided. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 9-10 + +.. compound:: + Before invoking the factory method to create a ``Model``, + first create a ``RunSettings`` object. ``RunSettings`` hold the + information needed to execute the ``Model`` on the machine. The ``RunSettings`` + object is initialized using the ``Experiment.create_run_settings`` method. + Specify the executable to run and arguments to pass to the executable. + + The example ``Model`` is a simple `Hello World` program + that echos `Hello World` to stdout. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 12-13 + + After creating the ``RunSettings`` object, initialize the ``Model`` object by passing the `name` + and `settings` to ``create_model``. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 14-15 + +Generating +========== +.. compound:: + Next we generate the file structure for the ``Experiment``. A call to ``Experiment.generate`` + instructs SmartSim to create directories within the ``Experiment`` folder for each instance passed in. + We organize the ``Orchestrator`` and ``Model`` output files within the ``Experiment`` folder by + passing the ``Orchestrator`` and ``Model`` instances to ``exp.generate``: + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 17-18 + + `Overwrite=True` instructs SmartSim to overwrite entity contents if files and subdirectories + already exist within the ``Experiment`` directory. + + .. note:: + If files or folders are attached to a ``Model`` or ``Ensemble`` members through ``Model.attach_generator_files`` + or ``Ensemble.attach_generator_files``, the attached files or directories will be symlinked, copied, or configured and + written into the created directory for that instance. + + The ``Experiment.generate`` call places the `.err` and `.out` log files in the entity + subdirectories within the main ``Experiment`` directory. + +Starting +======== +.. compound:: + Next launch the components of the ``Experiment`` (``Orchestrator`` and ``Model``). + To do so, use the ``Experiment.start`` factory method and pass in the previous + ``Orchestrator`` and ``Model`` instances. -In addition to a single model, SmartSim has the ability to launch an -``Ensemble`` of ``Model`` applications simultaneously. - -An ``Ensemble`` can be constructed in three ways: - 1. Parameter expansion (by specifying ``params`` and ``perm_strat`` argument) - 2. Replica creation (by specifying ``replicas`` argument) - 3. Manually (by adding created ``Model`` objects) if launching as a batch job - -Ensembles can be given parameters and permutation strategies that define how the -``Ensemble`` will create the underlying model objects. - -Three strategies are built in: - 1. ``all_perm``: for generating all permutations of model parameters - 2. ``step``: for creating one set of parameters for each element in `n` arrays - 3. ``random``: for random selection from predefined parameter spaces - -Here is an example that uses the ``random`` strategy to intialize four models -with random parameters within a set range. We use the ``params_as_args`` field -to specify that the randomly selected learning rate parameter should be passed -to the created models as a executable argument. - -.. code-block:: bash - - import numpy as np - from smartsim import Experiment - - exp = Experiment("Training-Run", launcher="auto") - - # setup ensemble parameter space - learning_rate = list(np.linspace(.01, .5)) - train_params = {"LR": learning_rate} - - # define how each member should run - run = exp.create_run_settings(exe="python", - exe_args="./train-model.py") - - ensemble = exp.create_ensemble("Training-Ensemble", - params=train_params, - params_as_args=["LR"], - run_settings=run, - perm_strategy="random", - n_models=4) - exp.start(ensemble, summary=True) - - -A callable function can also be supplied for custom permutation strategies. The -function should take two arguments: a list of parameter names, and a list of -lists of potential parameter values. The function should return a list of -dictionaries that will be supplied as model parameters. The length of the list -returned will determine how many ``Model`` instances are created. - -For example, the following is the built-in strategy ``all_perm``: - -.. code-block:: python - - from itertools import product - - def create_all_permutations(param_names, param_values): - perms = list(product(*param_values)) - all_permutations = [] - for p in perms: - temp_model = dict(zip(param_names, p)) - all_permutations.append(temp_model) - return all_permutations - - -After ``Ensemble`` initialization, ``Ensemble`` instances can be -passed as arguments to ``Experiment.generate()`` to write assigned -parameter values into attached and tagged configuration files. - -Launching Ensembles -------------------- - -Ensembles can be launched in previously obtained interactive allocations -and as a batch. Similar to ``RunSettings``, ``BatchSettings`` specify how -an application(s) in a batch job should be executed with regards to the system -workload manager and available compute resources. - - - :ref:`SbatchSettings ` for Slurm - - :ref:`QsubBatchSettings ` for PBSPro - - :ref:`BsubBatchSettings ` for LSF - -If it only passed ``RunSettings``, ``Ensemble``, objects will require either -a ``replicas`` argument or a ``params`` argument to expand parameters -into ``Model`` instances. At launch, the ``Ensemble`` will look for -interactive allocations to launch models in. - -If it passed ``BatchSettings`` without other arguments, an empty ``Ensemble`` -will be created that ``Model`` objects can be added to manually. All ``Model`` -objects added to the ``Ensemble`` will be launched in a single batch. - -If it passed ``BatchSettings`` and ``RunSettings``, the ``BatchSettings`` will -determine the allocation settings for the entire batch, and the ``RunSettings`` -will determine how each individual ``Model`` instance is executed within -that batch. - -This is the same example as above, but tailored towards a running as a batch job -on a slurm system: - -.. code-block:: bash - - import numpy as np - from smartsim import Experiment - - exp = Experiment("Training-Run", launcher="slurm") - - # setup ensemble parameter space - learning_rate = list(np.linspace(.01, .5)) - train_params = {"LR": learning_rate} - - # define resources for all ensemble members - sbatch = exp.create_batch_settings(nodes=4, - time="01:00:00", - account="12345-Cray", - queue="gpu") - - # define how each member should run - srun = exp.create_run_settings(exe="python", - exe_args="./train-model.py") - srun.set_nodes(1) - srun.set_tasks(24) - - ensemble = exp.create_ensemble("Training-Ensemble", - params=train_params, - params_as_args=["LR"], - batch_settings=sbatch, - run_settings=srun, - perm_strategy="random", - n_models=4) - exp.start(ensemble, summary=True) - - -This will generate and execute a batch script that looks something like -the following: - -.. code-block:: bash - - # GENERATED - - #!/bin/bash - - #SBATCH --output=/lus/smartsim/Training-Ensemble.out - #SBATCH --error=/lus/smartsim/Training-Ensemble.err - #SBATCH --job-name=Training-Ensemble-CHTN0UI2DORX - #SBATCH --nodes=4 - #SBATCH --time=01:00:00 - #SBATCH --partition=gpu - #SBATCH --account=12345-Cray - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_0.out --error /scratch/smartsim/Training-Ensemble_0.err --job-name Training-Ensemble_0-CHTN0UI2E5DX --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.17 & - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_1.out --error /scratch/smartsim/Training-Ensemble_1.err --job-name Training-Ensemble_1-CHTN0UI2JQR5 --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.32 & - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_2.out --error /scratch/smartsim/Training-Ensemble_2.err --job-name Training-Ensemble_2-CHTN0UI2P2AR --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.060000000000000005 & - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_3.out --error /scratch/smartsim/Training-Ensemble_3.err --job-name Training-Ensemble_3-CHTN0UI2TRE7 --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.35000000000000003 & - - wait - -Prefixing Keys in the Orchestrator ----------------------------------- - -If each of multiple ensemble members attempt to use the same code to access their respective models -in the Orchestrator, the keys by which they do this will overlap and they can end up accessing each -others' data inadvertently. To prevent this situation, the SmartSim Entity object supports key -prefixing, which automatically prepends the name of the model to the keys by which it is accessed. -With this enabled, key overlapping is no longer an issue and ensemble members can use the same code. - -Under the hood, calling ensemble.enable_key_prefixing() causes the SSKEYOUT environment variable to -be set, which in turn causes all keys generated by an ensemble member to be prefixed with its model -name. Similarly, if the model for the ensemble member has incoming entities (such as those set via -model.register_incoming_entity() or ensemble.register_incoming_entity()), the SSKEYIN environment -variable will be set and the keys associated with those inputs will be automatically prefixed. Note -that entities must register themselves as this is not done by default. - -Finally, please note that while prefixing is enabled by default for tensors, datasets, and aggregated -lists of datasets, a SmartRedis client must manually call Client.use_model_ensemble_prefix() to -ensure that prefixes are used with models and scripts. - -We modify the example above to enable key prefixing as follows: - -.. code-block:: bash - - import numpy as np - from smartsim import Experiment - - exp = Experiment("Training-Run", launcher="slurm") - - # setup ensemble parameter space - learning_rate = list(np.linspace(.01, .5)) - train_params = {"LR": learning_rate} - - # define resources for all ensemble members - sbatch = exp.create_batch_settings(nodes=4, - time="01:00:00", - account="12345-Cray", - queue="gpu") - - # define how each member should run - srun = exp.create_run_settings(exe="python", - exe_args="./train-model.py") - srun.set_nodes(1) - srun.set_tasks(24) + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 20-21 - ensemble = exp.create_ensemble("Training-Ensemble", - params=train_params, - params_as_args=["LR"], - batch_settings=sbatch, - run_settings=srun, - perm_strategy="random", - n_models=4) +Stopping +======== +.. compound:: + Lastly, to clean up the ``Experiment``, tear down the launched ``Orchestrator`` + using the ``Experiment.stop`` factory method. - # Enable key prefixing -- note that this should be done - # before starting the experiment - ensemble.enable_key_prefixing() + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 23-26 - exp.start(ensemble, summary=True) + Notice that we use the ``Experiment.summary`` function to print + the summary of the workflow. +When you run the experiment, the following output will appear:: -Further Information -------------------- + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|----------------|---------------|-------------|---------|---------|-----------|--------------| + | 0 | hello_world | Model | 1778304.4 | 0 | 10.0657 | Completed | 0 | + | 1 | orchestrator_0 | DBNode | 1778304.3+2 | 0 | 43.4797 | Cancelled | 0 | -For more informtion about Ensembles, please refer to the :ref:`Ensemble API documentation `. \ No newline at end of file +.. note:: + Failure to tear down the ``Orchestrator`` at the end of an ``Experiment`` + may lead to ``Orchestrator`` launch failures if another ``Experiment`` is + started on the same node. \ No newline at end of file diff --git a/doc/images/Experiment.png b/doc/images/Experiment.png new file mode 100644 index 000000000..a103dd6dd Binary files /dev/null and b/doc/images/Experiment.png differ diff --git a/doc/images/clustered_orchestrator-1.png b/doc/images/clustered_orchestrator-1.png new file mode 100644 index 000000000..996d55e85 Binary files /dev/null and b/doc/images/clustered_orchestrator-1.png differ diff --git a/doc/images/colocated_orchestrator-1.png b/doc/images/colocated_orchestrator-1.png new file mode 100644 index 000000000..0da5d0609 Binary files /dev/null and b/doc/images/colocated_orchestrator-1.png differ diff --git a/doc/index.rst b/doc/index.rst index 91a7ee1ba..7e7d9c2d6 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -30,8 +30,12 @@ :caption: SmartSim experiment + run_settings + batch_settings + model + ensemble orchestrator - launchers + ss_logger ml_features api/smartsim_api diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 2f43db50f..75b099ad5 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -1,3 +1,5 @@ +.. _basic_install_SS: + ****************** Basic Installation ****************** @@ -27,7 +29,7 @@ The base prerequisites to install SmartSim and SmartRedis are: - git - `git-lfs`_ -.. _git-lfs: https://github.com/git-lfs/git-lfs?utm_source=gitlfs_site&utm_medium=installation_link&utm_campaign=gitlfs#installing +.. _git-lfs: https://github.com/git-lfs/git-lfs?utm_source=gitlfs_site&utm_medium=installation_link&utm_campaign=gitlfs .. note:: @@ -48,7 +50,7 @@ The machine-learning backends have additional requirements in order to use GPUs for inference - `CUDA Toolkit 11 (tested with 11.8) `_ - - `cuDNN 8 (tested with 8.9.1) `_ + - `cuDNN 8 (tested with 8.9.1) `_ - OS: Linux - GPU: Nvidia diff --git a/doc/installation_instructions/platform/nonroot-linux.rst b/doc/installation_instructions/platform/nonroot-linux.rst index 2c8f7933a..3070a871a 100644 --- a/doc/installation_instructions/platform/nonroot-linux.rst +++ b/doc/installation_instructions/platform/nonroot-linux.rst @@ -13,6 +13,6 @@ a user is possible. ./cuda_11.4.4_470.82.01_linux.run --toolkit --silent --toolkitpath=/path/to/install/location/ For cuDNN, follow `Nvidia's instructions -`_, +`_, and copy the cuDNN libraries to the `lib64` directory at the CUDA Toolkit location specified above. \ No newline at end of file diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst index 5727ae8fe..6268584cc 100644 --- a/doc/installation_instructions/platform/olcf-summit.rst +++ b/doc/installation_instructions/platform/olcf-summit.rst @@ -6,7 +6,7 @@ Since SmartSim does not have a built PowerPC build, the build steps for an IBM system are slightly different than other systems. Luckily for us, a conda channel with all relevant packages is maintained as part -of the `OpenCE `_ initiative. Users can follow these +of the `OpenCE `_ initiative. Users can follow these instructions to get a working SmartSim build with PyTorch and TensorFlow for GPU on Summit. Note that SmartSim and SmartRedis will be downloaded to the working directory from which these instructions are executed. diff --git a/doc/launchers.rst b/doc/launchers.rst deleted file mode 100644 index 22425071e..000000000 --- a/doc/launchers.rst +++ /dev/null @@ -1,248 +0,0 @@ - -********* -Launchers -********* - -SmartSim interfaces with a number of backends called `launchers` that -are responsible for constructing jobs based on run parameters and -launching them onto a system. - -The `launchers` allow SmartSim users to interact with their system -programmatically through a python interface. -Because of this, SmartSim users do not have to leave the Jupyter Notebook, -Python REPL, or Python script to launch, query, and interact with their jobs. - -SmartSim currently supports 5 `launchers`: - 1. ``local``: for single-node, workstation, or laptop - 2. ``slurm``: for systems using the Slurm scheduler - 3. ``pbs``: for systems using the PBSpro scheduler - 4. ``lsf``: for systems using the LSF scheduler - 5. ``auto``: have SmartSim auto-detect the launcher to use. - -To specify a specific launcher, one argument needs to be provided -to the ``Experiment`` initialization. - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("name-of-experiment", launcher="local") # local launcher - exp = Experiment("name-of-experiment", launcher="slurm") # Slurm launcher - exp = Experiment("name-of-experiment", launcher="pbs") # PBSpro launcher - exp = Experiment("name-of-experiment", launcher="lsf") # LSF launcher - exp = Experiment("name-of-experiment", launcher="auto") # auto-detect launcher - -------------------------------------------------------------------------- - -Local -===== - - -The local launcher can be used on laptops, workstations and single -nodes of supercomputer and cluster systems. Through -launching locally, users can prototype workflows and quickly scale -them to larger systems with minimal changes. - -As with all launchers in SmartSim, the local launcher supports -asynchronous execution meaning once entities have been launched -the main thread of execution is not blocked. Daemon threads -that manage currently running jobs will be created when active -jobs are present within SmartSim. - -.. _psutil: https://github.com/giampaolo/psutil - -The local launcher uses the `psutil`_ library to execute and monitor -user-created jobs. - - -Running Locally ---------------- - -The local launcher supports the base :ref:`RunSettings API ` -which can be used to run executables as well as run executables -with arbitrary launch binaries like `mpiexec`. - -The local launcher is the default launcher for all ``Experiment`` -instances. - -The local launcher does not support batch launching. Ensembles -are always executed in parallel but launched sequentially. - ----------------------------------------------------------------------- - -Slurm -===== - -The Slurm launcher works directly with the Slurm scheduler to launch, query, -monitor and stop applications. During the course of an ``Experiment``, -launched entities can be queried for status, completion, and errors. - -The amount of communication between SmartSim and Slurm can be tuned -for specific guidelines of different sites by setting the -value for ``jm_interval`` in the SmartSim configuration file. - -To use the Slurm launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("NAMD-worklfow", launcher="slurm") - - -Running on Slurm ----------------- - -The Slurm launcher supports three types of ``RunSettings``: - 1. :ref:`SrunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``sbatch`` through: - 1. :ref:`SbatchSettings ` - - -Both supported ``RunSettings`` types above can be added -to a ``SbatchSettings`` batch workload through ``Ensemble`` -creation. - - -Getting Allocations -------------------- - -Slurm supports a number of user facing features that other schedulers -do not. For this reason, an extra module :ref:`smartsim.slurm ` can be -used to obtain allocations to launch on and release them after -``Experiment`` completion. - -.. code-block:: python - - from smartsim.wlm import slurm - alloc = slurm.get_allocation(nodes=1) - -The ID of the allocation is returned as a string to the user so that -they can specify what entities should run on which allocations -obtained by SmartSim. - -Additional arguments that would have been passed to the ``salloc`` -command can be passed through the ``options`` argument in a dictionary. - -Anything passed to the options will be processed as a Slurm -argument and appended to the salloc command with the appropriate -prefix (e.g. `-` or `--`). - -For arguments without a value, pass None as the value: - - `exclusive=None` - -.. code-block:: python - - from smartsim.wlm import slurm - salloc_options = { - "C": "haswell", - "partition": "debug", - "exclusive": None - } - alloc_id = slurm.get_slurm_allocation(nodes=128, - time="10:00:00", - options=salloc_options) - -The above code would generate a ``salloc`` command like: - -.. code-block:: bash - - salloc -N 5 -C haswell --partition debug --time 10:00:00 --exclusive - - - -Releasing Allocations ---------------------- - -The :ref:`smartsim.slurm ` interface -also supports releasing allocations obtained in an experiment. - -The example below releases the allocation in the example above. - -.. code-block:: python - - from smartsim.wlm import slurm - salloc_options = { - "C": "haswell", - "partition": "debug", - "exclusive": None - } - alloc_id = slurm.get_slurm_allocation(nodes=128, - time="10:00:00", - options=salloc_options) - - # - - slurm.release_slurm_allocation(alloc_id) - -------------------------------------------------------------------- - -PBSPro -====== - -Like the Slurm launcher, the PBSPro launcher works directly with the PBSPro -scheduler to launch, query, monitor and stop applications. - -The amount of communication between SmartSim and PBSPro can be tuned -for specific guidelines of different sites by setting the -value for ``jm_interval`` in the SmartSim configuration file. - -To use the PBSpro launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("LAMMPS-melt", launcher="pbs") - - - -Running on PBSpro ------------------ - -The PBSpro launcher supports three types of ``RunSettings``: - 1. :ref:`AprunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``qsub`` through: - 1. :ref:`QsubBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``QsubBatchSettings`` batch workload through ``Ensemble`` -creation. - ---------------------------------------------------------------------- - -LSF -=== - -The LSF Launcher works like the PBSPro launcher and -is compatible with LSF and OpenMPI workloads. - -To use the LSF launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("MOM6-double-gyre", launcher="lsf") - - -Running on LSF --------------- - -The LSF launcher supports three types of ``RunSettings``: - 1. :ref:`JsrunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``bsub`` through: - 1. :ref:`BsubBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``BsubBatchSettings`` batch workload through ``Ensemble`` -creation. diff --git a/doc/ml_features.rst b/doc/ml_features.rst index 6096f005e..4e0919a08 100644 --- a/doc/ml_features.rst +++ b/doc/ml_features.rst @@ -1,3 +1,5 @@ +.. _ml_features_docs: + ########### ML Features ########### @@ -303,7 +305,7 @@ with TensorFlow or PyTorch backends. .. code-block:: python - client.run_model(model_key, inputs=["mnist_imagse"], outputs=["mnist_output"]) + client.run_model(model_key, inputs=["mnist_images"], outputs=["mnist_output"]) output = client.get_tensor("mnist_output") diff --git a/doc/model.rst b/doc/model.rst new file mode 100644 index 000000000..52e1ce1c0 --- /dev/null +++ b/doc/model.rst @@ -0,0 +1,2343 @@ +.. _model_object_doc: + +***** +Model +***** +======== +Overview +======== +SmartSim ``Model`` objects enable users to execute computational tasks in an +``Experiment`` workflow, such as launching compiled applications, +running scripts, or performing general computational operations. A ``Model`` can be launched with +other SmartSim ``Model(s)`` and ``Orchestrator(s)`` to build AI-enabled workflows. +With the SmartSim ``Client`` (:ref:`SmartRedis`), data can be transferred from a ``Model`` +to the ``Orchestrator`` for use in an ML model (TF, TF-lite, PyTorch, or ONNX), online +training process, or additional ``Model`` applications. SmartSim ``Clients`` (SmartRedis) are available in +Python, C, C++, or Fortran. + +To initialize a SmartSim ``Model``, use the ``Experiment.create_model`` factory method. +When creating a ``Model``, a :ref:`RunSettings` object must be provided. A ``RunSettings`` +object specifies the ``Model`` executable (e.g. the full path to a compiled binary) as well as +executable arguments and launch parameters. These specifications include launch commands (e.g. `srun`, `aprun`, `mpiexec`, etc), +compute resource requirements, and application command-line arguments. + +Once a ``Model`` instance has been initialized, users have access to +the :ref:`Model API` functions to further configure the ``Model``. +The Model API functions provide users with the following capabilities: + +- :ref:`Attach Files to a SmartSim Model` +- :ref:`Colocate an Orchestrator to a SmartSim Model` +- :ref:`Attach a ML Model to the SmartSim Model` +- :ref:`Attach a TorchScript Function to the SmartSim Model` +- :ref:`Enable SmartSim Model Data Collision Prevention` + +Once the ``Model`` has been configured and launched, a user can leverage an ``Orchestrator`` within a ``Model`` +through **two** strategies: + +- :ref:`Connect to a Standalone Orchestrator` + When a ``Model`` is launched, it does not use or share compute + resources on the same host (computer/server) where a SmartSim ``Orchestrator`` is running. + Instead, it is launched on its own compute resources specified by the ``RunSettings`` object. + The ``Model`` can connect via a SmartRedis ``Client`` to a launched standalone ``Orchestrator``. + +- :ref:`Connect to a Colocated Orchestrator` + When the colocated ``Model`` is started, SmartSim launches an ``Orchestrator`` on the ``Model`` compute + nodes prior to the ``Model`` execution. The ``Model`` can then connect to the colocated ``Orchestrator`` + via a SmartRedis ``Client``. + +.. note:: + For the ``Client`` connection to be successful from within the ``Model`` application, + the SmartSim ``Orchestrator`` must be launched prior to the start of the ``Model``. + +.. note:: + A ``Model`` can be launched without an ``Orchestrator`` if data transfer and ML capabilities are not + required. + +SmartSim manages ``Model`` instances through the :ref:`Experiment API` by providing functions to +launch, monitor, and stop applications. Additionally, a ``Model`` can be launched individually +or as a group via an :ref:`Ensemble`. + +============== +Initialization +============== +Overview +======== +The ``Experiment`` is responsible for initializing all SmartSim entities. +A ``Model`` is created using the ``Experiment.create_model`` factory method, and users can customize the +``Model`` via the factory method parameters. + +The key initializer arguments for ``Model`` creation can be found in the :ref:`Experiment API` +under the ``create_model`` docstring. + +A `name` and :ref:`RunSettings` reference are required to initialize a ``Model``. +Optionally, include a :ref:`BatchSettings` object to specify workload manager batch launching. + +.. note:: + ``BatchSettings`` attached to a ``Model`` are ignored when the ``Model`` is executed as part of an ``Ensemble``. + +The `params` factory method parameter for ``Model`` creation allows a user to define simulation parameters and +values through a dictionary. Using ``Model`` :ref:`file functions`, users can write these parameters to +a file in the ``Model`` working directory. + +When a ``Model`` instance is passed to ``Experiment.generate``, a +directory within the Experiment directory +is created to store input and output files from the ``Model``. + +.. note:: + It is strongly recommended to invoke ``Experiment.generate`` on the ``Model`` + instance before launching the ``Model``. If a path is not specified during + ``Experiment.create_model``, calling ``Experiment.generate`` with the ``Model`` + instance will result in SmartSim generating a ``Model`` directory within the + ``Experiment`` directory. This directory will be used to store the ``Model`` outputs + and attached files. + +.. _std_model_doc: + +Example +======= +In this example, we provide a demonstration of how to initialize and launch a ``Model`` +within an ``Experiment`` workflow. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + +All workflow entities are initialized through the :ref:`Experiment API`. +Consequently, initializing a SmartSim ``Experiment`` is a prerequisite for ``Model`` +initialization. + +To initialize an instance of the ``Experiment`` class, import the SmartSim +``Experiment`` module and invoke the ``Experiment`` constructor +with a `name` and `launcher`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 1-4 + +A ``Model`` requires ``RunSettings`` objects to specify how the ``Model`` should be +executed within the workflow. We use the ``Experiment`` instance `exp` to +call the factory method ``Experiment.create_run_settings`` to initialize a ``RunSettings`` +object. Finally, we specify the executable `"echo"` to run the executable argument `"Hello World"`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 6-7 + +.. seealso:: + For more information on ``RunSettings`` objects, reference the :ref:`RunSettings` documentation. + +We now have a ``RunSettings`` instance named `model_settings` that contains all of the +information required to launch our application. Pass a `name` and the run settings instance +to the ``create_model`` factory method: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 9-10 + +To create an isolated output directory for the ``Model``, invoke ``Experiment.generate`` on the +``Model`` `model_instance`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 12-13 + +.. note:: + The ``Experiment.generate`` step is optional; however, this step organizes the ``Experiment`` + entity output files into individual entity folders within the ``Experiment`` folder. Continue + in the example for information on ``Model`` output generation or visit the + :ref:`Output and Error Files` section. + +All entities are launched, monitored and stopped by the ``Experiment`` instance. +To start the ``Model``, invoke ``Experiment.start`` on `model_instance`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 15-16 + +When the ``Experiment`` driver script is executed, two files from the `model_instance` will be created +in the generated ``Model`` subdirectory: + +1. `model_instance.out` : this file will hold outputs produced by the `model_instance` workload. +2. `model_instance.err` : this file will hold any errors that occurred during `model_instance` execution. + +.. _colo_model_doc: + +====================== +Colocated Orchestrator +====================== +A SmartSim ``Model`` has the capability to share compute node(s) with a SmartSim ``Orchestrator`` in +a deployment known as a colocated ``Orchestrator``. In this scenario, the ``Orchestrator`` and ``Model`` share +compute resources. To achieve this, users need to initialize a ``Model`` instance using the +``Experiment.create_model`` function and then utilize one of the three functions listed below to +colocate an ``Orchestrator`` with the ``Model``. This instructs SmartSim to launch an ``Orchestrator`` +on the application compute node(s) before the ``Model`` execution. + +There are **three** different Model API functions to colocate a ``Model``: + +- ``Model.colocate_db_tcp``: Colocate an ``Orchestrator`` instance and establish client communication using TCP/IP. +- ``Model.colocate_db_uds``: Colocate an ``Orchestrator`` instance and establish client communication using Unix domain sockets (UDS). +- ``Model.colocate_db``: (deprecated) An alias for `Model.colocate_db_tcp`. + +Each function initializes an unsharded ``Orchestrator`` accessible only to the ``Model`` processes on the same compute node. When the ``Model`` +is started, the ``Orchestrator`` will be launched on the same compute resource as the ``Model``. Only the colocated ``Model`` +may communicate with the ``Orchestrator`` via a SmartRedis ``Client`` by using the loopback TCP interface or +Unix Domain sockets. Extra parameters for the ``Orchestrator`` can be passed into the colocate functions above +via `kwargs`. + +.. code-block:: python + + example_kwargs = { + "maxclients": 100000, + "threads_per_queue": 1, + "inter_op_threads": 1, + "intra_op_threads": 1 + } + +For a walkthrough of how to colocate a ``Model``, navigate to the +:ref:`Colocated Orchestrator` for instructions. + +For users aiming to **optimize performance**, SmartSim offers the flexibility to specify +processor IDs to which the colocated ``Orchestrator`` should be pinned. This can be achieved using +the `custom_pinning` argument, which is recognized by both ``Model.colocate_db_uds`` and +``Model.colocate_db_tcp``. In systems where specific processors support ML model and +TorchScript execution, users can employ the `custom_pinning` argument to designate +these processor IDs. This ensures that the specified processors are available +when executing ML models or TorchScripts on the colocated ``Orchestrator``. +Additionally, users may use the `custom_pinning` argument to avoid reserved processors +by specifying a available processor ID or a list of available processor IDs. + +.. _files_doc: + +===== +Files +===== +Overview +======== +Applications often depend on external files (e.g. training datasets, evaluation datasets, etc) +to operate as intended. Users can instruct SmartSim to copy, symlink, or manipulate external files +prior to a ``Model`` launch via the ``Model.attach_generator_files`` function. + +.. note:: + Multiple calls to ``Model.attach_generator_files`` will overwrite previous file configurations + in the ``Model``. + +To setup the run directory for the ``Model``, users should pass the list of files to +``Model.attach_generator_files`` using the following arguments: + +* `to_copy` (t.Optional[t.List[str]] = None): Files that are copied into the path of the ``Model``. +* `to_symlink` (t.Optional[t.List[str]] = None): Files that are symlinked into the path of the ``Model``. + +User-formatted files can be attached using the `to_configure` argument. These files will be modified +during ``Model`` generation to replace tagged sections in the user-formatted files with +values from the `params` initializer argument used during ``Model`` creation: + +* `to_configure` (t.Optional[t.List[str]] = None): Designed for text-based ``Model`` input files, + `"to_configure"` is exclusive to the ``Model``. During ``Model`` directory generation, the attached + files are parsed and specified tagged parameters are replaced with the `params` values that were + specified in the ``Experiment.create_model`` factory method of the ``Model``. The default tag is a semicolon + (e.g., THERMO = ;THERMO;). + +In the :ref:`Example` subsection, we provide an example using the value `to_configure` +within ``attach_generator_files``. + +.. _files_example_doc: + +Example +======= +This example demonstrates how to attach a file to a ``Model`` for parameter replacement at the time +of ``Model`` directory generation. This is accomplished using the `params` function parameter in +``Experiment.create_model`` and the `to_configure` function parameter +in ``Model.attach_generator_files``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + +In this example, we have a text file named `params_inputs.txt`. Within the text file, is the parameter `THERMO` +that is required by the ``Model`` application at runtime: + +.. code-block:: bash + + THERMO = ;THERMO; + +In order to have the tagged parameter `;THERMO;` replaced with a usable value at runtime, two steps are required: + +1. The `THERMO` variable must be included in ``Experiment.create_model`` factory method as + part of the `params` initializer argument. +2. The file containing the tagged parameter `;THERMO;`, `params_inputs.txt`, must be attached to the ``Model`` + via the ``Model.attach_generator_files`` method as part of the `to_configure` function parameter. + +To encapsulate our application within a ``Model``, we must first create an ``Experiment`` instance. +Begin by importing the ``Experiment`` module and initializing an ``Experiment``: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 1-4 + +A SmartSim ``Model`` requires a ``RunSettings`` object to +specify the ``Model`` executable (e.g. the full path to a compiled binary) as well as +executable arguments and launch parameters. Create a simple ``RunSettings`` object +and specify the path to the executable script as an executable argument (`exe_args`): + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 6-7 + +.. seealso:: + To read more on SmartSim ``RunSettings`` objects, reference the :ref:`RunSettings` documentation. + +Next, initialize a ``Model`` object via ``Experiment.create_model``. Pass in the `model_settings` instance +and the `params` value: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 9-10 + +We now have a ``Model`` instance named `model_instance`. Attach the text file, `params_inputs.txt`, +to the ``Model`` for use at entity runtime. To do so, use the +``Model.attach_generator_files`` function and specify the `to_configure` +parameter with the path to the text file, `params_inputs.txt`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 12-13 + +To created an isolated directory for the ``Model`` outputs and configuration files, invoke ``Experiment.generate`` +on `model_instance` as an input parameter: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 15-16 + +The contents of `getting-started/model_name/params_inputs.txt` at runtime are: + +.. code-block:: bash + + THERMO = 1 + +.. _model_output_files: + +====================== +Output and Error Files +====================== +By default, SmartSim stores the standard output and error of the ``Model`` in two files: + +* `.out` +* `.err` + +The files are created in the working directory of the ``Model``, and the filenames directly match the +``Model`` name. The `.out` file logs standard outputs and the +`.err` logs errors for debugging. + +.. note:: + Invoking ``Experiment.generate(model)`` will create a directory `model_name/` and will store + the two files within that directory. You can also specify a path for these files using the + `path` parameter when invoking ``Experiment.create_model``. + +.. _ml_script_model_doc: + +===================== +ML Models and Scripts +===================== +Overview +======== +SmartSim users have the capability to load ML models and TorchScripts into an ``Orchestrator`` +within the ``Experiment`` script for use within a ``Model``. Functions accessible through +a ``Model`` object support loading ML models (TensorFlow, TensorFlow-lite, PyTorch, and ONNX) and +TorchScripts into standalone or colocated ``Orchestrator(s)`` before application runtime. + +Users can follow **two** processes to load an ML model to the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` + +.. warning:: + Uploading an ML model :ref:`from memory` is solely supported for + standalone ``Orchestrator(s)``. To upload an ML model to a colocated ``Orchestrator``, users + must save the ML model to disk and upload :ref:`from file`. + +Users can follow **three** processes to load a TorchScript to the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` +- :ref:`From String` + +.. warning:: + Uploading a TorchScript :ref:`from memory` is solely supported for + standalone ``Orchestrator(s)``. To upload a TorchScript to a colocated ``Orchestrator``, users + upload :ref:`from file` or :ref:`from string`. + +Once an ML model or TorchScript is loaded into the ``Orchestrator``, ``Model`` objects can +leverage ML capabilities by utilizing the SmartSim ``Client`` (:ref:`SmartRedis`) +to execute the stored ML models and TorchScripts. + +.. _ai_model_doc: + +AI Models +========= +When configuring a ``Model``, users can instruct SmartSim to load +Machine Learning (ML) models to the ``Orchestrator``. ML models added +are loaded into the ``Orchestrator`` prior to the execution of the ``Model``. To load an ML model +to the ``Orchestrator``, SmartSim users can provide the ML model **in-memory** or specify the **file path** +when using the ``Model.add_ml_model`` function. SmartSim solely supports loading an ML model from file +for use within standalone ``Orchestrator(s)``. The supported ML frameworks are TensorFlow, +TensorFlow-lite, PyTorch, and ONNX. + +The arguments that customize the storage and execution of an ML model can be found in the +:ref:`Model API` under the ``add_ml_model`` docstring. + +.. _in_mem_ML_model_ex: + +------------------------------------- +Example: Attach an In-Memory ML Model +------------------------------------- +This example demonstrates how to attach an in-memory ML model to a SmartSim ``Model`` +to load into an ``Orchestrator`` at ``Model`` runtime. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to the ``Model`` execution (colocated or standalone) + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``serialize_model`` which returns the + the ML model as a byte string with the names of the input and output layers + +**Attach the ML Model to a SmartSim Model** + +In this example, we have a serialized Tensorflow-based ML model that was saved to a byte string stored under `model`. +Additionally, the ``serialize_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Model`` named `smartsim_model` exists, we add the in-memory TensorFlow model using +the ``Model.add_ml_model`` function and specify the in-memory ML model to the function parameter `model`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``smartsim_model.add_ml_model`` code snippet, we pass in the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model` (model): The in-memory representation of the TensorFlow model. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the ML model will be loaded to the +launched ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` via a SmartSim +``Client`` (:ref:`SmartRedis`) within the application code. + +.. _from_file_ML_model_ex: + +------------------------------------- +Example: Attach an ML Model From File +------------------------------------- +This example demonstrates how to attach a ML model from file to a SmartSim ``Model`` +to load into an ``Orchestrator`` at ``Model`` runtime. +The source code example is available in the dropdown below for +convenient execution and customization. + +.. note:: + SmartSim supports loading ML models from file to standalone ``Orchestrator(s)``. + This feature is **not** supported for colocated ``Orchestrator(s)``. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_ml_model.py + +.. note:: + This example assumes: + + - a standalone ``Orchestrator`` is launched prior to the ``Model`` execution + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``freeze_model`` which returns the + the path to the serialized model file and the names of the input and output layers + +**Attach the ML Model to a SmartSim Model** + +In this example, we have a serialized Tensorflow-based ML model that was saved to disk and stored under `model`. +Additionally, the ``freeze_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Model`` named `smartsim_model` exists, we add the TensorFlow model using +the ``Model.add_ml_model`` function and specify the TensorFlow model path to the parameter `model_path`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_ml_model.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``smartsim_model.add_ml_model`` code snippet, we pass in the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model_path` (model_file): The path to the ML model script. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the ML model will be loaded to the +launched standalone ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` +via a SmartRedis ``Client`` (:ref:`SmartRedis`) within the application code. + +.. _TS_doc: + +TorchScripts +============ +When configuring a ``Model``, users can instruct SmartSim to load TorchScripts +to the ``Orchestrator``. TorchScripts added are loaded into the ``Orchestrator`` prior to +the execution of the ``Model``. To load a TorchScript to the ``Orchestrator``, SmartSim users +can follow one of the processes: + +- :ref:`Define a TorchScript Function In-Memory` + Use the ``Model.add_function`` to instruct SmartSim to load an in-memory TorchScript to the ``Orchestrator``. +- :ref:`Define a TorchScript Function From File` + Provide file path to ``Model.add_script`` to instruct SmartSim to load the TorchScript from file to the ``Orchestrator``. +- :ref:`Define a TorchScript Function as String` + Provide function string to ``Model.add_script`` to instruct SmartSim to load a raw string as a TorchScript function to the ``Orchestrator``. + +.. note:: + SmartSim does **not** support loading in-memory TorchScript functions to colocated ``Orchestrator(s)``. + Users should instead load TorchScripts to a colocated ``Orchestrator`` from file or as a raw string. + +Continue or select a process link to learn more on how each function (``Model.add_script`` and ``Model.add_function``) +load TorchScripts to launched ``Orchestrator(s)``. + +.. _in_mem_TF_doc: + +------------------------------- +Attach an In-Memory TorchScript +------------------------------- +Users can define TorchScript functions within the Python driver script +to attach to a ``Model``. This feature is supported by ``Model.add_function`` which provides flexible +device selection, allowing users to choose between which device the TorchScript is executed on, `"GPU"` or `"CPU"`. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` function parameter. + +.. warning:: + ``Model.add_function`` does **not** support loading in-memory TorchScript functions to a colocated ``Orchestrator``. + If you would like to load a TorchScript function to a colocated ``Orchestrator``, define the function + as a :ref:`raw string` or :ref:`load from file`. + +The arguments that customize the execution of an in-memory TorchScript function can be found in the +:ref:`Model API` under the ``add_function`` docstring. + +Example: Load a In-Memory TorchScript Function +---------------------------------------------- +This example walks through the steps of instructing SmartSim to load an in-memory TorchScript function +to a standalone ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_script.py + +.. note:: + The example assumes: + + - a standalone ``Orchestrator`` is launched prior to the ``Model`` execution + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + +**Define an In-Memory TF Function** + +To begin, define an in-memory TorchScript function within the ``Experiment`` driver script. +For the purpose of the example, we add a simple TorchScript function named `timestwo`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_script.py + :language: python + :linenos: + :lines: 3-4 + +**Attach the In-Memory TorchScript Function to a SmartSim Model** + +We use the ``Model.add_function`` function to instruct SmartSim to load the TorchScript function `timestwo` +onto the launched standalone ``Orchestrator``. Specify the function `timestwo` to the `function` +parameter: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_script.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``smartsim_model.add_function`` code snippet, we input the following arguments: + +- `name` ("example_func"): A name to uniquely identify the TorchScript within the ``Orchestrator``. +- `function` (timestwo): Name of the TorchScript function defined in the Python driver script. +- `device` ("CPU"): Specifying the device for TorchScript execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the TorchScript to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the TF function will be loaded to the +standalone ``Orchestrator``. The function can then be executed on the ``Orchestrator`` via a SmartRedis +``Client`` (:ref:`SmartRedis`) within the application code. + +.. _TS_from_file: + +------------------------------ +Attach a TorchScript From File +------------------------------ +Users can attach TorchScript functions from a file to a ``Model`` and upload them to a +colocated or standalone ``Orchestrator``. This functionality is supported by the ``Model.add_script`` +function's `script_path` parameter. The function supports +flexible device selection, allowing users to choose between `"GPU"` or `"CPU"` via the `device` parameter. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` parameter. + +The arguments that customize the storage and execution of a TorchScript script can be found in the +:ref:`Model API` under the ``add_script`` docstring. + +Example: Load a TorchScript From File +------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript from file +to a launched ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_script.py + +.. note:: + This example assumes: + + - a ``Orchestrator`` is launched prior to the ``Model`` execution (Colocated or standalone) + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + +**Define a TorchScript Script** + +For the example, we create the Python script `torchscript.py`. The file contains a +simple torch function shown below: + +.. code-block:: python + + def negate(x): + return torch.neg(x) + +**Attach the TorchScript Script to a SmartSim Model** + +Assuming an initialized ``Model`` named `smartsim_model` exists, we add the TorchScript script using +``Model.add_script`` by specifying the script path to the `script_path` parameter: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_script.py + :language: python + :linenos: + :lines: 13-14 + +In the above ``smartsim_model.add_script`` code snippet, we include the following arguments: + +- `name` ("example_script"): Reference name for the script inside of the ``Orchestrator``. +- `script_path` ("path/to/torchscript.py"): Path to the script file. +- `device` ("CPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When `smartsim_model` is started via ``Experiment.start``, the TorchScript will be loaded from file to the +``Orchestrator`` that is launched prior to the start of `smartsim_model`. The function can then be executed +on the ``Orchestrator`` via a SmartRedis ``Client`` (:ref:`SmartRedis`) within the application code. + +.. _TS_raw_string: + +--------------------------------- +Define TorchScripts as Raw String +--------------------------------- +Users can upload TorchScript functions from string to colocated or +standalone ``Orchestrator(s)``. This feature is supported by the +``Model.add_script`` function's `script` parameter. The function supports +flexible device selection, allowing users to choose between `"GPU"` or `"CPU"` via the `device` parameter. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` parameter. + +The arguments that customize the storage and execution of a TorchScript script can be found in the +:ref:`Model API` under the ``add_script`` docstring. + +Example: Load a TorchScript From String +--------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript +from string to a ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/string_script.py + +.. note:: + This example assumes: + + - a ``Orchestrator`` is launched prior to the ``Model`` execution (standalone or colocated) + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + +**Define a String TorchScript** + +Define the TorchScript code as a variable in the ``Experiment`` driver script: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/string_script.py + :language: python + :linenos: + :lines: 12-13 + +**Attach the TorchScript Function to a SmartSim Model** + +Assuming an initialized ``Model`` named `smartsim_model` exists, we add a TensorFlow model using +the ``Model.add_script`` function and specify the variable `torch_script_str` to the parameter +`script`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/string_script.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``smartsim_model.add_script`` code snippet, we offer the following arguments: + +- `name` ("example_script"): key to store script under. +- `script` (torch_script_str): TorchScript code. +- `device` ("CPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the TorchScript will be loaded to the +``Orchestrator`` that is launched prior to the start of the ``Model``. + +.. _model_key_collision: + +========================= +Data Collision Prevention +========================= +Overview +======== +If an ``Experiment`` consists of multiple ``Model(s)`` that use the same key names to reference +information in the ``Orchestrator``, the names used to reference data, ML models, and scripts will be +identical, and without the use of SmartSim and SmartRedis prefix methods, ``Model(s)`` +will end up inadvertently accessing or overwriting each other’s data. To prevent this +situation, the SmartSim ``Model`` object supports key prefixing, which prepends +the name of the ``Model`` to the keys sent to the ``Orchestrator`` to create unique key names. +With this enabled, collision is avoided and ``Model(s)`` can use the same key names within their applications. + +The key components of SmartSim ``Model`` prefixing functionality include: + +1. **Sending Data to the Orchestrator**: Users can send data to an ``Orchestrator`` + with the ``Model`` name prepended to the data name through SmartSim :ref:`Model functions` and + SmartRedis :ref:`Client functions`. +2. **Retrieving Data from the Orchestrator**: Users can instruct a ``Client`` to prepend a + ``Model`` name to a key during data retrieval, polling, or check for existence on the ``Orchestrator`` + through SmartRedis :ref:`Client functions`. + +For example, assume you have two ``Model(s)`` in an ``Experiment``, named `model_0` and `model_1`. In each +application code you use the function ``Client.put_tensor("tensor_0", data)`` to send a tensor named `"tensor_0"` +to the same ``Orchestrator``. With ``Model`` key prefixing turned on, the `model_0` and `model_1` +applications can access their respective tensor `"tensor_0"` by name without overwriting or accessing +the other ``Model(s)`` `"tensor_0"` tensor. In this scenario, the two tensors placed in the +``Orchestrator`` are `model_0.tensor_0` and `model_1.tensor_0`. + +Enabling and Disabling +====================== +SmartSim provides support for toggling prefixing on a ``Model`` for tensors, ``Datasets``, +lists, ML models, and scripts. Prefixing functions from the SmartSim :ref:`Model API` and SmartRedis :ref:`Client API` rely on +each other to fully support SmartSim key prefixing. For example, to use the ``Client`` prefixing +functions, a user must enable prefixing on the ``Model`` through ``Model.enable_key_prefixing``. +This function enables and activates prefixing for tensors, ``Datasets`` and lists placed in an ``Orchestrator`` +by the ``Model``. This configuration can be toggled within the ``Model`` application through +``Client`` functions, such as disabling tensor prefixing via ``Client.use_tensor_ensemble_prefix(False)``. + +The interaction between the prefix SmartSim `Model Functions` and SmartRedis +`Client Functions` are documentation below. + +.. _model_prefix_func: + +--------------- +Model Functions +--------------- +A ``Model`` object supports two prefixing functions: ``Model.enable_key_prefixing`` and +``Model.register_incoming_entity``. + +To enable prefixing on a ``Model``, users must use the ``Model.enable_key_prefixing`` +function in the ``Experiment`` driver script. The key components of this function include: + +- Activates prefixing for tensors, ``Datasets``, and lists sent to a ``Orchestrator`` from within + the ``Model`` application. +- Enables access to prefixing ``Client`` functions within the ``Model`` application. This excludes + the ``Client.set_data_source`` function, where ``enable_key_prefixing`` is not require for access. + +.. note:: + ML model and script prefixing is not automatically enabled through ``Model.enable_key_prefixing`` + and rather must be enabled within the ``Model`` application using ``Client.use_model_ensemble_prefix``. + +Users can enable a SmartRedis ``Client`` to interact with prefixed data, ML models and TorchScripts +within a ``Model`` application by specifying the producer entity name to ``Client.set_data_source``. +However, for SmartSim to recognize the entity name within the application, the producer +entity must be registered on the consumer entity using ``Ensemble.register_incoming_entity``. +This also applies to scenarios where the ``Model`` attempts to access data placed by self. +For more information on ``Client.set_data_source``, visit the +:ref:`Client functions` section. + +.. _client_prefix_func: + +---------------- +Client Functions +---------------- +A ``Client`` object supports five prefixing functions: ``Client.use_tensor_ensemble_prefix``, +``Client.use_dataset_ensemble_prefix``, ``Client.use_list_ensemble_prefix``, +``Client.use_model_ensemble_prefix`` and ``Client.set_data_source``. + +To enable or disable SmartRedis data structure prefixing for tensors, ``Datasets``, aggregation lists, ML models +and scripts, SmartRedis ``Client`` offers functions per data structure: + +- Tensor: ``Client.use_tensor_ensemble_prefix`` +- ``Dataset``: ``Client.use_dataset_ensemble_prefix`` +- Aggregation lists: ``Client.use_list_ensemble_prefix`` +- ML models/scripts: ``Client.use_model_ensemble_prefix`` + +.. warning:: + To access the ``Client`` prefixing functions, prefixing must be enabled on the + ``Model`` through ``Model.enable_key_prefixing``. This function activates prefixing + for tensors, ``Datasets`` and lists. + +Examples are provided below that show the use of these ``Client`` methods in conjunction +with the SmartSim key prefixing ``Model`` API functions. + +Users can enable the SmartSim ``Client`` to interact with prefixed data, ML models and TorchScripts +using the ``Client.set_data_source`` function. To leverage this capability: + +1. Use ``Model.register_incoming_entity`` on the ``Model`` intending to interact with prefixed data in the ``Orchestrator`` + placed by a separate ``Model``. +2. Pass the SmartSim entity (e.g., another ``Model``) to ``Model.register_incoming_entity`` in order to + reference the ``Model`` prefix in the application code. +3. In the ``Model`` application, instruct the ``Client`` to prepend the specified ``Model`` name during key searches + using ``Client.set_data_source("model_name")``. + +Examples are provided below that show the use of these ``Client`` methods in conjunction +with the SmartSim key prefixing ``Model`` API functions. + +.. _put_set_prefix: + +Put/Set Operations +================== +In the following tabs we provide snippets of driver script and application code to demonstrate +activating and deactivating prefixing for tensors, ``Datasets``, lists, ML models and scripts using +SmartRedis put/get semantics. + +.. tabs:: + + .. group-tab:: Tensor + **Activate Tensor Prefixing in the Driver Script** + + To activate prefixing on a ``Model`` in the driver script, a user must use the function + ``Model.enable_key_prefixing``. This functionality ensures that the ``Model`` name + is prepended to each tensor name sent to the ``Orchestrator`` from within the ``Model`` + executable code. The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + + In the driver script snippet below, we take an initialized ``Model`` and activate tensor + prefixing through the ``enable_key_prefixing`` function: + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + :language: python + :linenos: + :lines: 6-12 + + In the `model` application, two tensors named `tensor_1` and `tensor_2` are sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "model_name.tensor_1" + 2) "model_name.tensor_2" + + You will notice that the ``Model`` name `model_name` has been prepended to each tensor name + and stored in the ``Orchestrator``. + + **Activate Tensor Prefixing in the Application** + + Users can further configure tensor prefixing in the application by using + the ``Client`` function ``use_tensor_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_tensor_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling tensor prefixing: + + .. code-block:: python + + # Disable key prefixing + client.use_tensor_ensemble_prefix(False) + # Place a tensor in the Orchestrator + client.put_tensor("tensor_1", np.array([1, 2, 3, 4])) + # Enable key prefixing + client.use_tensor_ensemble_prefix(True) + # Place a tensor in the Orchestrator + client.put_tensor("tensor_2", np.array([5, 6, 7, 8])) + + In the application, two tensors named `tensor_1` and `tensor_2` are sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "tensor_1" + 2) "model_name.tensor_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `tensor_1` since + we disabled tensor prefixing before sending the tensor to the ``Orchestrator``. However, + when we enabled tensor prefixing and sent the second tensor, the ``Model`` name was prefixed + to `tensor_2`. + + .. group-tab:: Dataset + **Activate Dataset Prefixing in the Driver Script** + + To activate prefixing on a ``Model`` in the driver script, a user must use the function + ``Model.enable_key_prefixing``. This functionality ensures that the ``Model`` name + is prepended to each ``Dataset`` name sent to the ``Orchestrator`` from within the ``Model``. + The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + + In the driver script snippet below, we take an initialized ``Model`` and activate ``Dataset`` + prefixing through the ``enable_key_prefixing`` function: + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + :language: python + :linenos: + :lines: 6-12 + + In the `model` application, two Datasets named `dataset_1` and `dataset_2` are sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "model_name.{dataset_1}.dataset_tensor_1" + 2) "model_name.{dataset_1}.meta" + 3) "model_name.{dataset_2}.dataset_tensor_2" + 4) "model_name.{dataset_2}.meta" + + You will notice that the ``Model`` name `model_name` has been prefixed to each ``Dataset`` name + and stored in the ``Orchestrator``. + + **Activate Dataset Prefixing in the Application** + + Users can further configure ``Dataset`` prefixing in the application by using + the ``Client`` function ``use_dataset_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_dataset_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling ``Dataset`` prefixing: + + .. code-block:: python + + # Disable key prefixing + client.use_dataset_ensemble_prefix(False) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_1) + # Enable key prefixing + client.use_dataset_ensemble_prefix(True) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_2) + + In the application, we have two ``Datasets`` named `dataset_1` and `dataset_2`. + We then send them to a launched ``Orchestrator``. The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "{dataset_1}.dataset_tensor_1" + 2) "{dataset_1}.meta" + 3) "model_name.{dataset_2}.dataset_tensor_1" + 4) "model_name.{dataset_2}.meta" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `dataset_1` since + we disabled ``Dataset`` prefixing before sending the ``Dataset`` to the ``Orchestrator``. However, + when we enabled ``Dataset`` prefixing and sent the second ``Dataset``, the ``Model`` name was prefixed + to `dataset_2`. + + .. group-tab:: Aggregation List + **Activate Aggregation List Prefixing in the Driver Script** + + To activate prefixing on a ``Model`` in the driver script, a user must use the function + ``Model.enable_key_prefixing``. This functionality ensures that the ``Model`` name + is prepended to each list name sent to the ``Orchestrator`` from within the ``Model``. + The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + + In the driver script snippet below, we take an initialized ``Model`` and activate list + prefixing through the ``enable_key_prefixing`` function: + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + :language: python + :linenos: + :lines: 6-12 + + In the `model` application, a list named `dataset_list` is sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "model_name.dataset_list" + + You will notice that the ``Model`` name `model_name` has been prefixed to the list name + and stored in the ``Orchestrator``. + + **Activate Aggregation List Prefixing in the Application** + + Users can further configure list prefixing in the application by using + the ``Client`` function ``use_list_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_list_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling list prefixing: + + .. code-block:: python + + # Disable key prefixing + client.use_list_ensemble_prefix(False) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_1) + # Place a list in the Orchestrator + client.append_to_list("list_1", dataset_1) + # Enable key prefixing + client.use_dataset_ensemble_prefix(True) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_2) + # Append Dataset to list in the Orchestrator + client.append_to_list("list_2", dataset_2) + + In the application, two lists named `list_1` and `list_2` are sent to the ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "list_1" + 2) "model_name.{dataset_1}.meta" + 3) "model_name.{dataset_1}.dataset_tensor_1" + 4) "model_name.list_2" + 5) "model_name.{dataset_2}.meta" + 6) "model_name.{dataset_2}.dataset_tensor_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `list_1` since + we disabled list prefixing before sending the list to the ``Orchestrator``. However, + when we enabled list prefixing and sent the second list, the ``Model`` name was prefixed + to `list_2` as well as the list ``Dataset`` members. + + .. note:: + The ``Datasets`` sent to the ``Orchestrator`` are all prefixed. This is because + ``Model.enable_key_prefixing`` turns on prefixing for tensors, ``Datasets`` and lists. + + .. group-tab:: ML Model + **Activate ML Model Prefixing in the Application** + + Users can configure ML model prefixing in the application by using + the ``Client`` function ``use_model_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_model_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling ML model prefixing: + + .. code-block:: python + + # Disable ML model prefixing + client.use_model_ensemble_prefix(False) + # Send ML model to the Orchestrator + client.set_model( + "ml_model_1", serialized_model_1, "TF", device="CPU", inputs=inputs, outputs=outputs + ) + # Enable ML model prefixing + client.use_model_ensemble_prefix(True) + # Send prefixed ML model to the Orchestrator + client.set_model( + "ml_model_2", serialized_model_2, "TF", device="CPU", inputs=inputs, outputs=outputs + ) + + In the application, two ML models named `ml_model_1` and `ml_model_2` are sent + to a launched ``Orchestrator``. The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "ml_model_1" + 2) "model_name.ml_model_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `ml_model_1` since + we disabled ML model prefixing before sending the ML model to the ``Orchestrator``. However, + when we enabled ML model prefixing and sent the second ML model, the ``Model`` name was prefixed + to `ml_model_2`. + + .. group-tab:: Script + **Activate Script Prefixing in the Application** + + Users can configure script prefixing in the application by using + the ``Client`` function ``use_model_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_model_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling script prefixing: + + .. code-block:: python + + # Disable script prefixing + client.use_model_ensemble_prefix(False) + # Store a script in the Orchestrator + client.set_function("script_1", script_1) + # Enable script prefixing + client.use_model_ensemble_prefix(True) + # Store a prefixed script in the Orchestrator + client.set_function("script_2", script_2) + + In the application, two ML models named `script_1` and `script_2` are sent + to a launched ``Orchestrator``. The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "script_1" + 2) "model_name.script_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `script_1` since + we disabled script prefixing before sending the script to the ``Orchestrator``. However, + when we enabled script prefixing and sent the second script, the ``Model`` name was prefixed + to `script_2`. + +.. _get_prefix: + +Get Operations +============== +In the following sections, we walk through snippets of application code to demonstrate the retrieval +of prefixed tensors, ``Datasets``, lists, ML models, and scripts using SmartRedis put/get +semantics. The examples demonstrate retrieval within the same application where the data +structures were placed, as well as scenarios where data structures are placed by separate +applications. + +.. tabs:: + + .. group-tab:: Tensor + **Retrieve a Tensor Placed by the Same Application** + + SmartSim supports retrieving prefixed tensors sent to the ``Orchestrator`` from within the + same application where the tensor was placed. To achieve this, users must + provide the ``Model`` name that stored the tensor to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name + in the driver script. + + As an example, we placed a prefixed tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.tensor_name" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the tensor: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed tensor + tensor_data = client.get_tensor("tensor_name") + # Log the tensor data + client.log_data(LLInfo, f"The tensor value is: {tensor_data}") + + In the `model.out` file, the ``Client`` will log the message:: + Default@00-00-00:The tensor value is: [1 2 3 4] + + **Retrieve a Tensor Placed by an External Application** + + SmartSim supports retrieving prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data in the + driver script. + + In the example, a ``Model`` named `model_1` has placed a tensor in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.tensor_name" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored tensor named `tensor_name`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed tensor + tensor_data = client.get_tensor("tensor_name") + # Log the tensor data + client.log_data(LLInfo, f"The tensor value is: {tensor_data}") + + In the `model.out` file, the ``Client`` will log the message:: + Default@00-00-00:The tensor value is: [1 2 3 4] + + .. group-tab:: Dataset + **Retrieve a Dataset Placed by the Same Application** + + SmartSim supports retrieving prefixed ``Datasets`` sent to the ``Orchestrator`` from within the + same application where the ``Dataset`` was placed. To achieve this, users must + provide the ``Model`` name that stored the ``Dataset`` to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ``Dataset`` on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.{dataset_name}.dataset_tensor" + 2) "model_1.{dataset_name}.meta" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the ``Dataset``: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed Dataset + dataset_data = client.get_dataset("dataset_name") + # Log the Dataset data + client.log_data(LLInfo, f"The Dataset value is: {dataset_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + Default@00-00-00:Default@00-00-00:The dataset value is: + + DataSet (dataset_name): + Tensors: + dataset_tensor: + type: 16 bit unsigned integer + dimensions: [4] + elements: 4 + Metadata: + none + + **Retrieve a Dataset Placed by an External Application** + + SmartSim supports retrieving prefixed ``Datasets`` sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ``Dataset`` + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ``Dataset`` in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.{dataset_name}.dataset_tensor" + 2) "model_1.{dataset_name}.meta" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored ``Dataset`` named `dataset_name`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed Dataset + dataset_data = client.get_dataset("dataset_name") + # Log the Dataset data + client.log_data(LLInfo, f"The Dataset value is: {dataset_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + Default@00-00-00:Default@00-00-00:The Dataset value is: + + DataSet (dataset_name): + Tensors: + dataset_tensor: + type: 16 bit unsigned integer + dimensions: [4] + elements: 4 + Metadata: + none + + .. group-tab:: Aggregation List + **Retrieve a Aggregation List Placed by the Same Application** + + SmartSim supports retrieving prefixed lists sent to the ``Orchestrator`` from within the + same application where the list was placed. To achieve this, users must + provide the ``Model`` name that stored the list to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed list on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.dataset_list" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate checking the length of the list: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed list + list_data = client.get_datasets_from_list("dataset_list") + # Log the list data + client.log_data(LLInfo, f"The length of the list is: {len(list_data)}") + + In the `model.out` file, the ``Client`` will log the message:: + The length of the list is: 1 + + **Retrieve a Aggregation List Placed by an External Application** + + SmartSim supports retrieving prefixed lists sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the list + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a list in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_name.dataset_list" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we check the length of the list named `dataset_list`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed list + list_data = client.get_datasets_from_list("dataset_list") + # Log the list data + client.log_data(LLInfo, f"The length of the list is: {len(list_data)}") + + In the `model.out` file, the ``Client`` will log the message:: + The length of the list is: 1 + + .. group-tab:: ML Model + **Retrieve a ML Model Placed by the Same Application** + + SmartSim supports retrieving prefixed ML models sent to the ``Orchestrator`` from within the + same application where the ML model was placed. To achieve this, users must + provide the ``Model`` name that stored the ML model to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ML model on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the ML model: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed ML model + model_data = client.get_model("mnist_cnn") + + **Retrieve a ML Model Placed by an External Application** + + SmartSim supports retrieving prefixed ML model sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ML model + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ML model in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored ML model named `mnist_cnn`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed model + model_data = client.get_model("mnist_cnn") + + .. group-tab:: Script + **Retrieve a Script Placed by the Same Application** + + SmartSim supports retrieving prefixed scripts sent to the ``Orchestrator`` from within the + same application where the script was placed. To achieve this, users must + provide the ``Model`` name that stored the script to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed script on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.normalizer" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the script: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed script + script_data = client.get_script("normalizer") + # Log the script data + client.log_data(LLInfo, f"The script data is: {script_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + The script data is: def normalize(X): + """Simple function to normalize a tensor""" + mean = X.mean + std = X.std + + return (X-mean)/std + + **Retrieve a Script Placed by an External Application** + + SmartSim supports retrieving prefixed scripts sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the script + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a script in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.normalizer" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored script named `normalizer`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed script + script_data = client.get_script("model_1.normalizer") + # Log the script data + client.log_data(LLInfo, f"The script data is: {script_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + The script data is: def normalize(X): + """Simple function to normalize a tensor""" + mean = X.mean + std = X.std + + return (X-mean)/std + +.. _run_prefix: + +Run Operations +============== +In the following sections, we walk through snippets of application code to demonstrate executing +prefixed ML models and scripts using SmartRedis run semantics. The examples demonstrate +executing within the same application where the ML Model and Script were placed, as well as scenarios +where ML Model and Script are placed by separate applications. + +.. tabs:: + + .. group-tab:: ML Model + **Access ML Models From within the Application** + + SmartSim supports executing prefixed ML models with prefixed tensors sent to the ``Orchestrator`` from within + the same application that the ML model was placed. To achieve this, users must + provide the ``Model`` name that stored the ML model and input tensors to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ML model and tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + 2) "model_1.mnist_images" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate running the ML model: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the ML model + client.run_model(name="mnist_cnn", inputs=["mnist_images"], outputs=["Identity"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_1.Identity" + 2) "model_1.mnist_cnn" + 3) "model_1.mnist_images" + + .. note:: + The output tensors are prefixed because we executed ``model_1.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + + **Access ML Models Loaded From an External Application** + + SmartSim supports executing prefixed ML models with prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ML model and tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ML model and tensor in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + 2) "model_1.mnist_images" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate running the ML model: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the ML model + client.run_model(name="mnist_cnn", inputs=["mnist_images"], outputs=["Identity"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_2.Identity" + 2) "model_1.mnist_cnn" + 3) "model_1.mnist_images" + + .. note:: + The output tensors are prefixed because we executed ``model_2.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + + .. group-tab:: Script + + **Access Scripts From within the Application** + + SmartSim supports executing prefixed scripts with prefixed tensors sent to the ``Orchestrator`` from within + the same application that the script was placed. To achieve this, users must + provide the ``Model`` name that stored the script and input tensors to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed script and tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + + To run the script, the prefixed script name `"model_name.normalizer"` and prefixed + input tensors `"model_name.X_rand"` must be provided, as demonstrated below: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the script + client.run_script("normalizer", "normalize", inputs=["X_rand"], outputs=["X_norm"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + 3) "model_1.X_norm" + + .. note:: + The output tensors are prefixed because we executed ``model_1.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + + **Access Scripts Loaded From an External Application** + + SmartSim supports executing prefixed scripts with prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the script and tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a script and tensor in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate running the script: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the script + client.run_script("normalizer", "normalize", inputs=["X_rand"], outputs=["X_norm"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + 3) "model_2.X_norm" + + .. note:: + The output tensors are prefixed because we executed ``model_2.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + +.. _copy_rename_del_prefix: + +Copy/Rename/Delete Operations +============================= +In the following sections, we walk through snippets of application code to demonstrate the copy, rename and delete +operations on prefixed tensors, ``Datasets``, lists, ML models, and scripts. The examples +demonstrate these operations within the same script where the data +structures were placed, as well as scenarios where data structures are placed by separate +scripts. + +.. tabs:: + + .. group-tab:: Tensor + **Copy/Rename/Delete Operations on Tensors in The Same Application** + + SmartSim supports copy/rename/delete operations on prefixed tensors sent to the ``Orchestrator`` from within + the same application that the tensor was placed. To achieve this, users must + provide the ``Model`` name that stored the tensor to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.tensor" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To rename the tensor in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``rename_tensor``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the tensor + client.rename_tensor("tensor", "renamed_tensor") + + Because prefixing is enabled on the ``Model`` via ``enable_key_prefixing`` in the driver script, + SmartSim will keep the prefix on the tensor but replace the tensor name as shown in the ``Orchestrator``: + + .. code-block:: bash + + 1) "model_1.renamed_tensor" + + Next, we copy the prefixed tensor to a new destination: + + .. code-block:: python + + client.copy_tensor("renamed_tensor", "copied_tensor") + + Since tensor prefixing is enabled on the ``Client``, the `copied_tensor` is prefixed: + + .. code-block:: bash + + 1) "model_1.renamed_tensor" + 2) "model_1.copied_tensor" + + Next, delete `renamed_tensor`: + + .. code-block:: python + + client.delete_tensor("renamed_tensor") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_1.copied_tensor" + + **Copy/Rename/Delete Operations on Tensors Placed by an External Application** + + SmartSim supports copy/rename/delete operations on prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a tensor in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.tensor" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + From within a separate ``Model`` named `model_2`, we perform basic copy/rename/delete operations. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the tensor in the ``Orchestrator``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + + To rename the tensor in the ``Orchestrator``, we provide the tensor name: + + .. code-block:: python + + client.rename_tensor("tensor", "renamed_tensor") + + SmartSim will replace the prefix with the current ``Model`` name since prefixing is enabled + on the current ``Model``. The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_tensor" + + .. note:: + In the driver script, we also register `model_2` as an entity on itself via ``model_2.register_incoming_entity(model_2)``. + This way we can use ``Client.set_data_source`` to interact with prefixed data placed by `model_2`. + + Next, we copy the prefixed tensor to a new destination: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_2") + # Copy the tensor data + client.copy_tensor("renamed_tensor", "copied_tensor") + + The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_2.renamed_tensor" + 2) "model_2.copied_tensor" + + Next, delete `copied_tensor` by specifying the name: + + .. code-block:: python + + client.delete_tensor("copied_tensor") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_tensor" + + .. group-tab:: Dataset + **Copy/Rename/Delete Operations on A Dataset in The Same Application** + + SmartSim supports copy/rename/delete operations on prefixed ``Datasets`` sent to the ``Orchestrator`` from within + the same application that the ``Dataset`` was placed. To achieve this, users must + provide the ``Model`` name that stored the ``Dataset`` to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ``Dataset`` on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.{dataset}.dataset_tensor" + 2) "model_1.{dataset}.meta" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To rename the ``Dataset`` in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``rename_tensor``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the Dataset + client.rename_dataset("dataset", "renamed_dataset") + + Because prefixing is enabled on the ``Model`` via ``enable_key_prefixing`` in the driver script, + SmartSim will keep the prefix on the ``Dataset`` but replace the ``Dataset`` name as shown in the ``Orchestrator``: + + .. code-block:: bash + + 1) "model_1.{renamed_dataset}.dataset_tensor" + 2) "model_1.{renamed_dataset}.meta" + + Next, we copy the prefixed ``Dataset`` to a new destination: + + .. code-block:: python + + client.copy_dataset("renamed_dataset", "copied_dataset") + + Since ``Dataset`` prefixing is enabled on the ``Client``, the `copied_dataset` is prefixed: + + .. code-block:: bash + + 1) "model_1.{renamed_dataset}.dataset_tensor" + 2) "model_1.{renamed_dataset}.meta" + 3) "model_1.{copied_dataset}.dataset_tensor" + 4) "model_1.{copied_dataset}.meta" + + Next, delete `copied_dataset`: + + .. code-block:: python + + client.delete_dataset("model_name.copied_dataset") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_1.{renamed_dataset}.dataset_tensor" + 2) "model_1.{renamed_dataset}.meta" + + **Copy/Rename/Delete Operations on Datasets Placed by an External Application** + + SmartSim supports copy/rename/delete operations on prefixed ``Datasets`` sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ``Dataset`` + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ``Dataset`` in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.{dataset}.dataset_tensor" + 2) "model_1.{dataset}.meta" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + From within a separate ``Model`` named `model_2`, we perform basic copy/rename/delete operations. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the ``Dataset`` in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To rename the ``Dataset`` in the ``Orchestrator``, we provide the ``Dataset`` `name`: + + .. code-block:: python + + client.rename_tensor("dataset", "renamed_dataset") + + SmartSim will replace the prefix with the current ``Model`` name since prefixing is enabled + on the current ``Model`` via ``Model.enable_key_prefixing`` in the driver script. + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.{renamed_dataset}.dataset_tensor" + 2) "model_2.{renamed_dataset}.meta" + + .. note:: + In the driver script, we also register `model_2` as an entity on itself via ``model_2.register_incoming_entity(model_2)``. + This way we can use ``Client.set_data_source`` to interact with prefixed data placed by `model_2`. + + Next, we copy the prefixed ``Dataset`` to a new destination: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_2") + # Copy the tensor data + client.copy_dataset("renamed_dataset", "copied_dataset") + + The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_2.{renamed_dataset}.dataset_tensor" + 2) "model_2.{renamed_dataset}.meta" + 3) "model_2.{copied_dataset}.dataset_tensor" + 4) "model_2.{copied_dataset}.meta" + + Next, delete `copied_dataset` by specifying the name: + + .. code-block:: python + + client.delete_dataset("copied_tensor") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.{renamed_dataset}.dataset_tensor" + 2) "model_2.{renamed_dataset}.meta" + + .. group-tab:: Aggregation List + **Copy/Rename/Delete Operations on a Aggregation List in The Same Application** + + SmartSim supports copy/rename/delete operations on prefixed lists sent to the ``Orchestrator`` from within + the same application that the list was placed. To achieve this, users must + provide the ``Model`` name that stored the list to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed list on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.list_of_datasets" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To rename the list in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``rename_list``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the list + client.rename_list("list_of_datasets", "renamed_list") + + Because prefixing is enabled on the ``Model`` via ``enable_key_prefixing`` in the driver script, + SmartSim will keep the prefix on the list but replace the list name as shown in the ``Orchestrator``: + + .. code-block:: bash + + 1) "model_1.renamed_list" + + Next, we copy the prefixed list to a new destination: + + .. code-block:: python + + client.copy_list("renamed_list", "copied_list") + + Since list prefixing is enabled on the ``Client``, the `copied_list` is prefixed: + + .. code-block:: bash + + 1) "model_1.renamed_list" + 2) "model_1.copied_list" + + Next, delete `copied_list`: + + .. code-block:: python + + client.delete_list("copied_list") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_1.renamed_list" + + **Copy/Rename/Delete Operations on Aggregation Lists Placed by an External Application** + + SmartSim supports copy/rename/delete operations on prefixed lists sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the list + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a list in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.list_of_datasets" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + From within a separate ``Model`` named `model_2`, we perform basic copy/rename/delete operations. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the list in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To rename the list in the ``Orchestrator``, we provide the list name: + + .. code-block:: python + + client.rename_list("list_of_datasets", "renamed_list") + + SmartSim will replace the prefix with the current ``Model`` name since prefixing is enabled + on the current ``Model``. The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_list" + + .. note:: + In the driver script, we also register `model_2` as an entity on itself via ``model_2.register_incoming_entity(model_2)``. + This way we can use ``Client.set_data_source`` to interact with prefixed data placed by `model_2`. + + Next, we copy the prefixed list to a new destination: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_2") + # Copy the tensor data + client.copy_dataset("renamed_list", "copied_list") + + The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_2.renamed_list" + 2) "model_2.copied_list" + + Next, delete `copied_list` by specifying the name: + + .. code-block:: python + + client.delete_list("copied_list") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_list" + + .. group-tab:: ML Model + **Delete ML Models From within the Application** + + SmartSim supports delete operations on prefixed ML models sent to the ``Orchestrator`` from within + the same application that the ML model was placed. To achieve this, users must + provide the ``Model`` name that stored the ML model to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ML model on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + .. code-block:: bash + + 1) "model_1.ml_model" + + To delete the ML model in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``delete_model``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Delete the ML model + client.delete_model("ml_model") + + **Delete a ML Model Placed by an External Application** + + SmartSim supports delete operations on prefixed ML models sent to the ``Orchestrator`` by separate ``Model(s)``. + To do so, users must provide the ``Model`` name that stored the ML model to ``Client.set_data_source``. + This will instruct the ``Client`` to prepend the ``Model`` name input to all key searches. + + In the example, a ``Model`` named `model_1` has placed a ML model in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.ml_model" + + From within a separate ``Model`` named `model_2`, we perform a basic delete operation. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the list in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To delete the ML model in the ``Orchestrator``, we provide the ML model name: + + .. code-block:: python + + client.delete_model("ml_model") + + .. group-tab:: Script + + **Delete Scripts From within the Application** + + SmartSim supports delete operations on prefixed scripts sent to the ``Orchestrator`` from within + the same application that the script was placed. To achieve this, users must + provide the ``Model`` name that stored the script to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed script on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.script" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To delete the script in the ``Orchestrator``, we provide the full list name: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the script + client.delete_script("script") + + **Delete a Script Placed by an External Application** + + SmartSim supports delete operations on prefixed scripts sent to the ``Orchestrator`` by separate ``Model(s)``. + To do so, users must provide the ``Model`` name that stored the script to ``Client.set_data_source``. + This will instruct the ``Client`` to prepend the ``Model`` name input to all key searches. + + In the example, a ``Model`` named `model_1` has placed a ML model in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.script" + + From within a separate ``Model`` named `model_2`, we perform a basic delete operation. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the list in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To delete the script in the ``Orchestrator``, we provide the script name: + + .. code-block:: python + + client.delete_model("script") \ No newline at end of file diff --git a/doc/orchestrator.rst b/doc/orchestrator.rst index 456d9a814..6ccc7c1e1 100644 --- a/doc/orchestrator.rst +++ b/doc/orchestrator.rst @@ -1,208 +1,688 @@ +.. _orch_docs: + ************ Orchestrator ************ +======== +Overview +======== +The ``Orchestrator`` is an in-memory database with features built for +AI-enabled workflows including online training, low-latency inference, cross-application data +exchange, online interactive visualization, online data analysis, computational steering, and more. + +An ``Orchestrator`` can be thought of as a general feature store +capable of storing numerical data (tensors and ``Datasets``), AI models (TF, TF-lite, PyTorch, or ONNX), +and scripts (TorchScripts). In addition to storing data, the ``Orchestrator`` is capable of +executing AI models and TorchScripts on the stored data using CPUs or GPUs. + +.. figure:: images/smartsim-arch.png + + Sample ``Experiment`` showing a user application leveraging + machine learning infrastructure launched by SmartSim and connected + to an online analysis and visualization simulation via the ``Orchestrator``. + +Users can establish a connection to the ``Orchestrator`` from within ``Model`` executable code, ``Ensemble`` +member executable code, or ``Experiment`` driver scripts by using the +:ref:`SmartRedis` ``Client`` library. + +SmartSim offers **two** types of ``Orchestrator`` deployments: + +- :ref:`Standalone Deployment` + A standalone ``Orchestrator`` is ideal for systems that have heterogeneous node types + (i.e. a mix of CPU-only and GPU-enabled compute nodes) where + ML model and TorchScript evaluation is more efficiently performed off-node. This + deployment is also ideal for workflows relying on data exchange between multiple + applications (e.g. online analysis, visualization, computational steering, or + producer/consumer application couplings). Standalone deployment is also optimal for + high data throughput scenarios where ``Orchestrators`` require large amounts of compute resources. + +- :ref:`Colocated Deployment` + A colocated ``Orchestrator`` is ideal when the data and hardware accelerator are located on the same compute node. + This setup helps reduce latency in ML inference and TorchScript evaluation by eliminating off-node communication. +.. warning:: + Colocated ``Orchestrators`` cannot share data across compute nodes. + Communication is only supported between a ``Model`` and colocated ``Orchestrator`` pair. + +SmartSim allows users to launch :ref:`multiple Orchestrators` of either type during +the course of an ``Experiment``. If a workflow requires a multiple ``Orchestrator`` environment, a +`db_identifier` argument must be specified during ``Orchestrator`` initialization. Users can connect to +``Orchestrators`` in a multiple ``Orchestrator`` workflow by specifying the respective `db_identifier` argument +within a :ref:`ConfigOptions` object that is passed into the SmartRedis ``Client`` constructor. + +.. _standalone_orch_doc: + +===================== +Standalone Deployment +===================== +-------- +Overview +-------- +During standalone ``Orchestrator`` deployment, a SmartSim ``Orchestrator`` (the database) runs on separate +compute node(s) from the SmartSim ``Model`` node(s). A standalone ``Orchestrator`` can be deployed on a single +node (single-sharded) or distributed (sharded) over multiple nodes. With a multi-node ``Orchestrator``, users can +scale the number of database nodes for inference and script evaluation, enabling +increased in-memory capacity for data storage in large-scale workflows. Single-node +``Orchestrators`` are effective for small-scale workflows and offer lower latency for ``Client`` API calls +that involve data appending or processing (e.g. ``Client.append_to_list``, ``Client.run_model``, etc). + +When connecting to a standalone ``Orchestrator`` from within a ``Model`` application, the user has +several options to connect a SmartRedis ``Client``: + +- In an ``Experiment`` with a single deployed ``Orchestrator``, users can rely on SmartRedis + to detect the ``Orchestrator`` address through runtime configuration of the SmartSim ``Model`` environment. + A default ``Client`` constructor, with no user-specified parameters, is sufficient to + connect to the ``Orchestrator``. The only exception is for the Python ``Client``, which requires + the `cluster` constructor parameter to differentiate between standalone deployment and colocated + deployment. +- In an ``Experiment`` with multiple ``Orchestrators``, users can connect to a specific ``Orchestrator`` by + first specifying the `db_identifier` in the ``ConfigOptions`` constructor within the executable application. + Subsequently, users should pass the ``ConfigOptions`` instance to the ``Client`` constructor. +- Users can specify or override automatically configured connection options by providing the + ``Orchestrator`` address in the ``ConfigOptions`` object. Subsequently, users should pass the ``ConfigOptions`` + instance to the ``Client`` constructor. + +If connecting to a standalone ``Orchestrator`` from a ``Experiment`` driver script, the user must specify +the address of the ``Orchestrator`` to the ``Client`` constructor. SmartSim does not automatically +configure the environment of the ``Experiment`` driver script to connect to an ``Orchestrator``. Users +can access an ``Orchestrators`` address through ``Orchestrator.get_address``. -The ``Orchestrator`` is an in-memory database that is launched prior to all other -entities within an ``Experiment``. The ``Orchestrator`` can be used to store and retrieve -data during the course of an experiment and across multiple entities. In order to -stream data into or receive data from the ``Orchestrator``, one of the SmartSim clients -(SmartRedis) has to be used within a Model. +.. note:: + In SmartSim ``Model`` applications, it is advisable to **avoid** specifying addresses directly to the ``Client`` constructor. + Utilizing the SmartSim environment configuration for SmartRedis ``Client`` connections + allows the SmartSim ``Model`` application code to remain unchanged even as ``Orchestrator`` deployment + options vary. -.. |orchestrator| image:: images/Orchestrator.png - :width: 700 - :alt: Alternative text +The following image illustrates +communication between a standalone ``Orchestrator`` and a +SmartSim ``Model``. In the diagram, the application is running on multiple compute nodes, +separate from the ``Orchestrator`` compute nodes. Communication is established between the +``Model`` application and the sharded ``Orchestrator`` using the :ref:`SmartRedis client`. -|orchestrator| +.. figure:: images/clustered_orchestrator-1.png -Combined with the SmartRedis clients, the ``Orchestrator`` is capable of hosting and executing -AI models written in Python on CPU or GPU. The ``Orchestrator`` supports models written with -TensorFlow, Pytorch, TensorFlow-Lite, or models saved in an ONNX format (e.g. sci-kit learn). + Sample Standalone ``Orchestrator`` Deployment +.. note:: + Users do not need to know how the data is stored in a standalone configuration and + can address the cluster with the SmartRedis ``Client`` like a single block of memory + using simple put/get semantics in SmartRedis. + +In scenarios where data needs to be shared amongst ``Experiment`` entities, +such as online analysis, training, and processing, a standalone ``Orchestrator`` +is optimal. The data produced by multiple processes in a ``Model`` is stored in the standalone +``Orchestrator`` and is available for consumption by other ``Model``'s. + +If a workflow requires an application to leverage multiple standalone deployments, +multiple ``Clients`` can be instantiated within an application, +with each ``Client`` connected to a unique ``Orchestrator``. This is accomplished through the use of the +`db-identifier` and :ref:`ConfigOptions` object specified at ``Orchestrator`` initialization time. +For more information on a multiple database ``Experiment``, visit the :ref:`Multiple Orchestrators` section on +this page. + +------- +Example +------- +In the following example, we demonstrate deploying a standalone ``Orchestrator`` on an HPC system. +Once the standalone ``Orchestrator`` is launched from the ``Experiment`` driver script, we walk through +connecting a SmartRedis ``Client`` to the ``Orchestrator`` from within the ``Model`` +application to transmit and poll for data. -Cluster Orchestrator -==================== +The example is comprised of two script files: + +- :ref:`Application Script` + The application script is a Python file that contains instructions to create a SmartRedis + ``Client`` connection to the standalone ``Orchestrator``. To demonstrate the ability of + workflow components to access data from other entities, we retrieve the tensors set by + the driver script using a SmartRedis ``Client`` in the application script. We then instruct + the ``Client`` to send and retrieve data from within the application script. The example source + code is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Application Script source code + + .. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py -The ``Orchestrator`` supports single node and distributed memory settings. This means -that a single compute host can be used for the database or multiple by specifying -``db_nodes`` to be greater than 1. +- :ref:`Experiment Driver Script` + The ``Experiment`` driver script is responsible for launching and managing SmartSim entities. Within this script, + we use the ``Experiment`` API to create and launch a standalone ``Orchestrator``. To demonstrate the capability of + a ``Model`` application to access ``Orchestrator`` data sent from other sources, we employ the SmartRedis ``Client`` in + the driver script to store a tensor in the ``Orchestrator``, which is later retrieved by the ``Model`` application. + To employ the application script, we initialize a ``Model`` object with the application script as the executable, + launch the ``Orchestrator``, and then launch the ``Model``. -.. |cluster-orc| image:: images/clustered-orc-diagram.png - :width: 700 - :alt: Alternative text + To further demonstrate the ability of workflow components to access data from + other entities, we retrieve the tensors stored by the completed ``Model`` using a SmartRedis ``Client`` in + the driver script. Lastly, we tear down the ``Orchestrator``. The example source code is available in the dropdown below for + convenient execution and customization. -|cluster-orc| + .. dropdown:: Example Experiment Driver Script Source Code + .. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py -With a clustered ``Orchestrator``, multiple compute hosts memory can be used together -to store data. As well, the CPU or GPU(s) where the ``Orchestrator`` is running can -be used to execute the AI models, and Torchscript code on data stored within it. +.. _standalone_orch_app_script: -Users do not need to know how the data is stored in a clustered configuration and -can address the cluster with the SmartRedis clients like a single block of memory -using simple put/get semantics in SmartRedis. SmartRedis will ensure that data -is evenly distributed amongst all nodes in the cluster. +Application Script +================== +To begin writing the application script, import the necessary SmartRedis packages: -The cluster deployment is optimal for high data throughput scenarios such as -online analysis, training and processing. +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 1-2 +Client Initialization +--------------------- +To establish a connection with the ``Orchestrator``, we need to initialize a new SmartRedis ``Client``. +Because the ``Orchestrator`` launched in the driver script is sharded, we specify the +constructor argument `cluster` as `True`. -Colocated Orchestrator -======================= +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 4-5 -A colocated Orchestrator is a special type of Orchestrator that is deployed on -the same compute hosts an a ``Model`` instance defined by the user. In this -deployment, the database is *not* connected together in a cluster and each -shard of the database is addressed individually by the processes running -on that compute host. +.. note:: + Note that the C/C++/Fortran SmartRedis ``Clients`` are capable of reading cluster configurations + from the SmartSim ``Model`` environment and the `cluster` constructor argument does not need to be specified + in those ``Client`` languages. -.. |colo-orc| image:: images/co-located-orc-diagram.png - :width: 700 - :alt: Alternative text +Since there is only one ``Orchestrator`` launched in the ``Experiment`` +(the standalone ``Orchestrator``), specifying an ``Orchestrator`` `db_identifier` +is **not** required when initializing the SmartRedis ``Client``. +SmartRedis will handle the connection configuration. +.. note:: + To create a SmartRedis ``Client`` connection to the standalone ``Orchestrator``, the ``Orchestrator`` must be launched + from within the driver script prior to the start of the ``Model``. -|colo-orc| +Data Retrieval +-------------- +To confirm a successful connection to the ``Orchestrator``, we retrieve the tensor set from the ``Experiment`` script. +Use the ``Client.get_tensor`` method to retrieve the tensor named `tensor_1` placed by the driver script: -This deployment is designed for highly performant online inference scenarios where -a distributed process (likely MPI processes) are performing inference with -data local to each process. +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 7-10 -This method is deemed ``locality based inference`` since data is local to each -process and the ``Orchestrator`` is deployed locally on each compute host where -the distributed application is running. +After the ``Model`` is launched by the driver script, the following output will appear in +`getting-started/model/model.out`:: + Default@17-11-48:The multi-sharded db tensor is: [1 2 3 4] -To create a colocated model, first, create a ``Model`` instance and then call -the ``Model.colocate_db_tcp`` or ``Model.colocate_db_uds`` function. +Data Storage +------------ +Next, create a NumPy tensor to send to the standalone ``Orchestrator`` using +``Client.put_tensor(name, data)``: -.. currentmodule:: smartsim.entity.model +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 12-15 -.. automethod:: Model.colocate_db_tcp - :noindex: +We retrieve `"tensor_2"` in the ``Experiment`` driver script. -.. automethod:: Model.colocate_db_uds - :noindex: +.. _standalone_orch_driver_script: -Here is an example of creating a simple model that is colocated with an -``Orchestrator`` deployment using Unix Domain Sockets +Experiment Driver Script +======================== +To run the previous application script, we define a ``Model`` and ``Orchestrator`` within the +``Experiment`` driver script. Configuring and launching workflow entities (``Model`` and ``Orchestrator``) requires the utilization of +``Experiment`` class methods. The ``Experiment`` object is intended to be instantiated +once and utilized throughout the workflow runtime. -.. code-block:: python +In this example, we instantiate an ``Experiment`` object with the name `getting-started` +and the `launcher` set to `auto`. When using `launcher=auto`, SmartSim attempts to find a launcher on the machine. +For example, if this script were run on a Slurm-based system, SmartSim will automatically set the launcher to `slurm`. +We also setup the SmartSim `logger` to output information from the ``Experiment`` at runtime: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 1-9 + +Orchestrator Initialization +--------------------------- +In the next stage of the ``Experiment``, we create a standalone ``Orchestrator``. + +To create a standalone ``Orchestrator``, utilize the ``Experiment.create_database`` function: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 11-12 + +Client Initialization +--------------------- +The SmartRedis ``Client`` object contains functions that manipulate, send, and retrieve +data on the ``Orchestrator``. Begin by initializing a SmartRedis ``Client`` object for the standalone ``Orchestrator``. + +SmartRedis ``Clients`` in driver scripts do not have the ability to use a `db-identifier` or +rely on automatic configurations to connect to ``Orchestrators``. Therefore, when creating a SmartRedis ``Client`` +connection from within a driver script, specify the address of the ``Orchestrator`` you would like to connect to. +You can easily retrieve the ``Orchestrator`` address using the ``Orchestrator.get_address`` function: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 14-15 + +Data Storage +------------ +In the application script, we retrieved a NumPy tensor stored from within the driver script. +To support the application functionality, we create a +NumPy array in the ``Experiment`` driver script to send to the ``Orchestrator``. To +send a tensor to the ``Orchestrator``, use the function ``Client.put_tensor(name, data)``: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 17-20 + +Model Initialization +-------------------- +In the next stage of the ``Experiment``, we configure and create +a SmartSim ``Model`` and specify the executable path during ``Model`` creation: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 22-27 + +File Generation +--------------- +To create an isolated output directory for the ``Orchestrator`` and ``Model``, invoke ``Experiment.generate`` on the +``Experiment`` instance `exp` with `standalone_orchestrator` and `model` as input parameters: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 29-30 + +Invoking ``Experiment.generate(standalone_orchestrator, model)`` will create two directories: +`standalone_orchestrator/` and `model/`. Each of these directories will store +two output files: a `.out` file and a `.err` file. - from smartsim import Experiment - exp = Experiment("colo-test", launcher="auto") +.. note:: + It is important to invoke ``Experiment.generate`` with all ``Experiment`` entity instances + before launching. This will ensure that the output files are organized in the main ``experiment-name/`` + folder. In this example, the ``Experiment`` folder is named `getting-started/`. - colo_settings = exp.create_run_settings(exe="./some_mpi_app") +Entity Deployment +----------------- +In the next stage of the ``Experiment``, we launch the ``Orchestrator``, then launch the ``Model``. - colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_uds( - db_cpus=1, # cpus given to the database on each node - debug=False # include debug information (will be slower) - ifname=network_interface # specify network interface(s) to use (i.e. "ib0" or ["ib0", "lo"]) - ) - exp.start(colo_model) +Step 1: Start Orchestrator +'''''''''''''''''''''''''' +In the context of this ``Experiment``, it's essential to create and launch +the ``Orchestrator`` as a preliminary step before any other workflow entities. This is important +because the application requests and sends tensors to a launched ``Orchestrator``. +To launch the ``Orchestrator``, pass the ``Orchestrator`` instance to ``Experiment.start``. -By default, SmartSim will pin the database to the first _N_ CPUs according to ``db_cpus``. By -specifying the optional argument ``custom_pinning``, an alternative pinning can be specified -by sending in a list of CPU ids (e.g [0,2,range(5,8)]). For optimal performance, most users -will want to also modify the RunSettings for the model to pin their application to cores not -occupied by the database. +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 32-33 -.. warning:: +The ``Experiment.start`` function launches the ``Orchestrator`` for use within the workflow. +In other words, the function deploys the ``Orchestrator`` on the allocated compute resources. + +Step 2: Start Model +''''''''''''''''''' +Next, launch the `model` instance using the ``Experiment.start`` function: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 35-36 + +In the next subsection, we request tensors placed by the ``Model`` application. +We specify `block=True` to ``exp.start`` to require the ``Model`` to finish before +the ``Experiment`` continues. + +Data Polling +------------ +Next, check if the tensor exists in the standalone ``Orchestrator`` using ``Client.poll_tensor``. +This function queries for data in the ``Orchestrator``. The function requires the tensor name (`name`), +how many milliseconds to wait in between queries (`poll_frequency_ms`), +and the total number of times to query (`num_tries`). Check if the data exists in the ``Orchestrator`` by +polling every 100 milliseconds until 10 attempts have completed: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 38-41 + +When you execute the driver script, the output will be as follows:: + + 23:45:46 system.host.com SmartSim[87400] INFO The tensor exists: True + +Cleanup +------- +Finally, use the ``Experiment.stop`` function to stop the ``Orchestrator`` instance. Print the +workflow summary with ``Experiment.summary``: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 43-46 + +When you run the ``Experiment``, the following output will appear:: + + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|----------------|---------------|-------------|---------|---------|-----------|--------------| + | 0 | model | Model | 1658679.3 | 0 | 1.3342 | Completed | 0 | + | 1 | orchestrator_0 | DBNode | 1658679.2+2 | 0 | 42.8742 | Cancelled | 0 | + +.. _colocated_orch_doc: + +==================== +Colocated Deployment +==================== +-------- +Overview +-------- +During colocated ``Orchestrator`` deployment, a SmartSim ``Orchestrator`` (the database) runs on +the ``Model``'s compute node(s). Colocated ``Orchestrators`` can only be deployed as isolated instances +on each compute node and cannot be clustered over multiple nodes. The ``Orchestrator`` on each application node is +utilized by SmartRedis ``Clients`` on the same node. With a colocated ``Orchestrator``, all interactions +with the database occur on the same node, thus resulting in lower latency compared to the standard ``Orchestrator``. +A colocated ``Orchestrator`` is ideal when the data and hardware accelerator are located on the +same compute node. + +Communication between a colocated ``Orchestrator`` and ``Model`` is initiated in the application through a +SmartRedis ``Client``. Since a colocated ``Orchestrator`` is launched when the ``Model`` +is started by the ``Experiment``, connecting a SmartRedis ``Client`` to a colocated ``Orchestrator`` is only possible from within +the associated ``Model`` application. + +There are **three** methods for connecting the SmartRedis ``Client`` to the colocated ``Orchestrator``: + +- In an ``Experiment`` with a single deployed ``Orchestrator``, users can rely on SmartRedis + to detect the ``Orchestrator`` address through runtime configuration of the SmartSim ``Model`` environment. + A default ``Client`` constructor, with no user-specified parameters, is sufficient to + connect to the ``Orchestrator``. The only exception is for the Python ``Client``, which requires + the `cluster=False` constructor parameter for the colocated ``Orchestrator``. +- In an ``Experiment`` with multiple ``Orchestrators``, users can connect to a specific ``Orchestrator`` by + first specifying the `db_identifier` in the ``ConfigOptions`` constructor. Subsequently, users should pass the + ``ConfigOptions`` instance to the ``Client`` constructor. +- Users can specify or override automatically configured connection options by providing the + ``Orchestrator`` address in the ``ConfigOptions`` object. Subsequently, users should pass the ``ConfigOptions`` + instance to the ``Client`` constructor. + +Below is an image illustrating communication within a colocated ``Model`` spanning multiple compute nodes. +As demonstrated in the diagram, each process of the application creates its own SmartRedis ``Client`` +connection to the ``Orchestrator`` running on the same host. + +.. figure:: images/colocated_orchestrator-1.png + + Sample Colocated ``Orchestrator`` Deployment + +Colocated deployment is ideal for highly performant online inference scenarios where +a distributed application (likely an MPI application) is performing inference with +data local to each process. With colocated deployment, data does not need to travel +off-node to be used to evaluate a ML model, and the results of the ML model evaluation +are stored on-node. + +If a workflow requires an application to both leverage colocated +deployment and standalone deployment, multiple ``Clients`` can be instantiated within an application, +with each ``Client`` connected to a unique deployment. This is accomplished through the use of the +`db-identifier` specified at ``Orchestrator`` initialization time. + +------- +Example +------- +In the following example, we demonstrate deploying a colocated ``Orchestrator`` on an HPC system. +Once the ``Orchestrator`` is launched, we walk through connecting a SmartRedis ``Client`` +from within the application script to transmit and poll for data on the ``Orchestrator``. + +The example is comprised of two script files: + +- :ref:`Application Script` + The application script is a Python script that connects a SmartRedis + ``Client`` to the colocated ``Orchestrator``. From within the application script, + the ``Client`` is utilized to both send and retrieve data. The source code example + is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Application Script Source Code + + .. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py - Pinning is not supported on MacOS X. Setting ``custom_pinning`` to anything - other than ``None`` will raise a warning and the input will be ignored. +- :ref:`Experiment Driver Script` + The ``Experiment`` driver script launches and manages + the example entities through the ``Experiment`` API. + In the driver script, we use the ``Experiment`` API + to create and launch a colocated ``Model``. The source code example is available + in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Experiment Driver source code + + .. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + +.. _colocated_orch_app_script: + +Application Script +================== +To begin writing the application script, import the necessary SmartRedis packages: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 1-2 + +Client Initialization +--------------------- +To establish a connection with the colocated ``Orchestrator``, we need to initialize a +new SmartRedis ``Client`` and specify `cluster=False` since colocated deployments are never +clustered but only single-sharded. + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 4-5 + +.. note:: + Note that the C/C++/Fortran SmartRedis ``Clients`` are capable of reading cluster configurations + from the ``Model`` environment and the `cluster` constructor argument does not need to be specified + in those ``Client`` languages. .. note:: + Since there is only one ``Orchestrator`` launched in the ``Experiment`` + (the colocated ``Orchestrator``), specifying a ``Orchestrator`` `db_identifier` + is not required when initializing the ``Client``. SmartRedis will handle the + connection configuration. - Pinning _only_ affects the co-located deployment because both the application and the database - are sharing the same compute node. For the clustered deployment, a shard occupies the entirety - of the node. +.. note:: + To create a ``Client`` connection to the colocated ``Orchestrator``, the colocated ``Model`` must be launched + from within the driver script. You must execute the Python driver script, otherwise, there will + be no ``Orchestrator`` to connect the ``Client`` to. + +Data Storage +------------ +Next, using the SmartRedis ``Client`` instance, we create and store a NumPy tensor through +``Client.put_tensor(name, data)``: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 7-10 + +We will retrieve `“tensor_1”` in the following section. + +Data Retrieval +-------------- +To confirm a successful connection to the ``Orchestrator``, we retrieve the tensor we stored. +Use the ``Client.get_tensor`` method to retrieve the tensor by specifying the name +`“tensor_1”`: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 12-15 + +When the ``Experiment`` completes, you can find the following log message in `colo_model.out`:: + + Default@21-48-01:The colocated db tensor is: [1 2 3 4] + +.. _colocated_orch_driver_script: + +Experiment Driver Script +======================== +To run the previous application script, a ``Model`` object must be configured and launched within the +``Experiment`` driver script. Configuring and launching workflow entities (``Model``) +requires the utilization of ``Experiment`` class methods. The ``Experiment`` object is intended to +be instantiated once and utilized throughout the workflow runtime. + +In this example, we instantiate an ``Experiment`` object with the name `getting-started` +and the `launcher` set to `auto`. When using `launcher=auto`, SmartSim attempts to find a launcher on the machine. +In this case, since we are running the example on a Slurm-based machine, +SmartSim will automatically set the launcher to `slurm`. We set up the SmartSim `logger` +to output information from the ``Experiment`` at runtime: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 1-9 + +Colocated Model Initialization +------------------------------ +In the next stage of the ``Experiment``, we create and launch a colocated ``Model`` that +runs the application script with a ``Orchestrator`` on the same compute node. -Redis -===== +Step 1: Configure +''''''''''''''''' +In this example ``Experiment``, the ``Model`` application is a Python script as defined in section: +:ref:`Application Script`. Before initializing the ``Model`` object, we must use +``Experiment.create_run_settings`` to create a ``RunSettings`` object that defines how to execute +the ``Model``. To launch the Python script in this example workflow, we specify the path to the application +file `application_script.py` as the `exe_args` parameter and the executable `exe_ex` (the Python +executable on this system) as `exe` parameter. The ``Experiment.create_run_settings`` function +will return a ``RunSettings`` object that can then be used to initialize the ``Model`` object. -.. _Redis: https://github.com/redis/redis -.. _RedisAI: https://github.com/RedisAI/RedisAI +.. note:: + Change the `exe_args` argument to the path of the application script + on your file system to run the example. -The ``Orchestrator`` is built on `Redis`_. Largely, the job of the ``Orchestrator`` is to -create a Python reference to a Redis deployment so that users can launch, monitor -and stop a Redis deployment on workstations and HPC systems. +Use the ``RunSettings`` helper functions to +configure the the distribution of computational tasks (``RunSettings.set_nodes``). In this +example, we specify to SmartSim that we intend the ``Model`` to run on a single compute node. -Redis was chosen for the Orchestrator because it resides in-memory, can be distributed on-node -as well as across nodes, and provides low latency data access to many clients in parallel. The -Redis ecosystem was a primary driver as the Redis module system provides APIs for languages, -libraries, and techniques used in Data Science. In particular, the ``Orchestrator`` -relies on `RedisAI`_ to provide access to Machine Learning runtimes. +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 11-14 -At its core, Redis is a key-value store. This means that put/get semantics are used to send -messages to and from the database. SmartRedis clients use a specific hashing algorithm, CRC16, to ensure -that data is evenly distributed amongst all database nodes. Notably, a user is not required to -know where (which database node) data or Datasets (see Dataset API) are stored as the -SmartRedis clients will infer their location for the user. +Step 2: Initialize +'''''''''''''''''' +Next, create a ``Model`` instance using the ``Experiment.create_model`` factory method. +Pass the ``model_settings`` object as input to the method and +assign the returned ``Model`` instance to the variable `model`: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 16-17 + +Step 3: Colocate +'''''''''''''''' +To colocate an ``Orchestrator`` with a ``Model``, use the ``Model.colocate_db_uds`` function. +This function will colocate an ``Orchestrator`` instance with this ``Model`` over +a Unix domain socket connection. +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 19-20 -KeyDB -===== +Step 4: Generate Files +'''''''''''''''''''''' +Next, generate the ``Experiment`` entity directories by passing the ``Model`` instance to +``Experiment.generate``: -.. _KeyDB: https://github.com/EQ-Alpha/KeyDB +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 22-23 -`KeyDB`_ is a multi-threaded fork of Redis that can be swapped in as the database for -the ``Orchestrator`` in SmartSim. KeyDB can be swapped in for Redis by setting the -``REDIS_PATH`` environment variable to point to the ``keydb-server`` binary. +Step 5: Start +''''''''''''' +Next, launch the colocated ``Model`` instance using the ``Experiment.start`` function. -A full example of configuring KeyDB to run in SmartSim is shown below +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 25-26 -.. code-block:: bash +Cleanup +------- +.. note:: + Since the colocated ``Orchestrator`` is automatically torn down by SmartSim once the colocated ``Model`` + has finished, we do not need to `stop` the ``Orchestrator``. + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 28-29 + +When you run the experiment, the following output will appear:: - # build KeyDB - # see https://github.com/EQ-Alpha/KeyDB + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|--------|---------------|-----------|---------|---------|-----------|--------------| + | 0 | model | Model | 1592652.0 | 0 | 10.1039 | Completed | 0 | - # get KeyDB configuration file - wget https://github.com/CrayLabs/SmartSim/blob/d3d252b611c9ce9d9429ba6eeb71c15471a78f08/smartsim/_core/config/keydb.conf +.. _mutli_orch_doc: - export REDIS_PATH=/path/to/keydb-server - export REDIS_CONF=/path/to/keydb.conf +====================== +Multiple Orchestrators +====================== +SmartSim supports automating the deployment of multiple ``Orchestrators`` +from within an ``Experiment``. Communication with the ``Orchestrator`` via a SmartRedis ``Client`` is possible with the +`db_identifier` argument that is required when initializing an ``Orchestrator`` or +colocated ``Model`` during a multiple ``Orchestrator`` ``Experiment``. When initializing a SmartRedis +``Client`` during the ``Experiment``, create a ``ConfigOptions`` object to specify the `db_identifier` +argument used when creating the ``Orchestrator``. Pass the ``ConfigOptions`` object to +the ``Client`` init call. - # run smartsim workload +.. _mutli_orch: +----------------------------- Multiple Orchestrator Example -============================= +----------------------------- SmartSim offers functionality to automate the deployment of multiple databases, supporting workloads that require multiple ``Orchestrators`` for a ``Experiment``. For instance, a workload may consist of a simulation with high inference performance demands (necessitating a co-located deployment), -along with an analysis and -visualization workflow connected to the simulation (requiring a standard orchestrator). -In the following example, we simulate a simple version of this use case. +along with an analysis and visualization workflow connected to the simulation +(requiring a standalone ``Orchestrator``). In the following example, we simulate a +simple version of this use case. The example is comprised of two script files: -* The :ref:`Application Script` -* The :ref:`Experiment Driver Script` +* The Application Script +* The ``Experiment`` Driver Script **The Application Script Overview:** In this example, the application script is a python file that contains instructions to complete computational tasks. Applications are not limited to Python and can also be written in C, C++ and Fortran. -This script specifies creating a Python SmartRedis client for each -standard orchestrator and a colocated orchestrator. We use the -clients to request data from both standard databases, then -transfer the data to the colocated database. The application -file is launched by the experiment driver script +This script specifies creating a Python SmartRedis ``Client`` for each +standalone ``Orchestrator`` and a colocated ``Orchestrator``. We use the +``Clients`` to request data from both standalone ``Orchestrators``, then +transfer the data to the colocated ``Orchestrator``. The application +file is launched by the ``Experiment`` driver script through a ``Model`` stage. **The Application Script Contents:** -1. Connecting SmartRedis clients within the application to retrieve tensors - from the standard databases to store in a colocated database. Details in section: - :ref:`Initialize the Clients`. +1. Connecting SmartRedis ``Clients`` within the application to retrieve tensors + from the standalone ``Orchestrators`` to store in a colocated ``Orchestrator``. Details in section: + :ref:`Initialize the Clients`. **The Experiment Driver Script Overview:** -The experiment driver script holds the stages of the workflow +The ``Experiment`` driver script holds the stages of the workflow and manages their execution through the ``Experiment`` API. -We initialize an Experiment +We initialize an ``Experiment`` at the beginning of the Python file and use the ``Experiment`` to iteratively create, configure and launch computational kernels on the system through the `slurm` launcher. @@ -211,143 +691,146 @@ runs the application. **The Experiment Driver Script Contents:** -1. Launching two standard Orchestrators with unique identifiers. Details in section: - :ref:`Launch Multiple Orchestrators`. -2. Launching the application script with a co-located database. Details in section: - :ref:`Initialize a Colocated Model`. -3. Connecting SmartRedis clients within the driver script to send tensors to standard Orchestrators +1. Launching two standalone ``Orchestrators`` with unique identifiers. Details in section: + :ref:`Launch Multiple Orchestrators`. +2. Launching the application script with a colocated ``Orchestrator``. Details in section: + :ref:`Initialize a Colocated Model`. +3. Connecting SmartRedis ``Clients`` within the driver script to send tensors to standalone ``Orchestrators`` for retrieval within the application. Details in section: - :ref:`Create Client Connections to Orchestrators`. + :ref:`Create Client Connections to Orchestrators`. -Setup and run instructions can be found :ref:`here` +Setup and run instructions can be found :ref:`here` + +.. _app_script_multi_db: The Application Script ----------------------- -Applications interact with the databases -through a SmartRedis client. +====================== +Applications interact with the ``Orchestrators`` +through a SmartRedis ``Client``. In this section, we write an application script to demonstrate how to connect SmartRedis -clients in the context of multiple -launched databases. Using the clients, we retrieve tensors -from two databases launched in the driver script, then store -the tensors in the colocated database. +``Clients`` in the context of multiple +launched ``Orchestrators``. Using the ``Clients``, we retrieve tensors +from two ``Orchestrators`` launched in the driver script, then store +the tensors in the colocated ``Orchestrators``. .. note:: - The Experiment must be started to use the Orchestrators within the + The ``Experiment`` must be started to use the ``Orchestrators`` within the application script. Otherwise, it will fail to connect. - Find the instructions on how to launch :ref:`here` + Find the instructions on how to launch :ref:`here` To begin, import the necessary packages: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 1-3 +.. _init_model_client: + Initialize the Clients -^^^^^^^^^^^^^^^^^^^^^^ -To establish a connection with each database, -we need to initialize a new SmartRedis client for each -``Orchestrator``. +---------------------- +To establish a connection with each ``Orchestrators``, +we need to initialize a new SmartRedis ``Client`` for each. Step 1: Initialize ConfigOptions -"""""""""""""""""""""""""""""""" -Since we are launching multiple databases within the experiment, +'''''''''''''''''''''''''''''''' +Since we are launching multiple ``Orchestrators`` within the ``Experiment``, the SmartRedis ``ConfigOptions`` object is required when initializing -a client in the application. -We use the ``ConfigOptions.create_from_environment()`` +a ``Client`` in the application. +We use the ``ConfigOptions.create_from_environment`` function to create three instances of ``ConfigOptions``, with one instance associated with each launched ``Orchestrator``. -Most importantly, to associate each launched Orchestrator to a ConfigOptions object, -the ``create_from_environment()`` function requires specifying the unique database identifier +Most importantly, to associate each launched ``Orchestrator`` to a ``ConfigOptions`` object, +the ``create_from_environment`` function requires specifying the unique ``Orchestrator`` identifier argument named `db_identifier`. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 5-6 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 10-11 -For the colocated database: +For the colocated ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 15-16 Step 2: Initialize the Client Connections -""""""""""""""""""""""""""""""""""""""""" +''''''''''''''''''''''''''''''''''''''''' Now that we have three ``ConfigOptions`` objects, we have the -tools necessary to initialize three SmartRedis clients and -establish a connection with the three databases. -We use the SmartRedis ``Client`` API to create the client instances by passing in +tools necessary to initialize three SmartRedis ``Clients`` and +establish a connection with the three ``Orchestrators``. +We use the SmartRedis ``Client`` API to create the ``Client`` instances by passing in the ``ConfigOptions`` objects and assigning a `logger_name` argument. -Single-sharded database: +Single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 7-8 -Multi-sharded database: +Multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 12-13 -Colocated database: +Colocated ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 17-18 Retrieve Data and Store Using SmartRedis Client Objects -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To confirm a successful connection to each database, we will retrieve the tensors +------------------------------------------------------- +To confirm a successful connection to each ``Orchestrator``, we will retrieve the tensors that we plan to store in the python driver script. After retrieving, we -store both tensors in the colocated database. -The ``Client.get_tensor()`` method allows +store both tensors in the colocated ``Orchestrator``. +The ``Client.get_tensor`` method allows retrieval of a tensor. It requires the `name` of the tensor assigned -when sent to the database via ``Client.put_tensor()``. +when sent to the ``Orchestrator`` via ``Client.put_tensor``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 20-26 -Later, when you run the experiment driver script the following output will appear in ``tutorial_model.out`` +Later, when you run the ``Experiment`` driver script the following output will appear in ``tutorial_model.out`` located in ``getting-started-multidb/tutorial_model/``:: Model: single shard logger@00-00-00:The single sharded db tensor is: [1 2 3 4] Model: multi shard logger@00-00-00:The multi sharded db tensor is: [5 6 7 8] -This output showcases that we have established a connection with multiple Orchestrators. +This output showcases that we have established a connection with multiple ``Orchestrators``. -Next, take the tensors retrieved from the standard deployment databases and -store them in the colocated database using ``Client.put_tensor(name, data)``. +Next, take the tensors retrieved from the standalone deployment ``Orchestrators`` and +store them in the colocated ``Orchestrator`` using ``Client.put_tensor(name, data)``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 28-30 -Next, check if the tensors exist in the colocated database using ``Client.poll_tensor()``. -This function queries for data in the database. The function requires the tensor name (`name`), +Next, check if the tensors exist in the colocated ``Orchestrator`` using ``Client.poll_tensor``. +This function queries for data in the ``Orchestrator``. The function requires the tensor name (`name`), how many milliseconds to wait in between queries (`poll_frequency_ms`), and the total number of times to query (`num_tries`): -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 32-37 @@ -358,156 +841,162 @@ The output will be as follows:: Model: colo logger@00-00-00:The colocated db has tensor_2: True The Experiment Driver Script ----------------------------- +============================ To run the previous application, we must define workflow stages within a workload. Defining workflow stages requires the utilization of functions associated -with the ``Experiment`` object. The Experiment object is intended to be instantiated +with the ``Experiment`` object. The ``Experiment`` object is intended to be instantiated once and utilized throughout the workflow runtime. In this example, we instantiate an ``Experiment`` object with the name ``getting-started-multidb``. We setup the SmartSim ``logger`` to output information from the Experiment. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 1-10 +.. _launch_multiple_orch: + Launch Multiple Orchestrators -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +----------------------------- In the context of this ``Experiment``, it's essential to create and launch -the databases as a preliminary step before any other components since -the application script requests tensors from the launched databases. +the ``Orchestrators`` as a preliminary step before any other components since +the application script requests tensors from the launched ``Orchestrators``. -We aim to showcase the multi-database automation capabilities of SmartSim, so we -create two databases in the workflow: a single-sharded database and a -multi-sharded database. +We aim to showcase the multi-Orchestrator automation capabilities of SmartSim, so we +create two ``Orchestrators`` in the workflow: a single-sharded ``Orchestrator`` and a +multi-sharded ``Orchestrator``. Step 1: Initialize Orchestrators -"""""""""""""""""""""""""""""""" -To create an database, utilize the ``Experiment.create_database()`` function. +'''''''''''''''''''''''''''''''' +To create an ``Orchestrator``, utilize the ``Experiment.create_database`` function. The function requires specifying a unique -database identifier argument named `db_identifier` to launch multiple databases. -This step is necessary to connect to databases outside of the driver script. +``Orchestrator`` identifier argument named `db_identifier` to launch multiple ``Orchestrators``. +This step is necessary to connect to ``Orchestrators`` outside of the driver script. We will use the `db_identifier` names we specified in the application script. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 12-14 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 16-18 .. note:: - Calling ``exp.generate()`` will create two subfolders - (one for each Orchestrator created in the previous step) - whose names are based on the db_identifier of that Orchestrator. + Calling ``exp.generate`` will create two subfolders + (one for each ``Orchestrator`` created in the previous step) + whose names are based on the `db_identifier` of that ``Orchestrator``. In this example, the Experiment folder is - named ``getting-started-multidb/``. Within this folder, two Orchestrator subfolders will + named ``getting-started-multidb/``. Within this folder, two ``Orchestrator`` subfolders will be created, namely ``single_shard_db_identifier/`` and ``multi_shard_db_identifier/``. -Step 2: Start Databases -""""""""""""""""""""""" -Next, to launch the databases, -pass the database instances to ``Experiment.start()``. +Step 2: Start +''''''''''''' +Next, to launch the ``Orchestrators``, +pass the ``Orchestrator`` instances to ``Experiment.start``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 20-21 -The ``Experiment.start()`` function launches the ``Orchestrators`` for use within the workflow. In other words, the function -deploys the databases on the allocated compute resources. +The ``Experiment.start`` function launches the ``Orchestrators`` for use within the workflow. In other words, the function +deploys the ``Orchestrators`` on the allocated compute resources. .. note:: By setting `summary=True`, SmartSim will print a summary of the - experiment before it is launched. After printing the experiment summary, - the experiment is paused for 10 seconds giving the user time to - briefly scan the summary contents. If we set `summary=False`, then the experiment + ``Experiment`` before it is launched. After printing the ``Experiment`` summary, + the ``Experiment`` is paused for 10 seconds giving the user time to + briefly scan the summary contents. If we set `summary=False`, then the ``Experiment`` would be launched immediately with no summary. +.. _client_connect_orch: + Create Client Connections to Orchestrators -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------ The SmartRedis ``Client`` object contains functions that manipulate, send, and receive -data within the database. Each database has a single, dedicated SmartRedis ``Client``. -Begin by initializing a SmartRedis ``Client`` object per launched database. +data within the ``Orchestrator``. Each ``Orchestrator`` has a single, dedicated SmartRedis ``Client``. +Begin by initializing a SmartRedis ``Client`` object per launched ``Orchestrator``. To create a designated SmartRedis ``Client``, you need to specify the address of the target -running database. You can easily retrieve this address using the ``Orchestrator.get_address()`` function. +running ``Orchestrator``. You can easily retrieve this address using the ``Orchestrator.get_address`` function. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 23-24 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 25-26 Store Data Using Clients -^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------ In the application script, we retrieved two NumPy tensors. To support the apps functionality, we will create two -NumPy arrays in the python driver script and send them to the a database. To -accomplish this, we use the ``Client.put_tensor()`` function with the respective -database client instances. +NumPy arrays in the python driver script and send them to the a ``Orchestrator``. To +accomplish this, we use the ``Client.put_tensor`` function with the respective +``Orchestrator`` `client` instances. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 28-31 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 33-36 -Lets check to make sure the database tensors do not exist in the incorrect databases: +Lets check to make sure the ``Orchestrator`` tensors do not exist in the incorrect ``Orchestrators``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 38-42 -When you run the experiment, the following output will appear:: +When you run the ``Experiment``, the following output will appear:: 00:00:00 system.host.com SmartSim[#####] INFO The multi shard array key exists in the incorrect database: False 00:00:00 system.host.com SmartSim[#####] INFO The single shard array key exists in the incorrect database: False +.. _init_colocated_model: + Initialize a Colocated Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In the next stage of the experiment, we -launch the application script with a co-located database +---------------------------- +In the next stage of the ``Experiment``, we +launch the application script with a co-located ``Orchestrator`` by configuring and creating a SmartSim colocated ``Model``. Step 1: Configure -""""""""""""""""" -You can specify the run settings of a model. -In this experiment, we invoke the Python interpreter to run -the python script defined in section: :ref:`The Application Script`. -To configure this into a ``Model``, we use the ``Experiment.create_run_settings()`` function. +''''''''''''''''' +You can specify the run settings of a ``Model``. +In this ``Experiment``, we invoke the Python interpreter to run +the python script defined in section: :ref:`The Application Script`. +To configure this into a SmartSim ``Model``, we use the ``Experiment.create_run_settings`` function. The function returns a ``RunSettings`` object. When initializing the RunSettings object, we specify the path to the application file, `application_script.py`, for ``exe_args``, and the run command for ``exe``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 44-45 @@ -517,62 +1006,66 @@ we specify the path to the application file, on your machine to run the example. With the ``RunSettings`` instance, -configure the the distribution of computational tasks (``RunSettings.set_nodes()``) and the number of instances -the script is execute on each node (``RunSettings.set_tasks_per_node()``). In this +configure the the distribution of computational tasks (``RunSettings.set_nodes``) and the number of instances +the script is execute on each node (``RunSettings.set_tasks_per_node``). In this example, we specify to SmartSim that we intend to execute the script once on a single node. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 46-48 Step 2: Initialize -"""""""""""""""""" -Next, create a ``Model`` instance using the ``Experiment.create_model()``. +'''''''''''''''''' +Next, create a ``Model`` instance using the ``Experiment.create_model``. Pass the ``model_settings`` object as an argument -to the ``create_model()`` function and assign to the variable ``model``. +to the ``create_model`` function and assign to the variable ``model``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 49-50 Step 2: Colocate -"""""""""""""""" -To colocate the model, use the ``Model.colocate_db_uds()`` function to -Colocate an Orchestrator instance with this Model over +'''''''''''''''' +To colocate the ``Model``, use the ``Model.colocate_db_uds`` function to +Colocate an ``Orchestrator`` instance with this ``Model`` over a Unix domain socket connection. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 51-52 This method will initialize settings which add an unsharded -database to this Model instance. Only this Model will be able -to communicate with this colocated database by using the loopback TCP interface. +``Orchestrator`` to this ``Model`` instance. Only this ``Model`` will be able +to communicate with this colocated ``Orchestrator`` by using the loopback TCP interface. Step 3: Start -""""""""""""" -Next, launch the colocated model instance using the ``Experiment.start()`` function. +''''''''''''' +Next, launch the colocated ``Model`` instance using the ``Experiment.start`` function. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 53-54 .. note:: We set `block=True`, - so that ``Experiment.start()`` waits until the last Model has finished + so that ``Experiment.start`` waits until the last ``Model`` has finished before returning: it will act like a job monitor, letting us know if processes run, complete, or fail. Cleanup Experiment -^^^^^^^^^^^^^^^^^^ -Finally, use the ``Experiment.stop()`` function to stop the database instances. Print the -workflow summary with ``Experiment.summary()``. +------------------ +Finally, use the ``Experiment.stop`` function to stop the standard ``Orchestrator`` instances. + +.. note:: + Co-located ``Orchestrator``s are stopped when their associated ``Model``'s are stopped. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +Print the workflow summary with ``Experiment.summary``. + +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 56-59 @@ -586,16 +1079,18 @@ When you run the experiment, the following output will appear:: | 1 | single_shard_db_identifier_0 | DBNode | 1556529.3 | 0 | 68.8732 | Cancelled | 0 | | 2 | multi_shard_db_identifier_0 | DBNode | 1556529.4+2 | 0 | 45.5139 | Cancelled | 0 | +.. _run_ex_instruct: + How to Run the Example ----------------------- -Below are the steps to run the experiment. Find the -:ref:`experiment source code` -and :ref:`application source code` +====================== +Below are the steps to run the ``Experiment``. Find the +:ref:`experiment source code` +and :ref:`application source code` below in the respective subsections. .. note:: The example assumes that you have already installed and built - SmartSim and SmartRedis. Please refer to Section :ref:`Basic Installation` + SmartSim and SmartRedis. Please refer to Section :ref:`Basic Installation` for further details. For simplicity, we assume that you are running on a SLURM-based HPC-platform. Refer to the steps below for more details. @@ -609,7 +1104,7 @@ Step 1 : Setup your directory tree application_script.py experiment_script.py - You can find the application and experiment source code in subsections below. + You can find the application and ``Experiment`` source code in subsections below. Step 2 : Install and Build SmartSim This example assumes you have installed SmartSim and SmartRedis in your @@ -619,21 +1114,25 @@ Step 2 : Install and Build SmartSim Step 3 : Change the `exe_args` file path When configuring the colocated model in `experiment_script.py`, we pass the file path of `application_script.py` to the `exe_args` argument - on line 33 in :ref:`experiment_script.py`. + on line 33 in :ref:`experiment_script.py`. Edit this argument to the file path of your `application_script.py` -Step 4 : Run the Experiment - Finally, run the experiment with ``python experiment_script.py``. +Step 4 : Run the ``Experiment`` + Finally, run the ``Experiment`` with ``python experiment_script.py``. + +.. _multi_app_source_code: Application Source Code -^^^^^^^^^^^^^^^^^^^^^^^ -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +----------------------- +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: +.. _multi_exp_source_code: + Experiment Source Code -^^^^^^^^^^^^^^^^^^^^^^ -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +---------------------- +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: \ No newline at end of file diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index e883a2805..8b6d46bb9 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -12,3 +12,5 @@ ipython jinja2==3.1.2 protobuf numpy +sphinx-design +pypandoc diff --git a/doc/run_settings.rst b/doc/run_settings.rst new file mode 100644 index 000000000..449b61ea4 --- /dev/null +++ b/doc/run_settings.rst @@ -0,0 +1,311 @@ +.. _run_settings_doc: + +************ +Run Settings +************ +======== +Overview +======== +``RunSettings`` are used in the SmartSim API to define how ``Model`` and ``Ensemble`` jobs +should be executed. + +In general, ``RunSettings`` define: + +- the executable +- the arguments to pass to the executable +- necessary environment variables at runtime +- the required compute resources + +The base ``RunSettings`` class is utilized for local task launches, +while its derived child classes offer specialized functionality for HPC workload managers (WLMs). +Each SmartSim `launcher` interfaces with a specific ``RunSettings`` subclass tailored to an HPC job scheduler. + +- Navigate to the :ref:`Local` section to configure run settings locally +- Navigate to the :ref:`HPC Systems` section to configure run settings for HPC + +A ``RunSettings`` object is initialized through the ``Experiment.create_run_settings`` function. +This function accepts a `run_command` argument: the command to run the executable. + +If `run_command` is set to `"auto"`, SmartSim will attempt to match a run command on the +system with a ``RunSettings`` class. If found, the class corresponding to +that `run_command` will be created and returned. + +If the `run_command` is passed a recognized run command (e.g. `"srun"`) the ``RunSettings`` +instance will be a child class such as ``SrunSettings``. You may also specify `"mpirun"`, +`"mpiexec"`, `"aprun"`, `"jsrun"` or `"orterun"` to the `run_command` argument. +This will return the associated child class. + +If the run command is not supported by SmartSim, the base ``RunSettings`` class will be created and returned +with the specified `run_command` and `run_args` evaluated literally. + +After creating a ``RunSettings`` instance, users gain access to the attributes and methods +of the associated child class, providing them with the ability to further configure the run +settings for jobs. + +======== +Examples +======== +.. _run_settings_local_ex: + +Local +===== +When running SmartSim on laptops and single node workstations via the `"local"` +`launcher`, job execution is configured with the base ``RunSettings`` object. +For local launches, ``RunSettings`` accepts a `run_command` parameter to allow +the use of parallel launch binaries like `"mpirun"`, `"mpiexec"`, and others. + +If no `run_command` is specified and the ``Experiment`` `launcher` is set to `"local"`, +the executable is launched locally. When utilizing the `"local"` launcher and configuring +the `run_command` parameter to `"auto"` in the ``Experiment.create_run_settings`` factory +method, SmartSim defaults to omitting any run command prefix before the executable. + +Once the ``RunSettings`` object is initialized using the ``Experiment.create_run_settings`` factory +method, the :ref:`RunSettings API` can be used to further configure the +``RunSettings`` object prior to execution. + +.. note:: + The local `launcher` is the default `launcher` for all ``Experiment`` instances. + +When the user initializes the ``Experiment`` at the beginning of the Python driver script, +a `launcher` argument may be specified. SmartSim will register or detect the +`launcher` and return the supported class upon a call to ``Experiment.create_run_settings``. +Below we demonstrate creating and configuring the base ``RunSettings`` +object for local launches by specifying the `"local"` launcher during ``Experiment`` creation. +We also demonstrate specifying `run_command="mpirun"` locally. + +**Initialize and Configure a RunSettings Object with No Run Command Specified:** + +.. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher local + exp = Experiment("name-of-experiment", launcher="local") + + + # Initialize a RunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command=None) + +**Initialize and Configure a RunSettings Object with the `mpirun` Run Command Specified:** + +.. note:: + Please note that to run this example you need to have an MPI implementation + (e.g. OpenMPI or MPICH) installed. + +.. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher local + exp = Experiment("name-of-experiment", launcher="local") + + # Initialize a RunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + +Users may replace `mpirun` with `mpiexec`. + +.. _run_settings_hpc_ex: + +HPC System +========== +To configure an entity for launch on an HPC system, SmartSim offers ``RunSettings`` child classes. +Each WLM `launcher` supports different ``RunSettings`` child classes. +When the user initializes the ``Experiment`` at the beginning of the Python driver script, +a `launcher` argument may be specified. The specified `launcher` will be used by SmartSim to +return the correct ``RunSettings`` child class that matches with the specified (or auto-detected) +`run_command` upon a call to ``Experiment.create_run_settings``. Below we demonstrate +creating and configuring the base ``RunSettings`` object for HPC launches +by specifying the launcher during ``Experiment`` creation. We show examples +for each job scheduler. + +.. tabs:: + + .. group-tab:: Slurm + + The Slurm `launcher` supports the :ref:`SrunSettings API ` as well as the :ref:`MpirunSettings API `, + :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables + with launch binaries like `"srun"`, `"mpirun"`, `"mpiexec"` and `"orterun"`. Below we step through initializing a ``SrunSettings`` and ``MpirunSettings`` + instance on a Slurm based machine using the associated `run_command`. + + **SrunSettings** + + Run a job with the `srun` command on a Slurm based system. Any arguments passed in + the `run_args` dict will be converted into `srun` arguments and prefixed with `"--"`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the Experiment and provide launcher Slurm + exp = Experiment("name-of-experiment", launcher="slurm") + + # Initialize a SrunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="srun") + # Set the number of nodes + run_settings.set_nodes(4) + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + **MpirunSettings** + + Run a job with the `mpirun` command (MPI-standard) on a Slurm based system. Any + arguments passed in the `run_args` dict will be converted into `mpirun` arguments + and prefixed with `"--"`. Values of `None` can be provided for arguments that do + not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the Experiment and provide launcher Slurm + exp = Experiment("name-of-experiment", launcher="slurm") + + # Initialize a MpirunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + Users may replace `mpirun` with `mpiexec` or `orterun`. + + .. group-tab:: PBS Pro + The PBS Pro `launcher` supports the :ref:`AprunSettings API ` as well as the :ref:`MpirunSettings API `, + :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables + with launch binaries like `"aprun"`, `"mpirun"`, `"mpiexec"` and `"orterun"`. Below we step through initializing a ``AprunSettings`` and ``MpirunSettings`` + instance on a PBS Pro based machine using the associated `run_command`. + + **AprunSettings** + + Run a job with `aprun` command on a PBS Pro based system. Any arguments passed in + the `run_args` dict will be converted into `aprun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PBS Pro + exp = Experiment("name-of-experiment", launcher="pbs") + + # Initialize a AprunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="aprun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + **MpirunSettings** + + Run a job with `mpirun` command on a PBS Pro based system. Any arguments passed + in the `run_args` dict will be converted into `mpirun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PBS Pro + exp = Experiment("name-of-experiment", launcher="pbs") + + # Initialize a MpirunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + Users may replace `mpirun` with `mpiexec` or `orterun`. + + .. group-tab:: PALS + The PALS `launcher` supports the :ref:`MpiexecSettings API ` that can be used to run executables + with the `mpiexec` launch binary. Below we step through initializing a ``MpiexecSettings`` instance on a PALS + based machine using the associated `run_command`. + + **MpiexecSettings** + + Run a job with `mpiexec` command on a PALS based system. Any arguments passed in the `run_args` dict will be converted into `mpiexec` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PALS + exp = Experiment("name-of-experiment", launcher="pals") + + # Initialize a MpiexecSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpiexec") + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + .. group-tab:: LSF + The LSF `launcher` supports the :ref:`JsrunSettings API ` as well as the :ref:`MpirunSettings API `, + :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables + with launch binaries like `"jsrun"`, `"mpirun"`, `"mpiexec"` and `"orterun"`. Below we step through initializing a ``JsrunSettings`` and ``MpirunSettings`` + instance on a LSF based machine using the associated `run_command`. + + **JsrunSettings** + + Run a job with `jsrun` command on a LSF based system. Any arguments passed in the + `run_args` dict will be converted into `jsrun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher LSF + exp = Experiment("name-of-experiment", launcher="lsf") + + # Initialize a JsrunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="jsrun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + **MpirunSettings** + + Run a job with `mpirun` command on a LSF based system. Any arguments passed in the + `run_args` dict will be converted into `mpirun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher LSF + exp = Experiment("name-of-experiment", launcher="lsf") + + # Initialize a MpirunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + Users may replace `mpirun` with `mpiexec` or `orterun`. + +.. note:: + SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation + is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID + when initializing a run settings object via the `alloc` argument. If an allocation is specified, the entity receiving + these run parameters will launch on that allocation. \ No newline at end of file diff --git a/doc/sr_advanced_topics.rst b/doc/sr_advanced_topics.rst index 30da2c578..763a7fbe7 100644 --- a/doc/sr_advanced_topics.rst +++ b/doc/sr_advanced_topics.rst @@ -1,2 +1,2 @@ - +.. _config_options_explain: .. include:: ../smartredis/doc/advanced_topics.rst \ No newline at end of file diff --git a/doc/ss_logger.rst b/doc/ss_logger.rst new file mode 100644 index 000000000..186e28a89 --- /dev/null +++ b/doc/ss_logger.rst @@ -0,0 +1,221 @@ +****** +Logger +****** + +.. _ss_logger: + +======== +Overview +======== +SmartSim supports logging experiment activity through a logging API accessible via +the SmartSim `log` module. The SmartSim logger, backed by Python logging, enables +real-time logging of experiment activity **to stdout** and/or **to file**, with +multiple verbosity levels for categorizing log messages. + +Users may instruct SmartSim to log certain verbosity level log messages +and omit others through the `SMARTSIM_LOG_LEVEL` environment variable. The `SMARTSIM_LOG_LEVEL` +environment variable may be overridden when logging to file by specifying a log level to +the ``log_to_file`` function. Examples walking through logging :ref:`to stdout` +and :ref:`to file` are provided below. + +SmartSim offers **four** log functions to use within the Python driver script. The +below functions accept string messages: + +- ``logger.error`` +- ``logger.warning`` +- ``logger.info`` +- ``logger.debug`` + +The `SMARTSIM_LOG_LEVEL` environment variable accepts **four** log levels: `quiet`, +`info`, `debug` and `developer`. Setting the log level in the environment (or via the override function) +controls the log messages that are output at runtime. The log levels are listed below from +least verbose to most verbose: + +- level: `quiet` + - The `quiet` log level instructs SmartSim to print ``error`` and ``warning`` messages. +- level: `info` + - The `info` log level instructs SmartSim to print ``info``, ``error`` and ``warning`` messages. +- level: `debug` + - The `debug` log level instructs SmartSim to print ``debug``, ``info``, ``error`` and ``warning`` messages. +- level: `developer` + - The `developer` log level instructs SmartSim to print ``debug``, ``info``, ``error`` and ``warning`` messages. + +.. note:: + Levels `developer` and `debug` print the same log messages. The `developer` log level is intended for use + during code development and signifies highly detailed and verbose logging. + +.. note:: + `SMARTSIM_LOG_LEVEL` defaults to log level `info`. For SmartSim log API examples, continue to the :ref:`Examples` section. + +.. _log_ex: + +======== +Examples +======== +.. _log_to_stdout: + +------------- +Log to stdout +------------- +The ``get_logger`` function in SmartSim enables users to initialize a logger instance. +Once initialized, a user may use the instance to log a message using one of the four +logging functions. + +To use the SmartSim logger within a Python script, import the required `get_logger` +function from the `log` module: + +.. code-block:: python + + from smartsim.log import get_logger + +Next, initialize an instance of the logger and provide a logger `name`: + +.. code-block:: python + + logger = get_logger("SmartSim") + +To demonstrate full functionality of the SmartSim logger, we include all log +functions in the Python driver script with log messages: + +.. code-block:: python + + logger.info("This is a message") + logger.debug("This is a debug message") + logger.error("This is an error message") + logger.warning("This is a warning message") + +Execute the script *without* setting the `SMARTSIM_LOG_LEVEL`. Remember that `SMARTSIM_LOG_LEVEL` +defaults to `info`. When we execute the script, the following messages will print to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[130033] INFO This is a message + 11:15:00 system.host.com SmartSim[130033] ERROR This is an error message + 11:15:00 system.host.com SmartSim[130033] WARNING This is a warning message + +Notice that the `debug` function message was filtered. This is because by using +a lower verbosity level (`info`), we instruct SmartSim to omit the higher verbosity level messages (`debug` and `developer`). + +Next, set `SMARTSIM_LOG_LEVEL` to `debug`: + +.. code-block:: bash + + export SMARTSIM_LOG_LEVEL=debug + +When we execute the script again, +the following messages will print to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[65385] INFO This is a message + 11:15:00 system.host.com SmartSim[65385] DEBUG This is a debug message + 11:15:00 system.host.com SmartSim[65385] ERROR This is an error message + 11:15:00 system.host.com SmartSim[65385] WARNING This is a warning message + +Notice that all log messages print to stdout. By using a higher verbosity level (`debug`), +we instruct SmartSim to print all log functions at and above the level. + +Next, set `SMARTSIM_LOG_LEVEL` to `quiet` in terminal: + +.. code-block:: bash + + export SMARTSIM_LOG_LEVEL=quiet + +When we run the program once again, the following output is printed +to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[65385] ERROR This is an error message + 11:15:00 system.host.com SmartSim[65385] WARNING This is a warning message + +Notice that the `info` and `debug` log functions were filtered. This is because by using +the least verbose level (`quiet`), we instruct SmartSim to omit messages at higher verbosity levels +(`info`, `debug` and `developer`). + +To finish the example, set `SMARTSIM_LOG_LEVEL` to `info` in terminal: + +.. code-block:: bash + + export SMARTSIM_LOG_LEVEL=info + +When we execute the script, the following messages will print +to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[130033] INFO This is a message + 11:15:00 system.host.com SmartSim[130033] ERROR This is an error message + 11:15:00 system.host.com SmartSim[130033] WARNING This is a warning message + +Notice that the same messages were logged to stdout as when we ran the script with the default value `info`. +SmartSim omits messages at higher verbosity levels (`debug` and `developer`). + +.. _log_to_file: + +--------------- +Logging to File +--------------- +The ``log_to_file`` function in SmartSim allows users to log messages +to a specified file by providing a file name or relative file path. If the file name +passed in does not exist, SmartSim will create the file. If the program is re-executed with the same +file name, the file contents will be overwritten. + +To demonstrate, begin by importing the functions `get_logger` and `log_to_file` from the `log` module: + +.. code-block:: python + + from smartsim.log import get_logger, log_to_file + +Initialize a logger for use within the Python driver script: + +.. code-block:: python + + logger = get_logger("SmartSim") + +Invoke the ``log_to_file`` function to instruct SmartSim to create a file named `logger.out` +to write log messages to: + +.. code-block:: python + + log_to_file("logger.out") + +For the example, we add all log functions to the script: + +.. code-block:: python + + logger.info("This is a message") + logger.debug("This is a debug message") + logger.error("This is an error message") + logger.warning("This is a warning message") + +Remember that the default value for the `SMARTSIM_LOG_LEVEL` variable is `info`. +Therefore, we will not set the environment variable and instead rely on the +default. + +When we execute the Python script, a file named `logger.out` is created in our working +directory with the listed contents: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[10950] INFO This is a message + 11:15:00 system.host.com SmartSim[10950] ERROR This is an error message + 11:15:00 system.host.com SmartSim[10950] WARNING This is a warning message + +Notice that the `debug` function message was filtered. This is because by using +a lower verbosity level (`info`), we instruct SmartSim to omit higher verbosity messages (`debug` and `developer`). + +In the same Python script, add a log level to the ``log_to_file`` as a input argument: + +.. code-block:: python + + log_to_file("logger.out", "quiet") + +When we execute the Python script once again, SmartSim will override the `SMARTSIM_LOG_LEVEL` +variable to output messages of log level `quiet`. SmartSim will overwrite the contents +of `logger.out` with: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[10950] ERROR This is an error message + 11:15:00 system.host.com SmartSim[10950] WARNING This is a warning message \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py b/doc/tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py new file mode 100644 index 000000000..57d720163 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py @@ -0,0 +1,17 @@ +from smartredis import Client, LLInfo + +# Initialize a Client +client = Client(cluster=False) + +# Set the data source +client.set_data_source("producer_0") +# Check if the tensor exists +tensor_1 = client.poll_tensor("tensor", 100, 100) + +# Set the data source +client.set_data_source("producer_1") +# Check if the tensor exists +tensor_2 = client.poll_tensor("tensor", 100, 100) + +client.log_data(LLInfo, f"producer_0.tensor was found: {tensor_1}") +client.log_data(LLInfo, f"producer_1.tensor was found: {tensor_2}") \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py b/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py new file mode 100644 index 000000000..619a56e05 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py @@ -0,0 +1,10 @@ +from smartredis import Client +import numpy as np + +# Initialize a Client +client = Client(cluster=False) + +# Create NumPy array +array = np.array([1, 2, 3, 4]) +# Use SmartRedis Client to place tensor in standalone Orchestrator +client.put_tensor("tensor", array) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py new file mode 100644 index 000000000..a2fa206f5 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def save_tf_cnn(path, file_name): + """Create a Keras CNN and save to file for example purposes""" + from smartsim.ml.tf import freeze_model + + n = Net() + input_shape = (3, 3, 1) + n.build(input_shape=(None, *input_shape)) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return freeze_model(model, path, file_name) + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Serialize and save TF model to file +model_file, inputs, outputs = save_tf_cnn(ensemble_instance.path, "model.pb") + +# Attach ML model file to Ensemble +ensemble_instance.add_ml_model(name="cnn", backend="TF", model_path=model_file, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py new file mode 100644 index 000000000..98974fdc2 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def create_tf_cnn(): + """Create an in-memory Keras CNN for example purposes + + """ + from smartsim.ml.tf import serialize_model + n = Net() + input_shape = (3,3,1) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return serialize_model(model) + +# Serialize and save TF model +model, inputs, outputs = create_tf_cnn() + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Attach the in-memory ML model to the SmartSim Ensemble +ensemble_instance.add_ml_model(name="cnn", backend="TF", model=model, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py new file mode 100644 index 000000000..819ed814f --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py @@ -0,0 +1,13 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Attach TorchScript to Ensemble +ensemble_instance.add_script(name="example_script", script_path="path/to/torchscript.py", device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py new file mode 100644 index 000000000..3e68bfd5a --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +def timestwo(x): + return 2*x + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Ensemble object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Attach TorchScript to Ensemble +ensemble_instance.add_function(name="example_func", function=timestwo, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py new file mode 100644 index 000000000..b8f907e9a --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# TorchScript string +torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + +# Attach TorchScript to Ensemble +ensemble_instance.add_script(name="example_script", script=torch_script_str, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py b/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py new file mode 100644 index 000000000..1a1db58e4 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py @@ -0,0 +1,42 @@ +from smartsim import Experiment +from smartsim.log import get_logger + +logger = get_logger("Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a standalone Orchestrator +standalone_orch = exp.create_database(db_nodes=1) + +# Initialize a RunSettings object for Ensemble +ensemble_settings = exp.create_run_settings(exe="/path/to/executable_producer_simulation") + +# Initialize Ensemble +producer_ensemble = exp.create_ensemble("producer", run_settings=ensemble_settings, replicas=2) + +# Enable key prefixing for Ensemble members +producer_ensemble.enable_key_prefixing() + +# Initialize a RunSettings object for Model +model_settings = exp.create_run_settings(exe="/path/to/executable_consumer_simulation") +# Initialize Model +consumer_model = exp.create_model("consumer", model_settings) + +# Generate SmartSim entity folder tree +exp.generate(standalone_orch, producer_ensemble, consumer_model, overwrite=True) + +# Launch Orchestrator +exp.start(standalone_orch, summary=True) + +# Launch Ensemble +exp.start(producer_ensemble, block=True, summary=True) + +# Register Ensemble members on consumer Model +for model in producer_ensemble: + consumer_model.register_incoming_entity(model) + +# Launch consumer Model +exp.start(consumer_model, block=True, summary=True) + +# Clobber Orchestrator +exp.stop(standalone_orch) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/file_attach.py b/doc/tutorials/doc_examples/ensemble_doc_examples/file_attach.py new file mode 100644 index 000000000..68f233342 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/file_attach.py @@ -0,0 +1,20 @@ +from smartsim import Experiment + +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="python", exe_args="/path/to/application.py") + +# Initialize an Ensemble object via replicas strategy +example_ensemble = exp.create_ensemble("ensemble", ensemble_settings, replicas=2, params={"THERMO":1}) + +# Attach the file to the Ensemble instance +example_ensemble.attach_generator_files(to_configure="path/to/params_inputs.txt") + +# Generate the Ensemble directory +exp.generate(example_ensemble) + +# Launch the Ensemble +exp.start(example_ensemble) + diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py b/doc/tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py new file mode 100644 index 000000000..89c9ea27e --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py @@ -0,0 +1,25 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize BatchSettings +bs = exp.create_batch_settings(nodes=10, + time="01:00:00") + +# Initialize Ensemble +ensemble = exp.create_ensemble("ensemble-append", batch_settings=bs) + +# Initialize RunSettings for Model 1 +srun_settings_1 = exp.create_run_settings(exe=exe, exe_args="path/to/application_script_1.py") +# Initialize RunSettings for Model 2 +srun_settings_2 = exp.create_run_settings(exe=exe, exe_args="path/to/application_script_2.py") +# Initialize Model 1 with RunSettings 1 +model_1 = exp.create_model(name="model_1", run_settings=srun_settings_1) +# Initialize Model 2 with RunSettings 2 +model_2 = exp.create_model(name="model_2", run_settings=srun_settings_2) + +# Add Model member to Ensemble +ensemble.add_model(model_1) +# Add Model member to Ensemble +ensemble.add_model(model_2) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py new file mode 100644 index 000000000..6ccbce397 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings +rs = exp.create_run_settings(exe="path/to/example_simulation_program") + +#Create the parameters to expand to the Ensemble members +params = { + "name": ["Ellie", "John"], + "parameter": [2, 11] + } + +# Initialize the Ensemble by specifying RunSettings, the params and "all_perm" +ensemble = exp.create_ensemble("model_member", run_settings=rs, params=params, perm_strategy="all_perm") diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py new file mode 100644 index 000000000..f6fb30967 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py @@ -0,0 +1,21 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a BatchSettings +bs = exp.create_batch_settings(nodes=2, + time="10:00:00") + +# Initialize and configure RunSettings +rs = exp.create_run_settings(exe="python", exe_args="path/to/application_script.py") +rs.set_nodes(1) + +#Create the parameters to expand to the Ensemble members +params = { + "name": ["Ellie", "John"], + "parameter": [2, 11] + } + +# Initialize the Ensemble by specifying RunSettings, BatchSettings, the params and "step" +ensemble = exp.create_ensemble("ensemble", run_settings=rs, batch_settings=bs, params=params, perm_strategy="step") \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_1.py b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_1.py new file mode 100644 index 000000000..0dd5d16f5 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_1.py @@ -0,0 +1,10 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +rs = exp.create_run_settings(exe="python", exe_args="path/to/application_script.py") + +# Initialize the Ensemble by specifying the number of replicas and RunSettings +ensemble = exp.create_ensemble("ensemble-replica", replicas=4, run_settings=rs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_2.py b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_2.py new file mode 100644 index 000000000..e2363a5be --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_2.py @@ -0,0 +1,15 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a BatchSettings object +bs = exp.create_batch_settings(nodes=4, + time="10:00:00") + +# Initialize and configure a RunSettings object +rs = exp.create_run_settings(exe="python", exe_args="path/to/application_script.py") +rs.set_nodes(4) + +# Initialize an Ensemble +ensemble = exp.create_ensemble("ensemble-replica", replicas=4, run_settings=rs, batch_settings=bs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/experiment_doc_examples/exp.py b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py new file mode 100644 index 000000000..7a36262be --- /dev/null +++ b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py @@ -0,0 +1,26 @@ +from smartsim import Experiment +from smartsim.log import get_logger + +# Initialize an Experiment +exp = Experiment("example-experiment", launcher="auto") +# Initialize a SmartSim logger +smartsim_logger = get_logger("logger") + +# Initialize an Orchestrator +standalone_database = exp.create_database(db_nodes=3, port=6379, interface="ib0") + +# Initialize the Model RunSettings +settings = exp.create_run_settings("echo", exe_args="Hello World") +# Initialize the Model +model = exp.create_model("hello_world", settings) + +# Generate the output directory +exp.generate(standalone_database, model, overwrite=True) + +# Launch the Orchestrator then Model instance +exp.start(standalone_database, model) + +# Clobber the Orchestrator +exp.stop(standalone_database) +# Log the summary of the Experiment +smartsim_logger.info(exp.summary()) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/from_file_ml_model.py b/doc/tutorials/doc_examples/model_doc_examples/from_file_ml_model.py new file mode 100644 index 000000000..329d08edc --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/from_file_ml_model.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def save_tf_cnn(path, file_name): + """Create a Keras CNN and save to file for example purposes""" + from smartsim.ml.tf import freeze_model + + n = Net() + input_shape = (3, 3, 1) + n.build(input_shape=(None, *input_shape)) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return freeze_model(model, path, file_name) + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Get and save TF model +model_file, inputs, outputs = save_tf_cnn(model_instance.path, "model.pb") + +# Attach the from file ML model to the SmartSim Model +model_instance.add_ml_model(name="cnn", backend="TF", model_path=model_file, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/from_file_script.py b/doc/tutorials/doc_examples/model_doc_examples/from_file_script.py new file mode 100644 index 000000000..ca6dcaea1 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/from_file_script.py @@ -0,0 +1,14 @@ + +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Attach TorchScript to Model +model_instance.add_script(name="example_script", script_path="path/to/torchscript.py", device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py b/doc/tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py new file mode 100644 index 000000000..a34cceb4a --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def create_tf_cnn(): + """Create an in-memory Keras CNN for example purposes + + """ + from smartsim.ml.tf import serialize_model + n = Net() + input_shape = (3,3,1) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return serialize_model(model) + +# Serialize and save TF model +model, inputs, outputs = create_tf_cnn() + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Attach the in-memory ML model to the SmartSim Model +model_instance.add_ml_model(name="cnn", backend="TF", model=model, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/in_mem_script.py b/doc/tutorials/doc_examples/model_doc_examples/in_mem_script.py new file mode 100644 index 000000000..634746085 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/in_mem_script.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +def timestwo(x): + return 2*x + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Append TorchScript function to Model +model_instance.add_function(name="example_func", function=timestwo, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/model_file.py b/doc/tutorials/doc_examples/model_doc_examples/model_file.py new file mode 100644 index 000000000..8961d50a8 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/model_file.py @@ -0,0 +1,19 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings, params={"THERMO":1}) + +# Attach the file to the Model instance +model_instance.attach_generator_files(to_configure="path/to/params_inputs.txt") + +# Store model_instance outputs within the Experiment directory named getting-started +exp.generate(model_instance) + +# Launch the Model +exp.start(model_instance) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/model_init.py b/doc/tutorials/doc_examples/model_doc_examples/model_init.py new file mode 100644 index 000000000..b1bb090f4 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/model_init.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Init Experiment and specify to launch locally in this example +exp = Experiment(name="getting-started", launcher="local") + +# Initialize RunSettings +model_settings = exp.create_run_settings(exe="echo", exe_args="Hello World") + +# Initialize Model instance +model_instance = exp.create_model(name="example-model", run_settings=model_settings) + +# Generate Model directory +exp.generate(model_instance) + +# Launch Model +exp.start(model_instance) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/prefix_data.py b/doc/tutorials/doc_examples/model_doc_examples/prefix_data.py new file mode 100644 index 000000000..da4034d82 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/prefix_data.py @@ -0,0 +1,12 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Create the run settings for the Model +model_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Create a Model instance named 'model' +model = exp.create_model("model_name", model_settings) +# Enable tensor, Dataset and list prefixing on the 'model' instance +model.enable_key_prefixing() \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/string_script.py b/doc/tutorials/doc_examples/model_doc_examples/string_script.py new file mode 100644 index 000000000..52495ab47 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/string_script.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# TorchScript string +torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + +# Attach TorchScript to Model +model_instance.add_script(name="example_script", script=torch_script_str, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/colo_app.py b/doc/tutorials/doc_examples/orch_examples/colo_app.py new file mode 100644 index 000000000..930789fab --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/colo_app.py @@ -0,0 +1,15 @@ +from smartredis import Client, LLInfo +import numpy as np + +# Initialize a Client +colo_client = Client(cluster=False) + +# Create NumPy array +local_array = np.array([1, 2, 3, 4]) +# Store the NumPy tensor +colo_client.put_tensor("tensor_1", local_array) + +# Retrieve tensor from driver script +local_tensor = colo_client.get_tensor("tensor_1") +# Log tensor +colo_client.log_data(LLInfo, f"The colocated db tensor is: {local_tensor}") \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/colo_driver.py b/doc/tutorials/doc_examples/orch_examples/colo_driver.py new file mode 100644 index 000000000..fde06e9b7 --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/colo_driver.py @@ -0,0 +1,29 @@ +import numpy as np +from smartredis import Client +from smartsim import Experiment +from smartsim.log import get_logger + +# Initialize a logger object +logger = get_logger("Example Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/executable_simulation") +# Configure RunSettings object +model_settings.set_nodes(1) + +# Initialize a SmartSim Model +model = exp.create_model("colo_model", model_settings) + +# Colocate the Model +model.colocate_db_uds() + +# Generate output files +exp.generate(model) + +# Launch the colocated Model +exp.start(model, block=True, summary=True) + +# Log the Experiment summary +logger.info(exp.summary()) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/std_app.py b/doc/tutorials/doc_examples/orch_examples/std_app.py new file mode 100644 index 000000000..67129fbf4 --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/std_app.py @@ -0,0 +1,15 @@ +from smartredis import Client, LLInfo +import numpy as np + +# Initialize a SmartRedis Client +application_client = Client(cluster=True) + +# Retrieve the driver script tensor from Orchestrator +driver_script_tensor = application_client.get_tensor("tensor_1") +# Log the tensor +application_client.log_data(LLInfo, f"The multi-sharded db tensor is: {driver_script_tensor}") + +# Create a NumPy array +local_array = np.array([5, 6, 7, 8]) +# Use SmartRedis client to place tensor in multi-sharded db +application_client.put_tensor("tensor_2", local_array) diff --git a/doc/tutorials/doc_examples/orch_examples/std_driver.py b/doc/tutorials/doc_examples/orch_examples/std_driver.py new file mode 100644 index 000000000..cf425125b --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/std_driver.py @@ -0,0 +1,46 @@ +import numpy as np +from smartredis import Client +from smartsim import Experiment +from smartsim.log import get_logger + +# Initialize the logger +logger = get_logger("Example Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a multi-sharded Orchestrator +standalone_orchestrator = exp.create_database(db_nodes=3) + +# Initialize a SmartRedis client for multi-sharded Orchestrator +driver_client = Client(cluster=True, address=standalone_orchestrator.get_address()[0]) + +# Create NumPy array +local_array = np.array([1, 2, 3, 4]) +# Use the SmartRedis client to place tensor in the standalone Orchestrator +driver_client.put_tensor("tensor_1", local_array) + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="/path/to/executable_simulation") +model_settings.set_nodes(1) + +# Initialize the Model +model = exp.create_model("model", model_settings) + +# Create the output directory +exp.generate(standalone_orchestrator, model) + +# Launch the multi-sharded Orchestrator +exp.start(standalone_orchestrator) + +# Launch the Model +exp.start(model, block=True, summary=True) + +# Poll the tensors placed by the Model +app_tensor = driver_client.poll_key("tensor_2", 100, 10) +# Validate that the tensor exists +logger.info(f"The tensor exists: {app_tensor}") + +# Cleanup the Orchestrator +exp.stop(standalone_orchestrator) +# Print the Experiment summary +logger.info(exp.summary()) \ No newline at end of file diff --git a/tutorials/getting_started/consumer.py b/doc/tutorials/getting_started/consumer.py similarity index 100% rename from tutorials/getting_started/consumer.py rename to doc/tutorials/getting_started/consumer.py diff --git a/tutorials/getting_started/getting_started.ipynb b/doc/tutorials/getting_started/getting_started.ipynb similarity index 100% rename from tutorials/getting_started/getting_started.ipynb rename to doc/tutorials/getting_started/getting_started.ipynb diff --git a/tutorials/getting_started/multi_db_example/application_script.py b/doc/tutorials/getting_started/multi_db_example/application_script.py similarity index 100% rename from tutorials/getting_started/multi_db_example/application_script.py rename to doc/tutorials/getting_started/multi_db_example/application_script.py diff --git a/tutorials/getting_started/multi_db_example/multidb_driver.py b/doc/tutorials/getting_started/multi_db_example/multidb_driver.py similarity index 100% rename from tutorials/getting_started/multi_db_example/multidb_driver.py rename to doc/tutorials/getting_started/multi_db_example/multidb_driver.py diff --git a/tutorials/getting_started/output_my_parameter.py b/doc/tutorials/getting_started/output_my_parameter.py similarity index 100% rename from tutorials/getting_started/output_my_parameter.py rename to doc/tutorials/getting_started/output_my_parameter.py diff --git a/tutorials/getting_started/output_my_parameter_new_tag.py b/doc/tutorials/getting_started/output_my_parameter_new_tag.py similarity index 100% rename from tutorials/getting_started/output_my_parameter_new_tag.py rename to doc/tutorials/getting_started/output_my_parameter_new_tag.py diff --git a/tutorials/getting_started/producer.py b/doc/tutorials/getting_started/producer.py similarity index 100% rename from tutorials/getting_started/producer.py rename to doc/tutorials/getting_started/producer.py diff --git a/tutorials/ml_inference/Inference-in-SmartSim.ipynb b/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb similarity index 100% rename from tutorials/ml_inference/Inference-in-SmartSim.ipynb rename to doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb diff --git a/tutorials/ml_inference/colo-db-torch-example.py b/doc/tutorials/ml_inference/colo-db-torch-example.py similarity index 100% rename from tutorials/ml_inference/colo-db-torch-example.py rename to doc/tutorials/ml_inference/colo-db-torch-example.py diff --git a/tutorials/ml_training/surrogate/LICENSE b/doc/tutorials/ml_training/surrogate/LICENSE similarity index 100% rename from tutorials/ml_training/surrogate/LICENSE rename to doc/tutorials/ml_training/surrogate/LICENSE diff --git a/tutorials/ml_training/surrogate/README.md b/doc/tutorials/ml_training/surrogate/README.md similarity index 100% rename from tutorials/ml_training/surrogate/README.md rename to doc/tutorials/ml_training/surrogate/README.md diff --git a/tutorials/ml_training/surrogate/fd_sim.py b/doc/tutorials/ml_training/surrogate/fd_sim.py similarity index 100% rename from tutorials/ml_training/surrogate/fd_sim.py rename to doc/tutorials/ml_training/surrogate/fd_sim.py diff --git a/tutorials/ml_training/surrogate/steady_state.py b/doc/tutorials/ml_training/surrogate/steady_state.py similarity index 100% rename from tutorials/ml_training/surrogate/steady_state.py rename to doc/tutorials/ml_training/surrogate/steady_state.py diff --git a/tutorials/ml_training/surrogate/tf_model.py b/doc/tutorials/ml_training/surrogate/tf_model.py similarity index 100% rename from tutorials/ml_training/surrogate/tf_model.py rename to doc/tutorials/ml_training/surrogate/tf_model.py diff --git a/tutorials/ml_training/surrogate/tf_training.py b/doc/tutorials/ml_training/surrogate/tf_training.py similarity index 100% rename from tutorials/ml_training/surrogate/tf_training.py rename to doc/tutorials/ml_training/surrogate/tf_training.py diff --git a/tutorials/ml_training/surrogate/train_surrogate.ipynb b/doc/tutorials/ml_training/surrogate/train_surrogate.ipynb similarity index 100% rename from tutorials/ml_training/surrogate/train_surrogate.ipynb rename to doc/tutorials/ml_training/surrogate/train_surrogate.ipynb diff --git a/tutorials/ml_training/surrogate/vishelpers.py b/doc/tutorials/ml_training/surrogate/vishelpers.py similarity index 100% rename from tutorials/ml_training/surrogate/vishelpers.py rename to doc/tutorials/ml_training/surrogate/vishelpers.py diff --git a/tutorials/online_analysis/lattice/LICENSE b/doc/tutorials/online_analysis/lattice/LICENSE similarity index 100% rename from tutorials/online_analysis/lattice/LICENSE rename to doc/tutorials/online_analysis/lattice/LICENSE diff --git a/tutorials/online_analysis/lattice/README.md b/doc/tutorials/online_analysis/lattice/README.md similarity index 100% rename from tutorials/online_analysis/lattice/README.md rename to doc/tutorials/online_analysis/lattice/README.md diff --git a/tutorials/online_analysis/lattice/driver.py b/doc/tutorials/online_analysis/lattice/driver.py similarity index 100% rename from tutorials/online_analysis/lattice/driver.py rename to doc/tutorials/online_analysis/lattice/driver.py diff --git a/tutorials/online_analysis/lattice/fv_sim.py b/doc/tutorials/online_analysis/lattice/fv_sim.py similarity index 100% rename from tutorials/online_analysis/lattice/fv_sim.py rename to doc/tutorials/online_analysis/lattice/fv_sim.py diff --git a/tutorials/online_analysis/lattice/online_analysis.ipynb b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb similarity index 100% rename from tutorials/online_analysis/lattice/online_analysis.ipynb rename to doc/tutorials/online_analysis/lattice/online_analysis.ipynb diff --git a/tutorials/online_analysis/lattice/probe.script b/doc/tutorials/online_analysis/lattice/probe.script similarity index 100% rename from tutorials/online_analysis/lattice/probe.script rename to doc/tutorials/online_analysis/lattice/probe.script diff --git a/tutorials/online_analysis/lattice/vishelpers.py b/doc/tutorials/online_analysis/lattice/vishelpers.py similarity index 100% rename from tutorials/online_analysis/lattice/vishelpers.py rename to doc/tutorials/online_analysis/lattice/vishelpers.py diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index eff99de36..48a9f4027 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -58,9 +58,4 @@ RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop -- RUN python -m pip install -r doc/requirements-doc.txt \ && NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install . -RUN mkdir -p doc/tutorials/ \ - && cd doc/tutorials/ \ - && rm -rf * \ - && ln -s ../../tutorials/* . - RUN make docs diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 431cb43c5..9cd436e55 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -174,11 +174,11 @@ def __init__( Extra configurations for RedisAI - See https://oss.redislabs.com/redisai/configuration/ + See https://oss.redis.com/redisai/configuration/ :param threads_per_queue: threads per GPU device :type threads_per_queue: int, optional - :param inter_op_threads: threads accross CPU operations + :param inter_op_threads: threads across CPU operations :type inter_op_threads: int, optional :param intra_op_threads: threads per CPU operation :type intra_op_threads: int, optional diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 279128282..175997c96 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -835,8 +835,8 @@ def summary(self, style: str = "github") -> str: launched and completed in this ``Experiment`` :param style: the style in which the summary table is formatted, - for a full list of styles see: - https://github.com/astanin/python-tabulate#table-format, + for a full list of styles see the table-format section of: + https://github.com/astanin/python-tabulate, defaults to "github" :type style: str, optional :return: tabulate string of ``Experiment`` history