NVIDIA-NeMo · okuchaiev · Oct 18, 2019 · Oct 9, 2019 · Oct 11, 2019 · Oct 11, 2019
diff --git a/examples/start_here/chatbot_example.py b/examples/start_here/chatbot_example.py
@@ -1,5 +1,4 @@
 import os
-import sys
 import gzip
 import shutil
 import nemo
@@ -32,6 +31,9 @@
 
 # instantiate neural factory
 nf = nemo.core.NeuralModuleFactory()
+# To use CPU-only do:
+# from nemo.core import DeviceType
+# nf = nemo.core.NeuralModuleFactory(placement=DeviceType.CPU)
 
 # instantiate neural modules
 dl = nemo.tutorials.DialogDataLayer(**config)

diff --git a/examples/start_here/simplest_example.py b/examples/start_here/simplest_example.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import nemo
-
-# instantiate Neural Factory with supported backend
 nf = nemo.core.NeuralModuleFactory()
+# To use CPU-only do:
+# from nemo.core import DeviceType
+# nf = nemo.core.NeuralModuleFactory(placement=DeviceType.CPU)
 
 # instantiate necessary neural modules
 # RealFunctionDataLayer defaults to f=torch.sin, sampling from x=[-4, 4]

diff --git a/nemo/nemo/backends/pytorch/actions.py b/nemo/nemo/backends/pytorch/actions.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2019 NVIDIA Corporation
+import importlib
 import itertools
 import logging
 import os
@@ -21,14 +22,10 @@
 from ...core.neural_factory import Actions, ModelMode, Optimization
 from ...utils.helpers import get_checkpoint_from_dir
 
-try:
-    import apex
-    from apex.parallel import DistributedDataParallel as DDP
-    from apex.parallel.LARC import LARC
-    from apex import amp
-except ImportError:
-    raise ImportError(
-        "Please install apex from https://www.github.com/nvidia/apex")
+# these imports will happen on as-needed basis
+amp = None
+DDP = None
+LARC = None
 
 AmpOptimizations = {
     Optimization.mxprO0: "O0",
@@ -45,6 +42,28 @@
 class PtActions(Actions):
     def __init__(self, local_rank=None, tb_writer=None,
                  optimization_level=Optimization.mxprO0):
+        need_apex = local_rank is not None or \
+                    optimization_level != Optimization.mxprO0
+        if need_apex:
+            try:
+                apex = importlib.import_module('apex')
+                if optimization_level != Optimization.mxprO0:
+                    global amp
+                    amp = importlib.import_module('apex.amp')
+                if local_rank is not None:
+                    global DDP
+                    global LARC
+                    parallel = importlib.import_module('apex.parallel')
+                    DDP = parallel.DistributedDataParallel
+                    LARC = parallel.LARC
+
+            except ImportError:
+                raise ImportError(
+                    "NVIDIA Apex is necessary for distributed training and"
+                    "mixed precision training. It only works on GPUs."
+                    "Please install Apex from "
+                    "https://www.github.com/nvidia/apex")
+
         super(PtActions, self).__init__(
             local_rank=local_rank,
             optimization_level=optimization_level)
@@ -340,8 +359,12 @@ def __initialize_amp(
             self, optimizer, optim_level, amp_min_loss_scale=1.0
     ):
         if optim_level not in AmpOptimizations:
-            raise ValueError("__initialize_amp() was called but optim_level "
-                             "was set to float32.")
+            raise ValueError(f"__initialize_amp() was called with unknown "
+                             "optim_level={optim_level}")
+        # in this case, nothing to do here
+        if optim_level == Optimization.mxprO0:
+            return optimizer
+
         if len(self.modules) < 1:
             raise ValueError("There were no modules to initialize")
         pt_modules = []
@@ -371,11 +394,12 @@ def __nm_graph_forward_pass(self,
             m_id = call_chain[ind][0].unique_instance_id
             pmodule = self.module_reference_table[m_id][1]
 
-            if isinstance(pmodule, DDP):
-                if disable_allreduce:
-                    pmodule.disable_allreduce()
-                else:
-                    pmodule.enable_allreduce()
+            if self._local_rank is not None:
+                if isinstance(pmodule, DDP):
+                    if disable_allreduce:
+                        pmodule.disable_allreduce()
+                    else:
+                        pmodule.enable_allreduce()
 
             if mode == ModelMode.train:
                 # if module.is_trainable():
@@ -1164,7 +1188,8 @@ def train(self,
                     final_loss += registered_tensors[tensor.unique_name]
                 if nan:
                     continue
-                if self._optim_level in AmpOptimizations:
+                if self._optim_level in AmpOptimizations \
+                        and self._optim_level != Optimization.mxprO0:
                     with amp.scale_loss(
                             final_loss,
                             curr_optimizer,

diff --git a/nemo/nemo/core/neural_factory.py b/nemo/nemo/core/neural_factory.py
@@ -292,6 +292,16 @@ def __init__(
         if backend == Backend.PyTorch:
             # TODO: Move all framework specific code from this file
             import torch
+            if self._placement != DeviceType.CPU:
+                if not torch.cuda.is_available():
+                    raise ValueError("You requested to use GPUs but CUDA is "
+                                     "not installed. You can try running using"
+                                     " CPU-only. To do this, instantiate your"
+                                     " factory with placement=DeviceType.CPU"
+                                     "\n"
+                                     "Note that this is slow and is not "
+                                     "well supported.")
+
             torch.backends.cudnn.benchmark = cudnn_benchmark
             if random_seed is not None and cudnn_benchmark:
                 raise ValueError("cudnn_benchmark can not be set to True"