Update on "Add gradcheck for forward AD by default and functional API"

albanD · albanD · commit 3d90e5673a92 · 2020-12-14T16:42:59.000-05:00
RFC: pytorch/rfcs#11 This PR adds the option to check forward grad using gradcheck. The current logic is: - Forward grad is always checked - If the forward evaluation fails because an op is not implemented, the test is silently passing The goal is to make sure that all formulas that are added are properly tested without having to add a new test for each op. The final logic after the next PR that adds the remaining formulas is going to be: - Forward grad is always checked - Failure with not implemented op is an actual failure - Users should set `check_forward=False` if they explicitly don't want to test forward grads (which should not be the case internally). [ghstack-poisoned]
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
@@ -137,6 +137,22 @@ template<typename... Args> inline variable_list flatten_tensor_args(Args&&... ar
 inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_differentiable,
         bool is_fw_differentiable, c10::optional<std::function<Tensor(const Tensor&)>> view_func=c10::nullopt,
         CreationMeta creation_meta=CreationMeta::DEFAULT, bool allow_tensor_metadata_change=true) {
+  if (!isForwardADEnabled()) {
+    // Fast codepath for backward only code
+    if (is_bw_differentiable) {
+      if (base.is_view()) {
+        auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
+        auto base_bw_info = diff_view_meta->get_backward_view();
+        return make_variable_differentiable_view(std::move(tensor), base_bw_info.chain(base, tensor, view_func),
+                                                 c10::nullopt, creation_meta, allow_tensor_metadata_change);
+      } else {
+        return make_variable_differentiable_view(std::move(tensor), ViewInfo(base, view_func),
+                                                 c10::nullopt, creation_meta, allow_tensor_metadata_change);
+      }
+    } else {
+      return make_variable_non_differentiable_view(base, std::move(tensor), allow_tensor_metadata_change);
+    }
+  }
   // Create both the forward and backward info that are needed
   c10::optional<ViewInfo> new_bw_info = c10::nullopt;
   c10::optional<ViewInfo> new_fw_info = c10::nullopt;
@@ -167,7 +183,8 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif
   }
 
   if (is_fw_differentiable || is_bw_differentiable) {
-    return make_variable_differentiable_view(std::move(tensor), new_bw_info, new_fw_info, creation_meta, allow_tensor_metadata_change);
+    return make_variable_differentiable_view(std::move(tensor), std::move(new_bw_info), std::move(new_fw_info),
+                                             creation_meta, allow_tensor_metadata_change);
   } else {
     return make_variable_non_differentiable_view(base, std::move(tensor), allow_tensor_metadata_change);
   }
@@ -195,7 +212,7 @@ inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& ten
     TORCH_CHECK(creation_meta == CreationMeta::DEFAULT,
                 "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
   }
-  if (is_fw_differentiable) {
+  if (isForwardADEnabled() && is_fw_differentiable) {
     // Check if base is a forward differentiabble view
     auto is_view = torch::autograd::impl::get_autograd_meta(base) && torch::autograd::impl::get_autograd_meta(base)->is_view_;
     if (is_view && static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base))->has_fw_view()) {
diff --git a/torch/csrc/autograd/forward_grad.cpp b/torch/csrc/autograd/forward_grad.cpp
@@ -12,6 +12,10 @@ namespace {
 
     const static at::Tensor singleton_undefined_tensor;
 
+    // Temporary flag to disable forward mode
+    // TODO(alband) remove these when perf issues are solved
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    static bool is_forward_grad_enabled = false;
 }
 
 uint64_t ForwardADLevel::get_next_idx() {
@@ -55,4 +59,18 @@ const at::Tensor& ForwardGrad::value(uint64_t level) const {
     return it == content_.end() ? singleton_undefined_tensor : (*it).second;
 }
 
+const at::Tensor& ForwardGrad::undef_grad() {
+    return singleton_undefined_tensor;
+}
+
+// Temporary functions to disable forward AD
+// TODO(alband) remove these when perf issues are solved
+bool isForwardADEnabled() {
+    return is_forward_grad_enabled;
+}
+
+void setForwardADEnabled(bool value) {
+    is_forward_grad_enabled = value;
+}
+
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/forward_grad.h b/torch/csrc/autograd/forward_grad.h
@@ -91,11 +91,18 @@ struct TORCH_API ForwardGrad : std::enable_shared_from_this<ForwardGrad> {
         return content_.empty();
     }
 
+    static const at::Tensor& undef_grad();
+
 
 private:
     std::unordered_map<uint64_t, at::Tensor> content_;
     mutable std::mutex mutex_;
 
 };
 
+// Temporary functions to disable forward AD
+// TODO(alband) remove these when perf issues are solved
+bool TORCH_API isForwardADEnabled();
+void TORCH_API setForwardADEnabled(bool value);
+
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
@@ -235,6 +235,26 @@ static PyObject * autocast_decrement_nesting(PyObject* _unused, PyObject *arg) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * set_forward_AD_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(arg)) {
+    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  }
+  setForwardADEnabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * is_forward_AD_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (isForwardADEnabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject * set_grad_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
@@ -327,6 +347,8 @@ static PyObject * python_unpack_dual(PyObject* _unused, PyObject* args, PyObject
 static PyMethodDef methods[] = { // NOLINT
   {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
   {"is_grad_enabled", is_grad_enabled, METH_NOARGS, nullptr},
+  {"_set_forward_AD_enabled", set_forward_AD_enabled, METH_O, nullptr},
+  {"_is_forward_AD_enabled", is_forward_AD_enabled, METH_NOARGS, nullptr},
   {"set_autocast_enabled", set_autocast_enabled, METH_O, nullptr},
   {"is_autocast_enabled", is_autocast_enabled, METH_NOARGS, nullptr},
   {"clear_autocast_cache", clear_autocast_cache, METH_NOARGS, nullptr},
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
@@ -31,9 +31,10 @@ DifferentiableViewMeta::DifferentiableViewMeta(at::TensorImpl* self_impl,
   c10::optional<ViewInfo> backward_info,
   c10::optional<ViewInfo> forward_info,
   CreationMeta creation_meta)
-    : AutogradMeta(self_impl), creation_meta(creation_meta),
+    : AutogradMeta(self_impl),
       backward_info_(std::move(backward_info)),
-      forward_info_(std::move(forward_info)) {
+      forward_info_(std::move(forward_info)),
+      creation_meta(creation_meta) {
   is_view_ = true;
   if (backward_info_.has_value()) {
     self_impl->set_version_counter(impl::version_counter(backward_info_.value().base_));
@@ -594,6 +595,10 @@ namespace {
 // This function is will ensure that the fw_grad_ is properly a view of the base for inplace ops on
 // Tensors that do not have forward grad originally.
 void AutogradMeta::set_fw_grad(Variable& new_grad, const Variable& self, uint64_t level, bool is_inplace_op) {
+  if (!fw_grad_) {
+    // Lazy initialization
+    fw_grad_ = std::make_shared<ForwardGrad>();
+  }
   if (fw_grad_->contains(level)) {
     // Setting the forward grad again is only allowed if it is a no-op.
     // We do allow this case to simplify writing codegen for inplace ops.
@@ -652,33 +657,38 @@ void AutogradMeta::set_fw_grad(Variable& new_grad, const Variable& self, uint64_
 }
 
 const Variable& AutogradMeta::fw_grad(uint64_t level, const Variable& self) const {
-  const auto& val = fw_grad_->value(level);
-  if (!val.defined() && is_view_) {
+  bool has_no_direct_fw_grad = !(fw_grad_ && fw_grad_->value(level).defined());
+  if (has_no_direct_fw_grad && is_view_) {
     // For view that don't have a forward grad, check if their base has one that
     // has been defined by an inplace operation.
     // See [Forward Grad View] for more details.
-    const auto this_view_meta = static_cast<const DifferentiableViewMeta*>(this);
+    auto this_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(self));
     if (this_view_meta->has_fw_view()) {
       auto view_info = this_view_meta->get_forward_view();
       const auto& base = view_info.base_;
 
       const auto& base_val = base.fw_grad(level);
       if (base_val.defined()) {
+        // Lazy initialization
+        this_view_meta->fw_grad_ = std::make_shared<ForwardGrad>();
+
         Variable new_val;
         if (view_info.has_view_fn()) {
           new_val = view_info.view_fn()(base_val);
         } else {
           new_val = base_val.as_strided(self.sizes(), self.strides(), self.storage_offset());
         }
 
-        fw_grad_->set_value(new_val, level);
-        return fw_grad_->value(level);
-      } else {
-        return val;
+        this_view_meta->fw_grad_->set_value(new_val, level);
+        return this_view_meta->fw_grad_->value(level);
       }
     }
   }
-  return val;
+  if (fw_grad_) {
+    return fw_grad_->value(level);
+  } else {
+    return ForwardGrad::undef_grad();
+  }
 }
 
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
@@ -191,10 +191,12 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   std::string name_;
 
   Variable grad_;
-  std::shared_ptr<ForwardGrad> fw_grad_;
   std::shared_ptr<Node> grad_fn_;
   std::weak_ptr<Node> grad_accumulator_;
 
+  // This field is lazily initialized
+  std::shared_ptr<ForwardGrad> fw_grad_;
+
   std::vector<std::shared_ptr<FunctionPreHook>> hooks_;
   std::shared_ptr<hooks_list> cpp_hooks_list;
 
@@ -250,7 +252,6 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
     retains_grad_ = false;
     is_view_ = false;
     output_nr_ = gradient_edge.input_nr;
-    fw_grad_ = std::make_shared<ForwardGrad>();
 
     // set_requires_grad also checks error conditions.
     if (requires_grad) {
@@ -295,9 +296,9 @@ struct TORCH_API ViewInfo {
   ViewInfo chain(const Variable & base, const Variable & tensor,
     c10::optional<std::function<Variable(const Variable&)>> view_func=c10::nullopt);
 
-  ViewInfo(Variable base, c10::optional<std::function<Variable(const Variable&)>> view_fn) {
-    base_ = std::move(base);
-    view_fn_ = std::move(view_fn);
+  ViewInfo(Variable base, c10::optional<std::function<Variable(const Variable&)>> view_fn) :
+    base_(std::move(base)),
+    view_fn_(std::move(view_fn)) {
     TORCH_CHECK(base_.defined(), "base is undefined");
   }
 };