From 6d6f1da96005a54cd4bfeeb5075f5f7b6ac910d1 Mon Sep 17 00:00:00 2001
From: Geoffrey Goh <geoffreygoh@fb.com>
Date: Thu, 15 Aug 2019 18:29:20 -0700
Subject: [PATCH] Optimize Gelu operator for caffe2 export

Summary:
TIL ONNX->Caffe2 is very memory inefficient, it creates an intermediate blob for each intermediate output. So, the Gelu operator creates a lot of intermediate ops since it does a bunch of math.

Fix is to use the caffe2 Gelu operator, so all that computation is captured in a single op.

https://pxl.cl/HzGf

Differential Revision: D16849396

fbshipit-source-id: 4903c614833ae4ad8a84c6eddc2382b2a24872f3
---
 pytext/optimizer/activations.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/pytext/optimizer/activations.py b/pytext/optimizer/activations.py
index 3c9799d77..8cce4486d 100644
--- a/pytext/optimizer/activations.py
+++ b/pytext/optimizer/activations.py
@@ -20,11 +20,21 @@ class GeLU(nn.Module):
     """
 
     def forward(self, x):
-        return (
-            0.5
-            * x
-            * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x * x * x))))
-        )
+        if torch.onnx.is_in_onnx_export():
+            # ONNX -> Caffe2 conversion will create an intermediate blob for
+            # each intermediate math output, which is very memory inefficient.
+            # We use the Gelu operator directly to reduce the memory footprint
+            # in the exported model.
+            return torch.ops._caffe2.Gelu(x, True)
+        else:
+            return (
+                0.5
+                * x
+                * (
+                    1
+                    + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x * x * x)))
+                )
+            )
 
 
 def get_activation(name):