From 6d6f1da96005a54cd4bfeeb5075f5f7b6ac910d1 Mon Sep 17 00:00:00 2001 From: Geoffrey Goh Date: Thu, 15 Aug 2019 18:29:20 -0700 Subject: [PATCH] Optimize Gelu operator for caffe2 export Summary: TIL ONNX->Caffe2 is very memory inefficient, it creates an intermediate blob for each intermediate output. So, the Gelu operator creates a lot of intermediate ops since it does a bunch of math. Fix is to use the caffe2 Gelu operator, so all that computation is captured in a single op. https://pxl.cl/HzGf Differential Revision: D16849396 fbshipit-source-id: 4903c614833ae4ad8a84c6eddc2382b2a24872f3 --- pytext/optimizer/activations.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pytext/optimizer/activations.py b/pytext/optimizer/activations.py index 3c9799d77..8cce4486d 100644 --- a/pytext/optimizer/activations.py +++ b/pytext/optimizer/activations.py @@ -20,11 +20,21 @@ class GeLU(nn.Module): """ def forward(self, x): - return ( - 0.5 - * x - * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x * x * x)))) - ) + if torch.onnx.is_in_onnx_export(): + # ONNX -> Caffe2 conversion will create an intermediate blob for + # each intermediate math output, which is very memory inefficient. + # We use the Gelu operator directly to reduce the memory footprint + # in the exported model. + return torch.ops._caffe2.Gelu(x, True) + else: + return ( + 0.5 + * x + * ( + 1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x * x * x))) + ) + ) def get_activation(name):