Remove vocab from cuda

Summary: We have users who can't train models on extremely large embeddings because we try to allocate space for that on the GPU. With this diff, in training, we add a flag which users can set explicitly to keep the embedding layer on CPU even when the model is getting trained on GPUs. This is not default because we need the user to know that there will be a cost associated moving the tensors on and off the GPU. Note that this only applies during training. Differential Revision: D17114398 fbshipit-source-id: 8af99fd20529c6fc80230561142d5c472a32121a
facebookresearch · Sep 6, 2019 · 76792e3 · 76792e3
1 parent aa1f9de
commit 76792e3
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 2 deletions.
diff --git a/pytext/config/field_config.py b/pytext/config/field_config.py
@@ -40,6 +40,7 @@ class WordFeatConfig(ModuleConfig):
     min_freq: int = 1
     mlp_layer_dims: Optional[List[int]] = []
     padding_idx: Optional[int] = None
+    cpu_only: bool = False
 
 
 class DictFeatConfig(ModuleConfig):

diff --git a/pytext/models/embeddings/word_embedding.py b/pytext/models/embeddings/word_embedding.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
-import collections
 from typing import List, Optional
 
 import torch
 from pytext.config.field_config import WordFeatConfig
 from pytext.data.tensorizers import Tensorizer
 from pytext.fields import FieldMeta
 from pytext.utils.embeddings import PretrainedEmbedding
+from pytext.utils.torch import CPUOnlyParameter
 from torch import nn
 
 from .embedding_base import EmbeddingBase
@@ -92,6 +92,7 @@ def from_config(
             unk_token_idx=unk_token_idx,
             mlp_layer_dims=config.mlp_layer_dims,
             padding_idx=config.padding_idx,
+            cpu_only=config.cpu_only,
         )
 
     def __init__(
@@ -103,6 +104,7 @@ def __init__(
         unk_token_idx: int = 0,
         mlp_layer_dims: List[int] = (),
         padding_idx: Optional[int] = None,
+        cpu_only: bool = False,
     ) -> None:
         output_embedding_dim = mlp_layer_dims[-1] if mlp_layer_dims else embedding_dim
         EmbeddingBase.__init__(self, embedding_dim=output_embedding_dim)
@@ -114,6 +116,8 @@ def __init__(
             _weight=embeddings_weight,
             padding_idx=padding_idx,
         )
+        if cpu_only:
+            self.word_embedding.weight = CPUOnlyParameter(self.word_embedding.weight)
         if embeddings_weight is None and init_range:
             self.word_embedding.weight.data.uniform_(init_range[0], init_range[1])
         # Initialize unk embedding with zeros
@@ -136,7 +140,12 @@ def __getattr__(self, name):
         return super().__getattr__(name)
 
     def forward(self, input):
-        return self.mlp(self.word_embedding(input))
+        input_device = input.device
+        embedding_device = self.word_embedding.weight.device
+        if input_device != embedding_device:
+            input = input.to(embedding_device)
+        # We only want to do the embedding lookup on CPU
+        return self.mlp(self.word_embedding(input).to(input_device))
 
     def freeze(self):
         for param in self.word_embedding.parameters():

diff --git a/pytext/utils/torch.py b/pytext/utils/torch.py
@@ -500,3 +500,9 @@ def package_for_inference(self):
         self.do_normalization = torch.jit.Attribute(self.do_normalization, bool)
         self.feature_avgs = torch.jit.Attribute(self.feature_avgs, List[float])
         self.feature_stddevs = torch.jit.Attribute(self.feature_stddevs, List[float])
+
+
+class CPUOnlyParameter(torch.nn.Parameter):
+    def cuda(self, device=None):
+        # We do nothing because this Parameter should only be on the CPU
+        return self