Skip to content

Commit

Permalink
P-tuning refactor Part 1/N (NVIDIA#6054)
Browse files Browse the repository at this point in the history
* patch to allow using tokenizers without additional_special_tokens_ids attribute

Signed-off-by: arendu <[email protected]>

* early stop callback for prompt/p tuning

Signed-off-by: arendu <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <[email protected]>

* added exp manager config for early stop

Signed-off-by: arendu <[email protected]>

* pushed logic for creating early stopping inside exp manager

Signed-off-by: arendu <[email protected]>

* pushed logic for creating early stopping inside exp manager

Signed-off-by: arendu <[email protected]>

* minor updates and added dataclass check

Signed-off-by: arendu <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* more args

Signed-off-by: arendu <[email protected]>

* more args

Signed-off-by: arendu <[email protected]>

* wrap tpmlp inside prompt encoder

Signed-off-by: arendu <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updates removed unused imports

Signed-off-by: arendu <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* removes typecheck for tpmlp module

Signed-off-by: arendu <[email protected]>

---------

Signed-off-by: arendu <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and titu1994 committed Mar 24, 2023
1 parent a363f7a commit 05f777f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 167 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
from nemo.collections.nlp.modules.common import (
PromptEncoder,
PromptEncoderMLP,
PromptEncoderType,
PromptTable,
VirtualPromptPlaceholderToken,
Expand Down Expand Up @@ -64,7 +63,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
self.cfg = cfg

self.load_frozen_model(cfg, trainer)

self.prompt_encoder = None
self.tokenizer = self.frozen_model.tokenizer

if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
Expand Down Expand Up @@ -213,25 +212,15 @@ def init_prompt_encoder(self):
total_virtual_tokens = self.task_templates[new_task]["total_virtual_tokens"]

encoder_type = PromptEncoderType(self.cfg.p_tuning.get("encoder_type", "mlp").lower())
self.prompt_encoder = None
if encoder_type == PromptEncoderType.TPMLP:
self.prompt_encoder = PromptEncoderMLP(
total_virtual_tokens=total_virtual_tokens,
hidden_size=self.cfg.p_tuning.get("encoder_hidden", 2048),
output_size=self.hidden_size,
init_std=self.cfg.p_tuning.get("init_std", 0.023),
)
elif encoder_type == PromptEncoderType.LSTM or encoder_type == PromptEncoderType.MLP:
self.prompt_encoder = PromptEncoder(
encoder_type=PromptEncoderType(encoder_type),
total_virtual_tokens=total_virtual_tokens,
token_dim=self.hidden_size,
hidden_size=self.cfg.p_tuning.get("encoder_hidden", self.hidden_size // 2),
lstm_dropout=self.cfg.p_tuning.get("dropout", 0.0),
num_layers=self.cfg.p_tuning.get("num_layers", 2),
)
else:
raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
self.prompt_encoder = PromptEncoder(
encoder_type=PromptEncoderType(encoder_type),
total_virtual_tokens=total_virtual_tokens,
token_dim=self.hidden_size,
hidden_size=self.cfg.p_tuning.get("encoder_hidden", 2048),
lstm_dropout=self.cfg.p_tuning.get("dropout", 0.0),
num_layers=self.cfg.p_tuning.get("num_layers", 2),
init_std=self.cfg.p_tuning.get("init_std", 0.023),
)

def add_ptuned_prompts_to_prompt_table(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@
from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.modules.common import (
BIGLSTMPromptEncoder,
PromptEncoder,
PromptEncoderMLP,
PromptEncoderType,
PromptTable,
VirtualPromptPlaceholderToken,
Expand Down Expand Up @@ -268,32 +266,15 @@ def init_prompt_encoder(self):
total_virtual_tokens = self.task_templates[new_task]["total_virtual_tokens"]

encoder_type = PromptEncoderType(self.cfg.p_tuning.get("encoder_type", "tpmlp").lower())
if encoder_type == PromptEncoderType.TPMLP:
self.prompt_encoder = PromptEncoderMLP(
total_virtual_tokens=total_virtual_tokens,
hidden_size=self.cfg.p_tuning.get("encoder_hidden", 2048),
output_size=self.hidden_size,
init_std=self.cfg.p_tuning.get("init_std", 0.023),
)
elif encoder_type == PromptEncoderType.BIGLSTM:
self.prompt_encoder = BIGLSTMPromptEncoder(
total_virtual_tokens=total_virtual_tokens,
hidden_size=self.cfg.p_tuning.encoder_hidden,
output_size=self.hidden_size,
lstm_dropout=self.cfg.p_tuning.dropout,
num_layers=self.cfg.p_tuning.num_layers,
)
elif encoder_type == PromptEncoderType.LSTM or encoder_type == PromptEncoderType.MLP:
self.prompt_encoder = PromptEncoder(
encoder_type=encoder_type,
total_virtual_tokens=total_virtual_tokens,
token_dim=self.hidden_size,
hidden_size=self.cfg.p_tuning.get("encoder_hidden", self.hidden_size // 2),
lstm_dropout=self.cfg.p_tuning.get("dropout", 0.0),
num_layers=self.cfg.p_tuning.get("num_layers", 2),
)
else:
raise ValueError('not supported')
self.prompt_encoder = PromptEncoder(
encoder_type=encoder_type,
total_virtual_tokens=total_virtual_tokens,
token_dim=self.hidden_size,
hidden_size=self.cfg.p_tuning.get("encoder_hidden", 2048),
lstm_dropout=self.cfg.p_tuning.get("dropout", 0.0),
num_layers=self.cfg.p_tuning.get("num_layers", 2),
init_std=self.cfg.p_tuning.get("init_std", 0.023),
)

def add_ptuned_prompts_to_prompt_table(self):
"""
Expand Down
7 changes: 1 addition & 6 deletions nemo/collections/nlp/modules/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@
RobertaEncoder,
)
from nemo.collections.nlp.modules.common.lm_utils import get_lm_model, get_pretrained_lm_models_list
from nemo.collections.nlp.modules.common.prompt_encoder import (
BIGLSTMPromptEncoder,
PromptEncoder,
PromptEncoderMLP,
PromptEncoderType,
)
from nemo.collections.nlp.modules.common.prompt_encoder import PromptEncoder, PromptEncoderType
from nemo.collections.nlp.modules.common.prompt_table import (
PromptTable,
VirtualPromptPlaceholderToken,
Expand Down
140 changes: 28 additions & 112 deletions nemo/collections/nlp/modules/common/prompt_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,105 +36,24 @@
ModelType = AttnMaskType = AttnType = LayerType = ApexGuardDefaults()


__all__ = ["PromptEncoder", "BIGLSTMPromptEncoder", "PromptEncoderType", "PromptEncoderMLP"]
__all__ = ["PromptEncoder", "PromptEncoderType"]


class PromptEncoderType(enum.Enum):
BIGLSTM = 'biglstm' # LSTM model that works with large language model
TPMLP = 'tpmlp' # mlp model that support tensor parallel, better work together with a large language model
MLP = 'mlp'
LSTM = 'lstm'
TPMLP = "tpmlp" # mlp model that support tensor parallel, better work together with a large language model
MLP = "mlp"
LSTM = "lstm"


class BIGLSTMPromptEncoder(NeuralModule, Exportable):
"""
The LSTM prompt encoder network that is used to generate the virtual
token embeddings for p-tuning. It is specially used to work with large language model.
To handle large language model, the LSTM only uses hidden_size as its hidden internal dimension, which is independent of LM hidden dimension.
"""

@property
def input_types(self) -> Optional[Dict[str, NeuralType]]:
return {
"taskname_embeddings": NeuralType(('B', 'T', 'C'), ChannelType(), optional=False),
}

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
return {"output_embeds": NeuralType(('B', 'T', 'C'), ChannelType())}

def __init__(
self, total_virtual_tokens: int, hidden_size: int, output_size: int, lstm_dropout: float, num_layers: int
):
"""
Initializes the LSTM PromptEncoder module that works with large language model.
Args:
total_virtual_tokens: the total number of vitural tokens
hidden_size: the lstm hidden dimension
output_size: the output dimension
lstm_dropout: lstm dropout rate
num_layers: number of layers used in the LSTM
"""
super().__init__()
self.token_dim = token_dim
self.input_size = token_dim
self.output_size = token_dim
self.hidden_size = hidden_size
self.output_size = output_size
self.total_virtual_tokens = total_virtual_tokens
self.encoder_type = encoder_type

# Set fixed indicies for forward pass
self.register_buffer('indices', torch.LongTensor(list(range(self.total_virtual_tokens))))

# embedding
self.embedding = torch.nn.Embedding(self.total_virtual_tokens, hidden_size)

# LSTM
self.lstm_head = torch.nn.LSTM(
input_size=hidden_size,
hidden_size=self.hidden_size // 2,
num_layers=num_layers,
dropout=lstm_dropout,
bidirectional=True,
batch_first=True,
)
self.mlp_head = nn.Sequential(
nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(), nn.Linear(self.hidden_size, output_size)
)

@typecheck()
def forward(self, taskname_embeddings) -> torch.Tensor:
input_embeds = self.embedding(self.indices).unsqueeze(0)
batch_size, task_seq_length, _ = taskname_embeddings.shape
input_embeds = input_embeds.expand(batch_size, self.total_virtual_tokens, self.token_dim).clone()
length = min(task_seq_length, self.total_virtual_tokens)
# need to adapt taskname embedding hidden to the same size as hidden_size
taskname_embeddings = torch.matmul(taskname_embeddings, self.mlp_head[2].weight)
# Replace general input with task specific embeddings to specify the correct task
input_embeds[:, 0:length, :] = taskname_embeddings[:, 0:length, :]
output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0])
return output_embeds


class PromptEncoderMLP(NeuralModule, Exportable):
class TPMLP(NeuralModule, Exportable):
"""
The Tensor Parallel MLP prompt encoder network that is used to generate the virtual
token embeddings for p-tuning. It only have two layers.
"""

@property
def input_types(self) -> Optional[Dict[str, NeuralType]]:
return {
"taskname_embeddings": NeuralType(('B', 'T', 'C'), ChannelType(), optional=False),
}

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
return {"output_embeds": NeuralType(('B', 'T', 'C'), ChannelType())}

def __init__(self, total_virtual_tokens: int, hidden_size: int, output_size: int, init_std: float):
def __init__(
self, total_virtual_tokens: int, hidden_size: int, output_size: int, init_std: float,
):
"""
Initializes the Tensor Model parallel MLP PromptEncoderMLP module.
Args:
Expand All @@ -147,20 +66,15 @@ def __init__(self, total_virtual_tokens: int, hidden_size: int, output_size: int
self.hidden_size = hidden_size
self.output_size = output_size
self.total_virtual_tokens = total_virtual_tokens
self.activation = 'gelu'
self.activation = "gelu"

sequence_parallel = False
gradient_accumulation_fusion = False
# Set fixed indicies for forward pass
self.register_buffer('indices', torch.LongTensor(list(range(self.total_virtual_tokens))))

# embedding
self.embedding = torch.nn.Embedding(self.total_virtual_tokens, output_size)

no_async_tensor_model_parallel_allreduce = (
parallel_state.get_tensor_model_parallel_world_size() == 1 or sequence_parallel
)
self.first = tensor_parallel.ColumnParallelLinear(
output_size,
self.output_size,
self.hidden_size,
gather_output=False,
init_method=init_method_normal(init_std),
Expand All @@ -173,7 +87,7 @@ def __init__(self, total_virtual_tokens: int, hidden_size: int, output_size: int
)
self.second = tensor_parallel.RowParallelLinear(
self.hidden_size,
output_size,
self.output_size,
input_is_parallel=True,
init_method=init_method_normal(init_std),
skip_bias_add=True,
Expand All @@ -183,14 +97,7 @@ def __init__(self, total_virtual_tokens: int, hidden_size: int, output_size: int
gradient_accumulation_fusion=gradient_accumulation_fusion,
)

@typecheck()
def forward(self, taskname_embeddings) -> torch.Tensor:
input_embeds = self.embedding(self.indices).unsqueeze(0)
batch_size, task_seq_length, _ = taskname_embeddings.shape
input_embeds = input_embeds.expand(batch_size, self.total_virtual_tokens, self.output_size).clone()
length = min(task_seq_length, self.total_virtual_tokens)
# Replace general input with task specific embeddings to specify the correct task
input_embeds[:, 0:length, :] = taskname_embeddings[:, 0:length, :]
def forward(self, input_embeds) -> torch.Tensor:
intermediate_parallel, bias_parallel = self.first(input_embeds)
intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
output_embeds, bias_parallel = self.second(intermediate_parallel)
Expand All @@ -207,12 +114,12 @@ class PromptEncoder(NeuralModule, Exportable):
@property
def input_types(self) -> Optional[Dict[str, NeuralType]]:
return {
"taskname_embeddings": NeuralType(('B', 'T', 'C'), ChannelType(), optional=False),
"taskname_embeddings": NeuralType(("B", "T", "C"), ChannelType(), optional=False),
}

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
return {"output_embeds": NeuralType(('B', 'T', 'C'), ChannelType())}
return {"output_embeds": NeuralType(("B", "T", "C"), ChannelType())}

def __init__(
self,
Expand All @@ -222,6 +129,7 @@ def __init__(
hidden_size,
lstm_dropout: float,
num_layers: int,
init_std: float,
):
"""
Initializes the PromptEncoder module.
Expand All @@ -230,17 +138,21 @@ def __init__(
hidden_size: hidden dimension
lstm_dropout: the dropout used for the LSTM
num_layers: number of layers used in the LSTM
init_std: used for TPMLP encoder type to initialize the mlp weights
"""
super().__init__()
self.token_dim = token_dim
self.input_size = token_dim
self.output_size = token_dim
self.hidden_size = hidden_size
self.lstm_hidden_size = self.hidden_size // 2
self.total_virtual_tokens = total_virtual_tokens
self.encoder_type = encoder_type
self.activation = "gelu"
self.init_std = init_std

# Set fixed indicies for forward pass
self.register_buffer('indices', torch.LongTensor(list(range(self.total_virtual_tokens))))
self.register_buffer("indices", torch.LongTensor(list(range(self.total_virtual_tokens))))

# embedding
self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim)
Expand All @@ -249,17 +161,17 @@ def __init__(
# LSTM
self.lstm_head = torch.nn.LSTM(
input_size=self.input_size,
hidden_size=self.hidden_size,
hidden_size=self.lstm_hidden_size,
num_layers=num_layers,
dropout=lstm_dropout,
bidirectional=True,
batch_first=True,
)

self.mlp_head = nn.Sequential(
nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
nn.Linear(self.lstm_hidden_size * 2, self.lstm_hidden_size * 2),
nn.ReLU(),
nn.Linear(self.hidden_size * 2, self.output_size),
nn.Linear(self.lstm_hidden_size * 2, self.output_size),
)

elif self.encoder_type == PromptEncoderType.MLP:
Expand All @@ -275,6 +187,8 @@ def __init__(
layers.append(nn.Linear(self.hidden_size, self.output_size))
self.mlp_head = nn.Sequential(*layers)

elif self.encoder_type == PromptEncoderType.TPMLP:
self.tpmlp = TPMLP(self.total_virtual_tokens, self.hidden_size, self.output_size, self.init_std,)
else:
raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")

Expand All @@ -292,6 +206,8 @@ def forward(self, taskname_embeddings) -> torch.Tensor:
output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0])
elif self.encoder_type == PromptEncoderType.MLP:
output_embeds = self.mlp_head(input_embeds)
elif self.encoder_type == PromptEncoderType.TPMLP:
output_embeds = self.tpmlp(input_embeds)
else:
raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")

Expand Down

0 comments on commit 05f777f

Please sign in to comment.