pytorch · msaroufim · Aug 16, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/benchmarks/benchmark_int8_qt.py b/benchmarks/benchmark_int8_qt.py
@@ -0,0 +1,148 @@
+# pre-train a mini Llama2 on TinyStories with INT8 quantized training
+# pip install transformers sentencepiece wandb
+#
+# BF16 baseline: python benchmarks/benchmark_int8_qt.py --seed 2024 --n_steps 10_000
+# INT8 QT:       python benchmarks/benchmark_int8_qt.py --seed 2024 --n_steps 10_000 --quantize int8_weight_only
+
+import os
+
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import torch
+import wandb
+from tqdm import tqdm
+from transformers import LlamaConfig, LlamaForCausalLM
+
+from torchao.prototype import low_bit_optim
+from torchao.prototype.quantized_training import int8_weight_only_quantized_training
+from torchao.quantization.quant_api import quantize_
+
+
+def get_loss(model: LlamaForCausalLM, batch: torch.Tensor):
+    return model(batch, labels=batch).loss
+
+
+def get_tinystories():
+    save_path = Path("tinystories.bin")
+
+    if not save_path.exists():
+        import sentencepiece as spm
+        from huggingface_hub import hf_hub_download
+
+        tokenizer_path = hf_hub_download("meta-llama/Llama-2-7b", "tokenizer.model")
+        tokenizer = spm.SentencePieceProcessor(tokenizer_path)
+        assert tokenizer.vocab_size() < (1 << 16)  # make sure we can use uint16
+
+        # do everything in memory. we have enough RAM
+        filepath = hf_hub_download(
+            "roneneldan/TinyStories",
+            "TinyStoriesV2-GPT4-train.txt",
+            repo_type="dataset",
+        )
+        stories = open(filepath).read().split("\n<|endoftext|>\n")
+
+        tokens_list = []
+        chunk_size = 10_000
+        for i in tqdm(range(0, len(stories), chunk_size), desc="Tokenizing TinyStories"):
+            chunk = stories[i : min(i + chunk_size, len(stories))]
+            tokens_list.extend(tokenizer.Encode(chunk, add_bos=True, add_eos=True, num_threads=4))
+
+        total_size = sum(len(x) for x in tokens_list)
+        mmap_tokens = np.memmap(save_path, dtype=np.uint16, mode="w+", shape=total_size)
+        i = 0
+        for tokens in tokens_list:
+            mmap_tokens[i : i + len(tokens)] = tokens
+            i += len(tokens)
+        mmap_tokens.flush()
+
+    tokens = np.memmap(save_path, dtype=np.uint16, mode="r")
+    return torch.from_numpy(tokens)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # default config is 470M
+    parser.add_argument("--d_model", type=int, default=1024)
+    parser.add_argument("--depth", type=int, default=24)
+    parser.add_argument("--ffn_size", type=int, default=4096)
+    parser.add_argument("--head_dim", type=int, default=64)
+
+    parser.add_argument("--quantize")
+    parser.add_argument("--activation_checkpointing", action="store_true")
+
+    parser.add_argument("--n_steps", type=int, default=1000)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--seq_len", type=int, default=2048)
+
+    parser.add_argument("--optim", default="AdamW")
+    parser.add_argument("--lr", type=float, default=3e-4)
+    parser.add_argument("--weight_decay", type=float, default=1e-2)
+
+    parser.add_argument("--project", default="int8_quantized_training")
+    parser.add_argument("--run_name")
+    parser.add_argument("--seed", type=int)
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        torch.manual_seed(args.seed)
+
+    config = LlamaConfig(
 def setup_caches(self, max_batch_size, max_seq_length): 
     if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size: 
         return 
     head_dim = self.config.dim // self.config.n_head 
     max_seq_length = find_multiple(max_seq_length, 8) 
     self.max_seq_length = max_seq_length 
     self.max_batch_size = max_batch_size 
     dtype = self.output.weight.dtype 
     # For quantized layers, dtype is encoded in scales 
     if hasattr(self.output, "scales"): 
         dtype = self.output.scales.dtype 
     elif hasattr(self.output, "scales_and_zeros"): 
         dtype = self.output.scales_and_zeros.dtype 
     for b in self.layers: 
         b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype) 
     self.freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.dim // self.config.n_head, self.config.rope_base, dtype) 
     self.causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool)) 
 y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) 
 def setup_caches(self, max_batch_size, max_seq_length): 
     if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size: 
         return 
     head_dim = self.config.dim // self.config.n_head 
     max_seq_length = find_multiple(max_seq_length, 8) 
     self.max_seq_length = max_seq_length 
     self.max_batch_size = max_batch_size 
     dtype = self.output.weight.dtype 
     # For quantized layers, dtype is encoded in scales 
     if hasattr(self.output, "scales"): 
         dtype = self.output.scales.dtype 
     elif hasattr(self.output, "scales_and_zeros"): 
         dtype = self.output.scales_and_zeros.dtype 
     for b in self.layers: 
         b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype) 
  
     self.freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.dim // self.config.n_head, self.config.rope_base, dtype) 
     self.causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool)) 
 y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) 
+        hidden_size=args.d_model,
+        intermediate_size=args.ffn_size,
+        num_hidden_layers=args.depth,
+        num_attention_heads=args.d_model // args.head_dim,
+        max_position_embeddings=args.seq_len,
+        use_cache=False,
+    )
+    model = LlamaForCausalLM(config).bfloat16().cuda()
+    if args.activation_checkpointing:
+        model.gradient_checkpointing_enable()
+    if args.quantize == "int8_weight_only":
+        quantize_(model, int8_weight_only_quantized_training())
+    elif args.quantize is not None:
+        raise ValueError(f"Unsupported quantize={args.quantize}")
+    print(f"No. of params: {sum(p.numel() for p in model.parameters()):,}")
+    print(f"No. of buffers: {sum(p.numel() for p in model.buffers()):,}")
+
+    # turn off these flags (set by quantize_()) to speed up compile time
 def quantize_(model: torch.nn.Module, apply_tensor_subclass: Callable[[torch.nn.Module], torch.nn.Module], filter_fn: Optional[Callable[[torch.nn.Module, str], bool]]=None, set_inductor_config: bool=True): 
 def quantize_(model: torch.nn.Module, apply_tensor_subclass: Callable[[torch.nn.Module], torch.nn.Module], filter_fn: Optional[Callable[[torch.nn.Module, str], bool]]=None, set_inductor_config: bool=True): 
+    torch._inductor.config.coordinate_descent_tuning = False
+    torch._inductor.config.coordinate_descent_check_all_directions = False
+
+    optim = getattr(low_bit_optim, args.optim)(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+
+    data = get_tinystories().cuda()
+    run = wandb.init(dir="/tmp", config=args, project=args.project, name=args.run_name)
+
+    step = 0
+    log_interval = 50
+    pbar = tqdm(total=args.n_steps, dynamic_ncols=True)
+    model.train()
+
+    while step < args.n_steps:
+        # randomly select a continuous chunk, then reshape it
+        idx = torch.randint(0, data.shape[0] - args.batch_size * args.seq_len, (1,)).item()
+        batch = data[idx : idx + args.batch_size * args.seq_len].view(args.batch_size, args.seq_len).long()
+
+        loss = torch.compile(get_loss)(model, batch)
+        loss.backward()
+
+        if step % log_interval == 0:
+            log_dict = dict(
+                loss=loss.item(),
+                lr=optim.param_groups[0]["lr"],
+                max_memory_allocated=torch.cuda.max_memory_allocated(),
+            )
+            run.log(log_dict, step=step)
+            pbar.set_postfix(loss=log_dict["loss"])
+
+        optim.step()
+        optim.zero_grad()
+
+        step += 1
+        pbar.update()
+
+    run.finish()
diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -98,6 +98,30 @@ def test_optim_smoke(self, optim_name, dtype, device):
         optim.step()
         optim.zero_grad()
 
+    @parametrize("device", _DEVICES)
+    def test_optim_standard_correctness(self, device):
+        model1 = nn.Sequential(nn.Linear(32, 1024), nn.ReLU(), nn.Linear(1024, 128)).to(device)
+        model2 = copy.deepcopy(model1)
+
+        optim1 = torch.optim.AdamW(model1.parameters())
+        optim2 = low_bit_optim.AdamW(model2.parameters())
+
+        for _ in range(2):
+            x = torch.randn(4, 32, device=device)
+
+            loss1 = model1(x).sum()
+            loss1.backward()
+            optim1.step()
+            optim1.zero_grad()
+
+            loss2 = model2(x).sum()
+            loss2.backward()
+            optim2.step()
+            optim2.zero_grad()
+
+        for p1, p2 in zip(model1.parameters(), model2.parameters()):
+            torch.testing.assert_close(p2, p1, rtol=1e-5, atol=1e-5)
+
     @pytest.mark.skipif(bnb is None, reason="bitsandbytes is not availablle")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="bitsandbytes 8-bit Adam only works for CUDA")
     @pytest.mark.xfail(not TORCH_VERSION_AFTER_2_3, reason="torch.compile() fails for PyTorch < 2.3")