EleutherAI · Quentin-Anthony · Jan 27, 2024 · Dec 31, 2023 · Dec 31, 2023 · Dec 31, 2023
diff --git a/calc/README.md b/calc/README.md
@@ -20,8 +20,7 @@ Currently, scripts are entirely self-contained. This is for the dual purpose of:
 ```
 Example with Fairseq-MoE 15B: python calc_transformer_flops.py -l 12 -hs 768 --moe -e 512
 Example with GPT-3 175B: python calc_transformer_flops.py -l 96 -hs 12288
-usage: calc_transformer_flops.py [-h] [--vocab-size VOCAB_SIZE] [--hidden-size HIDDEN_SIZE] [--sequence-length SEQUENCE_LENGTH] [--num-layers NUM_LAYERS] [--moe] [--num-experts NUM_EXPERTS] [--expert-interval EXPERT_INTERVAL]
-                                 [--topk TOPK] [--batch-size BATCH_SIZE] [--tokens TOKENS] [--no-checkpoint-activations]
+usage: calc_transformer_flops.py [-h] [--vocab-size VOCAB_SIZE] [--hidden-size HIDDEN_SIZE] [--sequence-length SEQUENCE_LENGTH] [--num-layers NUM_LAYERS] [--kv-size-ratio KV_SIZE_RATIO] [--moe] [--num-experts NUM_EXPERTS] [--expert-interval EXPERT_INTERVAL] [--topk TOPK] [--swiglu] [--batch-size BATCH_SIZE] [--tokens TOKENS] [--no-checkpoint-activations]
 
 options:
   -h, --help            show this help message and exit
@@ -33,12 +32,15 @@ options:
                         Sequence length used for training
   --num-layers NUM_LAYERS, -l NUM_LAYERS
                         Number of transformer layers used in model
+  --kv-size-ratio KV_SIZE_RATIO, -kv KV_SIZE_RATIO
+                        Ratio of kv heads to query heads used in model. 1.0 for MHA
   --moe                 Whether our model is MoE
   --num-experts NUM_EXPERTS, -e NUM_EXPERTS
                         Number of experts for MoE
   --expert-interval EXPERT_INTERVAL, -ei EXPERT_INTERVAL
                         Expert interval for MoE
   --topk TOPK, -t TOPK  Top k routing for MoE
+  --swiglu              Use swiglu MLP. If set, ffn-hidden-size is defined as the inner dimension of each of the three MLP weights.
   --batch-size BATCH_SIZE, -b BATCH_SIZE
                         Global batch size in units of samples
   --tokens TOKENS       Number of tokens you are training over
@@ -83,19 +85,15 @@ options:
 `calc_transformer_mem.py` calculates the amount of device memory required to train or infer a model. See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how memory overhead is calculated. Take this estimation with a grain of salt, because every implementation is different and these calculations were written to match the GPT-NeoX library as close as possible. Even for other training and inference libraries, however, we expect our script to give approximate memory estimations within acceptable error. (Please see [LLM finetuning memory requirements](https://blog.scottlogic.com/2023/11/24/llm-mem.html) for a treatment of how specific memory costs may vary framework-to-framework). Other good resources that we consulted are [the ZeRO Paper](https://arxiv.org/abs/1910.02054) and [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198.pdf).
 
 ```
-Example with pythia 6.9B: python transformer_mem.py --num-layers=32 --sequence-length=2048 --num-attention-heads=32 --hidden-size=4096 --batch-size-per-gpu=8 --checkpoint-activations --zero-stage=1 --partition-activations --pipeline-parallel-size=1 --tensor-parallel-size=2 --num-gpus=128 --params=6900000000
-Example with pythia 12B: python transformer_mem.py --num-layers=36 --sequence-length=2048 --num-attention-heads=40 --hidden-size=5120 --batch-size-per-gpu=8 --checkpoint-activations --zero-stage=1 --partition-activations --pipeline-parallel-size=1 --tensor-parallel-size=4 --num-gpus=256 --params=11849420800
-Example with default 20B: python transformer_mem.py --num-layers=44 --sequence-length=2048 --num-attention-heads=64 --hidden-size=6144 --batch-size-per-gpu=1 --checkpoint-activations --zero-stage=1 --partition-activations --pipeline-parallel-size=1 --tensor-parallel-size=1 --num-gpus=1 --params=20000000000
+Example with pythia 6.9B: python calc_transformer_mem.py --num-layers=32 --sequence-length=2048 --num-attention-heads=32 --hidden-size=4096 --batch-size-per-gpu=8 --checkpoint-activations --zero-stage=1 --partition-activations --pipeline-parallel-size=1 --tensor-parallel-size=2 --num-gpus=128
+Example with pythia 12B: python calc_transformer_mem.py --num-layers=36 --sequence-length=2048 --num-attention-heads=40 --hidden-size=5120 --batch-size-per-gpu=8 --checkpoint-activations --zero-stage=1 --partition-activations --pipeline-parallel-size=1 --tensor-parallel-size=4 --num-gpus=256
+Example with default 20B: python calc_transformer_mem.py --num-layers=44 --sequence-length=2048 --num-attention-heads=64 --hidden-size=6144 --batch-size-per-gpu=1 --checkpoint-activations --zero-stage=1 --partition-activations --pipeline-parallel-size=1 --tensor-parallel-size=1 --num-gpus=1
 
-usage: calc_transformer_mem.py [-h] [--params PARAMS] [--num-gpus NUM_GPUS] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--pipeline-parallel-size PIPELINE_PARALLEL_SIZE] [--partition-activations] [--zero-stage {0,1,2,3}]
-                               [--checkpoint-activations] [--batch-size-per-gpu BATCH_SIZE_PER_GPU] [--hidden-size HIDDEN_SIZE] [--num-attention-heads NUM_ATTENTION_HEADS] [--sequence-length SEQUENCE_LENGTH] [--num-layers NUM_LAYERS]
-                               [--fp32-model] [--fp32-grads] [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] [--zero3-max-live-params ZERO3_MAX_LIVE_PARAMS] [--misc-mem-gb MISC_MEM_GB] [--num-experts NUM_EXPERTS]
-                               [--ffn-expansion-factor FFN_EXPANSION_FACTOR] [--expert-parallelism EXPERT_PARALLELISM] [--vocab-size VOCAB_SIZE]
+usage: calc_transformer_mem.py [-h] [--num-gpus NUM_GPUS] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--pipeline-parallel-size PIPELINE_PARALLEL_SIZE] [--partition-activations] [--zero-stage {0,1,2,3}] [--zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE] [--zero3-max-live-params ZERO3_MAX_LIVE_PARAMS] [--checkpoint-activations] [--batch-size-per-gpu BATCH_SIZE_PER_GPU] [--sequence-length SEQUENCE_LENGTH] [--vocab-size VOCAB_SIZE] [--hidden-size HIDDEN_SIZE] [--num-attention-heads NUM_ATTENTION_HEADS]
+                               [--num-layers NUM_LAYERS] [--ffn-expansion-factor FFN_EXPANSION_FACTOR] [--infer] [--kv-size-ratio KV_SIZE_RATIO] [--disable-mixed-precision] [--high-prec-bytes-per-val HIGH_PREC_BYTES_PER_VAL] [--low-prec-bytes-per-val LOW_PREC_BYTES_PER_VAL] [--bytes-per-grad-ele BYTES_PER_GRAD_ELE] [--num-experts NUM_EXPERTS] [--expert-parallelism EXPERT_PARALLELISM] [--misc-mem-gib MISC_MEM_GIB]
 
 options:
   -h, --help            show this help message and exit
-  --params PARAMS, -p PARAMS
-                        Number of Parameters
   --num-gpus NUM_GPUS   Number of GPUs used for training
   --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
                         Tensor parallel degree (1 if not used)
@@ -105,34 +103,43 @@ options:
                         Whether we use ZeRO-R to partition activation memory across tensor-parallel degree
   --zero-stage {0,1,2,3}, -z {0,1,2,3}
                         Stage of the ZeRO optimizer
+  --zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE, -zbs ZERO_ALLGATHER_BUCKET_SIZE
+                        Size of allgather buckets used by ZeRO
+  --zero3-max-live-params ZERO3_MAX_LIVE_PARAMS, -zmlp ZERO3_MAX_LIVE_PARAMS
+                        Maximum number of parameters ZeRO3 keeps in GPU memory
   --checkpoint-activations, -ca
                         Whether Megatron-style activation checkpointing is being used
   --batch-size-per-gpu BATCH_SIZE_PER_GPU, -b BATCH_SIZE_PER_GPU
                         Batch size per GPU
+  --sequence-length SEQUENCE_LENGTH, -s SEQUENCE_LENGTH
+                        Sequence length used for training
+  --vocab-size VOCAB_SIZE, -v VOCAB_SIZE
+                        How many tokens are in the embedding layer
   --hidden-size HIDDEN_SIZE, -hs HIDDEN_SIZE
                         Dimension of the model's hidden size
   --num-attention-heads NUM_ATTENTION_HEADS, -a NUM_ATTENTION_HEADS
                         Number of attention heads used in model
-  --sequence-length SEQUENCE_LENGTH, -s SEQUENCE_LENGTH
-                        Sequence length used for training
   --num-layers NUM_LAYERS, -l NUM_LAYERS
                         Number of transformer layers used in model
-  --fp32-model          Whether model is stored in fp32
-  --fp32-grads          Whether grads are stored in fp32
-  --zero-allgather-bucket-size ZERO_ALLGATHER_BUCKET_SIZE, -zbs ZERO_ALLGATHER_BUCKET_SIZE
-                        Size of allgather buckets used by ZeRO
-  --zero3-max-live-params ZERO3_MAX_LIVE_PARAMS, -zmlp ZERO3_MAX_LIVE_PARAMS
-                        Maximum number of parameters ZeRO3 keeps in GPU memory
-  --misc-mem-gb MISC_MEM_GB
-                        Miscellaneous memory overhead by DL framework(s), communication libraries, etc
-  --num-experts NUM_EXPERTS
-                        Number of experts
   --ffn-expansion-factor FFN_EXPANSION_FACTOR, -ff FFN_EXPANSION_FACTOR
                         How much the MLP hidden size expands
+  --infer               whether we're doing inference
+  --kv-size-ratio KV_SIZE_RATIO, -kv KV_SIZE_RATIO
+                        Ratio of total query heads to key/value heads. 1.0 for MHA, 1/num_attention_heads for MQA.
+  --disable-mixed-precision
+                        Disables mixed precision training
+  --high-prec-bytes-per-val HIGH_PREC_BYTES_PER_VAL
+                        The high-precision bytes per value (parameter, optimizer state, etc) in mixed precision
+  --low-prec-bytes-per-val LOW_PREC_BYTES_PER_VAL
+                        The low-precision bytes per value (parameter, optimizer state, etc) in mixed precision
+  --bytes-per-grad-ele BYTES_PER_GRAD_ELE
+                        The precision of gradient elements as bytes per value
+  --num-experts NUM_EXPERTS
+                        Number of experts
   --expert-parallelism EXPERT_PARALLELISM, -ep EXPERT_PARALLELISM
                         How many ways are the experts sharded across ranks
-  --vocab-size VOCAB_SIZE, -v VOCAB_SIZE
-                        How many ways are the experts sharded across ranks
+  --misc-mem-gib MISC_MEM_GIB
+                        Miscellaneous memory overhead per GPU by DL framework(s), communication libraries, etc
 ```
 
 

diff --git a/calc/calc_transformer_flops.py b/calc/calc_transformer_flops.py
@@ -51,6 +51,9 @@ def config_parser():
                         type=int,
                         default=1,
                         help='Top k routing for MoE')
+    parser.add_argument("--swiglu",
+                action="store_true",
+                help='Use swiglu MLP. If set, ffn-hidden-size is defined as the inner dimension of each of the three MLP weights.')    
     parser.add_argument("--batch-size", "-b",
                         type=int,
                         default=1,
@@ -85,6 +88,8 @@ def calc_params(args):
     attention_over_values_flops = iter_factor * 2 * args.num_layers * args.tokens * args.sequence_length * args.hidden_size
     linear_projection_flops = iter_factor * 2 * args.num_layers * args.tokens * args.hidden_size * args.hidden_size
     ffn_flops = iter_factor * 16 * args.num_layers * args.tokens * args.hidden_size * args.hidden_size
+    if args.swiglu:
+        ffn_flops = 3/2 * ffn_flops
     # no activation checkpointing for embeddings
     embedding_flops = 6 * args.tokens * args.hidden_size * args.vocab_size