From 6c9799a73ddb3bbf3562c9009385a2d6f7482bf4 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 22:11:21 +0000 Subject: [PATCH 01/29] Move --- cacheflow/{master => core}/block_manager.py | 0 cacheflow/{master => core}/policy.py | 0 cacheflow/{master => core}/scheduler.py | 0 cacheflow/{master => core}/server.py | 0 cacheflow/{master => core}/simple_frontend.py | 0 .../input_metadata.py | 0 .../layers}/activation.py | 0 .../layers}/attention.py | 0 .../layers}/layernorm.py | 0 .../layers}/sample.py | 0 .../memory_analyzer.py | 0 .../{models => model_executor}/model_utils.py | 0 .../{ => model_executor}/models/__init__.py | 0 cacheflow/{ => model_executor}/models/gpt2.py | 0 .../{ => model_executor}/models/gpt_neox.py | 0 .../{ => model_executor}/models/llama.py | 0 cacheflow/{ => model_executor}/models/opt.py | 0 .../parallel_utils/README.md | 0 .../parallel_utils/__init__.py | 0 .../parallel_utils/parallel_state.py | 0 .../tensor_parallel/__init__.py | 0 .../parallel_utils/tensor_parallel/layers.py | 0 .../tensor_parallel/mappings.py | 0 .../parallel_utils/tensor_parallel/random.py | 0 .../parallel_utils/tensor_parallel/utils.py | 0 .../parallel_utils/utils.py | 0 cacheflow/{models => model_executor}/utils.py | 0 plot/plot_normalized_latency.py | 212 ------------------ plot/plot_stats.py | 52 ----- 29 files changed, 264 deletions(-) rename cacheflow/{master => core}/block_manager.py (100%) rename cacheflow/{master => core}/policy.py (100%) rename cacheflow/{master => core}/scheduler.py (100%) rename cacheflow/{master => core}/server.py (100%) rename cacheflow/{master => core}/simple_frontend.py (100%) rename cacheflow/{models => model_executor}/input_metadata.py (100%) rename cacheflow/{models => model_executor/layers}/activation.py (100%) rename cacheflow/{models => model_executor/layers}/attention.py (100%) rename cacheflow/{models => model_executor/layers}/layernorm.py (100%) rename cacheflow/{models => model_executor/layers}/sample.py (100%) rename cacheflow/{models => model_executor}/memory_analyzer.py (100%) rename cacheflow/{models => model_executor}/model_utils.py (100%) rename cacheflow/{ => model_executor}/models/__init__.py (100%) rename cacheflow/{ => model_executor}/models/gpt2.py (100%) rename cacheflow/{ => model_executor}/models/gpt_neox.py (100%) rename cacheflow/{ => model_executor}/models/llama.py (100%) rename cacheflow/{ => model_executor}/models/opt.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/README.md (100%) rename cacheflow/{ => model_executor}/parallel_utils/__init__.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/parallel_state.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/tensor_parallel/__init__.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/tensor_parallel/layers.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/tensor_parallel/mappings.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/tensor_parallel/random.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/tensor_parallel/utils.py (100%) rename cacheflow/{ => model_executor}/parallel_utils/utils.py (100%) rename cacheflow/{models => model_executor}/utils.py (100%) delete mode 100644 plot/plot_normalized_latency.py delete mode 100644 plot/plot_stats.py diff --git a/cacheflow/master/block_manager.py b/cacheflow/core/block_manager.py similarity index 100% rename from cacheflow/master/block_manager.py rename to cacheflow/core/block_manager.py diff --git a/cacheflow/master/policy.py b/cacheflow/core/policy.py similarity index 100% rename from cacheflow/master/policy.py rename to cacheflow/core/policy.py diff --git a/cacheflow/master/scheduler.py b/cacheflow/core/scheduler.py similarity index 100% rename from cacheflow/master/scheduler.py rename to cacheflow/core/scheduler.py diff --git a/cacheflow/master/server.py b/cacheflow/core/server.py similarity index 100% rename from cacheflow/master/server.py rename to cacheflow/core/server.py diff --git a/cacheflow/master/simple_frontend.py b/cacheflow/core/simple_frontend.py similarity index 100% rename from cacheflow/master/simple_frontend.py rename to cacheflow/core/simple_frontend.py diff --git a/cacheflow/models/input_metadata.py b/cacheflow/model_executor/input_metadata.py similarity index 100% rename from cacheflow/models/input_metadata.py rename to cacheflow/model_executor/input_metadata.py diff --git a/cacheflow/models/activation.py b/cacheflow/model_executor/layers/activation.py similarity index 100% rename from cacheflow/models/activation.py rename to cacheflow/model_executor/layers/activation.py diff --git a/cacheflow/models/attention.py b/cacheflow/model_executor/layers/attention.py similarity index 100% rename from cacheflow/models/attention.py rename to cacheflow/model_executor/layers/attention.py diff --git a/cacheflow/models/layernorm.py b/cacheflow/model_executor/layers/layernorm.py similarity index 100% rename from cacheflow/models/layernorm.py rename to cacheflow/model_executor/layers/layernorm.py diff --git a/cacheflow/models/sample.py b/cacheflow/model_executor/layers/sample.py similarity index 100% rename from cacheflow/models/sample.py rename to cacheflow/model_executor/layers/sample.py diff --git a/cacheflow/models/memory_analyzer.py b/cacheflow/model_executor/memory_analyzer.py similarity index 100% rename from cacheflow/models/memory_analyzer.py rename to cacheflow/model_executor/memory_analyzer.py diff --git a/cacheflow/models/model_utils.py b/cacheflow/model_executor/model_utils.py similarity index 100% rename from cacheflow/models/model_utils.py rename to cacheflow/model_executor/model_utils.py diff --git a/cacheflow/models/__init__.py b/cacheflow/model_executor/models/__init__.py similarity index 100% rename from cacheflow/models/__init__.py rename to cacheflow/model_executor/models/__init__.py diff --git a/cacheflow/models/gpt2.py b/cacheflow/model_executor/models/gpt2.py similarity index 100% rename from cacheflow/models/gpt2.py rename to cacheflow/model_executor/models/gpt2.py diff --git a/cacheflow/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py similarity index 100% rename from cacheflow/models/gpt_neox.py rename to cacheflow/model_executor/models/gpt_neox.py diff --git a/cacheflow/models/llama.py b/cacheflow/model_executor/models/llama.py similarity index 100% rename from cacheflow/models/llama.py rename to cacheflow/model_executor/models/llama.py diff --git a/cacheflow/models/opt.py b/cacheflow/model_executor/models/opt.py similarity index 100% rename from cacheflow/models/opt.py rename to cacheflow/model_executor/models/opt.py diff --git a/cacheflow/parallel_utils/README.md b/cacheflow/model_executor/parallel_utils/README.md similarity index 100% rename from cacheflow/parallel_utils/README.md rename to cacheflow/model_executor/parallel_utils/README.md diff --git a/cacheflow/parallel_utils/__init__.py b/cacheflow/model_executor/parallel_utils/__init__.py similarity index 100% rename from cacheflow/parallel_utils/__init__.py rename to cacheflow/model_executor/parallel_utils/__init__.py diff --git a/cacheflow/parallel_utils/parallel_state.py b/cacheflow/model_executor/parallel_utils/parallel_state.py similarity index 100% rename from cacheflow/parallel_utils/parallel_state.py rename to cacheflow/model_executor/parallel_utils/parallel_state.py diff --git a/cacheflow/parallel_utils/tensor_parallel/__init__.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/__init__.py similarity index 100% rename from cacheflow/parallel_utils/tensor_parallel/__init__.py rename to cacheflow/model_executor/parallel_utils/tensor_parallel/__init__.py diff --git a/cacheflow/parallel_utils/tensor_parallel/layers.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py similarity index 100% rename from cacheflow/parallel_utils/tensor_parallel/layers.py rename to cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py diff --git a/cacheflow/parallel_utils/tensor_parallel/mappings.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py similarity index 100% rename from cacheflow/parallel_utils/tensor_parallel/mappings.py rename to cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py diff --git a/cacheflow/parallel_utils/tensor_parallel/random.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py similarity index 100% rename from cacheflow/parallel_utils/tensor_parallel/random.py rename to cacheflow/model_executor/parallel_utils/tensor_parallel/random.py diff --git a/cacheflow/parallel_utils/tensor_parallel/utils.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py similarity index 100% rename from cacheflow/parallel_utils/tensor_parallel/utils.py rename to cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py diff --git a/cacheflow/parallel_utils/utils.py b/cacheflow/model_executor/parallel_utils/utils.py similarity index 100% rename from cacheflow/parallel_utils/utils.py rename to cacheflow/model_executor/parallel_utils/utils.py diff --git a/cacheflow/models/utils.py b/cacheflow/model_executor/utils.py similarity index 100% rename from cacheflow/models/utils.py rename to cacheflow/model_executor/utils.py diff --git a/plot/plot_normalized_latency.py b/plot/plot_normalized_latency.py deleted file mode 100644 index 4bccc40fc2b1..000000000000 --- a/plot/plot_normalized_latency.py +++ /dev/null @@ -1,212 +0,0 @@ -import argparse -import os -import pickle -from typing import Any, Dict, List, Optional, Tuple - -import matplotlib.pyplot as plt -import numpy as np - - -SYSTEMS = [ - 'orca-constant', - 'orca-power2', - 'orca-oracle', - 'cacheflow', -] - -SYSTEM_TO_LABEL = { - 'orca-constant': 'Orca (Max)', - 'orca-power2': 'Orca (Pow2)', - 'orca-oracle': 'Orca (Oracle)', - 'cacheflow': 'KVFlow', -} - -SYSTEM_TO_COLOR = { - 'orca-constant': 'red', - 'orca-power2': 'orange', - 'orca-oracle': 'green', - 'cacheflow': 'blue', -} - -SYSTEM_TO_MARKER = { - 'orca-constant': 'x', - 'orca-power2': '^', - 'orca-oracle': 's', - 'cacheflow': 'o', -} - - -def get_results(save_dir: str) -> List[Dict[str, Any]]: - with open(os.path.join(save_dir, 'sequences.pkl'), 'rb') as f: - results = pickle.load(f) - return results - - -def get_request_rate(save_dir: str) -> float: - """Get request rate from save_dir name.""" - # Directory name format: - # .../req-rate-{req_rate}/seed-{seed}/duration-{duration} - save_dir = os.path.abspath(save_dir) - dir_names = save_dir.split('/') - - request_rate = None - for dir_name in dir_names: - if dir_name.startswith('req-rate-'): - if request_rate is not None: - raise ValueError(f'Found multiple request rates in {save_dir}') - request_rate = float(dir_name.split('-')[-1]) - if request_rate is None: - raise ValueError(f'Cannot find request rate in {save_dir}') - return request_rate - - -def get_model(save_dir: str) -> Tuple[str, int]: - save_dir = os.path.abspath(save_dir) - dir_names = save_dir.split('/') - - model = None - for dir_name in dir_names: - if '-tp' in dir_name: - if model is not None: - raise ValueError(f'Found multiple models in {save_dir}') - model = dir_name.split('-tp')[0] - tp = int(dir_name.split('-tp')[-1]) - if model is None: - raise ValueError(f'Cannot find model in {save_dir}') - return model, tp - - -def get_system(save_dir: str) -> str: - save_dir = os.path.abspath(save_dir) - dir_names = save_dir.split('/') - - for dir_name in dir_names: - if dir_name.startswith('orca-'): - return dir_name - if dir_name == 'cacheflow': - return dir_name - raise ValueError(f'Cannot find system in {save_dir}') - - -def get_sampling(save_dir: str) -> str: - save_dir = os.path.abspath(save_dir) - dir_names = save_dir.split('/') - - for dir_name in dir_names: - if dir_name.startswith('n'): - if dir_name.endswith('-beam'): - return dir_name - if dir_name[1:].isdigit(): - return dir_name - raise ValueError(f'Cannot find sampling method in {save_dir}') - - -def plot_normalized_latency( - exp_dir: str, - duration: int, - seed: int, - warmup: int, - xlim: Optional[float], - ylim: Optional[float], - log_scale: bool, - format: str, -) -> None: - # Get leaf directories. - save_dirs = [] - for root, dirs, files in os.walk(exp_dir): - if dirs: - continue - if 'sequences.pkl' not in files: - continue - if f'seed{seed}' not in root: - continue - if f'duration-{duration}' not in root: - continue - save_dirs.append(root) - - # Plot normalized latency. - perf_per_system: Dict[str, Tuple[List[float], List[float]]] = {} - for save_dir in save_dirs: - per_seq_norm_latencies = [] - results = get_results(save_dir) - for seq in results: - arrival_time = seq['arrival_time'] - finish_time = seq['finish_time'] - output_len = seq['output_len'] - if arrival_time < warmup: - continue - latency = finish_time - arrival_time - norm_latency = latency / output_len - per_seq_norm_latencies.append(norm_latency) - - request_rate = get_request_rate(save_dir) - normalized_latency = np.mean(per_seq_norm_latencies) - system_name = get_system(save_dir) - if system_name not in perf_per_system: - perf_per_system[system_name] = ([], []) - perf_per_system[system_name][0].append(request_rate) - perf_per_system[system_name][1].append(normalized_latency) - - print('#seqs', len(per_seq_norm_latencies)) - print(f'{save_dir}: {normalized_latency:.3f} s') - - - # Plot normalized latency. - plt.figure(figsize=(6, 4)) - for system_name in reversed(SYSTEMS): - if system_name not in perf_per_system: - continue - # Sort by request rate. - request_rates, normalized_latencies = perf_per_system[system_name] - request_rates, normalized_latencies = zip(*sorted(zip(request_rates, normalized_latencies))) - label = SYSTEM_TO_LABEL[system_name] - color = SYSTEM_TO_COLOR[system_name] - marker = SYSTEM_TO_MARKER[system_name] - plt.plot(request_rates, normalized_latencies, label=label, color=color, marker=marker) - - # plt.legend() - plt.xlabel('Request rate (req/s)', fontsize=12) - plt.ylabel('Normalized latency (s/token)', fontsize=12) - - if log_scale: - plt.yscale('log') - if xlim is not None: - plt.xlim(left=0, right=xlim) - if ylim is not None: - if log_scale: - plt.ylim(top=ylim) - else: - plt.ylim(bottom=0, top=ylim) - - handles, labels = plt.gca().get_legend_handles_labels() - handles = reversed(handles) - labels = reversed(labels) - - plt.legend( - handles, labels, - ncol=4, fontsize=12, loc='upper center', bbox_to_anchor=(0.5, 1.15), - columnspacing=0.5, handletextpad=0.5, handlelength=1.5, frameon=False, borderpad=0) - - # Save figure. - model, tp = get_model(exp_dir) - sampling = get_sampling(exp_dir) - figname = f'{model}-tp{tp}-{sampling}.{format}' - os.makedirs('./figures', exist_ok=True) - plt.savefig(os.path.join('figures', figname), bbox_inches='tight') - print(f'Saved figure to ./figures/{figname}') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('exp_dir', type=str) - parser.add_argument('--duration', type=int, required=True) - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--warmup', type=int, default=60) - parser.add_argument('--xlim', type=float, required=False, default=None) - parser.add_argument('--ylim', type=float, required=False, default=None) - parser.add_argument('--log', action='store_true') - parser.add_argument('--format', choices=['png', 'pdf'], default='png') - args = parser.parse_args() - - plot_normalized_latency( - args.exp_dir, args.duration, args.seed, args.warmup, args.xlim, args.ylim, args.log, args.format) diff --git a/plot/plot_stats.py b/plot/plot_stats.py deleted file mode 100644 index c391571403ec..000000000000 --- a/plot/plot_stats.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -import pickle - -import matplotlib.pyplot as plt - -STAT_NAMES = [ - 'input_lens', - 'num_running', - 'num_waiting', - 'num_preemption', - 'gpu_cache_usage', - 'cpu_cache_usage', - 'num_swapped', - 'swap_in_lens', - 'swap_out_lens', -] - - -def plot_stats(output_dir: str): - # Get stats. - with open(os.path.join(output_dir, 'stats.pkl'), 'rb') as f: - stats = pickle.load(f) - timestamps = stats['timestamps'] - - # Draw one figure for each stat. - num_stats = len(STAT_NAMES) - COLORS = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'orange', 'purple', 'pink', 'brown', 'gray'] - fig, axs = plt.subplots(num_stats, 1, figsize=(10, 2 * num_stats)) - for i, stat in enumerate(STAT_NAMES): - data = stats[stat] - if stat in ['gpu_cache_usage', 'cpu_cache_usage']: - data = [x * 100 for x in data] - stat = stat + ' (%)' - axs[i].plot(timestamps, data, color=COLORS[i % len(COLORS)]) - axs[i].set_ylabel(stat.replace('_', ' '), fontdict={'fontsize': 12}) - axs[i].set_ylim(bottom=0) - - plt.xlabel('Time (s)') - plt.tight_layout() - fig_path = os.path.join(output_dir, 'stats.png') - plt.savefig(fig_path) - print(f'Saved stats to {fig_path}') - - -if __name__ == '__main__': - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument('output_dir', type=str, help='Output directory.') - args = parser.parse_args() - - plot_stats(args.output_dir) From bacf49c4e0b3a11c9949693205bffe56f0d1c4a2 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 22:28:49 +0000 Subject: [PATCH 02/29] http_frontend -> frontend --- cacheflow/{http_frontend => frontend}/fastapi_frontend.py | 0 cacheflow/{http_frontend => frontend}/gradio_webserver.py | 0 cacheflow/{http_frontend => frontend}/test_cli_client.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename cacheflow/{http_frontend => frontend}/fastapi_frontend.py (100%) rename cacheflow/{http_frontend => frontend}/gradio_webserver.py (100%) rename cacheflow/{http_frontend => frontend}/test_cli_client.py (100%) diff --git a/cacheflow/http_frontend/fastapi_frontend.py b/cacheflow/frontend/fastapi_frontend.py similarity index 100% rename from cacheflow/http_frontend/fastapi_frontend.py rename to cacheflow/frontend/fastapi_frontend.py diff --git a/cacheflow/http_frontend/gradio_webserver.py b/cacheflow/frontend/gradio_webserver.py similarity index 100% rename from cacheflow/http_frontend/gradio_webserver.py rename to cacheflow/frontend/gradio_webserver.py diff --git a/cacheflow/http_frontend/test_cli_client.py b/cacheflow/frontend/test_cli_client.py similarity index 100% rename from cacheflow/http_frontend/test_cli_client.py rename to cacheflow/frontend/test_cli_client.py From a16c1f3d47c3d45bd30e942d6bf683f0b9fcf27d Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 23:20:16 +0000 Subject: [PATCH 03/29] Move --- cacheflow/{core => frontend}/simple_frontend.py | 0 cacheflow/frontend/gradio_webserver.py => gradio_webserver.py | 0 cacheflow/frontend/test_cli_client.py => test_cli_client.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename cacheflow/{core => frontend}/simple_frontend.py (100%) rename cacheflow/frontend/gradio_webserver.py => gradio_webserver.py (100%) rename cacheflow/frontend/test_cli_client.py => test_cli_client.py (100%) diff --git a/cacheflow/core/simple_frontend.py b/cacheflow/frontend/simple_frontend.py similarity index 100% rename from cacheflow/core/simple_frontend.py rename to cacheflow/frontend/simple_frontend.py diff --git a/cacheflow/frontend/gradio_webserver.py b/gradio_webserver.py similarity index 100% rename from cacheflow/frontend/gradio_webserver.py rename to gradio_webserver.py diff --git a/cacheflow/frontend/test_cli_client.py b/test_cli_client.py similarity index 100% rename from cacheflow/frontend/test_cli_client.py rename to test_cli_client.py From 9b868d1fdb575adb8835a3eb7360d00e428c8214 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 23:57:35 +0000 Subject: [PATCH 04/29] Move controller --- cacheflow/{worker => core}/controller.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cacheflow/{worker => core}/controller.py (100%) diff --git a/cacheflow/worker/controller.py b/cacheflow/core/controller.py similarity index 100% rename from cacheflow/worker/controller.py rename to cacheflow/core/controller.py From b2ff569d399cd602b46d06bc185eee7fd2fd67c3 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 19:54:15 +0000 Subject: [PATCH 05/29] Minor --- cacheflow/model_executor/models/gpt_neox.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cacheflow/model_executor/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py index fb85e2f72454..2d3c25b7125e 100644 --- a/cacheflow/model_executor/models/gpt_neox.py +++ b/cacheflow/model_executor/models/gpt_neox.py @@ -3,6 +3,7 @@ import torch from torch import nn +from transformers import GPTNeoXConfig from cacheflow.models import InputMetadata from cacheflow.models.attention import GPTNeoXCacheFlowAttention @@ -21,7 +22,7 @@ class GPTNeoXAttention(nn.Module): - def __init__(self, config): + def __init__(self, config: GPTNeoXConfig): super().__init__() self.total_num_heads = config.num_attention_heads self.hidden_size = config.hidden_size @@ -63,7 +64,7 @@ def forward( class GPTNeoXMLP(nn.Module): - def __init__(self, config): + def __init__(self, config: GPTNeoXConfig): super().__init__() self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size, config.intermediate_size, @@ -86,7 +87,7 @@ def forward(self, hidden_states): class GPTNeoXLayer(nn.Module): - def __init__(self, config): + def __init__(self, config: GPTNeoXConfig): super().__init__() self.use_parallel_residual = config.use_parallel_residual self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -129,7 +130,7 @@ def forward( class GPTNeoXModel(nn.Module): - def __init__(self, config): + def __init__(self, config: GPTNeoXConfig): super().__init__() self.config = config From 38b946bfbfa1208399a7aa3bb636629b2ee53737 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 21:19:08 +0000 Subject: [PATCH 06/29] Fix import errors --- cacheflow/core/controller.py | 2 +- cacheflow/core/scheduler.py | 4 ++-- cacheflow/frontend/simple_frontend.py | 2 +- simple_server.py | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cacheflow/core/controller.py b/cacheflow/core/controller.py index e6b148204c4c..462247273ad6 100644 --- a/cacheflow/core/controller.py +++ b/cacheflow/core/controller.py @@ -5,7 +5,7 @@ except ImportError: ray = None -from cacheflow.master.scheduler import Scheduler +from cacheflow.core.scheduler import Scheduler from cacheflow.sequence import SequenceGroupInputs from cacheflow.worker.worker import Worker diff --git a/cacheflow/core/scheduler.py b/cacheflow/core/scheduler.py index da461798bb6e..17c5c6d62a30 100644 --- a/cacheflow/core/scheduler.py +++ b/cacheflow/core/scheduler.py @@ -4,8 +4,8 @@ import time from typing import Any, Dict, List, Optional, Tuple -from cacheflow.master.block_manager import BlockSpaceManager -from cacheflow.master.policy import PolicyFactory +from cacheflow.core.block_manager import BlockSpaceManager +from cacheflow.core.policy import PolicyFactory from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import Sequence from cacheflow.sequence import SequenceGroup diff --git a/cacheflow/frontend/simple_frontend.py b/cacheflow/frontend/simple_frontend.py index f8396269874f..fb239b4a5cbf 100644 --- a/cacheflow/frontend/simple_frontend.py +++ b/cacheflow/frontend/simple_frontend.py @@ -1,5 +1,5 @@ import time -from typing import List, Optional, Set, Tuple +from typing import List, Optional, Tuple from transformers import AutoTokenizer diff --git a/simple_server.py b/simple_server.py index 4df46dc16226..66e8fbd8a80d 100644 --- a/simple_server.py +++ b/simple_server.py @@ -1,11 +1,12 @@ import argparse from typing import List -from cacheflow.master.server import ( +from cacheflow.core.server import ( add_server_arguments, process_server_arguments, init_local_server_and_frontend_with_arguments) from cacheflow.sampling_params import SamplingParams + def main(args: argparse.Namespace): server, frontend = init_local_server_and_frontend_with_arguments(args) # Test the following inputs. From 62c717555378b31b7157b94880583ee248f117d6 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 21:36:18 +0000 Subject: [PATCH 07/29] Move controller back to worker --- cacheflow/{core => worker}/controller.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cacheflow/{core => worker}/controller.py (100%) diff --git a/cacheflow/core/controller.py b/cacheflow/worker/controller.py similarity index 100% rename from cacheflow/core/controller.py rename to cacheflow/worker/controller.py From 26aafdcf628197d121ec91438c83f831a784c265 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 21:59:02 +0000 Subject: [PATCH 08/29] Rename --- .../{model_utils.py => model_loader.py} | 14 +-- cacheflow/model_executor/weight_utils.py | 101 ++++++++++++++++++ 2 files changed, 106 insertions(+), 9 deletions(-) rename cacheflow/model_executor/{model_utils.py => model_loader.py} (86%) create mode 100644 cacheflow/model_executor/weight_utils.py diff --git a/cacheflow/model_executor/model_utils.py b/cacheflow/model_executor/model_loader.py similarity index 86% rename from cacheflow/model_executor/model_utils.py rename to cacheflow/model_executor/model_loader.py index ca838d5e36d2..5422b9031b84 100644 --- a/cacheflow/model_executor/model_utils.py +++ b/cacheflow/model_executor/model_loader.py @@ -5,15 +5,11 @@ from transformers import AutoConfig from transformers import PretrainedConfig -from cacheflow.models.memory_analyzer import CacheFlowMemoryAnalyzer -from cacheflow.models.memory_analyzer import GPT2MemoryAnalyzer -from cacheflow.models.memory_analyzer import GPTNeoXMemoryAnalyzer -from cacheflow.models.memory_analyzer import LlamaMemoryAnalyzer -from cacheflow.models.memory_analyzer import OPTMemoryAnalyzer -from cacheflow.models.gpt2 import GPT2LMHeadModel -from cacheflow.models.gpt_neox import GPTNeoXForCausalLM -from cacheflow.models.llama import LlamaForCausalLM -from cacheflow.models.opt import OPTForCausalLM +from cacheflow.model_executor.memory_analyzer import ( + CacheFlowMemoryAnalyzer, GPT2MemoryAnalyzer, GPTNeoXMemoryAnalyzer, + LlamaMemoryAnalyzer, OPTMemoryAnalyzer) +from cacheflow.model_executor.models import ( + GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM) from cacheflow.models.utils import get_torch_dtype diff --git a/cacheflow/model_executor/weight_utils.py b/cacheflow/model_executor/weight_utils.py new file mode 100644 index 000000000000..80d2f0a5ea81 --- /dev/null +++ b/cacheflow/model_executor/weight_utils.py @@ -0,0 +1,101 @@ +import filelock +import glob +import json +import os +from typing import Optional + +from huggingface_hub import snapshot_download +import numpy as np +import torch +from tqdm.auto import tqdm + + +class Disabledtqdm(tqdm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, disable=True) + + +def hf_model_weights_iterator( + model_name_or_path: str, + cache_dir: Optional[str] = None, + use_np_cache: bool = False, +): + # Prepare file lock directory to prevent multiple processes from + # downloading the same model weights at the same time. + lock_dir = cache_dir if cache_dir is not None else "/tmp" + lock_file_name = model_name_or_path.replace("/", "-") + ".lock" + lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) + + # Download model weights from huggingface. + is_local = os.path.isdir(model_name_or_path) + if not is_local: + with lock: + hf_folder = snapshot_download(model_name_or_path, + allow_patterns="*.bin", + cache_dir=cache_dir, + tqdm_class=Disabledtqdm) + else: + hf_folder = model_name_or_path + + hf_bin_files = glob.glob(os.path.join(hf_folder, "*.bin")) + + if use_np_cache: + # Convert the model weights from torch tensors to numpy arrays for + # faster loading. + np_folder = os.path.join(hf_folder, 'np') + os.makedirs(np_folder, exist_ok=True) + weight_names_file = os.path.join(np_folder, 'weight_names.json') + with lock: + if not os.path.exists(weight_names_file): + weight_names = [] + for bin_file in hf_bin_files: + state = torch.load(bin_file, map_location="cpu") + for name, param in state.items(): + param_path = os.path.join(np_folder, name) + with open(param_path, "wb") as f: + np.save(f, param.cpu().detach().numpy()) + weight_names.append(name) + with open(weight_names_file, 'w') as f: + json.dump(weight_names, f) + + with open(weight_names_file, 'r') as f: + weight_names = json.load(f) + + for name in weight_names: + param_path = os.path.join(np_folder, name) + with open(param_path, "rb") as f: + param = np.load(f) + yield name, torch.from_numpy(param) + else: + for bin_file in hf_bin_files: + state = torch.load(bin_file, map_location="cpu") + for name, param in state.items(): + yield name, param + + +# TODO(woosuk): Annotate the types. +def load_tensor_parallel_weights( + param: torch.Tensor, + loaded_weight, + param_name, + column_parallel_weight_names, + row_parallel_weight_names, + tensor_model_parallel_rank: int, +) -> None: + for p in column_parallel_weight_names: + if p in param_name: + shard_size = param.shape[0] + loaded_weight = loaded_weight[ + shard_size * tensor_model_parallel_rank + :shard_size * (tensor_model_parallel_rank + 1)] + break + for p in row_parallel_weight_names: + if p in param_name: + shard_size = param.shape[1] + loaded_weight = loaded_weight[ + :, + shard_size * tensor_model_parallel_rank + :shard_size * (tensor_model_parallel_rank + 1)] + break + assert param.shape == loaded_weight.shape + param.data.copy_(loaded_weight) From 8419df6b1fc418540f1df51230771ebf136376a5 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:07:22 +0000 Subject: [PATCH 09/29] mv --- cacheflow/model_executor/utils.py | 95 +------------------------------ 1 file changed, 1 insertion(+), 94 deletions(-) diff --git a/cacheflow/model_executor/utils.py b/cacheflow/model_executor/utils.py index ced58c54a0c8..04a02189ae49 100644 --- a/cacheflow/model_executor/utils.py +++ b/cacheflow/model_executor/utils.py @@ -1,15 +1,6 @@ -import os -import glob -import json -import filelock -from typing import Union, Optional +from typing import Union -import numpy as np import torch -from tqdm.auto import tqdm -from huggingface_hub import snapshot_download -from cacheflow.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank) _STR_DTYPE_TO_TORCH_DTYPE = { @@ -32,87 +23,3 @@ def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype: def get_dtype_size(dtype: Union[torch.dtype, str]) -> int: torch_dtype = get_torch_dtype(dtype) return torch.tensor([], dtype=torch_dtype).element_size() - - -class Disabledtqdm(tqdm): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, disable=True) - - -def hf_model_weights_iterator(model_name_or_path: str, - cache_dir: Optional[str] = None, - use_np_cache: bool = False): - # Prepare file lock directory to prevent multiple processes from - # downloading the same model weights at the same time. - lock_dir = cache_dir if cache_dir is not None else "/tmp" - lock_file_name = model_name_or_path.replace("/", "-") + ".lock" - lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) - - # Download model weights from huggingface. - is_local = os.path.isdir(model_name_or_path) - if not is_local: - with lock: - hf_folder = snapshot_download(model_name_or_path, - allow_patterns="*.bin", - cache_dir=cache_dir, - tqdm_class=Disabledtqdm) - else: - hf_folder = model_name_or_path - - hf_bin_files = glob.glob(os.path.join(hf_folder, "*.bin")) - - if use_np_cache: - # Convert the model weights from torch tensors to numpy arrays for - # faster loading. - np_folder = os.path.join(hf_folder, 'np') - os.makedirs(np_folder, exist_ok=True) - weight_names_file = os.path.join(np_folder, 'weight_names.json') - with lock: - if not os.path.exists(weight_names_file): - weight_names = [] - for bin_file in hf_bin_files: - state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - param_path = os.path.join(np_folder, name) - with open(param_path, "wb") as f: - np.save(f, param.cpu().detach().numpy()) - weight_names.append(name) - with open(weight_names_file, 'w') as f: - json.dump(weight_names, f) - - with open(weight_names_file, 'r') as f: - weight_names = json.load(f) - - for name in weight_names: - param_path = os.path.join(np_folder, name) - with open(param_path, "rb") as f: - param = np.load(f) - yield name, torch.from_numpy(param) - else: - for bin_file in hf_bin_files: - state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - yield name, param - - -def load_tensor_parallel_weights(param, loaded_weight, param_name, - column_parallel_weight_names, - row_parallel_weight_names): - tensor_model_parallel_rank = get_tensor_model_parallel_rank() - for p in column_parallel_weight_names: - if p in param_name: - shard_size = param.shape[0] - loaded_weight = loaded_weight[ - shard_size * tensor_model_parallel_rank - :shard_size * (tensor_model_parallel_rank + 1)] - break - for p in row_parallel_weight_names: - if p in param_name: - shard_size = param.shape[1] - loaded_weight = loaded_weight[ - :, - shard_size * tensor_model_parallel_rank - :shard_size * (tensor_model_parallel_rank + 1)] - break - assert param.shape == loaded_weight.shape - param.data.copy_(loaded_weight) From d9520b4ea449c48b43c1ba8af59af79ce12414eb Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:07:36 +0000 Subject: [PATCH 10/29] Add __init__.py --- cacheflow/model_executor/models/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cacheflow/model_executor/models/__init__.py b/cacheflow/model_executor/models/__init__.py index 511a6822214c..8accaa6c13dd 100644 --- a/cacheflow/model_executor/models/__init__.py +++ b/cacheflow/model_executor/models/__init__.py @@ -1,10 +1,12 @@ -from cacheflow.models.input_metadata import InputMetadata -from cacheflow.models.model_utils import get_memory_analyzer -from cacheflow.models.model_utils import get_model +from cacheflow.model_executor.models.gpt_neox import GPTNeoXForCausalLM +from cacheflow.model_executor.models.gpt2 import GPT2LMHeadModel +from cacheflow.model_executor.models.llama import LlamaForCausalLM +from cacheflow.model_executor.models.opt import OPTForCausalLM __all__ = [ - 'InputMetadata', - 'get_memory_analyzer', - 'get_model', + "GPT2LMHeadModel", + "GPTNeoXForCausalLM", + "LlamaForCausalLM", + "OPTForCausalLM", ] From 61bf05fde36e83949b9bf36d9adba5ccf9dcc46d Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:08:36 +0000 Subject: [PATCH 11/29] Minor --- cacheflow/model_executor/model_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cacheflow/model_executor/model_loader.py b/cacheflow/model_executor/model_loader.py index 5422b9031b84..5b57e5774d8d 100644 --- a/cacheflow/model_executor/model_loader.py +++ b/cacheflow/model_executor/model_loader.py @@ -10,7 +10,7 @@ LlamaMemoryAnalyzer, OPTMemoryAnalyzer) from cacheflow.model_executor.models import ( GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM) -from cacheflow.models.utils import get_torch_dtype +from cacheflow.model_executor.utils import get_torch_dtype _MODELS = { From 799ce5330c1773d55dc2ee343fb1688bc9e066ae Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:14:09 +0000 Subject: [PATCH 12/29] Move set_random_seeds --- cacheflow/model_executor/utils.py | 16 ++++++++++++++++ cacheflow/utils.py | 18 +----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cacheflow/model_executor/utils.py b/cacheflow/model_executor/utils.py index 04a02189ae49..72f76a25c87e 100644 --- a/cacheflow/model_executor/utils.py +++ b/cacheflow/model_executor/utils.py @@ -1,7 +1,12 @@ +import random from typing import Union +import numpy as np import torch +from cacheflow.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized +from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed + _STR_DTYPE_TO_TORCH_DTYPE = { 'half': torch.half, @@ -23,3 +28,14 @@ def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype: def get_dtype_size(dtype: Union[torch.dtype, str]) -> int: torch_dtype = get_torch_dtype(dtype) return torch.tensor([], dtype=torch_dtype).element_size() + + +def set_random_seed(seed: int) -> None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + if model_parallel_is_initialized(): + model_parallel_cuda_manual_seed(seed) diff --git a/cacheflow/utils.py b/cacheflow/utils.py index 725a4d19a54f..e586707cd876 100644 --- a/cacheflow/utils.py +++ b/cacheflow/utils.py @@ -1,13 +1,8 @@ import enum -import random -import psutil -import numpy as np +import psutil import torch -from cacheflow.parallel_utils.parallel_state import model_parallel_is_initialized -from cacheflow.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed - class Device(enum.Enum): GPU = enum.auto() @@ -28,17 +23,6 @@ def reset(self) -> None: self.counter = 0 -def set_random_seed(seed: int): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - if model_parallel_is_initialized(): - model_parallel_cuda_manual_seed(seed) - - def get_gpu_memory(gpu: int = 0) -> int: return torch.cuda.get_device_properties(gpu).total_memory From b6f6d4c42259a1ff9d2221d28ae11b93a4e5369d Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:15:20 +0000 Subject: [PATCH 13/29] Fix imports --- cacheflow/frontend/fastapi_frontend.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cacheflow/frontend/fastapi_frontend.py b/cacheflow/frontend/fastapi_frontend.py index 55a5e2ee6e75..e5c116d7b76d 100644 --- a/cacheflow/frontend/fastapi_frontend.py +++ b/cacheflow/frontend/fastapi_frontend.py @@ -1,22 +1,22 @@ import argparse import asyncio +import json import time from typing import List, Dict, Optional -import json -import ray -from transformers import AutoTokenizer from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse +import ray +from transformers import AutoTokenizer import uvicorn +from cacheflow.core.server import (Server, add_server_arguments, + process_server_arguments, + initialize_cluster) from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import Sequence, SequenceGroup -from cacheflow.master.server import (Server, add_server_arguments, - process_server_arguments, - initialize_cluster) -from cacheflow.worker.controller import DeviceID from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory +from cacheflow.worker.controller import DeviceID TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds app = FastAPI() From a25b37de20096aa3175a5b5e2ffc8d1c96003aa3 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:46:05 +0000 Subject: [PATCH 14/29] Extract out initialize_dummy_weights --- cacheflow/model_executor/model_loader.py | 3 ++- cacheflow/model_executor/models/gpt2.py | 4 ---- cacheflow/model_executor/models/gpt_neox.py | 4 ---- cacheflow/model_executor/models/llama.py | 4 ---- cacheflow/model_executor/models/opt.py | 4 ---- cacheflow/model_executor/weight_utils.py | 9 +++++++++ 6 files changed, 11 insertions(+), 17 deletions(-) diff --git a/cacheflow/model_executor/model_loader.py b/cacheflow/model_executor/model_loader.py index 5b57e5774d8d..1af2c606c797 100644 --- a/cacheflow/model_executor/model_loader.py +++ b/cacheflow/model_executor/model_loader.py @@ -11,6 +11,7 @@ from cacheflow.model_executor.models import ( GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM) from cacheflow.model_executor.utils import get_torch_dtype +from cacheflow.model_executor.weight_utils import initialize_dummy_weights _MODELS = { @@ -73,7 +74,7 @@ def get_model( model = model.cuda() # NOTE(woosuk): For precise performance evaluation, we assign # random values to the weights. - model.initialize_dummy_weights() + initialize_dummy_weights(model) else: # Create a model instance. model = model_class(config) diff --git a/cacheflow/model_executor/models/gpt2.py b/cacheflow/model_executor/models/gpt2.py index 1b30ced28aa6..6ec2ffcfc473 100644 --- a/cacheflow/model_executor/models/gpt2.py +++ b/cacheflow/model_executor/models/gpt2.py @@ -259,7 +259,3 @@ def load_weights(self, model_name_or_path: str, load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, self._row_parallel_weights) - - def initialize_dummy_weights(self) -> None: - for param in self.state_dict().values(): - param.data.uniform_(-1e-3, 1e-3) diff --git a/cacheflow/model_executor/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py index 2d3c25b7125e..15e37f228b13 100644 --- a/cacheflow/model_executor/models/gpt_neox.py +++ b/cacheflow/model_executor/models/gpt_neox.py @@ -229,7 +229,3 @@ def load_weights(self, model_name_or_path: str, load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, self._row_parallel_weights) - - def initialize_dummy_weights(self) -> None: - for param in self.state_dict().values(): - param.data.uniform_(-1e-3, 1e-3) diff --git a/cacheflow/model_executor/models/llama.py b/cacheflow/model_executor/models/llama.py index 0669742d1c64..e9e3e71b92f2 100644 --- a/cacheflow/model_executor/models/llama.py +++ b/cacheflow/model_executor/models/llama.py @@ -264,7 +264,3 @@ def load_weights(self, model_name_or_path: str, load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, self._row_parallel_weights) - - def initialize_dummy_weights(self) -> None: - for param in self.state_dict().values(): - param.data.uniform_(-1e-3, 1e-3) diff --git a/cacheflow/model_executor/models/opt.py b/cacheflow/model_executor/models/opt.py index 4f1e729cdcd2..2f318df103f1 100644 --- a/cacheflow/model_executor/models/opt.py +++ b/cacheflow/model_executor/models/opt.py @@ -289,7 +289,3 @@ def load_weights(self, model_name_or_path: str, load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, self._row_parallel_weights) - - def initialize_dummy_weights(self) -> None: - for param in self.state_dict().values(): - param.data.uniform_(-1e-3, 1e-3) diff --git a/cacheflow/model_executor/weight_utils.py b/cacheflow/model_executor/weight_utils.py index 80d2f0a5ea81..7814de643154 100644 --- a/cacheflow/model_executor/weight_utils.py +++ b/cacheflow/model_executor/weight_utils.py @@ -99,3 +99,12 @@ def load_tensor_parallel_weights( break assert param.shape == loaded_weight.shape param.data.copy_(loaded_weight) + + +def initialize_dummy_weights( + model: torch.nn.Module, + low: float = -1e-3, + high: float = 1e-3, +) -> None: + for param in model.state_dict().values(): + param.data.uniform_(low, high) From e2ea5cc19cc4836bd43d371622e772a659ace7a7 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 22:56:09 +0000 Subject: [PATCH 15/29] Minor --- cacheflow/core/server.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cacheflow/core/server.py b/cacheflow/core/server.py index b0296513e9b1..01925fd5d222 100644 --- a/cacheflow/core/server.py +++ b/cacheflow/core/server.py @@ -8,16 +8,17 @@ except ImportError: ray = None -from cacheflow.master.scheduler import Scheduler -from cacheflow.master.simple_frontend import SimpleFrontend -from cacheflow.models import get_memory_analyzer -from cacheflow.worker.controller import Controller, DeviceID +from cacheflow.core.scheduler import Scheduler +from cacheflow.frontend.simple_frontend import SimpleFrontend +from cacheflow.model_executor import get_memory_analyzer from cacheflow.sequence import SequenceGroup from cacheflow.sampling_params import SamplingParams from cacheflow.utils import get_gpu_memory, get_cpu_memory +from cacheflow.worker.controller import Controller, DeviceID class Server: + def __init__( self, model: str, From de47b95a35b1c44d375c7b97d034a7ebcc1092b3 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:02:57 +0000 Subject: [PATCH 16/29] Minor --- cacheflow/model_executor/layers/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cacheflow/model_executor/layers/attention.py b/cacheflow/model_executor/layers/attention.py index 179fbd0b32c8..6d7b49edd931 100644 --- a/cacheflow/model_executor/layers/attention.py +++ b/cacheflow/model_executor/layers/attention.py @@ -7,7 +7,7 @@ from cacheflow import attention_ops from cacheflow import cache_ops from cacheflow import pos_encoding_ops -from cacheflow.models import InputMetadata +from cacheflow.model_executor.input_metadata import InputMetadata class GPTCacheFlowAttention(nn.Module): From 724dc90bf3c985c915cfd4230bc3c9c6ebb85fca Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:09:41 +0000 Subject: [PATCH 17/29] Fix import errors on parallel utils --- cacheflow/model_executor/layers/sample.py | 4 ++-- cacheflow/model_executor/models/gpt2.py | 17 ++++++++-------- cacheflow/model_executor/models/gpt_neox.py | 17 ++++++++-------- cacheflow/model_executor/models/llama.py | 22 ++++++++++----------- cacheflow/model_executor/models/opt.py | 17 ++++++++-------- 5 files changed, 37 insertions(+), 40 deletions(-) diff --git a/cacheflow/model_executor/layers/sample.py b/cacheflow/model_executor/layers/sample.py index dc488c814441..76444ba2b45d 100644 --- a/cacheflow/model_executor/layers/sample.py +++ b/cacheflow/model_executor/layers/sample.py @@ -3,10 +3,10 @@ import torch import torch.nn as nn -from cacheflow.models import InputMetadata +from cacheflow.model_executor.input_metadata import InputMetadata +from cacheflow.model_executor.parallel_utils.tensor_parallel import gather_from_tensor_model_parallel_region from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import SequenceOutputs -from cacheflow.parallel_utils.tensor_parallel import gather_from_tensor_model_parallel_region class Sampler(nn.Module): diff --git a/cacheflow/model_executor/models/gpt2.py b/cacheflow/model_executor/models/gpt2.py index 6ec2ffcfc473..39ecd08c18ec 100644 --- a/cacheflow/model_executor/models/gpt2.py +++ b/cacheflow/model_executor/models/gpt2.py @@ -5,16 +5,15 @@ from torch import nn from transformers import GPT2Config -from cacheflow.models import InputMetadata -from cacheflow.models.attention import GPTCacheFlowAttention -from cacheflow.models.sample import Sampler -from cacheflow.models.utils import (hf_model_weights_iterator, - load_tensor_parallel_weights) -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.model_executor.input_metadata import InputMetadata +from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention +from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, + load_tensor_parallel_weights) +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding, - ColumnParallelLinear, - RowParallelLinear) +from cacheflow.model_executor.parallel_utils.tensor_parallel import ( + VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) from cacheflow.sequence import SequenceOutputs KVCache = Tuple[torch.Tensor, torch.Tensor] diff --git a/cacheflow/model_executor/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py index 15e37f228b13..e08620de7741 100644 --- a/cacheflow/model_executor/models/gpt_neox.py +++ b/cacheflow/model_executor/models/gpt_neox.py @@ -5,16 +5,15 @@ from torch import nn from transformers import GPTNeoXConfig -from cacheflow.models import InputMetadata -from cacheflow.models.attention import GPTNeoXCacheFlowAttention -from cacheflow.models.sample import Sampler -from cacheflow.models.utils import (hf_model_weights_iterator, - load_tensor_parallel_weights) -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.model_executor.input_metadata import InputMetadata +from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention +from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, + load_tensor_parallel_weights) +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding, - ColumnParallelLinear, - RowParallelLinear) +from cacheflow.model_executor.parallel_utils.tensor_parallel import ( + VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) from cacheflow.sequence import SequenceOutputs KVCache = Tuple[torch.Tensor, torch.Tensor] diff --git a/cacheflow/model_executor/models/llama.py b/cacheflow/model_executor/models/llama.py index e9e3e71b92f2..3b573d3b7a29 100644 --- a/cacheflow/model_executor/models/llama.py +++ b/cacheflow/model_executor/models/llama.py @@ -5,18 +5,18 @@ from torch import nn from transformers import LlamaConfig -from cacheflow.models import InputMetadata -from cacheflow.models.activation import SiluAndMul -from cacheflow.models.attention import GPTNeoXCacheFlowAttention -from cacheflow.models.layernorm import RMSNorm -from cacheflow.models.sample import Sampler -from cacheflow.models.utils import (hf_model_weights_iterator, - load_tensor_parallel_weights) -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.sequence import SequenceOutputs +from cacheflow.model_executor.input_metadata import InputMetadata +from cacheflow.model_executor.layers.activation import SiluAndMul +from cacheflow.model_executor.layers.layernorm import RMSNorm +from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention +from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, + load_tensor_parallel_weights) +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding, - ColumnParallelLinear, - RowParallelLinear) +from cacheflow.model_executor.parallel_utils.tensor_parallel import ( + VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) from cacheflow.sequence import SequenceOutputs KVCache = Tuple[torch.Tensor, torch.Tensor] diff --git a/cacheflow/model_executor/models/opt.py b/cacheflow/model_executor/models/opt.py index 2f318df103f1..338f487202f1 100644 --- a/cacheflow/model_executor/models/opt.py +++ b/cacheflow/model_executor/models/opt.py @@ -5,16 +5,15 @@ from torch import nn from transformers import OPTConfig -from cacheflow.models import InputMetadata -from cacheflow.models.attention import GPTCacheFlowAttention -from cacheflow.models.sample import Sampler -from cacheflow.models.utils import (hf_model_weights_iterator, - load_tensor_parallel_weights) -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.model_executor.input_metadata import InputMetadata +from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention +from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, + load_tensor_parallel_weights) +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding, - ColumnParallelLinear, - RowParallelLinear) +from cacheflow.model_executor.parallel_utils.tensor_parallel import ( + VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) from cacheflow.sequence import SequenceOutputs KVCache = Tuple[torch.Tensor, torch.Tensor] From fd2647f3d14ee1a8ce17e084867ef2db4780e522 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:11:38 +0000 Subject: [PATCH 18/29] Add __init__.py --- cacheflow/model_executor/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 cacheflow/model_executor/__init__.py diff --git a/cacheflow/model_executor/__init__.py b/cacheflow/model_executor/__init__.py new file mode 100644 index 000000000000..d839756a5599 --- /dev/null +++ b/cacheflow/model_executor/__init__.py @@ -0,0 +1,11 @@ +from cacheflow.model_executor.input_metadata import InputMetadata +from cacheflow.model_executor.model_loader import get_model, get_memory_analyzer +from cacheflow.model_executor.utils import set_random_seed + + +__all__ = [ + "InputMetadata", + "get_model", + "get_memory_analyzer", + "set_random_seed", +] From 7755a7a267f077d461d2eb04743188e998ec1843 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:16:33 +0000 Subject: [PATCH 19/29] Fix parallel_utils --- cacheflow/model_executor/parallel_utils/__init__.py | 6 +++--- .../parallel_utils/tensor_parallel/layers.py | 2 +- .../parallel_utils/tensor_parallel/mappings.py | 2 +- .../parallel_utils/tensor_parallel/random.py | 4 ++-- .../parallel_utils/tensor_parallel/utils.py | 4 ++-- cacheflow/model_executor/parallel_utils/utils.py | 2 +- cacheflow/worker/worker.py | 13 +++++-------- 7 files changed, 15 insertions(+), 18 deletions(-) diff --git a/cacheflow/model_executor/parallel_utils/__init__.py b/cacheflow/model_executor/parallel_utils/__init__.py index 78a7c4463eac..11b6fa22fc8c 100644 --- a/cacheflow/model_executor/parallel_utils/__init__.py +++ b/cacheflow/model_executor/parallel_utils/__init__.py @@ -1,6 +1,6 @@ -import cacheflow.parallel_utils.parallel_state -import cacheflow.parallel_utils.tensor_parallel -import cacheflow.parallel_utils.utils +import cacheflow.model_executor.parallel_utils.parallel_state +import cacheflow.model_executor.parallel_utils.tensor_parallel +import cacheflow.model_executor.parallel_utils.utils # Alias parallel_state as mpu, its legacy name mpu = parallel_state diff --git a/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py index 2cbe2b8a6d76..fef8ff743947 100644 --- a/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py +++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py @@ -9,7 +9,7 @@ import torch.nn.init as init from torch.nn.parameter import Parameter -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_all_reduce_launcher, diff --git a/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py index d9ca3b460d7b..4352514b2380 100644 --- a/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py +++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py @@ -2,7 +2,7 @@ import torch -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tensor_model_parallel_group, diff --git a/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py index 6f36febdeb64..1374f13b401c 100644 --- a/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py +++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py @@ -10,7 +10,7 @@ from torch.cuda import _lazy_call, device as device_ctx_manager from torch.utils.checkpoint import detach_variable -from cacheflow.parallel_utils.parallel_state import ( +from cacheflow.model_executor.parallel_utils.parallel_state import ( get_data_parallel_rank, get_tensor_model_parallel_group, get_tensor_model_parallel_rank, @@ -22,7 +22,7 @@ gather_split_1d_tensor, ) -from cacheflow.parallel_utils.utils import safely_set_viewless_tensor_data +from cacheflow.model_executor.parallel_utils.utils import safely_set_viewless_tensor_data # Default name for the model parallel rng tracker. _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' diff --git a/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py b/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py index dd15faa9a65b..e8e6c81b8ddc 100644 --- a/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py +++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py @@ -3,8 +3,8 @@ import torch from typing import List, Sequence -from cacheflow.parallel_utils.utils import divide -from cacheflow.parallel_utils import parallel_state +from cacheflow.model_executor.parallel_utils.utils import divide +from cacheflow.model_executor.parallel_utils import parallel_state def split_tensor_along_last_dim( tensor: torch.Tensor, diff --git a/cacheflow/model_executor/parallel_utils/utils.py b/cacheflow/model_executor/parallel_utils/utils.py index 4b2b76130246..c81e0afacdab 100644 --- a/cacheflow/model_executor/parallel_utils/utils.py +++ b/cacheflow/model_executor/parallel_utils/utils.py @@ -4,7 +4,7 @@ import torch -from cacheflow.parallel_utils import parallel_state +from cacheflow.model_executor.parallel_utils import parallel_state def ensure_divisibility(numerator, denominator): diff --git a/cacheflow/worker/worker.py b/cacheflow/worker/worker.py index 59001b9d8fdc..9f81f207b00e 100644 --- a/cacheflow/worker/worker.py +++ b/cacheflow/worker/worker.py @@ -2,18 +2,15 @@ import torch -from cacheflow.models import get_model -from cacheflow.models import InputMetadata +from cacheflow.model_executor import get_model, InputMetadata, set_random_seed +from cacheflow.model_executor.parallel_utils.parallel_state import ( + initialize_model_parallel, + initialize_all_reduce_launcher, + get_tensor_model_parallel_world_size) from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import SequenceGroupInputs from cacheflow.sequence import SequenceOutputs from cacheflow.worker.cache_engine import CacheEngine -from cacheflow.parallel_utils.parallel_state import ( - initialize_model_parallel, - initialize_all_reduce_launcher, - get_tensor_model_parallel_world_size) -from cacheflow.utils import set_random_seed - class Worker: From a95bd42d3fd4eb1151e3e844650122e3f41a399b Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:16:45 +0000 Subject: [PATCH 20/29] Minor --- cacheflow/model_executor/memory_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cacheflow/model_executor/memory_analyzer.py b/cacheflow/model_executor/memory_analyzer.py index 0adc2e79a624..703257d39a76 100644 --- a/cacheflow/model_executor/memory_analyzer.py +++ b/cacheflow/model_executor/memory_analyzer.py @@ -1,7 +1,7 @@ import torch from transformers import AutoConfig -from cacheflow.models.utils import get_dtype_size +from cacheflow.model_executor.utils import get_dtype_size _GiB = 1 << 30 From 4dc8e9ee4df0bf67363a1f5abded36b4e3d097a4 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:19:49 +0000 Subject: [PATCH 21/29] Fix weight loading --- cacheflow/model_executor/models/gpt2.py | 3 ++- cacheflow/model_executor/models/gpt_neox.py | 3 ++- cacheflow/model_executor/models/llama.py | 3 ++- cacheflow/model_executor/models/opt.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cacheflow/model_executor/models/gpt2.py b/cacheflow/model_executor/models/gpt2.py index 39ecd08c18ec..f5d6df6f58f3 100644 --- a/cacheflow/model_executor/models/gpt2.py +++ b/cacheflow/model_executor/models/gpt2.py @@ -257,4 +257,5 @@ def load_weights(self, model_name_or_path: str, raise ValueError(f"Unexpected parameter name {name}") load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, - self._row_parallel_weights) + self._row_parallel_weights, + tensor_model_parallel_rank) diff --git a/cacheflow/model_executor/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py index e08620de7741..6c4cbbaf8c76 100644 --- a/cacheflow/model_executor/models/gpt_neox.py +++ b/cacheflow/model_executor/models/gpt_neox.py @@ -227,4 +227,5 @@ def load_weights(self, model_name_or_path: str, raise ValueError(f"Unexpected weight name: {name}") load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, - self._row_parallel_weights) + self._row_parallel_weights, + tensor_model_parallel_rank) diff --git a/cacheflow/model_executor/models/llama.py b/cacheflow/model_executor/models/llama.py index 3b573d3b7a29..3babc958c2f9 100644 --- a/cacheflow/model_executor/models/llama.py +++ b/cacheflow/model_executor/models/llama.py @@ -263,4 +263,5 @@ def load_weights(self, model_name_or_path: str, param = state_dict[name] load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, - self._row_parallel_weights) + self._row_parallel_weights, + tensor_model_parallel_rank) diff --git a/cacheflow/model_executor/models/opt.py b/cacheflow/model_executor/models/opt.py index 338f487202f1..2ca2802c2877 100644 --- a/cacheflow/model_executor/models/opt.py +++ b/cacheflow/model_executor/models/opt.py @@ -287,4 +287,5 @@ def load_weights(self, model_name_or_path: str, param = state_dict[name] load_tensor_parallel_weights(param, loaded_weight, name, self._column_parallel_weights, - self._row_parallel_weights) + self._row_parallel_weights, + tensor_model_parallel_rank) From e6ffa8076db9158e8f10ec51eb51b8a8e5d40ad2 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:20:02 +0000 Subject: [PATCH 22/29] Annotate types --- cacheflow/model_executor/weight_utils.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cacheflow/model_executor/weight_utils.py b/cacheflow/model_executor/weight_utils.py index 7814de643154..7b63948a17c1 100644 --- a/cacheflow/model_executor/weight_utils.py +++ b/cacheflow/model_executor/weight_utils.py @@ -2,7 +2,7 @@ import glob import json import os -from typing import Optional +from typing import Generator, List, Optional from huggingface_hub import snapshot_download import numpy as np @@ -19,7 +19,7 @@ def hf_model_weights_iterator( model_name_or_path: str, cache_dir: Optional[str] = None, use_np_cache: bool = False, -): +) -> Generator[str, torch.Tensor]: # Prepare file lock directory to prevent multiple processes from # downloading the same model weights at the same time. lock_dir = cache_dir if cache_dir is not None else "/tmp" @@ -73,13 +73,12 @@ def hf_model_weights_iterator( yield name, param -# TODO(woosuk): Annotate the types. def load_tensor_parallel_weights( param: torch.Tensor, - loaded_weight, - param_name, - column_parallel_weight_names, - row_parallel_weight_names, + loaded_weight: torch.Tensor, + param_name: str, + column_parallel_weight_names: List[str], + row_parallel_weight_names: List[str], tensor_model_parallel_rank: int, ) -> None: for p in column_parallel_weight_names: From da591aaf28154d51440931089489c041cbec60f1 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:24:28 +0000 Subject: [PATCH 23/29] Fix type --- cacheflow/model_executor/weight_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cacheflow/model_executor/weight_utils.py b/cacheflow/model_executor/weight_utils.py index 7b63948a17c1..796bfffbb70f 100644 --- a/cacheflow/model_executor/weight_utils.py +++ b/cacheflow/model_executor/weight_utils.py @@ -2,7 +2,7 @@ import glob import json import os -from typing import Generator, List, Optional +from typing import Iterator, List, Optional, Tuple from huggingface_hub import snapshot_download import numpy as np @@ -11,6 +11,7 @@ class Disabledtqdm(tqdm): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs, disable=True) @@ -19,7 +20,7 @@ def hf_model_weights_iterator( model_name_or_path: str, cache_dir: Optional[str] = None, use_np_cache: bool = False, -) -> Generator[str, torch.Tensor]: +) -> Iterator[Tuple[str, torch.Tensor]]: # Prepare file lock directory to prevent multiple processes from # downloading the same model weights at the same time. lock_dir = cache_dir if cache_dir is not None else "/tmp" From 0ae70da2a16cc4f8da649fd47ab48ec1f04ab657 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:29:03 +0000 Subject: [PATCH 24/29] sample -> sampler --- cacheflow/model_executor/layers/{sample.py => sampler.py} | 0 cacheflow/model_executor/models/gpt2.py | 2 +- cacheflow/model_executor/models/gpt_neox.py | 2 +- cacheflow/model_executor/models/llama.py | 2 +- cacheflow/model_executor/models/opt.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename cacheflow/model_executor/layers/{sample.py => sampler.py} (100%) diff --git a/cacheflow/model_executor/layers/sample.py b/cacheflow/model_executor/layers/sampler.py similarity index 100% rename from cacheflow/model_executor/layers/sample.py rename to cacheflow/model_executor/layers/sampler.py diff --git a/cacheflow/model_executor/models/gpt2.py b/cacheflow/model_executor/models/gpt2.py index f5d6df6f58f3..11e977221f35 100644 --- a/cacheflow/model_executor/models/gpt2.py +++ b/cacheflow/model_executor/models/gpt2.py @@ -7,7 +7,7 @@ from cacheflow.model_executor.input_metadata import InputMetadata from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention -from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.layers.sampler import Sampler from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, load_tensor_parallel_weights) from cacheflow.model_executor.parallel_utils.parallel_state import ( diff --git a/cacheflow/model_executor/models/gpt_neox.py b/cacheflow/model_executor/models/gpt_neox.py index 6c4cbbaf8c76..c50aa68e9b1d 100644 --- a/cacheflow/model_executor/models/gpt_neox.py +++ b/cacheflow/model_executor/models/gpt_neox.py @@ -7,7 +7,7 @@ from cacheflow.model_executor.input_metadata import InputMetadata from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention -from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.layers.sampler import Sampler from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, load_tensor_parallel_weights) from cacheflow.model_executor.parallel_utils.parallel_state import ( diff --git a/cacheflow/model_executor/models/llama.py b/cacheflow/model_executor/models/llama.py index 3babc958c2f9..b278d9894e3d 100644 --- a/cacheflow/model_executor/models/llama.py +++ b/cacheflow/model_executor/models/llama.py @@ -10,7 +10,7 @@ from cacheflow.model_executor.layers.activation import SiluAndMul from cacheflow.model_executor.layers.layernorm import RMSNorm from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention -from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.layers.sampler import Sampler from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, load_tensor_parallel_weights) from cacheflow.model_executor.parallel_utils.parallel_state import ( diff --git a/cacheflow/model_executor/models/opt.py b/cacheflow/model_executor/models/opt.py index 2ca2802c2877..fea604088adf 100644 --- a/cacheflow/model_executor/models/opt.py +++ b/cacheflow/model_executor/models/opt.py @@ -7,7 +7,7 @@ from cacheflow.model_executor.input_metadata import InputMetadata from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention -from cacheflow.model_executor.layers.sample import Sampler +from cacheflow.model_executor.layers.sampler import Sampler from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, load_tensor_parallel_weights) from cacheflow.model_executor.parallel_utils.parallel_state import ( From f1d270034eb14dd49a6b5efc43d2b051a4be59b9 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:29:41 +0000 Subject: [PATCH 25/29] Minor --- cacheflow/model_executor/layers/sampler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cacheflow/model_executor/layers/sampler.py b/cacheflow/model_executor/layers/sampler.py index 76444ba2b45d..b0bc83421d70 100644 --- a/cacheflow/model_executor/layers/sampler.py +++ b/cacheflow/model_executor/layers/sampler.py @@ -4,7 +4,8 @@ import torch.nn as nn from cacheflow.model_executor.input_metadata import InputMetadata -from cacheflow.model_executor.parallel_utils.tensor_parallel import gather_from_tensor_model_parallel_region +from cacheflow.model_executor.parallel_utils.tensor_parallel import ( + gather_from_tensor_model_parallel_region) from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import SequenceOutputs @@ -27,7 +28,7 @@ def forward( # Get the logits for the next tokens. logits = torch.matmul(hidden_states, embedding.t()) logits = gather_from_tensor_model_parallel_region(logits) - # Remove paddings in vocab. + # Remove paddings in vocab (if any). logits = logits[:, :self.vocab_size] # Apply temperature scaling. From 38429875ae7ff32bbad9b40ea35522f390126c92 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Mon, 8 May 2023 01:05:35 +0000 Subject: [PATCH 26/29] Do not use fast llama tokenizer --- cacheflow/frontend/fastapi_frontend.py | 4 ++-- cacheflow/frontend/simple_frontend.py | 3 ++- cacheflow/frontend/utils.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 cacheflow/frontend/utils.py diff --git a/cacheflow/frontend/fastapi_frontend.py b/cacheflow/frontend/fastapi_frontend.py index e5c116d7b76d..59e66a4ce5fe 100644 --- a/cacheflow/frontend/fastapi_frontend.py +++ b/cacheflow/frontend/fastapi_frontend.py @@ -7,12 +7,12 @@ from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse import ray -from transformers import AutoTokenizer import uvicorn from cacheflow.core.server import (Server, add_server_arguments, process_server_arguments, initialize_cluster) +from cacheflow.frontend.utils import get_tokenizer from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import Sequence, SequenceGroup from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory @@ -44,7 +44,7 @@ def __init__( ): self.block_size = block_size - self.tokenizer = AutoTokenizer.from_pretrained(model) + self.tokenizer = get_tokenizer(model) self.seq_group_counter = Counter() self.seq_counter = Counter() if server_use_ray: diff --git a/cacheflow/frontend/simple_frontend.py b/cacheflow/frontend/simple_frontend.py index fb239b4a5cbf..db3e65da390d 100644 --- a/cacheflow/frontend/simple_frontend.py +++ b/cacheflow/frontend/simple_frontend.py @@ -3,6 +3,7 @@ from transformers import AutoTokenizer +from cacheflow.frontend.utils import get_tokenizer from cacheflow.sampling_params import SamplingParams from cacheflow.sequence import Sequence, SequenceGroup from cacheflow.utils import Counter @@ -17,7 +18,7 @@ def __init__( ) -> None: self.block_size = block_size - self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.tokenizer = get_tokenizer(model_name) self.seq_group_counter = Counter() self.seq_counter = Counter() self.inputs: List[Tuple[SequenceGroup, SamplingParams]] = [] diff --git a/cacheflow/frontend/utils.py b/cacheflow/frontend/utils.py new file mode 100644 index 000000000000..83b90214ab88 --- /dev/null +++ b/cacheflow/frontend/utils.py @@ -0,0 +1,19 @@ +from typing import Union + +from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) + + +def get_tokenizer( + model_name: str, + *args, + **kwargs, +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + config = AutoConfig.from_pretrained(model_name) + if config.model_type == "llama": + # LLaMA fast tokenizer has a bug related to protobuf. + if "use_fast" in kwargs: + kwargs.pop("use_fast") + return AutoTokenizer.from_pretrained( + model_name, use_fast=False, *args, **kwargs) + return AutoTokenizer.from_pretrained(model_name, *args, **kwargs) From be3f6c7196dd8809ee27465734de4a7a3f3db80f Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Tue, 9 May 2023 22:55:32 +0000 Subject: [PATCH 27/29] Fix merge errors --- cacheflow/core/server.py | 21 ++++- cacheflow/logger.py | 51 +++++++++++ cacheflow/model_executor/memory_analyzer.py | 18 ++-- cacheflow/model_executor/model_loader.py | 96 ++++++++++++--------- simple_server.py | 1 - 5 files changed, 134 insertions(+), 53 deletions(-) create mode 100644 cacheflow/logger.py diff --git a/cacheflow/core/server.py b/cacheflow/core/server.py index 01925fd5d222..9eb96efd3506 100644 --- a/cacheflow/core/server.py +++ b/cacheflow/core/server.py @@ -10,13 +10,17 @@ from cacheflow.core.scheduler import Scheduler from cacheflow.frontend.simple_frontend import SimpleFrontend +from cacheflow.logger import init_logger from cacheflow.model_executor import get_memory_analyzer -from cacheflow.sequence import SequenceGroup from cacheflow.sampling_params import SamplingParams +from cacheflow.sequence import SequenceGroup from cacheflow.utils import get_gpu_memory, get_cpu_memory from cacheflow.worker.controller import Controller, DeviceID +logger = init_logger(__name__) + + class Server: def __init__( @@ -43,6 +47,17 @@ def __init__( collect_stats: bool = False, do_memory_analysis: bool = False, ): + logger.info( + "Initializing a server with config: " + f"model={model!r}, " + f"dtype={dtype}, " + f"use_dummy_weights={use_dummy_weights}, " + f"cache_dir={cache_dir}, " + f"use_np_cache={use_np_cache}, " + f"tensor_parallel_size={tensor_parallel_size}, " + f"block_size={block_size}, " + f"seed={seed})" + ) self.num_nodes = num_nodes self.num_devices_per_node = num_devices_per_node self.world_size = pipeline_parallel_size * tensor_parallel_size @@ -62,9 +77,7 @@ def __init__( self.num_gpu_blocks = self.memory_analyzer.get_max_num_gpu_blocks( max_num_batched_tokens=max_num_batched_tokens) self.num_cpu_blocks = self.memory_analyzer.get_max_num_cpu_blocks( - swap_space=swap_space) - print(f'# GPU blocks: {self.num_gpu_blocks}, ' - f'# CPU blocks: {self.num_cpu_blocks}') + swap_space_gib=swap_space) # Create a controller for each pipeline stage. self.controllers: List[Controller] = [] diff --git a/cacheflow/logger.py b/cacheflow/logger.py new file mode 100644 index 000000000000..30d35b34ef55 --- /dev/null +++ b/cacheflow/logger.py @@ -0,0 +1,51 @@ +# Adapted from https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py + +import logging +import sys + + +_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" +_DATE_FORMAT = "%m-%d %H:%M:%S" + + +class NewLineFormatter(logging.Formatter): + """Adds logging prefix to newlines to align multi-line messages.""" + + def __init__(self, fmt, datefmt=None): + logging.Formatter.__init__(self, fmt, datefmt) + + def format(self, record): + msg = logging.Formatter.format(self, record) + if record.message != "": + parts = msg.split(record.message) + msg = msg.replace("\n", "\r\n" + parts[0]) + return msg + + +_root_logger = logging.getLogger("cacheflow") +_default_handler = None + + +def _setup_logger(): + _root_logger.setLevel(logging.DEBUG) + global _default_handler + if _default_handler is None: + _default_handler = logging.StreamHandler(sys.stdout) + _default_handler.flush = sys.stdout.flush # type: ignore + _default_handler.setLevel(logging.INFO) + _root_logger.addHandler(_default_handler) + fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) + _default_handler.setFormatter(fmt) + # Setting this will avoid the message + # being propagated to the parent logger. + _root_logger.propagate = False + + +# The logger is initialized when the module is imported. +# This is thread-safe as the module is only imported once, +# guaranteed by the Python GIL. +_setup_logger() + + +def init_logger(name: str): + return logging.getLogger(name) diff --git a/cacheflow/model_executor/memory_analyzer.py b/cacheflow/model_executor/memory_analyzer.py index 703257d39a76..bc85d6586ae0 100644 --- a/cacheflow/model_executor/memory_analyzer.py +++ b/cacheflow/model_executor/memory_analyzer.py @@ -1,8 +1,12 @@ import torch from transformers import AutoConfig +from cacheflow.logger import init_logger from cacheflow.model_executor.utils import get_dtype_size + +logger = init_logger(__name__) + _GiB = 1 << 30 @@ -23,20 +27,20 @@ def get_cache_block_size(self) -> int: def get_max_num_cpu_blocks( self, - swap_space: int, + swap_space_gib: int, ) -> int: - swap_space = swap_space * _GiB + swap_space = swap_space_gib * _GiB cpu_memory = self.cpu_memory if swap_space > 0.8 * cpu_memory: - raise ValueError(f'The swap space ({swap_space / _GiB:.2f} GiB) ' + raise ValueError(f'The swap space ({swap_space_gib:.2f} GiB) ' 'takes more than 80% of the available memory ' f'({cpu_memory / _GiB:.2f} GiB).' 'Please check the swap space size.') if swap_space > 0.5 * cpu_memory: - print(f'WARNING: The swap space ({swap_space / _GiB:.2f} GiB) ' - 'takes more than 50% of the available memory ' - f'({cpu_memory / _GiB:.2f} GiB).' - 'This may slow the system performance.') + logger.info(f'WARNING: The swap space ({swap_space_gib:.2f} GiB) ' + 'takes more than 50% of the available memory ' + f'({cpu_memory / _GiB:.2f} GiB).' + 'This may slow the system performance.') max_num_blocks = swap_space // self.get_cache_block_size() return max_num_blocks diff --git a/cacheflow/model_executor/model_loader.py b/cacheflow/model_executor/model_loader.py index 1af2c606c797..5598309e8a4c 100644 --- a/cacheflow/model_executor/model_loader.py +++ b/cacheflow/model_executor/model_loader.py @@ -14,32 +14,51 @@ from cacheflow.model_executor.weight_utils import initialize_dummy_weights -_MODELS = { - 'gpt2': GPT2LMHeadModel, - 'llama': LlamaForCausalLM, - 'opt': OPTForCausalLM, - 'stablelm': GPTNeoXForCausalLM, - 'pythia': GPTNeoXForCausalLM, - 'dolly-v2': GPTNeoXForCausalLM, +# TODO(woosuk): Lazy-load the model classes. +_MODEL_REGISTRY = { + "GPT2LMHeadModel": GPT2LMHeadModel, + "GPTNeoXForCausalLM": GPTNeoXForCausalLM, + "LlamaForCausalLM": LlamaForCausalLM, + "OPTForCausalLM": OPTForCausalLM, } _MEMORY_ANALYZERS = { - 'gpt2': GPT2MemoryAnalyzer, - 'llama': LlamaMemoryAnalyzer, - 'opt': OPTMemoryAnalyzer, - 'stablelm': GPTNeoXMemoryAnalyzer, - 'pythia': GPTNeoXMemoryAnalyzer, - 'dolly-v2': GPTNeoXMemoryAnalyzer, + "GPT2LMHeadModel": GPT2MemoryAnalyzer, + "GPTNeoXForCausalLM": GPTNeoXMemoryAnalyzer, + "LlamaForCausalLM": LlamaMemoryAnalyzer, + "OPTForCausalLM": OPTMemoryAnalyzer, } +def _get_model_architecture(config: PretrainedConfig) -> nn.Module: + architectures = getattr(config, "architectures", []) + for arch in architectures: + if arch in _MODEL_REGISTRY: + return _MODEL_REGISTRY[arch] + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {list(_MODEL_REGISTRY.keys())}" + ) + + +def _get_memory_analyzer(config: PretrainedConfig) -> CacheFlowMemoryAnalyzer: + architectures = getattr(config, "architectures", []) + for arch in architectures: + if arch in _MEMORY_ANALYZERS: + return _MEMORY_ANALYZERS[arch] + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {list(_MEMORY_ANALYZERS.keys())}" + ) + + def _get_dtype(config: PretrainedConfig, dtype: str) -> torch.dtype: - # NOTE: getattr(config, 'torch_dtype', torch.float32) is not correct + # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. - config_dtype = getattr(config, 'torch_dtype', None) + config_dtype = getattr(config, "torch_dtype", None) if config_dtype is None: config_dtype = torch.float32 - if dtype == 'default': + if dtype == "default": if config_dtype == torch.float32: # Following the common practice, we use float16 for float32 models. torch_dtype = torch.float16 @@ -51,7 +70,7 @@ def _get_dtype(config: PretrainedConfig, dtype: str) -> torch.dtype: # TODO(woosuk): Allow using float16 for bfloat16 models and # vice versa. Print a warning message and continue. raise ValueError( - f'Cannot use {torch_dtype} for {config_dtype} model.') + f"Cannot use {torch_dtype} for {config_dtype} model.") return torch_dtype @@ -65,24 +84,21 @@ def get_model( config = AutoConfig.from_pretrained(model_name) torch_dtype = _get_dtype(config, dtype) torch.set_default_dtype(torch_dtype) - for model_class_name, model_class in _MODELS.items(): - if model_class_name in model_name: - if use_dummy_weights: - # Create a model instance. - # The weights will be initialized as empty tensors. - model = model_class(config) - model = model.cuda() - # NOTE(woosuk): For precise performance evaluation, we assign - # random values to the weights. - initialize_dummy_weights(model) - else: - # Create a model instance. - model = model_class(config) - # Load the weights from the cached or downloaded files. - model.load_weights(model_name, cache_dir, use_np_cache) - model = model.cuda() - return model.eval(), torch_dtype - raise ValueError(f'Unsupported model name: {model_name}') + model_class = _get_model_architecture(config) + + # Create a model instance. + # The weights will be initialized as empty tensors. + model = model_class(config) + if use_dummy_weights: + model = model.cuda() + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) + else: + # Load the weights from the cached or downloaded files. + model.load_weights(model_name, cache_dir, use_np_cache) + model = model.cuda() + return model.eval(), torch_dtype def get_memory_analyzer( @@ -95,9 +111,7 @@ def get_memory_analyzer( ) -> CacheFlowMemoryAnalyzer: config = AutoConfig.from_pretrained(model_name) torch_dtype = _get_dtype(config, dtype) - for model_class, memory_analyzer in _MEMORY_ANALYZERS.items(): - if model_class in model_name: - return memory_analyzer( - model_name, block_size, torch_dtype, gpu_memory, cpu_memory, - tensor_parallel_size) - raise ValueError(f'Unsupported model name: {model_name}') + memory_analyzer = _get_memory_analyzer(config) + return memory_analyzer( + model_name, block_size, torch_dtype, gpu_memory, cpu_memory, + tensor_parallel_size) diff --git a/simple_server.py b/simple_server.py index 66e8fbd8a80d..e0d17d90bf48 100644 --- a/simple_server.py +++ b/simple_server.py @@ -1,5 +1,4 @@ import argparse -from typing import List from cacheflow.core.server import ( add_server_arguments, process_server_arguments, From 729e14b20d03c3ec58d35a90429f7feb320f9c3c Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Tue, 9 May 2023 22:56:45 +0000 Subject: [PATCH 28/29] Add a tracking issue in comment --- cacheflow/frontend/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cacheflow/frontend/utils.py b/cacheflow/frontend/utils.py index 83b90214ab88..a503c6593577 100644 --- a/cacheflow/frontend/utils.py +++ b/cacheflow/frontend/utils.py @@ -12,6 +12,7 @@ def get_tokenizer( config = AutoConfig.from_pretrained(model_name) if config.model_type == "llama": # LLaMA fast tokenizer has a bug related to protobuf. + # See https://github.com/WoosukKwon/cacheflow/issues/80#issue-1698550554 if "use_fast" in kwargs: kwargs.pop("use_fast") return AutoTokenizer.from_pretrained( From 9fe9fbfb5599a2035d2a30a48fce490e42a0f519 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Tue, 9 May 2023 23:02:24 +0000 Subject: [PATCH 29/29] Minor refactoring --- cacheflow/frontend/utils.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cacheflow/frontend/utils.py b/cacheflow/frontend/utils.py index a503c6593577..efb50d9aced6 100644 --- a/cacheflow/frontend/utils.py +++ b/cacheflow/frontend/utils.py @@ -4,17 +4,19 @@ PreTrainedTokenizerFast) +_MODEL_TYPES_WITH_SLOW_TOKENIZER = [ + # LLaMA fast tokenizer has a bug related to protobuf. + # See https://github.com/WoosukKwon/cacheflow/issues/80#issue-1698550554 + "llama", +] + + def get_tokenizer( model_name: str, *args, **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: config = AutoConfig.from_pretrained(model_name) - if config.model_type == "llama": - # LLaMA fast tokenizer has a bug related to protobuf. - # See https://github.com/WoosukKwon/cacheflow/issues/80#issue-1698550554 - if "use_fast" in kwargs: - kwargs.pop("use_fast") - return AutoTokenizer.from_pretrained( - model_name, use_fast=False, *args, **kwargs) + if config.model_type in _MODEL_TYPES_WITH_SLOW_TOKENIZER: + kwargs["use_fast"] = False return AutoTokenizer.from_pretrained(model_name, *args, **kwargs)