From 9af370204773236039cf6dddfd79846372bef210 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 19 Nov 2025 19:44:23 +0000 Subject: [PATCH 1/4] Add lintrunner --- .lintrunner.toml | 100 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 72 ++++++++++++++++++++++++++ requirements-lintrunner.txt | 6 +++ 3 files changed, 178 insertions(+) create mode 100644 .lintrunner.toml create mode 100644 pyproject.toml create mode 100644 requirements-lintrunner.txt diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 0000000000..e0dcf592af --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,100 @@ +merge_base_with = 'origin/main' + +[[linter]] +code = 'RUFF' +include_patterns = [ + '**/*.py', + '**/*.pyi', +] +exclude_patterns = [ + 'cmake/**', + 'build/**', + 'nuget/**', +] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'ruff_linter', + '--config=pyproject.toml', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--requirement=requirements-lintrunner.txt', +] +is_formatter = true + + +[[linter]] +code = 'RUFF-FORMAT' +include_patterns = [ + '**/*.py', +] +exclude_patterns = [ + 'cmake/**', + 'build/**', + 'nuget/**', +] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'ruff_format_linter', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--requirement=requirements-lintrunner.txt', +] +is_formatter = true + +[[linter]] +code = 'CLANGFORMAT' +include_patterns = [ + '**/*.h', + '**/*.cc', + '**/*.hpp', + '**/*.cpp', + '**/*.cuh', + '**/*.cu', + '**/*.m', + '**/*.mm', +] +exclude_patterns = [ + 'cmake/**', +] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'clangformat_linter', + '--binary=clang-format', + '--fallback', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--requirement=requirements-lintrunner.txt', +] +is_formatter = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..cbc8c84f5d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,72 @@ +[tool.pydocstyle] +convention = "google" + +[tool.pyright] +exclude = [ + "cmake/**", + "**/node_modules/**", + "**/__pycache__/**", + "**/build/**", + "**/build_*/**", + "**/.DS_Store/**", +] +reportMissingImports = false + +[tool.ruff] +# NOTE: Do not create an exclude list. Edit .lintrunner.toml instead +target-version = "py310" +line-length = 120 + +[tool.ruff.lint] +select = [ + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "E", # pycodestyle + "F", # Pyflakes + "FURB", # refurb + "G", # flake8-logging-format + "I", # isort + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + "NPY", # numpy + "PERF", # Perflint + "PIE", # flake8-pie + "PLC", # pylint conventions + "PLE", # pylint errors + "PLW", # pylint warnings + "PYI", # flake8-pyi + "RUF", # Ruff-specific rules + "SIM", # flake8-simplify + "SLOT", # flake8-slots + "T10", # flake8-debugger + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +# NOTE: Refrain from growing the ignore list unless for exceptional cases. +# Always include a comment to explain why. +ignore = [ + "B028", # FIXME: Add stacklevel to warnings + "C408", # Sometimes it is preferable when we construct kwargs + "E501", # Line length controlled by black + "G004", # FIXME: Enable when the rule can be autofixed + "N803", # Argument casing + "N812", # Allow import torch.nn.functional as F + "N813", # Allow importing camelcase names in lowercase + "N999", # Module names + "NPY002", # np.random.Generator may not always fit our use cases + "PERF203", # "try-except-in-loop" only affects Python <3.11, and the improvement is minor; can have false positives + "PERF401", # List comprehensions are not always readable + "PYI041", # May create confusion + "PYI024", # May create confusion + "SIM102", # We don't perfer always combining if branches + "SIM103", # Do not collapse if-else + "SIM108", # We don't encourage ternary operators + "SIM114", # Don't combine if branches for debugability + "SIM116", # Don't use dict lookup to replace if-else + "UP038", # Using X | Y in isinstance checks is a little aggresive +] + +[tool.ruff.lint.per-file-ignores] +# NOTE: Refrain from growing the ignore list unless for exceptional cases. +# Prefer inline ignores with `noqa: xxx`. \ No newline at end of file diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt new file mode 100644 index 0000000000..ed0fda1ac9 --- /dev/null +++ b/requirements-lintrunner.txt @@ -0,0 +1,6 @@ +# This file is auto updated by dependabot +# When any package below is changed, you shall run "lintrunner init" again. +lintrunner==0.12.7 +lintrunner-adapters==0.12.5 +ruff==0.12.12 +clang-format==20.1.8 From 588769f43285dcf92269b43e3f32424aecfd58b7 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 19 Nov 2025 19:47:02 +0000 Subject: [PATCH 2/4] format --- benchmark/python/benchmark_e2e.py | 20 +- .../python/benchmark_e2e_continuous_test.py | 10 +- benchmark/python/benchmark_multimodal.py | 145 +- benchmark/python/metrics.py | 53 +- build.py | 8 +- cgmanifests/generate_cgmanifest.py | 4 +- examples/chat_app/app.py | 2 +- examples/chat_app/app_modules/overwrites.py | 4 +- examples/chat_app/app_modules/utils.py | 2 +- .../interface/hddr_llm_onnx_interface.py | 13 +- .../interface/multimodal_onnx_interface.py | 44 +- examples/python/awq-quantized-model.py | 29 +- examples/python/engine/continuous-batching.py | 26 +- examples/python/engine/model-qa.py | 2 +- examples/python/guidance-example.py | 67 +- examples/python/model-chat.py | 189 +- examples/python/model-generate.py | 10 +- examples/python/model-qa.py | 17 +- examples/python/model-vision.py | 47 +- examples/python/phi3-qa.py | 115 +- examples/python/phi4-mm.py | 6 +- examples/python/whisper.py | 3 +- examples/slm_engine/build_scripts/build.py | 17 +- .../slm_engine/build_scripts/build_deps.py | 16 +- examples/slm_engine/test/chat_ui.py | 5 +- examples/slm_engine/test/test_slm_server.py | 3 +- examples/slm_engine/test/test_tool_calling.py | 26 +- src/cuda/search_cuda.cu | 2 +- src/objectivec/test/ort_genai_api_test.mm | 190 +- src/python/py/_dll_directory.py | 12 +- src/python/py/models/__init__.py | 2 +- src/python/py/models/builder.py | 7 +- src/python/py/models/builders/__init__.py | 53 +- src/python/py/models/builders/base.py | 1621 ++++++++++++----- src/python/py/models/builders/chatglm.py | 3 +- src/python/py/models/builders/ernie.py | 3 +- src/python/py/models/builders/gemma.py | 4 +- src/python/py/models/builders/gptoss.py | 9 +- src/python/py/models/builders/granite.py | 3 +- src/python/py/models/builders/llama.py | 3 +- src/python/py/models/builders/mistral.py | 3 +- src/python/py/models/builders/nemotron.py | 3 +- src/python/py/models/builders/olmo.py | 6 +- src/python/py/models/builders/phi.py | 7 +- src/python/py/models/builders/qwen.py | 4 +- src/python/py/models/builders/smollm.py | 3 +- src/python/py/models/gguf_model.py | 8 +- src/python/py/models/quantized_model.py | 256 ++- .../ios_package_uitest_cpp_api.mm | 45 +- .../macos_package_uitest_cpp_api.mm | 45 +- test/python/_test_utils.py | 18 +- test/python/conftest.py | 8 +- test/python/test_onnxruntime_genai.py | 9 +- test/python/test_onnxruntime_genai_api.py | 140 +- test/python/test_onnxruntime_genai_e2e.py | 4 +- test/test_models/create_dummy_model.py | 4 +- tools/ci_build/get_docker_image.py | 4 +- .../github/android/build_aar_package.py | 3 +- .../apple/build_and_assemble_apple_pods.py | 3 +- .../github/apple/build_apple_framework.py | 2 +- .../github/apple/c/assemble_c_pod_package.py | 8 +- .../objectivec/assemble_objc_pod_package.py | 1 + .../github/apple/package_assembly_utils.py | 7 +- .../github/apple/test_apple_packages.py | 11 +- .../nuget/generate_nuspec_for_custom_nuget.py | 30 +- .../nuget/generate_nuspec_for_native_nuget.py | 28 +- .../nuget/generate_nuspec_for_winml_nuget.py | 30 +- .../model_validation/perplexity_metrics.py | 20 +- .../model_validation/validation_tool.py | 30 +- tools/python/util/__init__.py | 6 +- tools/python/util/android.py | 9 +- tools/python/util/dependency_resolver.py | 44 +- tools/python/util/platform_helpers.py | 2 +- tools/python/util/run.py | 1 + 74 files changed, 2339 insertions(+), 1258 deletions(-) diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index 6c9f8f2340..9b5962cac1 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -13,18 +13,18 @@ # # 2) Run this script with the desired arguments. Run benchmark_e2e.py -h for help. -import onnxruntime_genai as og -import time import argparse -from tqdm import tqdm +import json +import os import subprocess import threading -import psutil -import os -import json -from metrics import BenchmarkRecord +import time import numpy as np +import onnxruntime_genai as og +import psutil +from metrics import BenchmarkRecord +from tqdm import tqdm peak_cpu_memory = 0.0 peak_gpu_memory = 0.0 @@ -42,7 +42,7 @@ def monitor_gpu_memory(): global peak_gpu_memory while not stop_monitoring: - result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'], capture_output=True, text=True) + result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'], check=False, capture_output=True, text=True) memory_usage = result.stdout.splitlines() @@ -84,7 +84,7 @@ def generate_prompt(model, tokenizer, prompt_length) -> str: # Use prompt length to get pre-defined prompt def get_prompt_by_length(prompt_length): json_path = "prompts.json" - with open(json_path, "r") as file: + with open(json_path) as file: data = json.load(file) return data[f"{prompt_length}"] @@ -228,7 +228,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length if hasattr(model, "type"): model_type = model.type else: - with open(os.path.join(args.input_folder, "genai_config.json"), "r") as f: + with open(os.path.join(args.input_folder, "genai_config.json")) as f: genai_config = json.load(f) model_type = genai_config["model"]["type"] diff --git a/benchmark/python/benchmark_e2e_continuous_test.py b/benchmark/python/benchmark_e2e_continuous_test.py index f945790f69..065815b6f9 100644 --- a/benchmark/python/benchmark_e2e_continuous_test.py +++ b/benchmark/python/benchmark_e2e_continuous_test.py @@ -6,18 +6,20 @@ # This is an end-to-end benchmarking script for any ONNX model. # -# Prerequisites: +# Prerequisites: # 0) Install onnxruntime-genai and onnxruntime # # 1) Use builder.py to build the desired ONNX model # # 2) Run this script with the desired arguments. Run benchmark_e2e.py -h for help. -import onnxruntime_genai as og -import time import argparse +import time + +import onnxruntime_genai as og from tqdm import tqdm + def main(args): # Get user arguments num_repetitions = args.repetitions @@ -124,4 +126,4 @@ def main(args): parser.add_argument('-r', '--repetitions', type=int, default=10, help='Number of times to repeat the benchmark') parser.add_argument('-w', '--warmup', type=int, default=5, help='Number of warmup runs before benchmarking') args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/benchmark/python/benchmark_multimodal.py b/benchmark/python/benchmark_multimodal.py index 11679b6174..5a39695e4b 100644 --- a/benchmark/python/benchmark_multimodal.py +++ b/benchmark/python/benchmark_multimodal.py @@ -6,7 +6,7 @@ # This is an end-to-end benchmarking script for any multi-modal ONNX model pipeline. # -# Prerequisites: +# Prerequisites: # 0) Install ONNX Runtime GenAI and ONNX Runtime # # 1) Create or download the desired ONNX model @@ -15,13 +15,14 @@ import argparse import json -import onnxruntime_genai as og import os -import pandas as pd -import psutil import subprocess import threading import time + +import onnxruntime_genai as og +import pandas as pd +import psutil from tqdm import tqdm peak_cpu_memory = 0.0 @@ -35,12 +36,15 @@ except Exception: IS_NVIDIA_SYSTEM = False + # Monitor the GPU memory usage def monitor_gpu_memory(): global peak_gpu_memory while not stop_monitoring: - result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'], capture_output=True, text=True) + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"], check=False, capture_output=True, text=True + ) memory_usage = result.stdout.splitlines() @@ -53,6 +57,7 @@ def monitor_gpu_memory(): print("No GPU Memory Info Found") time.sleep(0.1) + # Monitor the CPU memory usage def monitor_cpu_memory(): global peak_cpu_memory @@ -63,6 +68,7 @@ def monitor_cpu_memory(): peak_cpu_memory = max(peak_cpu_memory, current_used_memory) time.sleep(0.1) + def save_results(results, filename): df = pd.DataFrame( results, @@ -85,6 +91,7 @@ def save_results(results, filename): df.to_csv(filename, header=True, index=False) print(f"Results saved in {filename}!") + def run_benchmark_memory(args, model, processor, image, audio, generation_length, max_length): """ This function is to run benchmark and print the memory usage @@ -102,7 +109,7 @@ def run_benchmark_memory(args, model, processor, image, audio, generation_length monitor_thread = threading.Thread(target=monitor_gpu_memory) else: monitor_thread = threading.Thread(target=monitor_cpu_memory) - + monitor_thread.start() metrics = run_benchmark(args, model, processor, image, audio, generation_length, max_length) @@ -114,9 +121,10 @@ def run_benchmark_memory(args, model, processor, image, audio, generation_length metrics.append(peak_gpu_memory) else: metrics.append(peak_cpu_memory) - + return metrics + def run_benchmark(args, model, processor, image, audio, generation_length, max_length): # Get user arguments num_repetitions = args.repetitions @@ -127,12 +135,13 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l if hasattr(model, "type"): model_type = model.type else: - with open(os.path.join(args.input_folder, "genai_config.json"), "r") as f: + with open(os.path.join(args.input_folder, "genai_config.json")) as f: genai_config = json.load(f) model_type = genai_config["model"]["type"] # Process prompt, image, and audio - if args.verbose: print("Processing inputs...") + if args.verbose: + print("Processing inputs...") user_prompt = "<|user|>\n" assistant_prompt = "<|assistant|>\n" @@ -143,19 +152,19 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l if image is not None and audio is not None: # Image + audio + text main_prompt = "What are some of the similarities and differences between the provided inputs?" - prompt = f'{user_prompt}{image_special}{audio_special}{main_prompt}{prompt_suffix}{assistant_prompt}' + prompt = f"{user_prompt}{image_special}{audio_special}{main_prompt}{prompt_suffix}{assistant_prompt}" elif image is not None: # Image + text main_prompt = "What is shown in this image?" - prompt = f'{user_prompt}{image_special}{main_prompt}{prompt_suffix}{assistant_prompt}' + prompt = f"{user_prompt}{image_special}{main_prompt}{prompt_suffix}{assistant_prompt}" elif audio is not None: # Audio + text main_prompt = "What is described in this audio?" - prompt = f'{user_prompt}{audio_special}{main_prompt}{prompt_suffix}{assistant_prompt}' + prompt = f"{user_prompt}{audio_special}{main_prompt}{prompt_suffix}{assistant_prompt}" else: # Text main_prompt = "What is the meaning of life?" - prompt = f'{user_prompt}{main_prompt}{prompt_suffix}{assistant_prompt}' + prompt = f"{user_prompt}{main_prompt}{prompt_suffix}{assistant_prompt}" if model_type == "whisper": decoder_prompt_tokens = ["<|startoftranscript|>", "<|en|>", "<|transcribe|>", "<|notimestamps|>"] @@ -163,14 +172,23 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l else: prompts = [prompt] inputs = processor(prompts, images=image, audios=audio) - prompt_length = inputs['input_ids'].shape()[1] - if args.verbose: print(f"Prompt used: {prompt}") + prompt_length = inputs["input_ids"].shape()[1] + if args.verbose: + print(f"Prompt used: {prompt}") params = og.GeneratorParams(model) do_sample = args.top_k > 1 or (args.top_p != 1.0 and args.top_p > 0.0) - params.set_search_options(do_sample=do_sample, top_k=args.top_k, top_p=args.top_p, temperature=temperature, max_length=max_length, min_length=max_length) + params.set_search_options( + do_sample=do_sample, + top_k=args.top_k, + top_p=args.top_p, + temperature=temperature, + max_length=max_length, + min_length=max_length, + ) - if args.verbose: print("Processed inputs, running warmup runs...") + if args.verbose: + print("Processed inputs, running warmup runs...") for _ in tqdm(range(args.warmup)): generator = og.Generator(model, params) generator.set_inputs(inputs) @@ -178,7 +196,8 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l while not generator.is_done() and i < generation_length: generator.generate_next_token() i += 1 - if args.print_model_output: print(processor.decode(generator.get_sequence(0))) + if args.print_model_output: + print(processor.decode(generator.get_sequence(0))) # Delete the generator to free the captured graph for the next generator, if graph capture is enabled del generator @@ -187,7 +206,8 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l token_gen_times = [] sampling_times = [] wall_clock_times = [] - if args.verbose: print(f"Done with warmup, running benchmark for {num_repetitions} repetitions...") + if args.verbose: + print(f"Done with warmup, running benchmark for {num_repetitions} repetitions...") for _ in tqdm(range(num_repetitions)): wall_clock_start_time = time.time() @@ -199,7 +219,14 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l # Prepare run params = og.GeneratorParams(model) - params.set_search_options(do_sample=do_sample, top_k=args.top_k, top_p=args.top_p, temperature=temperature, max_length=max_length, min_length=max_length) + params.set_search_options( + do_sample=do_sample, + top_k=args.top_k, + top_p=args.top_p, + temperature=temperature, + max_length=max_length, + min_length=max_length, + ) # Measure prompt processing prompt_start_time = time.perf_counter() @@ -220,12 +247,13 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l token_gen_start_time = time.perf_counter() generator.generate_next_token() token_gen_end_time = time.perf_counter() - + token_gen_times.append(token_gen_end_time - token_gen_start_time) i += 1 wall_clock_end_time = time.time() wall_clock_times.append(wall_clock_end_time - wall_clock_start_time) - if args.print_model_output: print(processor.decode(generator.get_sequence(0))) + if args.print_model_output: + print(processor.decode(generator.get_sequence(0))) # Delete the generator to free the captured graph for the next generator, if graph capture is enabled del generator @@ -246,7 +274,7 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l avg_token_gen_thrpt = 1 / avg_token_gen_latency_s print(f"Average Token Generation Latency (per token): {avg_token_gen_latency_ms} ms") print(f"Average Token Generation Throughput (per token): {avg_token_gen_thrpt} tps") - + # Calculate sampling metrics avg_sampling_latency_s = sum(sampling_times) / len(sampling_times) avg_sampling_latency_ms = avg_sampling_latency_s * 1000 @@ -266,25 +294,29 @@ def run_benchmark(args, model, processor, image, audio, generation_length, max_l max_length, avg_processing_latency_ms, avg_prompt_latency_ms, - avg_token_gen_thrpt, - avg_token_gen_latency_ms, - avg_sampling_thrpt, + avg_token_gen_thrpt, + avg_token_gen_latency_ms, + avg_sampling_thrpt, avg_sampling_latency_ms, avg_wall_clock_thrpt, avg_wall_clock_time, ] return metrics + def main(args): all_csv_metrics = [] # Get tokenizer, and model model_path = args.input_folder - if args.verbose: print(f"Loading model... ") - model=og.Model(f'{model_path}') - if args.verbose: print("Model loaded, loading processor...") + if args.verbose: + print("Loading model... ") + model = og.Model(f"{model_path}") + if args.verbose: + print("Model loaded, loading processor...") processor = model.create_multimodal_processor() - if args.verbose: print("Processor loaded, loading image...") + if args.verbose: + print("Processor loaded, loading image...") # Get image image_path = args.image_path @@ -304,7 +336,8 @@ def main(args): else: audio = None - if args.verbose: print("Image loaded, starting benchmark...") + if args.verbose: + print("Image loaded, starting benchmark...") for g, gen_length in enumerate(args.generation_lengths): if args.max_lengths: max_length = args.max_lengths[g] @@ -314,29 +347,49 @@ def main(args): metrics = run_benchmark_memory(args, model, processor, image, audio, gen_length, max_length) all_csv_metrics.append(metrics) # Add metrics to CSV - if args.verbose: print("Adding results to CSV") + if args.verbose: + print("Adding results to CSV") filename = args.output save_results(all_csv_metrics, filename) + def str2intlist(value): - return [int(v) for v in value.split(',')] + return [int(v) for v in value.split(",")] + def str2strlist(value): - return [str(v) for v in value.split(',')] + return [str(v) for v in value.split(",")] + if __name__ == "__main__": parser = argparse.ArgumentParser(description="End-to-end benchmarking for ONNX Runtime GenAI") - parser.add_argument('-i', '--input_folder', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and ONNX models)') - parser.add_argument('-au', '--audio_path', type=str, default="", required=False, help='Path to the audio') - parser.add_argument('-im', '--image_path', type=str, default="", required=False, help='Path to the image') - parser.add_argument('-g', '--generation_lengths', type=str2intlist, default=[256], help='Number of tokens to generate after prompt') - parser.add_argument('-m', '--max_lengths', type=str2intlist, default=[7680], help='Max length buffer sizes... User should supply one for every Generation length') - parser.add_argument('-r', '--repetitions', type=int, default=30, help='Number of times to repeat the benchmark') - parser.add_argument('-w', '--warmup', type=int, default=5, help='Number of warmup runs before benchmarking') - parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from') - parser.add_argument('-p', '--top_p', type=float, default=1.0, help='Top p probability to sample with') - parser.add_argument('-o', '--output', type=str, default='genai_e2e.csv', help='Output CSV file name or path (with .csv extension)') - parser.add_argument('-v', '--verbose', action='store_true', help='Print extra information') - parser.add_argument('-mo', '--print_model_output', action='store_true', help='Print model output') + parser.add_argument( + "-i", + "--input_folder", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and ONNX models)", + ) + parser.add_argument("-au", "--audio_path", type=str, default="", required=False, help="Path to the audio") + parser.add_argument("-im", "--image_path", type=str, default="", required=False, help="Path to the image") + parser.add_argument( + "-g", "--generation_lengths", type=str2intlist, default=[256], help="Number of tokens to generate after prompt" + ) + parser.add_argument( + "-m", + "--max_lengths", + type=str2intlist, + default=[7680], + help="Max length buffer sizes... User should supply one for every Generation length", + ) + parser.add_argument("-r", "--repetitions", type=int, default=30, help="Number of times to repeat the benchmark") + parser.add_argument("-w", "--warmup", type=int, default=5, help="Number of warmup runs before benchmarking") + parser.add_argument("-k", "--top_k", type=int, default=50, help="Top k tokens to sample from") + parser.add_argument("-p", "--top_p", type=float, default=1.0, help="Top p probability to sample with") + parser.add_argument( + "-o", "--output", type=str, default="genai_e2e.csv", help="Output CSV file name or path (with .csv extension)" + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Print extra information") + parser.add_argument("-mo", "--print_model_output", action="store_true", help="Print model output") args = parser.parse_args() main(args) diff --git a/benchmark/python/metrics.py b/benchmark/python/metrics.py index a698d524b8..421ca66b98 100644 --- a/benchmark/python/metrics.py +++ b/benchmark/python/metrics.py @@ -6,7 +6,6 @@ import datetime import json -from typing import Optional import pandas as pd @@ -30,10 +29,10 @@ def to_dict(self): class ModelInfo(BaseObject): def __init__( self, - full_name: Optional[str] = None, - is_huggingface: Optional[bool] = False, - is_text_generation: Optional[bool] = False, - short_name: Optional[str] = None, + full_name: str | None = None, + is_huggingface: bool | None = False, + is_text_generation: bool | None = False, + short_name: str | None = None, ): super().__init__() self.full_name = full_name @@ -46,9 +45,9 @@ def __init__( class BackendOptions(BaseObject): def __init__( self, - enable_profiling: Optional[bool] = False, - execution_provider: Optional[str] = None, - use_io_binding: Optional[bool] = False, + enable_profiling: bool | None = False, + execution_provider: str | None = None, + use_io_binding: bool | None = False, ): super().__init__() self.enable_profiling = enable_profiling @@ -59,12 +58,12 @@ def __init__( class Config(BaseObject): def __init__( self, - backend: Optional[str] = "onnxruntime-genai", - batch_size: Optional[int] = 1, - seq_length: Optional[int] = 0, - precision: Optional[str] = "fp32", - warmup_runs: Optional[int] = 1, - measured_runs: Optional[int] = 10, + backend: str | None = "onnxruntime-genai", + batch_size: int | None = 1, + seq_length: int | None = 0, + precision: str | None = "fp32", + warmup_runs: int | None = 1, + measured_runs: int | None = 10, ): super().__init__() self.backend = backend @@ -80,11 +79,11 @@ def __init__( class Metadata(BaseObject): def __init__( self, - device: Optional[str] = None, - package_name: Optional[str] = None, - package_version: Optional[str] = None, - platform: Optional[str] = None, - python_version: Optional[str] = None, + device: str | None = None, + package_name: str | None = None, + package_version: str | None = None, + platform: str | None = None, + python_version: str | None = None, ): super().__init__() self.device = device @@ -97,9 +96,9 @@ def __init__( class Metrics(BaseObject): def __init__( self, - latency_ms_mean: Optional[float] = 0.0, - throughput_qps: Optional[float] = 0.0, - max_memory_usage_GB: Optional[float] = 0.0, + latency_ms_mean: float | None = 0.0, + throughput_qps: float | None = 0.0, + max_memory_usage_GB: float | None = 0.0, ): super().__init__() self.latency_ms_mean = latency_ms_mean @@ -116,10 +115,10 @@ def __init__( device: str, package_name: str, package_version: str, - batch_size: Optional[int] = 1, - warmup_runs: Optional[int] = 1, - measured_runs: Optional[int] = 10, - trigger_date: Optional[str] = None, + batch_size: int | None = 1, + warmup_runs: int | None = 1, + measured_runs: int | None = 10, + trigger_date: str | None = None, ): self.config = Config() self.metrics = Metrics() @@ -161,4 +160,4 @@ def save_as_json(cls, file_name: str, records: list) -> None: return rds = [record.to_dict() for record in records] with open(file_name, "w") as f: - json.dump(rds, f, indent=4, default=str) \ No newline at end of file + json.dump(rds, f, indent=4, default=str) diff --git a/build.py b/build.py index f4e0841fbe..52d5e92cb3 100644 --- a/build.py +++ b/build.py @@ -11,14 +11,12 @@ import shutil import sys import textwrap - from pathlib import Path REPO_ROOT = Path(__file__).parent sys.path.append(str(REPO_ROOT / "tools" / "python")) import util # ./tools/python/util noqa: E402 - log = util.get_logger("build.py") @@ -133,7 +131,7 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") parser.add_argument("--use_guidance", action="store_true", help="Whether to add guidance support. Default is False.") - + # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) platform_group = parser.add_mutually_exclusive_group() platform_group.add_argument("--android", action="store_true", help="Build for Android") @@ -337,7 +335,7 @@ def _validate_ios_args(args: argparse.Namespace): raise ValueError( "iOS build on MacOS canceled due to missing arguments: " + ", ".join( - val for val, cond in zip(arg_names, needed_args) if not cond + val for val, cond in zip(arg_names, needed_args, strict=False) if not cond ) ) @@ -791,7 +789,7 @@ def build_examples(args: argparse.Namespace, env: dict[str, str]): if arguments.build: build(arguments, environment) - + if arguments.package: package(arguments, environment) diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py index 58cf1f8249..abd30ebb00 100644 --- a/cgmanifests/generate_cgmanifest.py +++ b/cgmanifests/generate_cgmanifest.py @@ -115,9 +115,7 @@ def normalize_path_separators(path): submodule_lines = proc.stdout.splitlines() for submodule_line in submodule_lines: (absolute_path, url, commit) = submodule_line.split(" ") - git_deps[GitDep(commit, url)] = "git submodule at {}".format( - normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR)) - ) + git_deps[GitDep(commit, url)] = f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}" with open(os.path.join(SCRIPT_DIR, "..", "cmake", "deps.txt")) as f: depfile_reader = csv.reader(f, delimiter=";") diff --git a/examples/chat_app/app.py b/examples/chat_app/app.py index 097286e983..2b0492d13b 100755 --- a/examples/chat_app/app.py +++ b/examples/chat_app/app.py @@ -258,7 +258,7 @@ def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_pa # check if genai_config.json in the model foler if "genai_config.json" not in os.listdir(model_path): raise ValueError(f"Your model_path folder do not include 'genai.json' file, please double check your model_path '{model_path}'") - + if args.model_name: model_name = args.model_name diff --git a/examples/chat_app/app_modules/overwrites.py b/examples/chat_app/app_modules/overwrites.py index 88ff8634cb..8807b89027 100755 --- a/examples/chat_app/app_modules/overwrites.py +++ b/examples/chat_app/app_modules/overwrites.py @@ -1,12 +1,10 @@ from __future__ import annotations -from typing import List, Tuple - from .presets import gr from .utils import convert_asis, convert_mdtext, detect_converted_mark -def postprocess(self, y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]: +def postprocess(self, y: list[tuple[str | None, str | None]]) -> list[tuple[str | None, str | None]]: """Each message and response should be a string, which may be in Markdown format. Returns: diff --git a/examples/chat_app/app_modules/utils.py b/examples/chat_app/app_modules/utils.py index ae852aebbe..1ce8ef0060 100755 --- a/examples/chat_app/app_modules/utils.py +++ b/examples/chat_app/app_modules/utils.py @@ -68,7 +68,7 @@ def convert_mdtext(md_text): non_code_parts = code_block_pattern.split(md_text)[::2] result = [] - for non_code, code in zip(non_code_parts, [*code_blocks, ""]): + for non_code, code in zip(non_code_parts, [*code_blocks, ""], strict=False): if non_code.strip(): formatted_non_code = normalize_markdown(non_code) if inline_code_pattern.search(formatted_non_code): diff --git a/examples/chat_app/interface/hddr_llm_onnx_interface.py b/examples/chat_app/interface/hddr_llm_onnx_interface.py index 8ec8140daf..ba79c17b10 100755 --- a/examples/chat_app/interface/hddr_llm_onnx_interface.py +++ b/examples/chat_app/interface/hddr_llm_onnx_interface.py @@ -2,13 +2,14 @@ import logging import os import sys + import onnxruntime_genai as og from app_modules.utils import convert_to_markdown, is_stop_word_or_prefix, shared_state current_dir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(current_dir, "..", "..", "..")) -class ONNXModel(): +class ONNXModel: """A wrapper for OnnxRuntime-GenAI to run ONNX LLM model.""" def __init__(self, model_path, execution_provider): @@ -38,10 +39,10 @@ def __init__(self, model_path, execution_provider): self.history_template = """<|start_header_id|>user<|end_header_id|> {input}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {response}<|eot_id|>""" - + self.chat_template = """<|start_header_id|>user<|end_header_id|> {input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""" - + #self.chat_template = llama3_template else: self.enable_history_max = 2 @@ -184,10 +185,10 @@ def predict( print(type(e).__name__, e) return - + def shutdown(self): pass - + def retry(self, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step): if len(history) == 0: yield chatbot, history, "Empty context" @@ -201,4 +202,4 @@ def retry(self, chatbot, history, max_length_tokens, max_context_length_tokens, max_length_tokens, max_context_length_tokens, token_printing_step, - ) \ No newline at end of file + ) diff --git a/examples/chat_app/interface/multimodal_onnx_interface.py b/examples/chat_app/interface/multimodal_onnx_interface.py index 4ae959f9bd..909915a540 100755 --- a/examples/chat_app/interface/multimodal_onnx_interface.py +++ b/examples/chat_app/interface/multimodal_onnx_interface.py @@ -1,13 +1,13 @@ import gc + import onnxruntime_genai as og -from consts import default_prompt, logging from app_modules.utils import convert_to_markdown, shared_state +from consts import default_prompt, logging logging.getLogger("interface") -class MultiModal_ONNXModel(): - +class MultiModal_ONNXModel: """A wrapper for ONNXRuntime GenAI to run ONNX Multimodal model""" def __init__(self, model_path, execution_provider): @@ -32,19 +32,19 @@ def __init__(self, model_path, execution_provider): def generate_prompt_with_history(self, images, history, text=default_prompt, max_length=3072): prompt = "" - for dialog in history[-self.enable_history_max:]: - prompt += f'{self.history_template.format(input=dialog[0], response=dialog[1])}' + for dialog in history[-self.enable_history_max :]: + prompt += f"{self.history_template.format(input=dialog[0], response=dialog[1])}" prompt = self.template_header + prompt image_tags = "" for i in range(len(images)): - image_tags += f"<|image_{i+1}|>\n" + image_tags += f"<|image_{i + 1}|>\n" - prompt += f'{self.chat_template.format(input=text, tags=image_tags)}' + prompt += f"{self.chat_template.format(input=text, tags=image_tags)}" if len(prompt) > max_length: history.clear() - prompt = f'{self.chat_template.format(input=text, tags=image_tags)}' + prompt = f"{self.chat_template.format(input=text, tags=image_tags)}" self.images = og.Images.open(*images) @@ -52,7 +52,6 @@ def generate_prompt_with_history(self, images, history, text=default_prompt, max inputs = self.processor(prompt, images=self.images) return inputs - def search(self, inputs, max_length: int = 3072, token_printing_step: int = 1): output = "" params = og.GeneratorParams(self.model) @@ -72,16 +71,12 @@ def search(self, inputs, max_length: int = 3072, token_printing_step: int = 1): return output def predict(self, text, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, *args): - if text == "": yield chatbot, history, "Empty context" return inputs = self.generate_prompt_with_history( - text=text, - history=history, - images=args[0], - max_length=max_context_length_tokens + text=text, history=history, images=args[0], max_length=max_context_length_tokens ) sentence = self.search( @@ -91,13 +86,15 @@ def predict(self, text, chatbot, history, max_length_tokens, max_context_length_ ) sentence = sentence.strip() - a, b = [[y[0], convert_to_markdown(y[1])] for y in history] + [[text, convert_to_markdown(sentence)]], [ - *history, - [ text, sentence], - ] + a, b = ( + [[y[0], convert_to_markdown(y[1])] for y in history] + [[text, convert_to_markdown(sentence)]], + [ + *history, + [text, sentence], + ], + ) yield a, b, "Generating ... " - if shared_state.interrupted: shared_state.recover() try: @@ -129,12 +126,5 @@ def retry(self, chatbot, history, max_length_tokens, max_context_length_tokens, inputs = history.pop()[0] yield from self.predict( - inputs, - chatbot, - history, - max_length_tokens, - max_context_length_tokens, - token_printing_step, - args[0] + inputs, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, args[0] ) - diff --git a/examples/python/awq-quantized-model.py b/examples/python/awq-quantized-model.py index 884225faed..fd9991b3c7 100644 --- a/examples/python/awq-quantized-model.py +++ b/examples/python/awq-quantized-model.py @@ -1,11 +1,12 @@ import argparse +import json import os +import onnxruntime_genai as og from awq import AutoAWQForCausalLM -from transformers import AutoTokenizer from onnxruntime_genai.models.builder import create_model -import onnxruntime_genai as og -import json +from transformers import AutoTokenizer + def parse_args(): parser = argparse.ArgumentParser() @@ -48,13 +49,12 @@ def parse_args(): args = parser.parse_args() return args + def quantize_model(args): - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} # Load model - model = AutoAWQForCausalLM.from_pretrained( - args.model_path, **{"low_cpu_mem_usage": True, "use_cache": False} - ) + model = AutoAWQForCausalLM.from_pretrained(args.model_path, low_cpu_mem_usage=True, use_cache=False) tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) # Quantize model @@ -66,6 +66,7 @@ def quantize_model(args): print(f'Model is quantized and saved at "{args.quant_path}"') + def run_model(args): # Load model print("Loading model...") @@ -81,8 +82,8 @@ def run_model(args): # Override any default search options in `genai_config.json` search_options = { - 'min_length': 1, - 'max_length': 2048, + "min_length": 1, + "max_length": 2048, } while True: @@ -104,7 +105,7 @@ def run_model(args): generator.append_tokens(input_tokens) print() - print("Output: ", end='', flush=True) + print("Output: ", end="", flush=True) try: while True: @@ -113,7 +114,7 @@ def run_model(args): break new_token = generator.get_next_tokens()[0] - print(tokenizer_stream.decode(new_token), end='', flush=True) + print(tokenizer_stream.decode(new_token), end="", flush=True) except KeyboardInterrupt: print(" --control+c pressed, aborting generation--") print() @@ -122,6 +123,7 @@ def run_model(args): # Delete the generator to free the captured graph for the next generator, if graph capture is enabled del generator + def main(): args = parse_args() @@ -144,9 +146,12 @@ def main(): if args.execution_provider == "dml": if og.__id__ != "onnxruntime-genai-directml": - raise ValueError(f"onnxruntime-genai-directml is required to be installed. Please uninstall all ORT GenAI packages with `pip uninstall -y onnxruntime-genai onnxruntime-genai-cuda onnxruntime-genai-directml` and only install the DML version with `pip install onnxruntime-genai-directml`.") + raise ValueError( + "onnxruntime-genai-directml is required to be installed. Please uninstall all ORT GenAI packages with `pip uninstall -y onnxruntime-genai onnxruntime-genai-cuda onnxruntime-genai-directml` and only install the DML version with `pip install onnxruntime-genai-directml`." + ) # Run ONNX model run_model(args) + if __name__ == "__main__": main() diff --git a/examples/python/engine/continuous-batching.py b/examples/python/engine/continuous-batching.py index aaee994918..cdf979aed7 100644 --- a/examples/python/engine/continuous-batching.py +++ b/examples/python/engine/continuous-batching.py @@ -19,9 +19,7 @@ def get_random_prompts(num_questions: int, split="validation") -> list[str]: class ClientRequest: - def __init__( - self, prompt: str, model: og.Model, tokenizer: og.Tokenizer, opaque_data: any - ): + def __init__(self, prompt: str, model: og.Model, tokenizer: og.Tokenizer, opaque_data: any): self.prompt = prompt self.params = og.GeneratorParams(model) self.params.set_search_options( @@ -37,11 +35,7 @@ def __init__( self.request = og.Request(self.params) self.request.add_tokens( - tokenizer.encode( - tokenizer.apply_chat_template( - messages=messages, add_generation_prompt=True - ) - ) + tokenizer.encode(tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True)) ) self.request.set_opaque_data(opaque_data) self.streaming_tokenizer = tokenizer.create_stream() @@ -76,9 +70,7 @@ def __init__( self.engine.add_request(request.request) def fill(self): - for i, prompt in enumerate( - self.prompts[int(len(self.prompts) * self.load_factor) :] - ): + for i, prompt in enumerate(self.prompts[int(len(self.prompts) * self.load_factor) :]): request = ClientRequest(prompt, self.model, self.tokenizer, self) with self.lock: self.requests.append(request) @@ -87,19 +79,13 @@ def fill(self): def drain(self, request: og.Request): with self.lock: - client_request = next( - (r for r in self.requests if r.request == request), None - ) + client_request = next((r for r in self.requests if r.request == request), None) while request.has_unseen_tokens(): token = request.get_unseen_token() - client_request.token_stream += ( - client_request.streaming_tokenizer.decode(token) - ) + client_request.token_stream += client_request.streaming_tokenizer.decode(token) if request.is_done(): - assert ( - client_request is not None - ), "Client request not found in the pool" + assert client_request is not None, "Client request not found in the pool" if self.debug: print(f"🫵 : {client_request.prompt}") diff --git a/examples/python/engine/model-qa.py b/examples/python/engine/model-qa.py index 14969ffcf5..cb8e2458b7 100644 --- a/examples/python/engine/model-qa.py +++ b/examples/python/engine/model-qa.py @@ -45,7 +45,7 @@ def run(args: argparse.Namespace): engine.add_request(request) - print(f"🤖 :", end="", flush=True) + print("🤖 :", end="", flush=True) while ready_request := engine.step(): while ready_request.has_unseen_tokens(): diff --git a/examples/python/guidance-example.py b/examples/python/guidance-example.py index 335017a006..3d9a068783 100644 --- a/examples/python/guidance-example.py +++ b/examples/python/guidance-example.py @@ -1,20 +1,18 @@ -import onnxruntime_genai as og import argparse -import time import json +import time +import onnxruntime_genai as og from datasets import load_dataset + def main(args): dataset = load_dataset(path="epfl-dlab/JSONSchemaBench", name="Github_hard", split="test") - schema = json.loads(dataset[0]['json_schema']) + schema = json.loads(dataset[0]["json_schema"]) system_prompt = "You need to generate a JSON object that matches the schema below." user_prompt = json.dumps(schema, indent=2) - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt} - ] + messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] config = og.Config(args.model_path) if args.execution_provider != "follow_config": @@ -26,35 +24,35 @@ def main(args): tokenizer = og.Tokenizer(model) tokenizer_stream = tokenizer.create_stream() - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} - search_options['batch_size'] = 1 - search_options['temperature'] = 0.0 + search_options = { + name: getattr(args, name) + for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"] + if name in args + } + search_options["batch_size"] = 1 + search_options["temperature"] = 0.0 params = og.GeneratorParams(model) params.set_search_options(**search_options) - schema["x-guidance"] = { - "whitespace_flexible": False, - "key_separator": ": ", - "item_separator": ", " - } + schema["x-guidance"] = {"whitespace_flexible": False, "key_separator": ": ", "item_separator": ", "} guidance_type = "lark_grammar" guidance_input = f"""start: %json {json.dumps(schema)}\n""" params.set_search_options(**search_options) - params.set_guidance(guidance_type, guidance_input, args.enable_ff_tokens) # set guidance + params.set_guidance(guidance_type, guidance_input, args.enable_ff_tokens) # set guidance generator = og.Generator(model, params) final_prompt = tokenizer.apply_chat_template(messages=json.dumps(messages), add_generation_prompt=True) final_input = tokenizer.encode(final_prompt) generator.append_tokens(final_input) - + start_len = len(generator.get_sequence(0)) prev_len = start_len t0 = time.time() # for i in range(15): full_seq_str = "" - while not generator.is_done(): + while not generator.is_done(): generator.generate_next_token() # NOTE: since get_next_tokens returns only the last token, we'll need to use get_sequence instead @@ -66,7 +64,7 @@ def main(args): seq_str = "" for token in new_tokens: seq_str += tokenizer_stream.decode(token) - print(seq_str, end='', flush=True) + print(seq_str, end="", flush=True) prev_len = len(seq) full_seq_str += seq_str latency = time.time() - t0 @@ -82,9 +80,30 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") - parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") - parser.add_argument('--enable_ff_tokens', action='store_true', default=False, help='Enable feed-forward tokens in the model session if supported (default: False)') + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai" + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument( + "--enable_ff_tokens", + action="store_true", + default=False, + help="Enable feed-forward tokens in the model session if supported (default: False)", + ) args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py index 1b9465cf35..44c9add4ff 100644 --- a/examples/python/model-chat.py +++ b/examples/python/model-chat.py @@ -1,12 +1,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import onnxruntime_genai as og import argparse -import os import json +import os import time +import onnxruntime_genai as og + + def get_tools_list(input_tools): # input_tools format: '[{"name": "fn1", "description": "fn details", "parameters": {"p1": {"description": "details", "type": "string"}}}, # {"fn2": 2},{"fn3": 3}]' @@ -14,17 +16,19 @@ def get_tools_list(input_tools): try: tools_list = json.loads(input_tools) except json.JSONDecodeError: - raise ValueError("Invalid JSON format for tools list, expected format: '[{\"name\": \"fn1\"},{\"name\": \"fn2\"}]'") + raise ValueError('Invalid JSON format for tools list, expected format: \'[{"name": "fn1"},{"name": "fn2"}]\'') if len(tools_list) == 0: raise ValueError("Tools list cannot be empty") return tools_list + def create_prompt_tool_input(tools_list): tool_input = str(tools_list[0]) for tool in tools_list[1:]: - tool_input += ',' + str(tool) + tool_input += "," + str(tool) return tool_input + def get_json_grammar(input_tools): tools_list = get_tools_list(input_tools) prompt_tool_input = create_prompt_tool_input(tools_list) @@ -33,19 +37,28 @@ def get_json_grammar(input_tools): else: output = '{ "anyOf": [' + json.dumps(tools_list[0]) for tool in tools_list[1:]: - output += ',' + json.dumps(tool) - output += '] }' + output += "," + json.dumps(tool) + output += "] }" return prompt_tool_input, output + def get_lark_grammar(input_tools): tools_list = get_tools_list(input_tools) prompt_tool_input = create_prompt_tool_input(tools_list) if len(tools_list) == 1: # output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(tools_list[0])) - output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(convert_tool_to_grammar_input(tools_list[0]))) + output = "start: TEXT | fun_call\nTEXT: /[^{](.|\\n)*/\n fun_call: <|tool_call|> %json " + json.dumps( + convert_tool_to_grammar_input(tools_list[0]) + ) return prompt_tool_input, output else: - return prompt_tool_input, "start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {\"anyOf\": [" + ','.join([json.dumps(tool) for tool in tools_list]) + "]}" + return ( + prompt_tool_input, + 'start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {"anyOf": [' + + ",".join([json.dumps(tool) for tool in tools_list]) + + "]}", + ) + def convert_tool_to_grammar_input(tool): param_props = {} @@ -53,23 +66,23 @@ def convert_tool_to_grammar_input(tool): for param_name, param_info in tool.get("parameters", {}).items(): param_props[param_name] = { "type": param_info.get("type", "string"), - "description": param_info.get("description", "") + "description": param_info.get("description", ""), } required_params.append(param_name) output_schema = { - "description": tool.get('description', ''), + "description": tool.get("description", ""), "type": "object", "required": ["name", "parameters"], "additionalProperties": False, "properties": { - "name": { "const": tool["name"] }, + "name": {"const": tool["name"]}, "parameters": { "type": "object", "properties": param_props, "required": required_params, - "additionalProperties": False - } - } + "additionalProperties": False, + }, + }, } if len(param_props) == 0: output_schema["required"] = ["name"] @@ -77,7 +90,8 @@ def convert_tool_to_grammar_input(tool): def main(args): - if args.verbose: print("Loading model...") + if args.verbose: + print("Loading model...") if args.timings: started_timestamp = 0 first_token_timestamp = 0 @@ -86,21 +100,30 @@ def main(args): if args.execution_provider != "follow_config": config.clear_providers() if args.execution_provider != "cpu": - if args.verbose: print(f"Setting model to {args.execution_provider}") + if args.verbose: + print(f"Setting model to {args.execution_provider}") config.append_provider(args.execution_provider) model = og.Model(config) - if args.verbose: print("Model loaded") - + if args.verbose: + print("Model loaded") + tokenizer = og.Tokenizer(model) tokenizer_stream = tokenizer.create_stream() - if args.verbose: print("Tokenizer created") - if args.verbose: print() + if args.verbose: + print("Tokenizer created") + if args.verbose: + print() - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} - search_options['batch_size'] = 1 + search_options = { + name: getattr(args, name) + for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"] + if name in args + } + search_options["batch_size"] = 1 - if args.verbose: print(search_options) + if args.verbose: + print(search_options) system_prompt = args.system_prompt guidance_type = "" @@ -130,7 +153,8 @@ def main(args): print("Guidance input is:", guidance_input) generator = og.Generator(model, params) - if args.verbose: print("Generator created") + if args.verbose: + print("Generator created") if guidance_type == "json_schema" or guidance_type == "lark_grammar": messages = f"""[{{"role": "system", "content": "{system_prompt}", "tools": "{prompt_tool_input}"}}]""" else: @@ -141,9 +165,11 @@ def main(args): tokenizer_input_system_prompt = None jinja_path = os.path.join(args.model_path, "chat_template.jinja") if os.path.exists(jinja_path): - with open(jinja_path, "r", encoding="utf-8") as f: + with open(jinja_path, encoding="utf-8") as f: template_str = f.read() - tokenizer_input_system_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=False, template_str=template_str) + tokenizer_input_system_prompt = tokenizer.apply_chat_template( + messages=messages, add_generation_prompt=False, template_str=template_str + ) else: tokenizer_input_system_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=False) @@ -164,26 +190,30 @@ def main(args): if text == "quit()": break - if args.timings: started_timestamp = time.time() + if args.timings: + started_timestamp = time.time() messages = f"""[{{"role": "user", "content": "{text}"}}]""" # Apply Chat Template user_prompt = "" if os.path.exists(jinja_path): - user_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True, template_str=template_str) + user_prompt = tokenizer.apply_chat_template( + messages=messages, add_generation_prompt=True, template_str=template_str + ) else: user_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True) input_tokens = tokenizer.encode(user_prompt) generator.append_tokens(input_tokens) - if args.verbose: print("Running generation loop ...") + if args.verbose: + print("Running generation loop ...") if args.timings: first = True new_tokens = [] print() - print("Output: ", end='', flush=True) + print("Output: ", end="", flush=True) try: while True: @@ -197,8 +227,9 @@ def main(args): break new_token = generator.get_next_tokens()[0] - print(tokenizer_stream.decode(new_token), end='', flush=True) - if args.timings: new_tokens.append(new_token) + print(tokenizer_stream.decode(new_token), end="", flush=True) + if args.timings: + new_tokens.append(new_token) except KeyboardInterrupt: print(" --control+c pressed, aborting generation--") print() @@ -207,7 +238,9 @@ def main(args): if args.timings: prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp - print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") + print( + f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" + ) # Rewind the generator to the system prompt, this will erase all the memory of the model. if args.rewind: @@ -215,21 +248,79 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") - parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") - parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') - parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') - parser.add_argument('-ds', '--do_sample', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') - parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') - parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') - parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') - parser.add_argument('-re', '--repetition_penalty', type=float, help='Repetition penalty to sample with') - parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') - parser.add_argument('-gtype', '--guidance_type', type=str, default="none", choices=["none", "json_schema", "regex", "lark_grammar"], help='Provide guidance type for the model, options are json_schema, regex, or lark_grammar.') - parser.add_argument('-ginfo', '--guidance_info', type=str, default='', help='Provide information of the guidance type used, it could be either tools or regex string. It is required if guidance_type is provided') - parser.add_argument('-s', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the prompt.') - parser.add_argument('-r', '--rewind', action='store_true', default=False, help='Rewind to the system prompt after each generation. Defaults to false') + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai" + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument("-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt") + parser.add_argument("-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt") + parser.add_argument( + "-ds", + "--do_sample", + action="store_true", + help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false", + ) + parser.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with") + parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from") + parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with") + parser.add_argument("-re", "--repetition_penalty", type=float, help="Repetition penalty to sample with") + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="Print verbose output and timing information. Defaults to false", + ) + parser.add_argument( + "-g", + "--timings", + action="store_true", + default=False, + help="Print timing information for each generation step. Defaults to false", + ) + parser.add_argument( + "-gtype", + "--guidance_type", + type=str, + default="none", + choices=["none", "json_schema", "regex", "lark_grammar"], + help="Provide guidance type for the model, options are json_schema, regex, or lark_grammar.", + ) + parser.add_argument( + "-ginfo", + "--guidance_info", + type=str, + default="", + help="Provide information of the guidance type used, it could be either tools or regex string. It is required if guidance_type is provided", + ) + parser.add_argument( + "-s", + "--system_prompt", + type=str, + default="You are a helpful AI assistant.", + help="System prompt to use for the prompt.", + ) + parser.add_argument( + "-r", + "--rewind", + action="store_true", + default=False, + help="Rewind to the system prompt after each generation. Defaults to false", + ) args = parser.parse_args() main(args) diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py index f64f53b9ce..20c52ad9ba 100644 --- a/examples/python/model-generate.py +++ b/examples/python/model-generate.py @@ -1,7 +1,9 @@ -import onnxruntime_genai as og import argparse -import time import json +import time + +import onnxruntime_genai as og + def main(args): if args.verbose: print("Loading model...") @@ -66,7 +68,7 @@ def main(args): params = og.GeneratorParams(model) - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} if (args.verbose): print(f'Args: {args}') if (args.verbose): print(f'Search options: {search_options}') @@ -76,7 +78,7 @@ def main(args): generator = og.Generator(model, params) if args.verbose: print("Generator created") - + generator.append_tokens(input_tokens) if args.verbose: print("Input tokens added") diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index 31e6a97107..73d035fb39 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -1,11 +1,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import onnxruntime_genai as og import argparse -import time import json -import numpy as np +import time + +import onnxruntime_genai as og + def get_tools_list(input_tools): # input_tools format: '[{"name": "fn1", "description": "fn details", "parameters": {"p1": {"description": "details", "type": "string"}}}, @@ -42,7 +43,7 @@ def get_lark_grammar(input_tools): prompt_tool_input = create_prompt_tool_input(tools_list) if len(tools_list) == 1: # output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(tools_list[0])) - output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(convert_tool_to_grammar_input(tools_list[0]))) + output = ("start: TEXT | fun_call\nTEXT: /[^{](.|\\n)*/\n fun_call: <|tool_call|> %json " + json.dumps(convert_tool_to_grammar_input(tools_list[0]))) return prompt_tool_input, output else: return prompt_tool_input, "start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {\"anyOf\": [" + ','.join([json.dumps(tool) for tool in tools_list]) + "]}" @@ -90,7 +91,7 @@ def main(args): model = og.Model(config) if args.verbose: print("Model loaded") - + tokenizer = og.Tokenizer(model) tokenizer_stream = tokenizer.create_stream() if args.verbose: print("Tokenizer created") @@ -146,7 +147,7 @@ def main(args): generator = og.Generator(model, params) if args.verbose: print("Generator created") - + # Create messages with proper JSON encoding # Gemma2 models don't support system role, so we prepend system prompt to user message if model.type == "gemma2": @@ -162,10 +163,10 @@ def main(args): {"role": "system", "content": system_prompt}, {"role": "user", "content": text} ] - + # Convert to JSON string for tokenizer messages = json.dumps(messages_list) - + # Apply Chat Template if model.type == "marian-ssru": prompt = text diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py index 9614d9db37..1fd73a9db2 100644 --- a/examples/python/model-vision.py +++ b/examples/python/model-vision.py @@ -2,15 +2,17 @@ # Licensed under the MIT License import argparse -import os import glob -import time import json +import os +import time from pathlib import Path import onnxruntime_genai as og + # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) + def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): curr_path = Path(current_dir).absolute() target_dir = glob.glob(target_dir_name, root_dir=curr_path) @@ -20,7 +22,7 @@ def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): if curr_path.parent == curr_path: # Root dir return None - return _find_dir_contains_sub_dir(curr_path / '..', target_dir_name) + return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name) def _complete(text, state): @@ -48,6 +50,7 @@ def run(args: argparse.Namespace): if interactive: try: import readline + readline.set_completer_delims(" \t\n;") readline.parse_and_bind("tab: complete") readline.set_completer(_complete) @@ -56,15 +59,20 @@ def run(args: argparse.Namespace): pass image_paths = [ image_path.strip() - for image_path in input( - "Image Path (comma separated; leave empty if no image): " - ).split(",") + for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",") ] else: if args.image_paths: image_paths = args.image_paths else: - image_paths = [str(_find_dir_contains_sub_dir(Path(__file__).parent, "test") / "test_models" / "images" / "australia.jpg")] + image_paths = [ + str( + _find_dir_contains_sub_dir(Path(__file__).parent, "test") + / "test_models" + / "images" + / "australia.jpg" + ) + ] image_paths = [image_path for image_path in image_paths if image_path] @@ -86,12 +94,12 @@ def run(args: argparse.Namespace): text = args.prompt else: text = "What is shown in this image?" - + # Construct the "messages" argument passed to apply_chat_template messages = [] if model.type == "phi3v": # Combine all image tags and text into one user message - content = "".join([f"<|image_{i+1}|>\n" for i in range(len(image_paths))]) + text + content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text messages.append({"role": "user", "content": content}) else: # Gemma3-style multimodal: structured content @@ -138,20 +146,27 @@ def run(args: argparse.Namespace): if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("-m", "--model_path", type=str, required=True, help="Path to the folder containing the model") parser.add_argument( - "-m", "--model_path", type=str, required=True, help="Path to the folder containing the model" - ) - parser.add_argument( - "-e", "--execution_provider", type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead." + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", ) parser.add_argument( - "--image_paths", nargs='*', type=str, required=False, help="Path to the images, mainly for CI usage" + "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage" ) parser.add_argument( - '-pr', '--prompt', required=False, help='Input prompts to generate tokens from, mainly for CI usage' + "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage" ) parser.add_argument( - '--non-interactive', action=argparse.BooleanOptionalAction, required=False, help='Non-interactive mode, mainly for CI usage' + "--non-interactive", + action=argparse.BooleanOptionalAction, + required=False, + help="Non-interactive mode, mainly for CI usage", ) args = parser.parse_args() run(args) diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py index 2fd009ad10..e0296d1905 100644 --- a/examples/python/phi3-qa.py +++ b/examples/python/phi3-qa.py @@ -1,10 +1,13 @@ -import onnxruntime_genai as og import argparse -import time import json +import time + +import onnxruntime_genai as og + def main(args): - if args.verbose: print("Loading model...") + if args.verbose: + print("Loading model...") if args.timings: started_timestamp = 0 first_token_timestamp = 0 @@ -13,22 +16,30 @@ def main(args): if args.execution_provider != "follow_config": config.clear_providers() if args.execution_provider != "cpu": - if args.verbose: print(f"Setting model to {args.execution_provider}") + if args.verbose: + print(f"Setting model to {args.execution_provider}") config.append_provider(args.execution_provider) model = og.Model(config) - if args.verbose: print("Model loaded") - + if args.verbose: + print("Model loaded") + tokenizer = og.Tokenizer(model) tokenizer_stream = tokenizer.create_stream() - if args.verbose: print("Tokenizer created") - if args.verbose: print() - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} - + if args.verbose: + print("Tokenizer created") + if args.verbose: + print() + search_options = { + name: getattr(args, name) + for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"] + if name in args + } + # Set the max length to something sensible by default, unless it is specified by the user, # since otherwise it will be set to the entire context length - if 'max_length' not in search_options: - search_options['max_length'] = 2048 + if "max_length" not in search_options: + search_options["max_length"] = 2048 # Keep asking for input prompts in a loop while True: @@ -37,10 +48,11 @@ def main(args): print("Error, input cannot be empty") continue - if args.timings: started_timestamp = time.time() + if args.timings: + started_timestamp = time.time() # If there is a chat template, use it - input_message = [{"role": "user", "content": text }] + input_message = [{"role": "user", "content": text}] input_prompt = tokenizer.apply_chat_template(json.dumps(input_message), add_generation_prompt=True) input_tokens = tokenizer.encode(input_prompt) @@ -50,15 +62,17 @@ def main(args): generator = og.Generator(model, params) generator.append_tokens(input_tokens) - if args.verbose: print("Generator created") + if args.verbose: + print("Generator created") - if args.verbose: print("Running generation loop ...") + if args.verbose: + print("Running generation loop ...") if args.timings: first = True new_tokens = [] print() - print("Output: ", end='', flush=True) + print("Output: ", end="", flush=True) try: while True: @@ -72,8 +86,9 @@ def main(args): break new_token = generator.get_next_tokens()[0] - print(tokenizer_stream.decode(new_token), end='', flush=True) - if args.timings: new_tokens.append(new_token) + print(tokenizer_stream.decode(new_token), end="", flush=True) + if args.timings: + new_tokens.append(new_token) except KeyboardInterrupt: print(" --control+c pressed, aborting generation--") print() @@ -82,21 +97,57 @@ def main(args): if args.timings: prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp - print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") + print( + f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" + ) if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") - parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") - parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') - parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') - parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') - parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') - parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') - parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') - parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') - parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai" + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument("-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt") + parser.add_argument("-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt") + parser.add_argument( + "-ds", + "--do_sample", + action="store_true", + default=False, + help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false", + ) + parser.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with") + parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from") + parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with") + parser.add_argument("-r", "--repetition_penalty", type=float, help="Repetition penalty to sample with") + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="Print verbose output and timing information. Defaults to false", + ) + parser.add_argument( + "-g", + "--timings", + action="store_true", + default=False, + help="Print timing information for each generation step. Defaults to false", + ) args = parser.parse_args() main(args) diff --git a/examples/python/phi4-mm.py b/examples/python/phi4-mm.py index defc1e6472..d4ceda0b75 100644 --- a/examples/python/phi4-mm.py +++ b/examples/python/phi4-mm.py @@ -2,8 +2,8 @@ # Licensed under the MIT License import argparse -import os import glob +import os import time from pathlib import Path @@ -117,7 +117,7 @@ def run(args: argparse.Namespace): else: text = "Does the audio summarize what is shown in the image? If not, what is different?" prompt += f"{text}<|end|>\n<|assistant|>\n" - + print("Processing inputs...") inputs = processor(prompt, images=images, audios=audios) print("Processor complete.") @@ -173,4 +173,4 @@ def run(args: argparse.Namespace): '--non-interactive', action=argparse.BooleanOptionalAction, required=False, help='Non-interactive mode, mainly for CI usage' ) args = parser.parse_args() - run(args) \ No newline at end of file + run(args) diff --git a/examples/python/whisper.py b/examples/python/whisper.py index 67e44ad34f..600cd97863 100644 --- a/examples/python/whisper.py +++ b/examples/python/whisper.py @@ -7,6 +7,7 @@ import readline import onnxruntime_genai as og + # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) def _complete(text, state): @@ -75,7 +76,7 @@ def run(args: argparse.Namespace): tokens = generator.get_sequence(i) transcription = processor.decode(tokens) - print(f"Transcription:") + print("Transcription:") print( f" {Format.underline}batch {i // args.num_beams}, beam {i % args.num_beams}{Format.end}: {transcription}" ) diff --git a/examples/slm_engine/build_scripts/build.py b/examples/slm_engine/build_scripts/build.py index 75f3f03c7f..844b1ce5ea 100755 --- a/examples/slm_engine/build_scripts/build.py +++ b/examples/slm_engine/build_scripts/build.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 -import os -import sys import argparse +import os +import pathlib import platform import subprocess -import pathlib +import sys + from build_deps import get_machine_type BLUE = "\033[34m" @@ -63,14 +64,14 @@ def main(): print(f"Using CMake generator: {cmake_generator}") - artifacts_dir = os.path.abspath(f"slm_deps/artifacts/") + artifacts_dir = os.path.abspath("slm_deps/artifacts/") cmake_options = [ "cmake", "-G", cmake_generator, TOPLEVEL_DIR, f"-DARTIFACTS_DIR={artifacts_dir}", - f"-DCMAKE_BUILD_TYPE={args.build_type}" + f"-DCMAKE_BUILD_TYPE={args.build_type}", ] # We keep the build directory prefix as same as that's returned by the @@ -88,12 +89,12 @@ def main(): build_dir = f"builds/{dir_prefix}-{get_machine_type(args)}" # Launch build - print(f"BUILD Dir:", build_dir) + print("BUILD Dir:", build_dir) os.makedirs(build_dir, exist_ok=True) print(f"{BLUE}CMAKE Options: {cmake_options}{CLEAR}") - print(f"Building ...") + print("Building ...") os.chdir(build_dir) result = subprocess.call(cmake_options) if result != 0: @@ -113,7 +114,7 @@ def main(): raise Exception(f"{RED}Build error!{CLEAR}") # Now run the installation - print(f"Installing...") + print("Installing...") result = subprocess.call(["cmake", "--install", "."]) if result != 0: raise Exception(f"{RED}Installation error!{CLEAR}") diff --git a/examples/slm_engine/build_scripts/build_deps.py b/examples/slm_engine/build_scripts/build_deps.py index 309d786c9a..1f7fd7ca91 100755 --- a/examples/slm_engine/build_scripts/build_deps.py +++ b/examples/slm_engine/build_scripts/build_deps.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 +import argparse import glob import os -import argparse import platform import shutil import subprocess @@ -290,14 +290,14 @@ def build_ort(args, build_dir, artifacts_dir): # lib directory as well if platform.system() == "Windows": copy_files_keeping_symlinks( - glob.glob(f"bin/*.dll"), - f"lib", + glob.glob("bin/*.dll"), + "lib", ) # Copy the include/onnxruntime/* to include directory copy_files_keeping_symlinks( - glob.glob(f"include/onnxruntime/*"), - f"include", + glob.glob("include/onnxruntime/*"), + "include", ) print(f"{MAGENTA}Copying ORT artifacts to 3P Artifacts: \n{artifacts_dir}{CLEAR}") @@ -324,7 +324,7 @@ def build_ort_genai(args, artifacts_dir, ort_home): # Go to the toplevel directory. To determine the top level directory, we need to # find the directory of this python file and then go from there - top_level_dir = f"../../../" + top_level_dir = "../../../" os.chdir(top_level_dir) if subprocess.call(["git", "submodule", "update", "--init", "--recursive"]) != 0: @@ -378,7 +378,7 @@ def build_ort_genai(args, artifacts_dir, ort_home): # Remove --use_guidance from cmd_args if "--use_guidance" in cmd_args: cmd_args.remove("--use_guidance") - + print(f"{MAGENTA}Running build.py with fallback args: {cmd_args}{CLEAR}") result = subprocess.call([python_executable, "build.py"] + cmd_args) if result != 0: @@ -598,7 +598,7 @@ def main(): os.makedirs(artifacts_dir, exist_ok=True) - common_artifacts_dir = os.path.abspath(f"slm_deps/artifacts/common") + common_artifacts_dir = os.path.abspath("slm_deps/artifacts/common") os.makedirs(common_artifacts_dir, exist_ok=True) time_build_start = time.time() diff --git a/examples/slm_engine/test/chat_ui.py b/examples/slm_engine/test/chat_ui.py index 4e36d69161..dc505d0001 100644 --- a/examples/slm_engine/test/chat_ui.py +++ b/examples/slm_engine/test/chat_ui.py @@ -1,7 +1,8 @@ +import json + import gradio as gr -import requests import pandas as pd -import json +import requests SLM_ENDPOINT = "http://localhost:8080/completions" SYSTEM_PROMPT = ( diff --git a/examples/slm_engine/test/test_slm_server.py b/examples/slm_engine/test/test_slm_server.py index 53de3fe526..900cdfd553 100755 --- a/examples/slm_engine/test/test_slm_server.py +++ b/examples/slm_engine/test/test_slm_server.py @@ -2,6 +2,7 @@ import argparse import json + import requests BLUE = "\033[34m" @@ -37,7 +38,7 @@ def launch_server(server_binary: str, model_path: str): f"{MAGENTA}Engine State: {json_response['response']['engine_state']}{CLEAR}" ) started = True - except Exception as ex: + except Exception: # Initially the server may not be ready to accept requests # We want to ignore and retry pass # Ignore all exceptions diff --git a/examples/slm_engine/test/test_tool_calling.py b/examples/slm_engine/test/test_tool_calling.py index f4a38d4584..717580abaa 100755 --- a/examples/slm_engine/test/test_tool_calling.py +++ b/examples/slm_engine/test/test_tool_calling.py @@ -4,20 +4,22 @@ This script tests the booking_flight_tickets and booking_hotels tools """ -import requests import json +import requests + + def test_tool_calling(): """Test tool calling functionality with flight and hotel booking""" - + url = "http://localhost:8080/completions" headers = {"Content-Type": "application/json"} - + # Test case 1: Flight and Hotel booking print("=" * 70) print("Test 1: Flight and Hotel booking from Beijing to Paris") print("=" * 70) - + payload1 = { "messages": [ { @@ -76,7 +78,7 @@ def test_tool_calling(): "top_p": 1.0, "do_sample": False } - + try: response1 = requests.post(url, headers=headers, json=payload1, timeout=30) print(f"Status Code: {response1.status_code}") @@ -88,11 +90,11 @@ def test_tool_calling(): print(f"Error: {response1.text}") except requests.exceptions.RequestException as e: print(f"Request failed: {e}") - + print("\n" + "=" * 70) print("Test 2: Flight booking only (JFK to LHR)") print("=" * 70) - + # Test case 2: Flight only payload2 = { "messages": [ @@ -134,7 +136,7 @@ def test_tool_calling(): "top_p": 0.9, "do_sample": True } - + try: response2 = requests.post(url, headers=headers, json=payload2, timeout=30) print(f"Status Code: {response2.status_code}") @@ -146,11 +148,11 @@ def test_tool_calling(): print(f"Error: {response2.text}") except requests.exceptions.RequestException as e: print(f"Request failed: {e}") - + print("\n" + "=" * 70) print("Test 3: Hotel booking only (Tokyo)") print("=" * 70) - + # Test case 3: Hotel only payload3 = { "messages": [ @@ -188,7 +190,7 @@ def test_tool_calling(): "top_p": 0.95, "do_sample": True } - + try: response3 = requests.post(url, headers=headers, json=payload3, timeout=30) print(f"Status Code: {response3.status_code}") @@ -200,7 +202,7 @@ def test_tool_calling(): print(f"Error: {response3.text}") except requests.exceptions.RequestException as e: print(f"Request failed: {e}") - + print("\n" + "=" * 70) print("All tool calling tests completed!") print("=" * 70) diff --git a/src/cuda/search_cuda.cu b/src/cuda/search_cuda.cu index 8474ae53ed..e72689291b 100644 --- a/src/cuda/search_cuda.cu +++ b/src/cuda/search_cuda.cu @@ -108,7 +108,7 @@ __global__ void CheckForEOSAndPad(int32_t* next_tokens, int next_tokens_count, b } } -void Launch_CheckForEOSAndPad(int32_t* next_tokens, int next_tokens_count, bool* eos_seen, const int *eos_token_ids, int eos_token_count, int pad_token_id, bool* done_cpu, cudaStream_t stream) { +void Launch_CheckForEOSAndPad(int32_t* next_tokens, int next_tokens_count, bool* eos_seen, const int* eos_token_ids, int eos_token_count, int pad_token_id, bool* done_cpu, cudaStream_t stream) { CheckForEOSAndPad<<<1, 1, 0, stream>>>(next_tokens, next_tokens_count, eos_seen, eos_token_ids, eos_token_count, pad_token_id, done_cpu); } diff --git a/src/objectivec/test/ort_genai_api_test.mm b/src/objectivec/test/ort_genai_api_test.mm index 9168377735..f1e05d9a9d 100644 --- a/src/objectivec/test/ort_genai_api_test.mm +++ b/src/objectivec/test/ort_genai_api_test.mm @@ -17,120 +17,120 @@ @interface ORTGenAIAPITest : XCTestCase @implementation ORTGenAIAPITest - (void)setUp { - [super setUp]; - self.continueAfterFailure = NO; + [super setUp]; + self.continueAfterFailure = NO; } + (void)tearDown { - [OGAGenerator shutdown]; + [OGAGenerator shutdown]; } + (NSString*)getModelPath { - NSBundle* bundle = [NSBundle bundleForClass:[ORTGenAIAPITest class]]; - NSString* path = [[bundle resourcePath] stringByAppendingString:@"/tiny-random-gpt2-fp32"]; - return path; + NSBundle* bundle = [NSBundle bundleForClass:[ORTGenAIAPITest class]]; + NSString* path = [[bundle resourcePath] stringByAppendingString:@"/tiny-random-gpt2-fp32"]; + return path; } - (void)testTensor_And_AddExtraInput { - // Create a [3 4] shaped tensor - std::array data{0, 1, 2, 3, + // Create a [3 4] shaped tensor + std::array data{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23}; - NSArray* shape = @[@3, @4]; + NSArray* shape = @[ @3, @4 ]; - NSError *error = nil; - BOOL ret = NO; - OGAModel* model = [[OGAModel alloc] initWithPath:[ORTGenAIAPITest getModelPath] error:&error]; - ORTAssertNullableResultSuccessful(model, error); + NSError* error = nil; + BOOL ret = NO; + OGAModel* model = [[OGAModel alloc] initWithPath:[ORTGenAIAPITest getModelPath] error:&error]; + ORTAssertNullableResultSuccessful(model, error); - OGAGeneratorParams *params = [[OGAGeneratorParams alloc] initWithModel:model error:&error]; - ORTAssertNullableResultSuccessful(params, error); + OGAGeneratorParams* params = [[OGAGeneratorParams alloc] initWithModel:model error:&error]; + ORTAssertNullableResultSuccessful(params, error); - OGATensor* tensor = [[OGATensor alloc] initWithDataPointer:data.data() shape:shape type:OGAElementTypeFloat32 error:&error]; - ORTAssertNullableResultSuccessful(tensor, error); + OGATensor* tensor = [[OGATensor alloc] initWithDataPointer:data.data() shape:shape type:OGAElementTypeFloat32 error:&error]; + ORTAssertNullableResultSuccessful(tensor, error); - OGAGenerator* generator = [[OGAGenerator alloc] initWithModel:model - params:params - error:&error]; + OGAGenerator* generator = [[OGAGenerator alloc] initWithModel:model + params:params + error:&error]; - ret = [generator setModelInput:@"test_input" tensor:tensor error:&error]; - ORTAssertBoolResultSuccessful(ret, error); + ret = [generator setModelInput:@"test_input" tensor:tensor error:&error]; + ORTAssertBoolResultSuccessful(ret, error); } - (void)testGetOutput { - std::vector input_ids_shape{2, 4}; - NSArray* input_ids = @[@0, @0, @0, @52, @0, @0, @195, @731]; - const auto batch_size = input_ids_shape[0]; - int max_length = 10; - - NSError *error = nil; - BOOL ret = NO; - OGAModel* model = [[OGAModel alloc] initWithPath:[ORTGenAIAPITest getModelPath] error:&error]; - ORTAssertNullableResultSuccessful(model, error); - - OGAGeneratorParams *params = [[OGAGeneratorParams alloc] initWithModel:model error:&error]; - ORTAssertNullableResultSuccessful(params, error); - - [params setSearchOption:@"max_length" doubleValue:max_length error:&error]; - XCTAssertNil(error); - - [params setSearchOption:@"batch_size" doubleValue:batch_size error:&error]; - XCTAssertNil(error); - - OGAGenerator* generator = [[OGAGenerator alloc] initWithModel:model - params:params - error:&error]; - ORTAssertNullableResultSuccessful(generator, error); - [generator appendTokens:input_ids error:&error]; - XCTAssertNil(error); - - // check prompt - // full logits has shape [2, 4, 1000]. Sample 1 for every 200 tokens and the expected sampled logits has shape [2, 4, 5] - std::vector expected_sampled_logits_prompt{0.29694548f, 0.00955007f, 0.0430819f, 0.10063869f, 0.0437237f, - 0.27329233f, 0.00841076f, -0.1060291f, 0.11328877f, 0.13369876f, - 0.30323744f, 0.0545997f, 0.03894716f, 0.11702324f, 0.0410665f, - -0.12675379f, -0.04443946f, 0.14492269f, 0.03021223f, -0.03212897f, - 0.29694548f, 0.00955007f, 0.0430819f, 0.10063869f, 0.0437237f, - 0.27329233f, 0.00841076f, -0.1060291f, 0.11328877f, 0.13369876f, - -0.04699047f, 0.17915794f, 0.20838135f, 0.10888482f, -0.00277808f, - 0.2938929f, -0.10538938f, -0.00226692f, 0.12050669f, -0.10622668f}; - - OGATensor* prompt_logits_ptr = [generator getOutput:@"logits" error:&error]; - ORTAssertNullableResultSuccessful(prompt_logits_ptr, error); - auto prompt_logits = static_cast([prompt_logits_ptr getDataPointerWithError:&error]); - XCTAssertNil(error); - XCTAssertNotEqual(prompt_logits, nullptr); - const int num_prompt_outputs_to_check = 40; - const int sample_size = 200; - const float tolerance = 0.001f; - // Verify outputs match expected outputs - for (int i = 0; i < num_prompt_outputs_to_check; i++) { - XCTAssertEqualWithAccuracy(expected_sampled_logits_prompt[i], prompt_logits[i * sample_size], tolerance); - } - - ret = [generator generateNextTokenWithError:&error]; - ORTAssertBoolResultSuccessful(ret, error); - ret = [generator generateNextTokenWithError:&error]; - ORTAssertBoolResultSuccessful(ret, error); - - // check for the 1st token generation - // full logits has shape [2, 1, 1000]. Sample 1 for every 200 tokens and the expected sampled logits has shape [2, 1, 5] - std::vector expected_sampled_logits_token_gen{0.03742531f, -0.05752287f, 0.14159015f, 0.04210977f, -0.1484456f, - 0.3041716f, -0.08701379f, -0.03778192f, 0.07471392f, -0.02049096f}; - - OGATensor* token_gen_logits_ptr = [generator getOutput:@"logits" error:&error]; - ORTAssertNullableResultSuccessful(token_gen_logits_ptr, error); - - auto token_gen_logits = static_cast([token_gen_logits_ptr getDataPointerWithError:&error]); - XCTAssertNil(error); - XCTAssertNotEqual(token_gen_logits, nullptr); - int num_token_gen_outputs_to_check = 10; - - for (int i = 0; i < num_token_gen_outputs_to_check; i++) { - XCTAssertEqualWithAccuracy(expected_sampled_logits_token_gen[i], token_gen_logits[i * sample_size], tolerance); - } - [generator generateNextTokenWithError:&error]; - ORTAssertBoolResultSuccessful(ret, error); + std::vector input_ids_shape{2, 4}; + NSArray* input_ids = @[ @0, @0, @0, @52, @0, @0, @195, @731 ]; + const auto batch_size = input_ids_shape[0]; + int max_length = 10; + + NSError* error = nil; + BOOL ret = NO; + OGAModel* model = [[OGAModel alloc] initWithPath:[ORTGenAIAPITest getModelPath] error:&error]; + ORTAssertNullableResultSuccessful(model, error); + + OGAGeneratorParams* params = [[OGAGeneratorParams alloc] initWithModel:model error:&error]; + ORTAssertNullableResultSuccessful(params, error); + + [params setSearchOption:@"max_length" doubleValue:max_length error:&error]; + XCTAssertNil(error); + + [params setSearchOption:@"batch_size" doubleValue:batch_size error:&error]; + XCTAssertNil(error); + + OGAGenerator* generator = [[OGAGenerator alloc] initWithModel:model + params:params + error:&error]; + ORTAssertNullableResultSuccessful(generator, error); + [generator appendTokens:input_ids error:&error]; + XCTAssertNil(error); + + // check prompt + // full logits has shape [2, 4, 1000]. Sample 1 for every 200 tokens and the expected sampled logits has shape [2, 4, 5] + std::vector expected_sampled_logits_prompt{0.29694548f, 0.00955007f, 0.0430819f, 0.10063869f, 0.0437237f, + 0.27329233f, 0.00841076f, -0.1060291f, 0.11328877f, 0.13369876f, + 0.30323744f, 0.0545997f, 0.03894716f, 0.11702324f, 0.0410665f, + -0.12675379f, -0.04443946f, 0.14492269f, 0.03021223f, -0.03212897f, + 0.29694548f, 0.00955007f, 0.0430819f, 0.10063869f, 0.0437237f, + 0.27329233f, 0.00841076f, -0.1060291f, 0.11328877f, 0.13369876f, + -0.04699047f, 0.17915794f, 0.20838135f, 0.10888482f, -0.00277808f, + 0.2938929f, -0.10538938f, -0.00226692f, 0.12050669f, -0.10622668f}; + + OGATensor* prompt_logits_ptr = [generator getOutput:@"logits" error:&error]; + ORTAssertNullableResultSuccessful(prompt_logits_ptr, error); + auto prompt_logits = static_cast([prompt_logits_ptr getDataPointerWithError:&error]); + XCTAssertNil(error); + XCTAssertNotEqual(prompt_logits, nullptr); + const int num_prompt_outputs_to_check = 40; + const int sample_size = 200; + const float tolerance = 0.001f; + // Verify outputs match expected outputs + for (int i = 0; i < num_prompt_outputs_to_check; i++) { + XCTAssertEqualWithAccuracy(expected_sampled_logits_prompt[i], prompt_logits[i * sample_size], tolerance); + } + + ret = [generator generateNextTokenWithError:&error]; + ORTAssertBoolResultSuccessful(ret, error); + ret = [generator generateNextTokenWithError:&error]; + ORTAssertBoolResultSuccessful(ret, error); + + // check for the 1st token generation + // full logits has shape [2, 1, 1000]. Sample 1 for every 200 tokens and the expected sampled logits has shape [2, 1, 5] + std::vector expected_sampled_logits_token_gen{0.03742531f, -0.05752287f, 0.14159015f, 0.04210977f, -0.1484456f, + 0.3041716f, -0.08701379f, -0.03778192f, 0.07471392f, -0.02049096f}; + + OGATensor* token_gen_logits_ptr = [generator getOutput:@"logits" error:&error]; + ORTAssertNullableResultSuccessful(token_gen_logits_ptr, error); + + auto token_gen_logits = static_cast([token_gen_logits_ptr getDataPointerWithError:&error]); + XCTAssertNil(error); + XCTAssertNotEqual(token_gen_logits, nullptr); + int num_token_gen_outputs_to_check = 10; + + for (int i = 0; i < num_token_gen_outputs_to_check; i++) { + XCTAssertEqualWithAccuracy(expected_sampled_logits_token_gen[i], token_gen_logits[i * sample_size], tolerance); + } + [generator generateNextTokenWithError:&error]; + ORTAssertBoolResultSuccessful(ret, error); } @end diff --git a/src/python/py/_dll_directory.py b/src/python/py/_dll_directory.py index 98fad0a0cf..7b6aee39ea 100644 --- a/src/python/py/_dll_directory.py +++ b/src/python/py/_dll_directory.py @@ -4,6 +4,7 @@ import os import sys + def _is_windows(): return sys.platform.startswith("win") @@ -18,13 +19,14 @@ def _is_macos(): def add_onnxruntime_dependency(package_id: str): """Add the onnxruntime shared library dependency. - + On Windows, this function adds the onnxruntime DLL directory to the DLL search path. On Linux, this function loads the onnxruntime shared library and its dependencies so that they can be found by the dynamic linker. """ if _is_windows(): import importlib.util + ort_package = importlib.util.find_spec("onnxruntime") if not ort_package: raise ImportError("Could not find the onnxruntime package.") @@ -38,6 +40,7 @@ def add_onnxruntime_dependency(package_id: str): # Check to see if DirectML.dll exists before trying to load it. if os.path.exists(dml_path): import ctypes + _ = ctypes.CDLL(dml_path) # Workaround for onnxruntime.dll loading @@ -46,17 +49,18 @@ def add_onnxruntime_dependency(package_id: str): # Check to see if onnxruntime.dll exists before trying to load it. if os.path.exists(ort_path): import ctypes + _ = ctypes.CDLL(ort_path) elif _is_linux() or _is_macos(): - import importlib.util import ctypes import glob + import importlib.util ort_package = importlib.util.find_spec("onnxruntime") if not ort_package: raise ImportError("Could not find the onnxruntime package.") - + # Load the onnxruntime shared library here since we can find the path in python with ease. # This avoids needing to know the exact path of the shared library from native code. ort_package_path = ort_package.submodule_search_locations[0] @@ -75,7 +79,7 @@ def add_onnxruntime_dependency(package_id: str): def add_cuda_dependency(): """Add the CUDA DLL directory to the DLL search path. - + This function is a no-op on non-Windows platforms. """ if _is_windows(): diff --git a/src/python/py/models/__init__.py b/src/python/py/models/__init__.py index 363c2e0be7..33aeb8ce10 100644 --- a/src/python/py/models/__init__.py +++ b/src/python/py/models/__init__.py @@ -5,4 +5,4 @@ import os.path import sys -sys.path.append(os.path.dirname(__file__)) \ No newline at end of file +sys.path.append(os.path.dirname(__file__)) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 2d802df43d..049367fb8e 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -14,10 +14,6 @@ import onnx_ir as ir import torch -from transformers import ( - AutoConfig, -) - from builders import ( ChatGLMModel, ErnieModel, @@ -43,6 +39,9 @@ QwenModel, SmolLM3Model, ) +from transformers import ( + AutoConfig, +) def check_extra_options(kv_pairs, execution_provider): diff --git a/src/python/py/models/builders/__init__.py b/src/python/py/models/builders/__init__.py index 5e0a0ccfbb..f6e8fd09d0 100644 --- a/src/python/py/models/builders/__init__.py +++ b/src/python/py/models/builders/__init__.py @@ -4,27 +4,50 @@ # license information. # -------------------------------------------------------------------------- from .base import Model +from .chatglm import ChatGLMModel +from .ernie import ErnieModel +from .gemma import Gemma2Model, Gemma3Model, GemmaModel +from .gptoss import GPTOSSModel +from .granite import GraniteModel from .llama import LlamaModel from .mistral import MistralModel -from .qwen import QwenModel, Qwen3Model -from .phi import ( - PhiModel, Phi3MiniModel, Phi3MiniLongRoPEModel, Phi3SmallModel, - Phi3SmallLongRoPEModel, Phi3VModel, Phi3MoELongRoPEModel, Phi4MMModel -) -from .gemma import GemmaModel, Gemma2Model, Gemma3Model from .nemotron import NemotronModel -from .chatglm import ChatGLMModel from .olmo import OLMoModel -from .granite import GraniteModel -from .ernie import ErnieModel +from .phi import ( + Phi3MiniLongRoPEModel, + Phi3MiniModel, + Phi3MoELongRoPEModel, + Phi3SmallLongRoPEModel, + Phi3SmallModel, + Phi3VModel, + Phi4MMModel, + PhiModel, +) +from .qwen import Qwen3Model, QwenModel from .smollm import SmolLM3Model -from .gptoss import GPTOSSModel __all__ = [ + "ChatGLMModel", + "ErnieModel", + "GPTOSSModel", + "Gemma2Model", + "Gemma3Model", + "GemmaModel", + "GraniteModel", + "LlamaModel", + "MistralModel", "Model", - "LlamaModel", "MistralModel", "QwenModel", "Qwen3Model", "PhiModel", - "Phi3MiniModel", "Phi3MiniLongRoPEModel", "Phi3SmallModel", - "Phi3SmallLongRoPEModel", "Phi3VModel", "Phi3MoELongRoPEModel", "Phi4MMModel", - "GemmaModel", "Gemma2Model", "Gemma3Model", "NemotronModel", "ChatGLMModel", - "OLMoModel", "GraniteModel", "ErnieModel", "SmolLM3Model", "GPTOSSModel" + "NemotronModel", + "OLMoModel", + "Phi3MiniLongRoPEModel", + "Phi3MiniModel", + "Phi3MoELongRoPEModel", + "Phi3SmallLongRoPEModel", + "Phi3SmallModel", + "Phi3VModel", + "Phi4MMModel", + "PhiModel", + "Qwen3Model", + "QwenModel", + "SmolLM3Model" ] diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index d83326a53c..be2daa2d8e 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -10,12 +10,12 @@ import ast import json import os -from typing import Sequence +from collections.abc import Sequence import numpy as np import onnx_ir as ir import torch -from onnx_ir.tensor_adapters import to_torch_dtype, TorchTensor +from onnx_ir.tensor_adapters import TorchTensor, to_torch_dtype from onnxruntime.quantization.matmul_nbits_quantizer import ( MatMulNBitsQuantizer, QuantFormat, @@ -29,6 +29,7 @@ GenerationConfig, ) + def parse_hf_token(hf_token): """ Returns the authentication token needed for Hugging Face. @@ -45,19 +46,50 @@ def parse_hf_token(hf_token): # Return user-provided token as string return hf_token + class Model: def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.context_length = config.seq_length if hasattr(config, "seq_length") else config.max_position_embeddings - self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length - self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1 # default is -1 in GroupQueryAttention kernel - self.intermediate_size = config.ffn_hidden_size if hasattr(config, "ffn_hidden_size") else config.intermediate_size + self.original_context_length = ( + config.original_max_position_embeddings + if hasattr(config, "original_max_position_embeddings") + else config.rope_scaling["original_max_position_embeddings"] + if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") + else self.context_length + ) + self.window_size = ( + config.sliding_window if hasattr(config, "sliding_window") else -1 + ) # default is -1 in GroupQueryAttention kernel + self.intermediate_size = ( + config.ffn_hidden_size if hasattr(config, "ffn_hidden_size") else config.intermediate_size + ) self.hidden_size = config.hidden_size - self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.multi_query_group_num if hasattr(config, "multi_query_group_num") else config.num_attention_heads + self.num_kv_heads = ( + config.num_key_value_heads + if hasattr(config, "num_key_value_heads") + else config.multi_query_group_num + if hasattr(config, "multi_query_group_num") + else config.num_attention_heads + ) self.num_attn_heads = config.num_attention_heads - self.head_size = config.head_dim if hasattr(config, "head_dim") and config.head_dim is not None else config.hidden_size // config.num_attention_heads - self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers + self.head_size = ( + config.head_dim + if hasattr(config, "head_dim") and config.head_dim is not None + else config.hidden_size // config.num_attention_heads + ) + self.num_layers = ( + int(extra_options["num_hidden_layers"]) + if "num_hidden_layers" in extra_options + else config.num_hidden_layers + if hasattr(config, "num_hidden_layers") + else config.num_layers + ) self.vocab_size = config.vocab_size - self.activation = config.hidden_activation if hasattr(config, "hidden_activation") and config.hidden_activation is not None else config.hidden_act + self.activation = ( + config.hidden_activation + if hasattr(config, "hidden_activation") and config.hidden_activation is not None + else config.hidden_act + ) self.model_name_or_path = config._name_or_path self.model_type = config.architectures[0] @@ -88,32 +120,48 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.ep_attrs = { "cpu": {}, "cuda": { - "enable_cuda_graph": "1" if extra_options.get("enable_cuda_graph", False) else "0", # "1" if the model is able to enable cuda graph, "0" otherwise - "enable_skip_layer_norm_strict_mode": "1" + "enable_cuda_graph": "1" + if extra_options.get("enable_cuda_graph", False) + else "0", # "1" if the model is able to enable cuda graph, "0" otherwise + "enable_skip_layer_norm_strict_mode": "1", }, "dml": {}, # TODO: Enable graph capture for webgpu once supported both in onnxruntime-genai and onnxruntime. "webgpu": {}, - "trt-rtx": {"enable_cuda_graph": "1"} + "trt-rtx": {"enable_cuda_graph": "1"}, } # Map input names to their types and shapes self.input_names = ["input_ids", "attention_mask", "position_ids"] self.input_types = { - "input_ids": ir.DataType.INT64, # For standard models - "attention_mask": ir.DataType.INT64, # For standard models - "position_ids": ir.DataType.INT64, # For standard models - "inputs_embeds": self.io_dtype, # For standard models where you want to remove the embedding layer from the model (note that `inputs_embeds` is written this way to match Hugging Face format) - "past_key_values.key": self.io_dtype, # For standard models (note that `past_key_values.key` is written this way to match Hugging Face format) - "past_key_values.value": self.io_dtype, # For standard models (note that `past_key_values.value` is written this way to match Hugging Face format) + "input_ids": ir.DataType.INT64, # For standard models + "attention_mask": ir.DataType.INT64, # For standard models + "position_ids": ir.DataType.INT64, # For standard models + "inputs_embeds": self.io_dtype, # For standard models where you want to remove the embedding layer from the model (note that `inputs_embeds` is written this way to match Hugging Face format) + "past_key_values.key": self.io_dtype, # For standard models (note that `past_key_values.key` is written this way to match Hugging Face format) + "past_key_values.value": self.io_dtype, # For standard models (note that `past_key_values.value` is written this way to match Hugging Face format) } self.input_shapes = { - "input_ids": ["batch_size", "sequence_length"], # For standard models - "attention_mask": ["batch_size", "total_sequence_length"], # For standard models - "position_ids": ["batch_size", "sequence_length"], # For standard models - "inputs_embeds": ["batch_size", "sequence_length", self.hidden_size], # For standard models where you want to remove the embedding layer from the model (note that `inputs_embeds` is written this way to match Hugging Face format) - "past_key_values.key": ["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size], # For standard models (note that `past_key_values.key` is written this way to match Hugging Face format) - "past_key_values.value": ["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size], # For standard models (note that `past_key_values.value` is written this way to match Hugging Face format) + "input_ids": ["batch_size", "sequence_length"], # For standard models + "attention_mask": ["batch_size", "total_sequence_length"], # For standard models + "position_ids": ["batch_size", "sequence_length"], # For standard models + "inputs_embeds": [ + "batch_size", + "sequence_length", + self.hidden_size, + ], # For standard models where you want to remove the embedding layer from the model (note that `inputs_embeds` is written this way to match Hugging Face format) + "past_key_values.key": [ + "batch_size", + self.num_kv_heads, + "past_sequence_length", + self.head_size, + ], # For standard models (note that `past_key_values.key` is written this way to match Hugging Face format) + "past_key_values.value": [ + "batch_size", + self.num_kv_heads, + "past_sequence_length", + self.head_size, + ], # For standard models (note that `past_key_values.value` is written this way to match Hugging Face format) } self.exclude_embeds = extra_options.get("exclude_embeds", False) if self.exclude_embeds: @@ -122,16 +170,30 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # Map output names to their types and shapes self.output_names = ["logits"] self.output_types = { - "hidden_states": self.io_dtype, # For standard models where you want to remove the language modeling head from the model (note that `hidden_states` is written this way to match Hugging Face format) - "logits": self.io_dtype, # For standard models - "present.key": self.io_dtype, # For standard models (note that `present.key` is written this way to match Hugging Face format) - "present.value": self.io_dtype, # For standard models (note that `present.value` is written this way to match Hugging Face format) + "hidden_states": self.io_dtype, # For standard models where you want to remove the language modeling head from the model (note that `hidden_states` is written this way to match Hugging Face format) + "logits": self.io_dtype, # For standard models + "present.key": self.io_dtype, # For standard models (note that `present.key` is written this way to match Hugging Face format) + "present.value": self.io_dtype, # For standard models (note that `present.value` is written this way to match Hugging Face format) } self.output_shapes = { - "hidden_states": ["batch_size", "sequence_length", self.hidden_size], # For standard models where you want to remove the language modeling head from the model (note that `hidden_states` is written this way to match Hugging Face format) - "logits": ["batch_size", "sequence_length", self.vocab_size], # For standard models - "present.key": ["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size], # For standard models (note that `present.key` is written this way to match Hugging Face format) - "present.value": ["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size], # For standard models (note that `present.value` is written this way to match Hugging Face format) + "hidden_states": [ + "batch_size", + "sequence_length", + self.hidden_size, + ], # For standard models where you want to remove the language modeling head from the model (note that `hidden_states` is written this way to match Hugging Face format) + "logits": ["batch_size", "sequence_length", self.vocab_size], # For standard models + "present.key": [ + "batch_size", + self.num_kv_heads, + "total_sequence_length", + self.head_size, + ], # For standard models (note that `present.key` is written this way to match Hugging Face format) + "present.value": [ + "batch_size", + self.num_kv_heads, + "total_sequence_length", + self.head_size, + ], # For standard models (note that `present.value` is written this way to match Hugging Face format) } self.make_outputs_init() @@ -141,104 +203,118 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # Mask-specific variables # TODO: Reconcile differences between `seqlens_k` and `key_total_seq_lens` in the GroupQueryAttention and SparseAttention implementations. Ideally the same subgraph can be shared for both. self.mask_attrs = { - "mask_name": "", # Name of node that outputs 4D causal attention mask (used as add_qk in MultiHeadAttention) - "seqlens_k": "", # Sum of each row in attention mask - 1 (used as input to GroupQueryAttention) - "total_seq_len": "", # Size of total sequence length in attention mask (used as input to GroupQueryAttention and SparseAttention) - "block_row_indices": "", # Row indices of CSR format of block mask (used as input to SparseAttention) - "block_col_indices": "", # Col indices of CSR format of block mask (used as input to SparseAttention) - "key_total_seq_lens": "", # Sum of each row in attention mask (used as input to SparseAttention) + "mask_name": "", # Name of node that outputs 4D causal attention mask (used as add_qk in MultiHeadAttention) + "seqlens_k": "", # Sum of each row in attention mask - 1 (used as input to GroupQueryAttention) + "total_seq_len": "", # Size of total sequence length in attention mask (used as input to GroupQueryAttention and SparseAttention) + "block_row_indices": "", # Row indices of CSR format of block mask (used as input to SparseAttention) + "block_col_indices": "", # Col indices of CSR format of block mask (used as input to SparseAttention) + "key_total_seq_lens": "", # Sum of each row in attention mask (used as input to SparseAttention) } # Embedding-specific variables self.embed_attrs = { - "scale": 1, # Scale value to multiply output of Embedding layer by + "scale": 1, # Scale value to multiply output of Embedding layer by } # LayerNorm-specific variables epsilon = config.rms_norm_eps if hasattr(config, "rms_norm_eps") else 1e-06 self.layernorm_attrs = { - "simple": True, # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm - "first_layernorm": True, # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms - "last_layernorm": False, # Last LayerNorm = SkipLayerNorm with only output 0 (no output 3) - "root_input": "", # Root input from parent node for LayerNorm and SkipLayerNorm - "skip_input": "", # Skip input from parent node for SkipLayerNorm - "output_0": "", # Output 0 for LayerNorm and SkipLayerNorm - "output_3": "", # Output 3 for SkipLayerNorm - "add_offset": 0, # Offset value for LayerNorm weight - "epsilon": epsilon, # Epsilon value to avoid `sqrt(0)` in LayerNorm - "cast": { # Casting LayerNorm-specific variables - "use_fp32": False, # Use float32 precision to compute LayerNorm - "root_input": False, # Cast root_input - "skip_input": False, # Cast skip_input - "output_0": False, # Cast output_0 - "output_3": False, # Cast output_3 - } + "simple": True, # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm + "first_layernorm": True, # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms + "last_layernorm": False, # Last LayerNorm = SkipLayerNorm with only output 0 (no output 3) + "root_input": "", # Root input from parent node for LayerNorm and SkipLayerNorm + "skip_input": "", # Skip input from parent node for SkipLayerNorm + "output_0": "", # Output 0 for LayerNorm and SkipLayerNorm + "output_3": "", # Output 3 for SkipLayerNorm + "add_offset": 0, # Offset value for LayerNorm weight + "epsilon": epsilon, # Epsilon value to avoid `sqrt(0)` in LayerNorm + "cast": { # Casting LayerNorm-specific variables + "use_fp32": False, # Use float32 precision to compute LayerNorm + "root_input": False, # Cast root_input + "skip_input": False, # Cast skip_input + "output_0": False, # Cast output_0 + "output_3": False, # Cast output_3 + }, } # MatMul-specific variables is_lora = hasattr(config, "peft_type") and config.peft_type == "LORA" self.matmul_attrs = { - "use_lora": is_lora, # Use LoRA/QLoRA format + "use_lora": is_lora, # Use LoRA/QLoRA format } # RotaryEmbedding-specific variables position_scale = config.rope_position_scale if hasattr(config, "rope_position_scale") else 1 partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 rotemb_dim = int(self.head_size * partial_rotary_factor) if partial_rotary_factor != 1.0 else 0 - rope_theta = config.rope_theta if hasattr(config, "rope_theta") else config.rope_embedding_base if hasattr(config, "rope_embedding_base") else 10000 + rope_theta = ( + config.rope_theta + if hasattr(config, "rope_theta") + else config.rope_embedding_base + if hasattr(config, "rope_embedding_base") + else 10000 + ) self.rope_attrs = { - "create_caches": True, # Create cos/sin caches for rotary embeddings - "save_caches": True, # Auto-save cos/sin caches for rotary embeddings after creation - "cache_length": self.context_length, # Cache length to use when creating cos/sin caches for rotary embeddings - "theta": rope_theta, # Base value if calculating cos/sin caches from scratch + "create_caches": True, # Create cos/sin caches for rotary embeddings + "save_caches": True, # Auto-save cos/sin caches for rotary embeddings after creation + "cache_length": self.context_length, # Cache length to use when creating cos/sin caches for rotary embeddings + "theta": rope_theta, # Base value if calculating cos/sin caches from scratch "partial_rotary_factor": partial_rotary_factor, # Factor for partial rotary embeddings - "interleaved": 0, # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0) - "rotary_embedding_dim": rotemb_dim, # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0) - "rescale_factors": 1, # Rescale factors when calculating `inv_freq` in rotary embeddings - "t_dtype": torch.int64, # Torch dtype when calculating `t` in rotary embeddings - "position_scale": position_scale, # Scale value when calculating `t` in rotary embeddings - "mscale": 1, # Magnitude scaling factor when scaling `emb.cos()/emb.sin()` in rotary embeddings - "mscale_policy": "", # Magnitude scaling policy when scaling `emb.cos()/emb.sin()` in rotary embeddings + "interleaved": 0, # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0) + "rotary_embedding_dim": rotemb_dim, # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0) + "rescale_factors": 1, # Rescale factors when calculating `inv_freq` in rotary embeddings + "t_dtype": torch.int64, # Torch dtype when calculating `t` in rotary embeddings + "position_scale": position_scale, # Scale value when calculating `t` in rotary embeddings + "mscale": 1, # Magnitude scaling factor when scaling `emb.cos()/emb.sin()` in rotary embeddings + "mscale_policy": "", # Magnitude scaling policy when scaling `emb.cos()/emb.sin()` in rotary embeddings } if hasattr(config, "rope_scaling") and config.rope_scaling is not None: self.make_rope_init(config) # Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.) - attn_softcap = config.attn_logit_softcapping if hasattr(config, "attn_logit_softcapping") and config.attn_logit_softcapping is not None else 0.0 # default is 0.0 in GroupQueryAttention kernel + attn_softcap = ( + config.attn_logit_softcapping + if hasattr(config, "attn_logit_softcapping") and config.attn_logit_softcapping is not None + else 0.0 + ) # default is 0.0 in GroupQueryAttention kernel # Block-sparse attention-specific variables sparse_block_size = config.blocksparse_block_size if hasattr(config, "blocksparse_block_size") else 0 - kernel_block_size = config.blocksparse_triton_kernel_block_size if hasattr(config, "blocksparse_triton_kernel_block_size") else 0 + kernel_block_size = ( + config.blocksparse_triton_kernel_block_size + if hasattr(config, "blocksparse_triton_kernel_block_size") + else 0 + ) local_blocks = config.blocksparse_num_local_blocks if hasattr(config, "blocksparse_num_local_blocks") else 0 vert_block_stride = config.blocksparse_vert_stride if hasattr(config, "blocksparse_vert_stride") else 0 homo_head = config.blocksparse_homo_head_pattern if hasattr(config, "blocksparse_homo_head_pattern") else False self.attention_attrs = { - "q_path": "", # Q path to attention - "k_path": "", # K path to attention - "v_path": "", # V path to attention - "op_type": "MultiHeadAttention", # Attention op to use - "scale": 1 / np.sqrt(self.head_size), # Scale value after calculating Q x K' in attention - "softcap": attn_softcap, # Softcap value to prevent values from exploding in attention - "use_rope_in_attn": False, # Use rotary embeddings within attention (instead of a separate RotaryEmbedding op) - "use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V) - "block_sparse": { # Block-sparse attention-specific variables - "sparse_block_size": sparse_block_size, # Sparse block size for SparseAttention op - "kernel_block_size": kernel_block_size, # Kernel block size for sparse attention - "local_blocks": local_blocks, # Number of local blocks for sparse attention - "vert_stride": vert_block_stride, # Vertical stride to use for sparse attention - "homo_head": homo_head, # Use homo head pattern for sparse attention + "q_path": "", # Q path to attention + "k_path": "", # K path to attention + "v_path": "", # V path to attention + "op_type": "MultiHeadAttention", # Attention op to use + "scale": 1 / np.sqrt(self.head_size), # Scale value after calculating Q x K' in attention + "softcap": attn_softcap, # Softcap value to prevent values from exploding in attention + "use_rope_in_attn": False, # Use rotary embeddings within attention (instead of a separate RotaryEmbedding op) + "use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V) + "block_sparse": { # Block-sparse attention-specific variables + "sparse_block_size": sparse_block_size, # Sparse block size for SparseAttention op + "kernel_block_size": kernel_block_size, # Kernel block size for sparse attention + "local_blocks": local_blocks, # Number of local blocks for sparse attention + "vert_stride": vert_block_stride, # Vertical stride to use for sparse attention + "homo_head": homo_head, # Use homo head pattern for sparse attention }, - "q_norm": False, # LayerNorm after MatMul in Q path - "k_norm": False, # LayerNorm after MatMul in K path - "sinks": False, # Sink values for softmax in attention + "q_norm": False, # LayerNorm after MatMul in Q path + "k_norm": False, # LayerNorm after MatMul in K path + "sinks": False, # Sink values for softmax in attention } self.make_attention_init() # MLP-specific variables self.mlp_attrs = { - "use_proj": True, # Use projection style for MLP (GateProj/UpProj/DownProj) - "use_fc": False, # Use fully-connected style for MLP (FC1/FC2) - "output_0": "", # Output 0 for MLP layer + "use_proj": True, # Use projection style for MLP (GateProj/UpProj/DownProj) + "use_fc": False, # Use fully-connected style for MLP (FC1/FC2) + "output_0": "", # Output 0 for MLP layer } # MoE-specific variables @@ -248,24 +324,28 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): expert_weight_bits = 8 if extra_options.get("use_8bits_moe", False) else 4 swiglu_limit = config.swiglu_limit if hasattr(config, "swiglu_limit") else None self.moe_attrs = { - "op_type": moe_op_type, # MoE op to use - "num_experts": num_experts, # Number of experts in MoE layer - "top_k": top_k_experts, # Number of experts to select in MoE layer - "activation_alpha": 1.0, # Alpha parameter used in activation function - "activation_beta": 0.0, # Beta parameter used in activation function - "activation_type": self.activation, # Activation function for MoE layer - "expert_weight_bits": expert_weight_bits, # Number of bits used in quantized MoE weights (only INT4 or INT8 are supported). - "normalize_routing_weights": False, # Normalize routing weights in MoE layer - "swiglu_fusion": 0, # Fusion level for SwiGLU activation function - "swiglu_limit": swiglu_limit, # Value used to clamp results into a certain range in SwiGLU activation function - "use_sparse_mixer": False, # Use SparseMixer in MoE layer (used in Phi-3.5 MoE) + "op_type": moe_op_type, # MoE op to use + "num_experts": num_experts, # Number of experts in MoE layer + "top_k": top_k_experts, # Number of experts to select in MoE layer + "activation_alpha": 1.0, # Alpha parameter used in activation function + "activation_beta": 0.0, # Beta parameter used in activation function + "activation_type": self.activation, # Activation function for MoE layer + "expert_weight_bits": expert_weight_bits, # Number of bits used in quantized MoE weights (only INT4 or INT8 are supported). + "normalize_routing_weights": False, # Normalize routing weights in MoE layer + "swiglu_fusion": 0, # Fusion level for SwiGLU activation function + "swiglu_limit": swiglu_limit, # Value used to clamp results into a certain range in SwiGLU activation function + "use_sparse_mixer": False, # Use SparseMixer in MoE layer (used in Phi-3.5 MoE) } # LM head-specific variables - lm_head_softcap = config.final_logit_softcapping if hasattr(config, "final_logit_softcapping") and config.final_logit_softcapping is not None else 0.0 # default is 0.0 in GroupQueryAttention kernel + lm_head_softcap = ( + config.final_logit_softcapping + if hasattr(config, "final_logit_softcapping") and config.final_logit_softcapping is not None + else 0.0 + ) # default is 0.0 in GroupQueryAttention kernel self.lm_head_attrs = { - "scale": 1, # Scale value to multiply output of LM head by - "mask": None, # LM head mask for tokens in the vocabulary + "scale": 1, # Scale value to multiply output of LM head by + "mask": None, # LM head mask for tokens in the vocabulary "softcap": lm_head_softcap, # Softcap value to prevent values from exploding in LM head } if hasattr(config, "dummy_token_indices"): @@ -279,10 +359,12 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.int4_block_size = extra_options.get("int4_block_size", 32) self.quant_attrs = { "int4": { - "accuracy_level": int(extra_options.get("int4_accuracy_level", 4 if self.ep in ["cpu", "webgpu"] else 0)), + "accuracy_level": int( + extra_options.get("int4_accuracy_level", 4 if self.ep in ["cpu", "webgpu"] else 0) + ), "block_size": int(self.int4_block_size), "is_symmetric": extra_options.get("int4_is_symmetric", True), - "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )), + "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul",)), "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []), "algo_config": int4_algo_config, }, @@ -291,9 +373,15 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): if self.quant_type is not None: # Create quantized attributes from quantization config self.quant_attrs["config"] = config.quantization_config - self.quant_attrs["use_g_idx"] = config.quantization_config["desc_act"] if "desc_act" in config.quantization_config else False + self.quant_attrs["use_g_idx"] = ( + config.quantization_config["desc_act"] if "desc_act" in config.quantization_config else False + ) - self.int4_tied_embeddings = config.tie_word_embeddings if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is not None else False + self.int4_tied_embeddings = ( + config.tie_word_embeddings + if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is not None + else False + ) self.int4_tied_embeddings = extra_options.get("int4_tied_embeddings", self.int4_tied_embeddings) self.int8_lm_head = extra_options.get("int4_algo_config", "default") in {"k_quant_mixed", "k_quant_last"} if not self.int8_lm_head: @@ -325,26 +413,34 @@ def make_rope_init(self, config): short_mscale = config.rope_scaling["short_mscale"] if "short_mscale" in config.rope_scaling else 0 long_mscale = config.rope_scaling["long_mscale"] if "long_mscale" in config.rope_scaling else 0 - short_mscale = short_mscale if short_mscale > 0 else self.make_mscale(self.context_length / self.original_context_length) - long_mscale = long_mscale if long_mscale > 0 else self.make_mscale(self.context_length / self.original_context_length) + short_mscale = ( + short_mscale + if short_mscale > 0 + else self.make_mscale(self.context_length / self.original_context_length) + ) + long_mscale = ( + long_mscale if long_mscale > 0 else self.make_mscale(self.context_length / self.original_context_length) + ) self.rope_attrs["multi_cache"] = { - "short_factor": short_factor, # Short factor when calculating `inv_freq` in rotary embeddings - "long_factor": long_factor, # Long factor when calculating `inv_freq` in rotary embeddings - "short_mscale": short_mscale, # Magnitude scaling for short factor when scaling `emb.cos()/emb.sin()` in rotary embeddings - "long_mscale": long_mscale, # Magnitude scaling for long factor when scaling `emb.cos()/emb.sin()` in rotary embeddings + "short_factor": short_factor, # Short factor when calculating `inv_freq` in rotary embeddings + "long_factor": long_factor, # Long factor when calculating `inv_freq` in rotary embeddings + "short_mscale": short_mscale, # Magnitude scaling for short factor when scaling `emb.cos()/emb.sin()` in rotary embeddings + "long_mscale": long_mscale, # Magnitude scaling for long factor when scaling `emb.cos()/emb.sin()` in rotary embeddings } elif "low_freq_factor" in config.rope_scaling: # For models that rescale `inv_freq` using `low_freq_factor` and `high_freq_factor` (e.g. LLaMA-3.1) factor = config.rope_scaling["factor"] if "factor" in config.rope_scaling else 0 low_freq_factor = config.rope_scaling["low_freq_factor"] if "low_freq_factor" in config.rope_scaling else 0 - high_freq_factor = config.rope_scaling["high_freq_factor"] if "high_freq_factor" in config.rope_scaling else 0 - + high_freq_factor = ( + config.rope_scaling["high_freq_factor"] if "high_freq_factor" in config.rope_scaling else 0 + ) + self.rope_attrs["rescale_inv_freq"] = { - "factor": factor, # Scale factor when calculating `new_freq` in rotary embeddings - "low_freq_factor": low_freq_factor, # Low freq factor when calculating `low_freq_wavelen` in rotary embeddings - "high_freq_factor": high_freq_factor, # High freq factor when calculating `high_freq_wavelen` in rotary embeddings + "factor": factor, # Scale factor when calculating `new_freq` in rotary embeddings + "low_freq_factor": low_freq_factor, # Low freq factor when calculating `low_freq_wavelen` in rotary embeddings + "high_freq_factor": high_freq_factor, # High freq factor when calculating `high_freq_wavelen` in rotary embeddings } elif "beta_fast" in config.rope_scaling: @@ -395,10 +491,14 @@ def make_attention_init(self): def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): # Create config with attributes from config.json and generation_config.json (if latter file exists) - config = AutoConfig.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs) + config = AutoConfig.from_pretrained( + model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs + ) try: # Override search attributes in config based on values in generation_config.json - gen_config = GenerationConfig.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs) + gen_config = GenerationConfig.from_pretrained( + model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs + ) defaults = { "bos_token_id": None, "do_sample": False, @@ -415,31 +515,41 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): except: pass - inputs = dict(zip(self.input_names, self.input_names)) - inputs.update({ - "past_key_names": "past_key_values.%d.key", - "past_value_names": "past_key_values.%d.value", - }) - outputs = dict(zip(self.output_names, self.output_names)) - outputs.update({ - "present_key_names": "present.%d.key", - "present_value_names": "present.%d.value", - }) + inputs = dict(zip(self.input_names, self.input_names, strict=False)) + inputs.update( + { + "past_key_names": "past_key_values.%d.key", + "past_value_names": "past_key_values.%d.value", + } + ) + outputs = dict(zip(self.output_names, self.output_names, strict=False)) + outputs.update( + { + "present_key_names": "present.%d.key", + "present_value_names": "present.%d.value", + } + ) if "hidden_states" in outputs: # Remove 'hidden_states' from 'outputs' entry in config since ORT GenAI doesn't use it del outputs["hidden_states"] bos_token_id = config.bos_token_id if hasattr(config, "bos_token_id") and config.bos_token_id is not None else 1 eos_token_id = config.eos_token_id - pad_token_id = config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id + pad_token_id = ( + config.pad_token_id + if hasattr(config, "pad_token_id") and config.pad_token_id is not None + else config.eos_token_id[0] + if isinstance(config.eos_token_id, list) + else config.eos_token_id + ) genai_config = { "model": { "bos_token_id": bos_token_id, "context_length": self.context_length, "decoder": { - "session_options" : { + "session_options": { "log_id": "onnxruntime-genai", - "provider_options" : [], + "provider_options": [], }, "filename": self.filename, "head_size": self.head_size, @@ -452,7 +562,9 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): }, "eos_token_id": eos_token_id, "pad_token_id": pad_token_id, - "type": self.model_type[ : self.model_type.find("For") if "For" in self.model_type else len(self.model_type)].lower(), + "type": self.model_type[ + : self.model_type.find("For") if "For" in self.model_type else len(self.model_type) + ].lower(), "vocab_size": self.vocab_size, }, "search": { @@ -465,7 +577,9 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): "no_repeat_ngram_size": config.no_repeat_ngram_size if hasattr(config, "no_repeat_ngram_size") else 0, "num_beams": config.num_beams if hasattr(config, "num_beams") else 1, "num_return_sequences": config.num_return_sequences if hasattr(config, "num_return_sequences") else 1, - "past_present_share_buffer": False if "config_only" in self.extra_options else self.past_present_share_buffer, + "past_present_share_buffer": False + if "config_only" in self.extra_options + else self.past_present_share_buffer, "repetition_penalty": config.repetition_penalty if hasattr(config, "repetition_penalty") else 1.0, "temperature": config.temperature if hasattr(config, "temperature") else 1.0, "top_k": config.top_k if hasattr(config, "top_k") else 50, @@ -475,22 +589,24 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): if self.ep == "trt-rtx" and self.window_size is not None and self.window_size > 0: # Compute layer indices that use sliding window attention - layer_idxs = [layer_id for layer_id in range(self.num_layers) if hasattr(self, "is_local") and self.is_local(layer_id)] - + layer_idxs = [ + layer_id for layer_id in range(self.num_layers) if hasattr(self, "is_local") and self.is_local(layer_id) + ] + genai_config["model"]["decoder"]["sliding_window"] = { "window_size": self.window_size, "slide_key_value_cache": False, "slide_inputs": False, - "layers": layer_idxs + "layers": layer_idxs, } if self.ep != "cpu": ep_name = self.ep.replace("trt-rtx", "NvTensorRtRtx") - ep_options = { ep_name : self.ep_attrs[self.ep] } + ep_options = {ep_name: self.ep_attrs[self.ep]} genai_config["model"]["decoder"]["session_options"]["provider_options"].append(ep_options) print(f"Saving GenAI config in {out_dir}") - with open(os.path.join(out_dir,"genai_config.json"), "w") as f: + with open(os.path.join(out_dir, "genai_config.json"), "w") as f: json.dump(genai_config, f, indent=4) def make_key_value_cache_shape(self, layer_id, shape): @@ -503,7 +619,9 @@ def make_key_value_cache_shape(self, layer_id, shape): return shape def save_processing(self, model_name_or_path, extra_kwargs, out_dir): - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs) + tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs + ) print(f"Saving processing files in {out_dir} for GenAI") tokenizer.save_pretrained(out_dir) @@ -524,7 +642,9 @@ def make_int4_algo_config(self, quant_method: str): layers_to_exclude = [ i for i in range(self.num_layers) - if i < self.num_layers / 8 or i >= 7 * self.num_layers / 8 or (i - (round)(self.num_layers / 8)) % 3 == 2 + if i < self.num_layers / 8 + or i >= 7 * self.num_layers / 8 + or (i - (round)(self.num_layers / 8)) % 3 == 2 ] for i in layers_to_exclude: customized_weight_config["/model/layers." + str(i) + "/attn/qkv_proj/MatMul"] = {"bits": 8} @@ -553,7 +673,9 @@ def to_int4(self) -> ir.Model: def save_model(self, out_dir): print(f"Saving ONNX model in {out_dir}") - already_quantized_in_qdq_format = self.quant_type is not None and self.quant_attrs["use_qdq"] # Skip quantizing `MatMul` in `DequantizeLinear --> Transpose --> MatMul` path + already_quantized_in_qdq_format = ( + self.quant_type is not None and self.quant_attrs["use_qdq"] + ) # Skip quantizing `MatMul` in `DequantizeLinear --> Transpose --> MatMul` path if self.onnx_dtype in {ir.DataType.INT4, ir.DataType.UINT4} and not already_quantized_in_qdq_format: model = self.to_int4() else: @@ -596,7 +718,9 @@ def callback(tensor: ir.TensorProtocol, metadata: dict): if not os.listdir(self.cache_dir): os.rmdir(self.cache_dir) - def make_initializer(self, tensor: torch.Tensor | np.ndarray | ir.TensorProtocol, /, name: str, to: ir.DataType | None = None): + def make_initializer( + self, tensor: torch.Tensor | np.ndarray | ir.TensorProtocol, /, name: str, to: ir.DataType | None = None + ): if to is not None: # Cast the tensor lazily if `to` is provided def tensor_func(): @@ -604,9 +728,7 @@ def tensor_func(): tensor = tensor.to(to_torch_dtype(to)) return TorchTensor(tensor, name=name) - ir_tensor = ir.LazyTensor( - tensor_func, dtype=to, shape=ir.Shape(tensor.shape), name=name - ) + ir_tensor = ir.LazyTensor(tensor_func, dtype=to, shape=ir.Shape(tensor.shape), name=name) elif isinstance(tensor, torch.nn.parameter.Parameter): ir_tensor = TorchTensor(tensor, name=name) else: @@ -641,7 +763,9 @@ def make_node(self, op_type, inputs: Sequence[str], outputs: Sequence[str], *, n self.model.graph.append(node) self.node_names.add(name) - def make_value(self, name, dtype: ir.DataType | int| None = None, shape: Sequence[int | str] | ir.Shape | None = None) -> ir.Value: + def make_value( + self, name, dtype: ir.DataType | int | None = None, shape: Sequence[int | str] | ir.Shape | None = None + ) -> ir.Value: """Obtain or create an IR value by value name. If the value does not exist a new one is created. @@ -685,7 +809,9 @@ def make_inputs_and_outputs(self): value_name = f"past_key_values.{i}.value" value_shape = self.make_key_value_cache_shape(i, self.input_shapes["past_key_values.value"]) - inputs.append(self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=value_shape)) + inputs.append( + self.make_value(value_name, dtype=self.input_types["past_key_values.value"], shape=value_shape) + ) # Add KV cache to outputs key_name = f"present.{i}.key" @@ -891,7 +1017,7 @@ def make_matmul_float(self, matmul, name, root_input, **kwargs): last_dim = matmul.weight.shape[0] output = "logits" if kwargs.get("logits", False) else f"{name}/output_0" self.make_node("MatMul", inputs=[root_input, weight], outputs=[output], name=name) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', last_dim]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", last_dim]) return name @@ -924,11 +1050,18 @@ def make_matmul_int4(self, matmul, basename, root_input, **kwargs): output = "logits" if kwargs.get("logits", False) else f"{name}/output_0" self.make_node( - "MatMulNBits", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", + "MatMulNBits", + inputs=inputs, + outputs=[output], + name=name, + domain="com.microsoft", accuracy_level=self.quant_attrs["int4"]["accuracy_level"], - bits=matmul.bits, block_size=matmul.group_size, K=matmul.in_features, N=matmul.out_features, + bits=matmul.bits, + block_size=matmul.group_size, + K=matmul.in_features, + N=matmul.out_features, ) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', matmul.out_features]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", matmul.out_features]) return name @@ -959,16 +1092,25 @@ def make_dequantize_linear(self, dequantize_name, quantized_op): if hasattr(quantized_op, "qzeros") and quantized_op.qzeros is not None: zeros = dequantize_name[1:].replace("/", ".") + ".qzeros" self.make_initializer( - ir.PackedTensor( - quantized_op.qzeros, self.onnx_dtype, shape=scales_target_shape - ), + ir.PackedTensor(quantized_op.qzeros, self.onnx_dtype, shape=scales_target_shape), zeros, ) dequantize_inputs.append(zeros) dequantize_output = f"{dequantize_name}/output_0" - self.make_node("DequantizeLinear", inputs=dequantize_inputs, outputs=[dequantize_output], name=dequantize_name, block_size=quantized_op.group_size, axis=-1) - self.make_value(dequantize_output, self.io_dtype, shape=[*scales_pt.shape[:-1], scales_pt.shape[-1] * quantized_op.group_size]) + self.make_node( + "DequantizeLinear", + inputs=dequantize_inputs, + outputs=[dequantize_output], + name=dequantize_name, + block_size=quantized_op.group_size, + axis=-1, + ) + self.make_value( + dequantize_output, + self.io_dtype, + shape=[*scales_pt.shape[:-1], scales_pt.shape[-1] * quantized_op.group_size], + ) return dequantize_output @@ -995,8 +1137,10 @@ def make_matmul_int4_qdq(self, matmul, matmul_name, root_input, **kwargs): self.make_transpose(transpose_name, dequantize_output, self.io_dtype, transposed_shape, [1, 0]) matmul_output = "logits" if kwargs.get("logits", False) else f"{matmul_name}/output_0" - self.make_node("MatMul", inputs=[root_input, f"{transpose_name}/output_0"], outputs=[matmul_output], name=matmul_name) - self.make_value(matmul_output, self.io_dtype, shape=['batch_size', 'sequence_length', matmul.out_features]) + self.make_node( + "MatMul", inputs=[root_input, f"{transpose_name}/output_0"], outputs=[matmul_output], name=matmul_name + ) + self.make_value(matmul_output, self.io_dtype, shape=["batch_size", "sequence_length", matmul.out_features]) return matmul_name @@ -1059,7 +1203,10 @@ def make_packed_matmul_float(self, q_matmul, k_matmul, v_matmul, basename, root_ # Create dummy PackedMatMul class class PackedMatMul: def __init__(self): - self.weight = torch.cat([q_matmul.weight, k_matmul.weight, v_matmul.weight], dim=0).reshape(N_q + N_kv + N_kv, H) + self.weight = torch.cat([q_matmul.weight, k_matmul.weight, v_matmul.weight], dim=0).reshape( + N_q + N_kv + N_kv, H + ) + matmul = PackedMatMul() new_name = self.make_matmul(matmul, basename, root_input, **kwargs) @@ -1088,6 +1235,7 @@ def __init__(self): self.out_features = q_matmul.out_features + k_matmul.out_features + v_matmul.out_features self.bits = q_matmul.bits self.group_size = q_matmul.group_size + matmul = PackedMatMul() new_name = self.make_matmul_int4(matmul, basename, root_input, **kwargs) @@ -1098,7 +1246,7 @@ def make_add_bias(self, add, name, root_input, **kwargs): self.make_initializer(add, bias, to=self.io_dtype) add_bias_inputs = [root_input, bias] - shape = ['batch_size', 'sequence_length', add.shape[0]] + shape = ["batch_size", "sequence_length", add.shape[0]] if kwargs.get("logits", False): output = "logits" @@ -1120,30 +1268,46 @@ def make_embedding(self, embedding): weight_reshape_name = f"{basename}/Reshape" bits = 8 if self.int8_lm_head else 4 - weight_reshape_inputs = [f"lm_head.MatMul.weight_Q{bits}G{self.int4_block_size}", f"/model/constants/INT64/[{self.vocab_size}, {self.hidden_size}]"] + weight_reshape_inputs = [ + f"lm_head.MatMul.weight_Q{bits}G{self.int4_block_size}", + f"/model/constants/INT64/[{self.vocab_size}, {self.hidden_size}]", + ] weight_reshape_output = f"{weight_reshape_name}/output_0" # quantized weight dtype is uint8, see here # https://github.com/microsoft/onnxruntime/blob/0c9356cb986fd4cd2c5d510909d31186010ba226/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py#L73 - self.make_reshape(weight_reshape_name, weight_reshape_inputs, dtype=ir.DataType.UINT8, shape=['vocab_size', 'hidden_size']) + self.make_reshape( + weight_reshape_name, weight_reshape_inputs, dtype=ir.DataType.UINT8, shape=["vocab_size", "hidden_size"] + ) - self.make_node('GatherBlockQuantized', inputs=[weight_reshape_output, 'input_ids', 'lm_head.MatMul.weight_scale', 'lm_head.MatMul.weight_zp'], outputs=[gather_output], name=gather_name, domain="com.microsoft", bits=bits, block_size=int(self.int4_block_size)) + self.make_node( + "GatherBlockQuantized", + inputs=[weight_reshape_output, "input_ids", "lm_head.MatMul.weight_scale", "lm_head.MatMul.weight_zp"], + outputs=[gather_output], + name=gather_name, + domain="com.microsoft", + bits=bits, + block_size=int(self.int4_block_size), + ) else: weight = "model.embed_tokens.weight" self.make_initializer(embedding, weight, to=self.io_dtype) gather_name = f"{basename}/Gather" gather_output = f"{gather_name}/output_0" - self.make_node('Gather', inputs=[weight, 'input_ids'], outputs=[gather_output], name=gather_name) + self.make_node("Gather", inputs=[weight, "input_ids"], outputs=[gather_output], name=gather_name) - self.make_value(gather_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(gather_output, self.io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) if self.embed_attrs["scale"] != 1: # Scale the embeddings mul_name = f"{basename}/Mul" - mul_inputs = [gather_output, f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.embed_attrs['scale']}"] + mul_inputs = [ + gather_output, + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.embed_attrs['scale']}", + ] mul_output = f"{mul_name}/output_0" - self.make_node('Mul', inputs=mul_inputs, outputs=[mul_output], name=mul_name) - self.make_value(mul_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_node("Mul", inputs=mul_inputs, outputs=[mul_output], name=mul_name) + self.make_value(mul_output, self.io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) layernorm_attrs_value = mul_output else: @@ -1152,7 +1316,12 @@ def make_embedding(self, embedding): if self.layernorm_attrs["cast"]["use_fp32"] and self.io_dtype != ir.DataType.FLOAT: # Insert output Cast node cast_name = f"{basename}/Cast" - self.make_cast(cast_name, layernorm_attrs_value, ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_cast( + cast_name, + layernorm_attrs_value, + ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.hidden_size], + ) layernorm_attrs_value = f"{cast_name}/output_0" self.layernorm_attrs["root_input"] = layernorm_attrs_value @@ -1176,11 +1345,7 @@ def make_layernorm_op(self, layer_id, layernorm, skip, simple, location): # Create weight and bias tensors weight = f"model.layers.{layer_id}.{location}_layernorm.weight" - self.make_initializer( - layernorm.weight + self.layernorm_attrs["add_offset"], - weight, - to=new_io_dtype - ) + self.make_initializer(layernorm.weight + self.layernorm_attrs["add_offset"], weight, to=new_io_dtype) bias = f"model.layers.{layer_id}.{location}_layernorm.bias" if not simple: self.make_initializer(layernorm.bias, bias, to=new_io_dtype) @@ -1208,10 +1373,12 @@ def make_layernorm_op(self, layer_id, layernorm, skip, simple, location): inputs, outputs = self.make_layernorm_casts(name, inputs, outputs, old_io_dtype, new_io_dtype) # Make op and its shape - self.make_node(op_type, inputs=inputs, outputs=outputs, name=name, domain=("com.microsoft" if skip else None), **kwargs) - self.make_value(outputs[0], new_io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_node( + op_type, inputs=inputs, outputs=outputs, name=name, domain=("com.microsoft" if skip else None), **kwargs + ) + self.make_value(outputs[0], new_io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) if skip and not self.layernorm_attrs["last_layernorm"]: - self.make_value(outputs[3], new_io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(outputs[3], new_io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) # Update LayerNorm attributes self.layernorm_attrs["output_0"] = output_0 @@ -1232,16 +1399,12 @@ def _make_layernorm_op(self, layer_id, layernorm, skip, simple, location): # Create weight and bias tensors weight = f"model.layers.{layer_id}.{location}_layernorm.weight" - self.make_initializer( - layernorm.weight + self.layernorm_attrs["add_offset"], - weight, - to=new_io_dtype - ) + self.make_initializer(layernorm.weight + self.layernorm_attrs["add_offset"], weight, to=new_io_dtype) bias = f"model.layers.{layer_id}.{location}_layernorm.bias" if not simple: self.make_initializer(layernorm.bias, bias, to=new_io_dtype) - # Create input names for op + # Create input names for op inputs = [root_input, skip_input, weight] if skip else [root_input, weight] if not simple: inputs.append(bias) @@ -1266,16 +1429,42 @@ def _make_layernorm_op(self, layer_id, layernorm, skip, simple, location): skip_input = inputs[1] if skip else None if op_type == "SimplifiedLayerNormalization": - self._make_simplified_layer_norm(name, root_input, weight, outputs[0], new_io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self._make_simplified_layer_norm( + name, + root_input, + weight, + outputs[0], + new_io_dtype, + shape=["batch_size", "sequence_length", self.hidden_size], + ) elif op_type == "SkipSimplifiedLayerNormalization": - self._make_skip_simplified_layer_norm(name, root_input, skip_input, weight, outputs[0], output_3, new_io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self._make_skip_simplified_layer_norm( + name, + root_input, + skip_input, + weight, + outputs[0], + output_3, + new_io_dtype, + shape=["batch_size", "sequence_length", self.hidden_size], + ) elif op_type == "SkipLayerNormalization": - self._make_skip_layer_norm(name, root_input, skip_input, weight, bias, outputs[0], output_3, new_io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self._make_skip_layer_norm( + name, + root_input, + skip_input, + weight, + bias, + outputs[0], + output_3, + new_io_dtype, + shape=["batch_size", "sequence_length", self.hidden_size], + ) else: raise ValueError(f"Invalid op_type: {op_type}") if skip and not self.layernorm_attrs["last_layernorm"]: - self.make_value(outputs[3], new_io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(outputs[3], new_io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) # Update LayerNorm attributes self.layernorm_attrs["output_0"] = output_0 @@ -1303,7 +1492,9 @@ def make_layernorm_casts(self, name, inputs, outputs, old_dtype, new_dtype): # Cast root_input root_input_cast_name = f"{name}/root_input/Cast" root_input_cast_output = f"{root_input_cast_name}/output_0" - self.make_node("Cast", inputs=[root_input], outputs=[root_input_cast_output], name=root_input_cast_name, to=new_dtype) + self.make_node( + "Cast", inputs=[root_input], outputs=[root_input_cast_output], name=root_input_cast_name, to=new_dtype + ) self.make_value(root_input_cast_output, new_dtype, shape=root_input_shape) inputs[0] = root_input_cast_output @@ -1312,7 +1503,9 @@ def make_layernorm_casts(self, name, inputs, outputs, old_dtype, new_dtype): assert skip_input is not None skip_input_cast_name = f"{name}/skip_input/Cast" skip_input_cast_output = f"{skip_input_cast_name}/output_0" - self.make_node("Cast", inputs=[skip_input], outputs=[skip_input_cast_output], name=skip_input_cast_name, to=new_dtype) + self.make_node( + "Cast", inputs=[skip_input], outputs=[skip_input_cast_output], name=skip_input_cast_name, to=new_dtype + ) self.make_value(skip_input_cast_output, new_dtype, shape=self.values[skip_input].shape) inputs[1] = skip_input_cast_output @@ -1320,7 +1513,9 @@ def make_layernorm_casts(self, name, inputs, outputs, old_dtype, new_dtype): # Cast output_0 output_0_cast_name = f"{name}/output_0/Cast" output_0_cast_output = f"{output_0_cast_name}/output_0" - self.make_node("Cast", inputs=[output_0_cast_output], outputs=[output_0], name=output_0_cast_name, to=old_dtype) + self.make_node( + "Cast", inputs=[output_0_cast_output], outputs=[output_0], name=output_0_cast_name, to=old_dtype + ) self.make_value(output_0, old_dtype, shape=root_input_shape) outputs[0] = output_0_cast_output @@ -1329,7 +1524,9 @@ def make_layernorm_casts(self, name, inputs, outputs, old_dtype, new_dtype): assert output_3 is not None output_3_cast_name = f"{name}/output_3/Cast" output_3_cast_output = f"{output_3_cast_name}/output_3" - self.make_node("Cast", inputs=[output_3_cast_output], outputs=[output_3], name=output_3_cast_name, to=old_dtype) + self.make_node( + "Cast", inputs=[output_3_cast_output], outputs=[output_3], name=output_3_cast_name, to=old_dtype + ) self.make_value(output_3, old_dtype, shape=root_input_shape) outputs[3] = output_3_cast_output @@ -1359,7 +1556,7 @@ def make_inv_freq_rescaled(self, inv_freq): elif "ntk_alpha" in self.rope_attrs["rescale_inv_freq"]: return self.make_inv_freq_rescaled_with_ntk(inv_freq) else: - raise NotImplementedError(f"The method to rescale inv_freq could not be identified.") + raise NotImplementedError("The method to rescale inv_freq could not be identified.") def make_inv_freq_rescaled_with_freq_factors(self, inv_freq): scale_factor = self.rope_attrs["rescale_inv_freq"]["factor"] @@ -1400,9 +1597,7 @@ def make_inv_freq_rescaled_with_ntk(self, inv_freq): interpolation = 1.0 / (self.rope_attrs["rescale_inv_freq"]["factor"] * inv_freq) extrapolation = 1.0 / inv_freq - ramp = ( - torch.arange(d_half, dtype=torch.float32, device=inv_freq.device) - low - ) / (high - low) + ramp = (torch.arange(d_half, dtype=torch.float32, device=inv_freq.device) - low) / (high - low) mask = 1 - ramp.clamp(0, 1) inv_freq = interpolation * (1 - mask) + extrapolation * mask @@ -1410,12 +1605,17 @@ def make_inv_freq_rescaled_with_ntk(self, inv_freq): def make_rotary_embedding_caches_from_scratch(self): dim = int(self.rope_attrs["partial_rotary_factor"] * self.head_size) - inv_freq = 1.0 / (self.rope_attrs["rescale_factors"] * (self.rope_attrs["theta"] ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))) + inv_freq = 1.0 / ( + self.rope_attrs["rescale_factors"] + * (self.rope_attrs["theta"] ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) + ) if "rescale_inv_freq" in self.rope_attrs: inv_freq = self.make_inv_freq_rescaled(inv_freq) position_scale = self.rope_attrs["position_scale"] if self.context_length == self.original_context_length else 1 - t = (torch.arange(self.rope_attrs["cache_length"], dtype=self.rope_attrs["t_dtype"]) * position_scale).type_as(inv_freq) + t = (torch.arange(self.rope_attrs["cache_length"], dtype=self.rope_attrs["t_dtype"]) * position_scale).type_as( + inv_freq + ) freqs = torch.outer(t, inv_freq) emb = torch.cat((freqs, freqs), dim=-1) @@ -1474,16 +1674,25 @@ def make_padded_cache(self, small_cache, large_cache, pad_value=0.0): # Create padded tensor filled with pad_value padded_cache = torch.full(target_shape, pad_value, dtype=small_cache.dtype) # Copy original data to the beginning - padded_cache[:small_cache.shape[0], :] = small_cache + padded_cache[: small_cache.shape[0], :] = small_cache return padded_cache - def _make_split_if_nodes_for_trt_rtx(self, basename, greater_name, - cos_cache_name, sin_cache_name, - cos_cache_large, sin_cache_large, - cos_cache_small, sin_cache_small, - cos_cache_large_name, sin_cache_large_name, - cos_cache_small_name, sin_cache_small_name, - small_cache_shape): + def _make_split_if_nodes_for_trt_rtx( + self, + basename, + greater_name, + cos_cache_name, + sin_cache_name, + cos_cache_large, + sin_cache_large, + cos_cache_small, + sin_cache_small, + cos_cache_large_name, + sin_cache_large_name, + cos_cache_small_name, + sin_cache_small_name, + small_cache_shape, + ): """Create split If nodes for TRT-RTX to workaround trt-rtx multi-output bug. This is a TEMPORARY workaround for TRT-RTX bug where If nodes with @@ -1498,19 +1707,38 @@ def _make_split_if_nodes_for_trt_rtx(self, basename, greater_name, cos_if_name = f"{basename}/cos/If" cos_large_for_split = ir.node( - "Constant", [], outputs=[ - ir.Value(name=f"{cos_cache_large_name}_split", type=ir.TensorType(self.io_dtype), shape=ir.Shape(cos_cache_large.shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=f"{cos_cache_large_name}_split", + type=ir.TensorType(self.io_dtype), + shape=ir.Shape(cos_cache_large.shape), + ) ], - name=f"/large/cos_cache/Constant_split_cos", attributes=dict(value=ir.tensor(cos_cache_large))) + name="/large/cos_cache/Constant_split_cos", + attributes=dict(value=ir.tensor(cos_cache_large)), + ) cos_small_for_split = ir.node( - "Constant", [], outputs=[ - ir.Value(name=f"{cos_cache_small_name}_split", type=ir.TensorType(self.io_dtype), shape=ir.Shape(small_cache_shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=f"{cos_cache_small_name}_split", + type=ir.TensorType(self.io_dtype), + shape=ir.Shape(small_cache_shape), + ) ], - name=f"/small/cos_cache/Constant_split_cos", attributes=dict(value=ir.tensor(cos_cache_small))) + name="/small/cos_cache/Constant_split_cos", + attributes=dict(value=ir.tensor(cos_cache_small)), + ) self.make_node( - "If", inputs=[f"{greater_name}/output_0"], outputs=[cos_cache_name], name=cos_if_name, + "If", + inputs=[f"{greater_name}/output_0"], + outputs=[cos_cache_name], + name=cos_if_name, then_branch=ir.Graph( inputs=[], outputs=[cos_large_for_split.outputs[0]], @@ -1530,19 +1758,38 @@ def _make_split_if_nodes_for_trt_rtx(self, basename, greater_name, # Create unique constant nodes for sin to avoid tensor sharing sin_large_for_split = ir.node( - "Constant", [], outputs=[ - ir.Value(name=f"{sin_cache_large_name}_split", type=ir.TensorType(self.io_dtype), shape=ir.Shape(sin_cache_large.shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=f"{sin_cache_large_name}_split", + type=ir.TensorType(self.io_dtype), + shape=ir.Shape(sin_cache_large.shape), + ) ], - name=f"/large/sin_cache/Constant_split_sin", attributes=dict(value=ir.tensor(sin_cache_large))) + name="/large/sin_cache/Constant_split_sin", + attributes=dict(value=ir.tensor(sin_cache_large)), + ) sin_small_for_split = ir.node( - "Constant", [], outputs=[ - ir.Value(name=f"{sin_cache_small_name}_split", type=ir.TensorType(self.io_dtype), shape=ir.Shape(small_cache_shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=f"{sin_cache_small_name}_split", + type=ir.TensorType(self.io_dtype), + shape=ir.Shape(small_cache_shape), + ) ], - name=f"/small/sin_cache/Constant_split_sin", attributes=dict(value=ir.tensor(sin_cache_small))) + name="/small/sin_cache/Constant_split_sin", + attributes=dict(value=ir.tensor(sin_cache_small)), + ) self.make_node( - "If", inputs=[f"{greater_name}/output_0"], outputs=[sin_cache_name], name=sin_if_name, + "If", + inputs=[f"{greater_name}/output_0"], + outputs=[sin_cache_name], + name=sin_if_name, then_branch=ir.Graph( inputs=[], outputs=[sin_large_for_split.outputs[0]], @@ -1568,11 +1815,18 @@ def make_rotary_embedding(self, name, root_input, **kwargs): inputs = [root_input, kwargs.pop("position_ids"), cos_cache_name, sin_cache_name] output = f"{name}/output_0" self.make_node( - "RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", - interleaved=self.rope_attrs["interleaved"], num_heads=(0 if self.rope_attrs["partial_rotary_factor"] == 1.0 else num_heads), # default is 0 in RotaryEmbedding kernel + "RotaryEmbedding", + inputs=inputs, + outputs=[output], + name=name, + domain="com.microsoft", + interleaved=self.rope_attrs["interleaved"], + num_heads=( + 0 if self.rope_attrs["partial_rotary_factor"] == 1.0 else num_heads + ), # default is 0 in RotaryEmbedding kernel rotary_embedding_dim=self.rope_attrs["rotary_embedding_dim"], ) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * num_heads]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", self.head_size * num_heads]) def make_rotary_embedding_multi_cache(self, **kwargs): cos_cache_name = kwargs.get("cos_cache_name", "cos_cache") @@ -1586,7 +1840,9 @@ def make_rotary_embedding_multi_cache(self, **kwargs): # Create caches for when sequence_length > self.original_context_length cos_cache_large_name, sin_cache_large_name = "cos_cache_large", "sin_cache_large" self.rope_attrs["save_caches"] = False - cos_cache_large, sin_cache_large = self.make_rotary_embedding_caches(cos_cache_name=cos_cache_large_name, sin_cache_name=sin_cache_large_name) + cos_cache_large, sin_cache_large = self.make_rotary_embedding_caches( + cos_cache_name=cos_cache_large_name, sin_cache_name=sin_cache_large_name + ) # Set cache attributes for when sequence_length <= self.original_context_length self.rope_attrs["rescale_factors"] = self.rope_attrs["multi_cache"]["short_factor"] @@ -1597,7 +1853,9 @@ def make_rotary_embedding_multi_cache(self, **kwargs): # Create caches for when sequence_length <= self.original_context_length cos_cache_small_name, sin_cache_small_name = "cos_cache_small", "sin_cache_small" self.rope_attrs["save_caches"] = False - cos_cache_small, sin_cache_small = self.make_rotary_embedding_caches(cos_cache_name=cos_cache_small_name, sin_cache_name=sin_cache_small_name) + cos_cache_small, sin_cache_small = self.make_rotary_embedding_caches( + cos_cache_name=cos_cache_small_name, sin_cache_name=sin_cache_small_name + ) # Determine which EPs don't support the If operator self.eps_without_if_support = ["dml"] @@ -1646,7 +1904,7 @@ def make_rotary_embedding_multi_cache(self, **kwargs): sin_cache_large_name=sin_cache_large_name, cos_cache_small_name=cos_cache_small_name, sin_cache_small_name=sin_cache_small_name, - small_cache_shape=cos_cache_large.shape + small_cache_shape=cos_cache_large.shape, ) return @@ -1670,29 +1928,56 @@ def make_rotary_embedding_multi_cache(self, **kwargs): if_name = f"{basename}/If" cos_cache_large_node = ir.node( - "Constant", [], outputs=[ - ir.Value(name=cos_cache_large_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(cos_cache_large.shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=cos_cache_large_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(cos_cache_large.shape) + ) ], - name="/large/cos_cache/Constant", attributes=dict(value=ir.tensor(cos_cache_large))) + name="/large/cos_cache/Constant", + attributes=dict(value=ir.tensor(cos_cache_large)), + ) sin_cache_large_node = ir.node( - "Constant", [], outputs=[ - ir.Value(name=sin_cache_large_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(sin_cache_large.shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=sin_cache_large_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(sin_cache_large.shape) + ) ], - name="/large/sin_cache/Constant", attributes=dict(value=ir.tensor(sin_cache_large))) + name="/large/sin_cache/Constant", + attributes=dict(value=ir.tensor(sin_cache_large)), + ) cos_cache_small_node = ir.node( - "Constant", [], outputs=[ - ir.Value(name=cos_cache_small_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(cos_cache_small.shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=cos_cache_small_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(cos_cache_small.shape) + ) ], - name="/small/cos_cache/Constant", attributes=dict(value=ir.tensor(cos_cache_small))) + name="/small/cos_cache/Constant", + attributes=dict(value=ir.tensor(cos_cache_small)), + ) sin_cache_small_node = ir.node( - "Constant", [], outputs=[ - ir.Value(name=sin_cache_small_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(sin_cache_small.shape)) + "Constant", + [], + outputs=[ + ir.Value( + name=sin_cache_small_name, type=ir.TensorType(self.io_dtype), shape=ir.Shape(sin_cache_small.shape) + ) ], - name="/small/sin_cache/Constant", attributes=dict(value=ir.tensor(sin_cache_small))) + name="/small/sin_cache/Constant", + attributes=dict(value=ir.tensor(sin_cache_small)), + ) # Create single If node with multiple outputs self.make_node( - "If", inputs=[f"{greater_name}/output_0"], outputs=[cos_cache_name, sin_cache_name], name=if_name, + "If", + inputs=[f"{greater_name}/output_0"], + outputs=[cos_cache_name, sin_cache_name], + name=if_name, then_branch=ir.Graph( inputs=[], outputs=[ @@ -1722,7 +2007,9 @@ def make_rotary_embedding_multi_cache(self, **kwargs): self.make_value(sin_cache_name, self.io_dtype, shape=["max_sequence_length", "head_dim / 2"]) # This expansion of contrib-op can be updated / deprecated in future. - def _make_skip_simplified_layer_norm(self, basename, root_input, skip_input, weight_name, output_0, output_3, io_dtype, shape): + def _make_skip_simplified_layer_norm( + self, basename, root_input, skip_input, weight_name, output_0, output_3, io_dtype, shape + ): # root_input skip_input # | | # +------------------+ @@ -1733,13 +2020,17 @@ def _make_skip_simplified_layer_norm(self, basename, root_input, skip_input, wei make_add_name = f"{basename}/Add" output_3 = f"{basename}/Add/output_0" if output_3 is None else output_3 self.make_node("Add", inputs=[root_input, skip_input], outputs=[output_3], name=make_add_name) - self.make_value(output_3, io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(output_3, io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) make_simplified_layer_norm_name = f"{basename}/skip_simplified_layer_norm" - self._make_simplified_layer_norm(make_simplified_layer_norm_name, output_3, weight_name, output_0, io_dtype, shape=shape) + self._make_simplified_layer_norm( + make_simplified_layer_norm_name, output_3, weight_name, output_0, io_dtype, shape=shape + ) # This expansion contrib-op can be updated / deprecated in the future. - def _make_skip_layer_norm(self, basename, root_input, skip_input, weight_name, bias_name, output_0, output_3, io_dtype, shape): + def _make_skip_layer_norm( + self, basename, root_input, skip_input, weight_name, bias_name, output_0, output_3, io_dtype, shape + ): # root_input skip_input # | | # +------------------+ @@ -1750,7 +2041,7 @@ def _make_skip_layer_norm(self, basename, root_input, skip_input, weight_name, b output_3 = f"{basename}/Add/output_0" if output_3 is None else output_3 make_add_name = f"{basename}/Add" self.make_node("Add", inputs=[root_input, skip_input], outputs=[output_3], name=make_add_name) - self.make_value(output_3, io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(output_3, io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) make_layer_norm_name = f"{basename}/LayerNormalization" inputs = [output_3, weight_name, bias_name] @@ -1763,7 +2054,6 @@ def _make_skip_layer_norm(self, basename, root_input, skip_input, weight_name, b # This expansion contrib-op can be updated / deprecated in the future. def _make_simplified_layer_norm(self, basename, root_input, weight_name, output_0, io_dtype, shape): - # Cast (float32) - most calc happens in higher precision # | # +-------+-------+ @@ -1792,15 +2082,22 @@ def _make_simplified_layer_norm(self, basename, root_input, weight_name, output_ make_pow_name = f"{basename}/Pow" make_pow_inputs = [f"{make_cast_name}/output_0", "/model/constants/FLOAT/2"] - self.make_node("Pow", inputs=make_pow_inputs, outputs=[f"{make_pow_name}/output_0"], name=make_pow_name, domain="") + self.make_node( + "Pow", inputs=make_pow_inputs, outputs=[f"{make_pow_name}/output_0"], name=make_pow_name, domain="" + ) self.make_value(f"{make_pow_name}/output_0", ir.DataType.FLOAT, shape=shape) make_reducemean_name = f"{basename}/ReduceMean" make_reducemean_inputs = [f"{make_pow_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_reduce_mean(make_reducemean_name, make_reducemean_inputs, ir.DataType.FLOAT, keepdims=True, shape=shape) + self.make_reduce_mean( + make_reducemean_name, make_reducemean_inputs, ir.DataType.FLOAT, keepdims=True, shape=shape + ) make_add_name = f"{basename}/Add" - make_add_inputs = [f"{make_reducemean_name}/output_0", f"/model/constants/FLOAT/{self.layernorm_attrs['epsilon']}"] + make_add_inputs = [ + f"{make_reducemean_name}/output_0", + f"/model/constants/FLOAT/{self.layernorm_attrs['epsilon']}", + ] self.make_add(make_add_name, make_add_inputs, ir.DataType.FLOAT, shape=shape) make_sqrt_name = f"{basename}/Sqrt" @@ -1824,7 +2121,6 @@ def _make_simplified_layer_norm(self, basename, root_input, weight_name, output_ self.make_node("Mul", inputs=make_mul_1_inputs, outputs=[output_0], name=make_mul_1_name) self.make_value(output_0, dtype=io_dtype, shape=shape) - def make_qk_norm(self, layer_id, attention): # Make subgraph to compute SimplifiedLayerNorm after Q and K MatMuls in attention: # @@ -1846,61 +2142,107 @@ def make_qk_norm(self, layer_id, attention): q_reshape_1_name = f"/model/layers.{layer_id}/attn/q_norm/Reshape_1" q_reshape_1_inputs = [self.attention_attrs["q_path"], f"/model/constants/INT64/[0, -1, {self.head_size}]"] q_reshape_1_output = f"{q_reshape_1_name}/output_0" - self.make_reshape(q_reshape_1_name, q_reshape_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length * num_attention_heads', self.head_size]) + self.make_reshape( + q_reshape_1_name, + q_reshape_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length * num_attention_heads", self.head_size], + ) # Make Q LayerNorm q_layernorm_name = f"/model/layers.{layer_id}/attn/q_norm/SimplifiedLayerNormalization" q_weight_name = f"model.layers.{layer_id}.attn.q_norm.layernorm.weight" q_layernorm_output = f"{q_layernorm_name}/output_0" self.make_initializer( - attention.q_norm.weight + self.layernorm_attrs["add_offset"], - q_weight_name, - to=new_io_dtype + attention.q_norm.weight + self.layernorm_attrs["add_offset"], q_weight_name, to=new_io_dtype ) # Create Cast nodes for inputs and outputs if old_dtype != new_dtype q_layernorm_inputs = [q_reshape_1_output, q_weight_name] q_layernorm_outputs = [q_layernorm_output] if cast: - q_layernorm_inputs, q_layernorm_outputs = self.make_layernorm_casts(q_layernorm_name, q_layernorm_inputs, q_layernorm_outputs, old_io_dtype, new_io_dtype) + q_layernorm_inputs, q_layernorm_outputs = self.make_layernorm_casts( + q_layernorm_name, q_layernorm_inputs, q_layernorm_outputs, old_io_dtype, new_io_dtype + ) - self.make_node("SimplifiedLayerNormalization", inputs=q_layernorm_inputs, outputs=q_layernorm_outputs, name=q_layernorm_name, **layernorm_kwargs) - self.make_value(q_layernorm_outputs[0], dtype=new_io_dtype, shape=['batch_size', 'sequence_length * num_attention_heads', self.head_size]) + self.make_node( + "SimplifiedLayerNormalization", + inputs=q_layernorm_inputs, + outputs=q_layernorm_outputs, + name=q_layernorm_name, + **layernorm_kwargs, + ) + self.make_value( + q_layernorm_outputs[0], + dtype=new_io_dtype, + shape=["batch_size", "sequence_length * num_attention_heads", self.head_size], + ) # Reshape Q path after LayerNorm from Bx(SxN)xH to BxSxD q_reshape_2_name = f"/model/layers.{layer_id}/attn/q_norm/Reshape_2" - q_reshape_2_inputs = [q_layernorm_output, f"/model/constants/INT64/[0, -1, {self.num_attn_heads * self.head_size}]"] - self.make_reshape(q_reshape_2_name, q_reshape_2_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads * self.head_size]) + q_reshape_2_inputs = [ + q_layernorm_output, + f"/model/constants/INT64/[0, -1, {self.num_attn_heads * self.head_size}]", + ] + self.make_reshape( + q_reshape_2_name, + q_reshape_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.num_attn_heads * self.head_size], + ) # Reshape K MatMul from BxSxD to Bx(SxN)xH before LayerNorm k_reshape_1_name = f"/model/layers.{layer_id}/attn/k_norm/Reshape_1" k_reshape_1_inputs = [self.attention_attrs["k_path"], f"/model/constants/INT64/[0, -1, {self.head_size}]"] k_reshape_1_output = f"{k_reshape_1_name}/output_0" - self.make_reshape(k_reshape_1_name, k_reshape_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length * num_key_value_heads', self.head_size]) + self.make_reshape( + k_reshape_1_name, + k_reshape_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length * num_key_value_heads", self.head_size], + ) # Make K LayerNorm k_layernorm_name = f"/model/layers.{layer_id}/attn/k_norm/SimplifiedLayerNormalization" k_weight_name = f"model.layers.{layer_id}.attn.k_norm.layernorm.weight" k_layernorm_output = f"{k_layernorm_name}/output_0" self.make_initializer( - attention.k_norm.weight + self.layernorm_attrs["add_offset"], - k_weight_name, - to=new_io_dtype + attention.k_norm.weight + self.layernorm_attrs["add_offset"], k_weight_name, to=new_io_dtype ) # Create Cast nodes for inputs and outputs if old_dtype != new_dtype k_layernorm_inputs = [k_reshape_1_output, k_weight_name] k_layernorm_outputs = [k_layernorm_output] if cast: - k_layernorm_inputs, k_layernorm_outputs = self.make_layernorm_casts(k_layernorm_name, k_layernorm_inputs, k_layernorm_outputs, old_io_dtype, new_io_dtype) + k_layernorm_inputs, k_layernorm_outputs = self.make_layernorm_casts( + k_layernorm_name, k_layernorm_inputs, k_layernorm_outputs, old_io_dtype, new_io_dtype + ) - self.make_node("SimplifiedLayerNormalization", inputs=k_layernorm_inputs, outputs=k_layernorm_outputs, name=k_layernorm_name, **layernorm_kwargs) - self.make_value(k_layernorm_outputs[0], dtype=new_io_dtype, shape=['batch_size', 'sequence_length * num_key_value_heads', self.head_size]) + self.make_node( + "SimplifiedLayerNormalization", + inputs=k_layernorm_inputs, + outputs=k_layernorm_outputs, + name=k_layernorm_name, + **layernorm_kwargs, + ) + self.make_value( + k_layernorm_outputs[0], + dtype=new_io_dtype, + shape=["batch_size", "sequence_length * num_key_value_heads", self.head_size], + ) # Reshape K path after LayerNorm from Bx(SxN)xH to BxSxD k_reshape_2_name = f"/model/layers.{layer_id}/attn/k_norm/Reshape_2" - k_reshape_2_inputs = [k_layernorm_output, f"/model/constants/INT64/[0, -1, {self.num_kv_heads * self.head_size}]"] - self.make_reshape(k_reshape_2_name, k_reshape_2_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_kv_heads * self.head_size]) + k_reshape_2_inputs = [ + k_layernorm_output, + f"/model/constants/INT64/[0, -1, {self.num_kv_heads * self.head_size}]", + ] + self.make_reshape( + k_reshape_2_name, + k_reshape_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.num_kv_heads * self.head_size], + ) # Update q_path and k_path now self.attention_attrs["q_path"] = f"{q_reshape_2_name}/output_0" @@ -1990,10 +2332,21 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): # present_kv +------> Gather --> Unsqueeze -----+ reshape_1_name = f"{basename}/Reshape_1" reshape_1_inputs = [root_input, f"/model/constants/INT64/[0, 0, {self.num_kv_heads}, -1]"] - self.make_reshape(reshape_1_name, reshape_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_kv_heads, self.head_size]) + self.make_reshape( + reshape_1_name, + reshape_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.num_kv_heads, self.head_size], + ) transpose_1_name = f"{basename}/Transpose_1" transpose_1_input = f"{reshape_1_name}/output_0" - self.make_transpose(transpose_1_name, transpose_1_input, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 'sequence_length', self.head_size], perm=[0,2,1,3]) + self.make_transpose( + transpose_1_name, + transpose_1_input, + dtype=self.io_dtype, + shape=["batch_size", self.num_kv_heads, "sequence_length", self.head_size], + perm=[0, 2, 1, 3], + ) concat_1_name = f"{basename}/Concat_1" concat_1_inputs = [past_kv, f"{transpose_1_name}/output_0"] self.make_node("Concat", inputs=concat_1_inputs, outputs=[present_kv], name=concat_1_name, axis=2) @@ -2025,14 +2378,28 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): unsqueeze_4_inputs = [f"{gather_4_name}/output_0", "/model/constants/INT64/[0]"] self.make_unsqueeze(unsqueeze_4_name, unsqueeze_4_inputs, dtype=ir.DataType.INT64, shape=[1]) concat_2_name = f"{basename}/Concat_2" - concat_2_inputs = [f"{unsqueeze_1_name}/output_0", f"{unsqueeze_2_name}/output_0", f"/model/constants/INT64/[{self.num_attn_heads // self.num_kv_heads}]", f"{unsqueeze_3_name}/output_0", f"{unsqueeze_4_name}/output_0"] + concat_2_inputs = [ + f"{unsqueeze_1_name}/output_0", + f"{unsqueeze_2_name}/output_0", + f"/model/constants/INT64/[{self.num_attn_heads // self.num_kv_heads}]", + f"{unsqueeze_3_name}/output_0", + f"{unsqueeze_4_name}/output_0", + ] self.make_concat(concat_2_name, concat_2_inputs, dtype=ir.DataType.INT64, shape=[5], axis=0) mul_1_name = f"{basename}/Mul_1" - mul_1_inputs = [f"{unsqueeze_2_name}/output_0", f"/model/constants/INT64/{self.num_attn_heads // self.num_kv_heads}"] + mul_1_inputs = [ + f"{unsqueeze_2_name}/output_0", + f"/model/constants/INT64/{self.num_attn_heads // self.num_kv_heads}", + ] self.make_mul(mul_1_name, mul_1_inputs, dtype=ir.DataType.INT64, shape=None) concat_3_name = f"{basename}/Concat_3" - concat_3_inputs = [f"{unsqueeze_1_name}/output_0", f"{mul_1_name}/output_0", f"{unsqueeze_3_name}/output_0", f"{unsqueeze_4_name}/output_0"] + concat_3_inputs = [ + f"{unsqueeze_1_name}/output_0", + f"{mul_1_name}/output_0", + f"{unsqueeze_3_name}/output_0", + f"{unsqueeze_4_name}/output_0", + ] self.make_concat(concat_3_name, concat_3_inputs, dtype=ir.DataType.INT64, shape=[4], axis=0) # Make the subgraph that follows the initial subgraph @@ -2049,7 +2416,13 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): self.make_shape(shape_2_name, f"{reshape_2_name}/output_0", shape=[1]) constant_shape_name = f"{basename}/ConstantOfShape" constant_shape_value = ir.tensor([1], dtype=ir.DataType.INT64) - self.make_constant_of_shape(constant_shape_name, f"{shape_2_name}/output_0", value=constant_shape_value, dtype=ir.DataType.INT64, shape=[5]) + self.make_constant_of_shape( + constant_shape_name, + f"{shape_2_name}/output_0", + value=constant_shape_value, + dtype=ir.DataType.INT64, + shape=[5], + ) mul_2_name = f"{basename}/Mul" mul_2_inputs = [f"{constant_shape_name}/output_0", "/model/constants/INT64/-1"] self.make_mul(mul_2_name, mul_2_inputs, dtype=ir.DataType.INT64, shape=[5]) @@ -2067,19 +2440,54 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): # Unsqueeze --> Expand --> Reshape --> Transpose --> Reshape unsqueeze_5_name = f"{basename}/Unsqueeze_5" unsqueeze_5_inputs = [present_kv, "/model/constants/INT64/[2]"] - self.make_unsqueeze(unsqueeze_5_name, unsqueeze_5_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 1, 'sequence_length', self.head_size]) + self.make_unsqueeze( + unsqueeze_5_name, + unsqueeze_5_inputs, + dtype=self.io_dtype, + shape=["batch_size", self.num_kv_heads, 1, "sequence_length", self.head_size], + ) expand_name = f"{basename}/Expand" expand_inputs = [f"{unsqueeze_5_name}/output_0", f"{where_name}/output_0"] - self.make_expand(expand_name, expand_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, self.num_attn_heads // self.num_kv_heads, 'sequence_length', self.head_size]) + self.make_expand( + expand_name, + expand_inputs, + dtype=self.io_dtype, + shape=[ + "batch_size", + self.num_kv_heads, + self.num_attn_heads // self.num_kv_heads, + "sequence_length", + self.head_size, + ], + ) reshape_3_name = f"{basename}/Reshape_3" reshape_3_inputs = [f"{expand_name}/output_0", f"{concat_3_name}/output_0"] - self.make_reshape(reshape_3_name, reshape_3_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_attn_heads, 'sequence_length', self.head_size]) + self.make_reshape( + reshape_3_name, + reshape_3_inputs, + dtype=self.io_dtype, + shape=["batch_size", self.num_attn_heads, "sequence_length", self.head_size], + ) transpose_2_name = f"{basename}/Transpose_2" transpose_2_input = f"{reshape_3_name}/output_0" - self.make_transpose(transpose_2_name, transpose_2_input, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads, self.head_size], perm=[0,2,1,3]) + self.make_transpose( + transpose_2_name, + transpose_2_input, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.num_attn_heads, self.head_size], + perm=[0, 2, 1, 3], + ) reshape_4_name = f"{basename}/Reshape_4" - reshape_4_inputs = [f"{transpose_2_name}/output_0", f"/model/constants/INT64/[0, 0, {self.num_attn_heads * self.head_size}]"] - self.make_reshape(reshape_4_name, reshape_4_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads * self.head_size]) + reshape_4_inputs = [ + f"{transpose_2_name}/output_0", + f"/model/constants/INT64/[0, 0, {self.num_attn_heads * self.head_size}]", + ] + self.make_reshape( + reshape_4_name, + reshape_4_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.num_attn_heads * self.head_size], + ) input_to_attention = f"{reshape_4_name}/output_0" return input_to_attention @@ -2090,33 +2498,63 @@ def make_attention_op(self, name, **kwargs): if op_type == "MultiHeadAttention": self.make_multi_head_attention(name, add_qk=f"{self.mask_attrs['mask_name']}/output_0", **kwargs) elif op_type == "GroupQueryAttention": - self.make_group_query_attention(name, seqlens_k=f"{self.mask_attrs['seqlens_k']}/output_0", total_seq_len=f"{self.mask_attrs['total_seq_len']}/output_0", **kwargs) + self.make_group_query_attention( + name, + seqlens_k=f"{self.mask_attrs['seqlens_k']}/output_0", + total_seq_len=f"{self.mask_attrs['total_seq_len']}/output_0", + **kwargs, + ) elif op_type == "SparseAttention": - self.make_sparse_attention(name, block_row_indices=self.mask_attrs['block_row_indices'], block_col_indices=self.mask_attrs['block_col_indices'], key_total_seq_lens=f"{self.mask_attrs['key_total_seq_lens']}/output_0", total_seq_len=f"{self.mask_attrs['total_seq_len']}/output_0", **kwargs) + self.make_sparse_attention( + name, + block_row_indices=self.mask_attrs["block_row_indices"], + block_col_indices=self.mask_attrs["block_col_indices"], + key_total_seq_lens=f"{self.mask_attrs['key_total_seq_lens']}/output_0", + total_seq_len=f"{self.mask_attrs['total_seq_len']}/output_0", + **kwargs, + ) else: raise NotImplementedError(f"The {op_type} op is not currently supported.") def make_multi_head_attention(self, name, **kwargs): inputs = [ - kwargs["q_path"], kwargs["k_path"], kwargs["v_path"], kwargs.get("bias", ""), - kwargs.get("attn_mask", ""), kwargs.get("add_qk", ""), - kwargs.get("past_k", ""), kwargs.get("past_v", ""), + kwargs["q_path"], + kwargs["k_path"], + kwargs["v_path"], + kwargs.get("bias", ""), + kwargs.get("attn_mask", ""), + kwargs.get("add_qk", ""), + kwargs.get("past_k", ""), + kwargs.get("past_v", ""), ] output = f"{name}/output_0" outputs = [output, kwargs.get("present_k", ""), kwargs.get("present_v", "")] self.make_node( - "MultiHeadAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", - num_heads=self.num_attn_heads, scale=self.attention_attrs["scale"], + "MultiHeadAttention", + inputs=inputs, + outputs=outputs, + name=name, + domain="com.microsoft", + num_heads=self.num_attn_heads, + scale=self.attention_attrs["scale"], + ) + self.make_value( + output, self.io_dtype, shape=["batch_size", "sequence_length", self.head_size * self.num_attn_heads] ) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * self.num_attn_heads]) def make_group_query_attention(self, name, **kwargs): inputs = [ - kwargs["q_path"], kwargs["k_path"], kwargs["v_path"], - kwargs.get("past_k", ""), kwargs.get("past_v", ""), - kwargs.get("seqlens_k", ""), kwargs.get("total_seq_len", ""), - kwargs.get("cos_cache", ""), kwargs.get("sin_cache", ""), - "", "", # position_ids, attention_bias + kwargs["q_path"], + kwargs["k_path"], + kwargs["v_path"], + kwargs.get("past_k", ""), + kwargs.get("past_v", ""), + kwargs.get("seqlens_k", ""), + kwargs.get("total_seq_len", ""), + kwargs.get("cos_cache", ""), + kwargs.get("sin_cache", ""), + "", + "", # position_ids, attention_bias ] sinks = kwargs.get("sinks", "") # TODO: add to inputs list directly once ORT 1.23 is out (one-time exception) if sinks: @@ -2125,26 +2563,51 @@ def make_group_query_attention(self, name, **kwargs): output = f"{name}/output_0" outputs = [output, kwargs.get("present_k", ""), kwargs.get("present_v", "")] self.make_node( - "GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", - num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, scale=self.attention_attrs["scale"], local_window_size=self.window_size, - softcap=self.attention_attrs["softcap"], do_rotary=self.attention_attrs["use_rope_in_attn"], rotary_interleaved=self.rope_attrs["interleaved"], + "GroupQueryAttention", + inputs=inputs, + outputs=outputs, + name=name, + domain="com.microsoft", + num_heads=self.num_attn_heads, + kv_num_heads=self.num_kv_heads, + scale=self.attention_attrs["scale"], + local_window_size=self.window_size, + softcap=self.attention_attrs["softcap"], + do_rotary=self.attention_attrs["use_rope_in_attn"], + rotary_interleaved=self.rope_attrs["interleaved"], + ) + self.make_value( + output, self.io_dtype, shape=["batch_size", "sequence_length", self.head_size * self.num_attn_heads] ) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * self.num_attn_heads]) def make_sparse_attention(self, name, **kwargs): inputs = [ - kwargs["q_path"], kwargs["k_path"], kwargs["v_path"], - kwargs.get("past_k"), kwargs.get("past_v"), - kwargs.get("block_row_indices"), kwargs.get("block_col_indices"), - kwargs.get("total_seq_len"), kwargs.get("key_total_seq_lens"), - kwargs.get("cos_cache", ""), kwargs.get("sin_cache", ""), + kwargs["q_path"], + kwargs["k_path"], + kwargs["v_path"], + kwargs.get("past_k"), + kwargs.get("past_v"), + kwargs.get("block_row_indices"), + kwargs.get("block_col_indices"), + kwargs.get("total_seq_len"), + kwargs.get("key_total_seq_lens"), + kwargs.get("cos_cache", ""), + kwargs.get("sin_cache", ""), ] output = f"{name}/output_0" outputs = [output, kwargs.get("present_k", ""), kwargs.get("present_v", "")] self.make_node( - "SparseAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", - num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, scale=self.attention_attrs["scale"], sparse_block_size=self.attention_attrs["block_sparse"]["sparse_block_size"], - do_rotary=self.attention_attrs["use_rope_in_attn"], rotary_interleaved=self.rope_attrs["interleaved"], + "SparseAttention", + inputs=inputs, + outputs=outputs, + name=name, + domain="com.microsoft", + num_heads=self.num_attn_heads, + kv_num_heads=self.num_kv_heads, + scale=self.attention_attrs["scale"], + sparse_block_size=self.attention_attrs["block_sparse"]["sparse_block_size"], + do_rotary=self.attention_attrs["use_rope_in_attn"], + rotary_interleaved=self.rope_attrs["interleaved"], ) def make_attention(self, layer_id, attention, root_input, **kwargs): @@ -2184,18 +2647,24 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): # Unpack attention weights if needed self.make_attention_unpacked(layer_id, attention, root_input, **kwargs) - + # Get dtype used for MatMul ops q_dtype = getattr(attention.q_proj, "weight", getattr(attention.q_proj, "bits", None)) k_dtype = getattr(attention.k_proj, "weight", getattr(attention.k_proj, "bits", None)) v_dtype = getattr(attention.v_proj, "weight", getattr(attention.v_proj, "bits", None)) - qkv_dtype_equal = getattr(q_dtype, "dtype", q_dtype) == getattr(k_dtype, "dtype", k_dtype) == getattr(v_dtype, "dtype", v_dtype) + qkv_dtype_equal = ( + getattr(q_dtype, "dtype", q_dtype) + == getattr(k_dtype, "dtype", k_dtype) + == getattr(v_dtype, "dtype", v_dtype) + ) # Make MatMul nodes if self.attention_attrs["use_packed_matmul"] and qkv_dtype_equal: # Combine 3 MatMuls into 1 packed MatMul qkv_matmul_basename = f"/model/layers.{layer_id}/attn/qkv_proj/MatMul" - qkv_matmul_name = self.make_packed_matmul(attention.q_proj, attention.k_proj, attention.v_proj, qkv_matmul_basename, root_input) + qkv_matmul_name = self.make_packed_matmul( + attention.q_proj, attention.k_proj, attention.v_proj, qkv_matmul_basename, root_input + ) self.attention_attrs["q_path"] = f"{qkv_matmul_name}/output_0" else: q_matmul_basename = f"/model/layers.{layer_id}/attn/q_proj/MatMul" @@ -2217,7 +2686,13 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): if self.attention_attrs["use_packed_matmul"] and qkv_dtype_equal and any_bias_exists: # Combine 3 Adds into 1 packed Add qkv_add_name = f"/model/layers.{layer_id}/attn/qkv_proj/Add" - self.make_packed_add(attention.q_proj.bias, attention.k_proj.bias, attention.v_proj.bias, qkv_add_name, root_input=self.attention_attrs["q_path"]) + self.make_packed_add( + attention.q_proj.bias, + attention.k_proj.bias, + attention.v_proj.bias, + qkv_add_name, + root_input=self.attention_attrs["q_path"], + ) self.attention_attrs["q_path"] = f"{qkv_add_name}/output_0" else: if q_bias_exists: @@ -2243,10 +2718,18 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): cos_cache_name, sin_cache_name = self.make_rotary_embedding_caches() else: q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding" - self.make_rotary_embedding(q_rotary_name, root_input=self.attention_attrs["q_path"], position_ids=kwargs.get("position_ids", "position_ids")) + self.make_rotary_embedding( + q_rotary_name, + root_input=self.attention_attrs["q_path"], + position_ids=kwargs.get("position_ids", "position_ids"), + ) self.attention_attrs["q_path"] = f"{q_rotary_name}/output_0" k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding" - self.make_rotary_embedding(k_rotary_name, root_input=self.attention_attrs["k_path"], position_ids=kwargs.get("position_ids", "position_ids")) + self.make_rotary_embedding( + k_rotary_name, + root_input=self.attention_attrs["k_path"], + position_ids=kwargs.get("position_ids", "position_ids"), + ) self.attention_attrs["k_path"] = f"{k_rotary_name}/output_0" # Make repeat KV nodes (Note: `repeat_kv` needs to be kept since GroupQueryAttention isn't supported for FP32 CUDA) @@ -2255,8 +2738,12 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): present_k = f"present.{layer_id}.key" present_v = f"present.{layer_id}.value" if self.num_attn_heads != self.num_kv_heads and self.attention_attrs["op_type"] == "MultiHeadAttention": - self.attention_attrs["k_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["k_path"], past_kv=past_k, present_kv=present_k) - self.attention_attrs["v_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["v_path"], past_kv=past_v, present_kv=present_v) + self.attention_attrs["k_path"] = self.make_repeat_kv( + layer_id, root_input=self.attention_attrs["k_path"], past_kv=past_k, present_kv=present_k + ) + self.attention_attrs["v_path"] = self.make_repeat_kv( + layer_id, root_input=self.attention_attrs["v_path"], past_kv=past_v, present_kv=present_v + ) past_k, past_v, present_k, present_v = "", "", "", "" # Make sinks input @@ -2268,13 +2755,22 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.) attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}" self.make_attention_op( - attn_name, q_path=self.attention_attrs["q_path"], k_path=self.attention_attrs["k_path"], v_path=self.attention_attrs["v_path"], - past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, - cos_cache=cos_cache_name, sin_cache=sin_cache_name, sinks=sinks_name, **kwargs, + attn_name, + q_path=self.attention_attrs["q_path"], + k_path=self.attention_attrs["k_path"], + v_path=self.attention_attrs["v_path"], + past_k=past_k, + past_v=past_v, + present_k=present_k, + present_v=present_v, + cos_cache=cos_cache_name, + sin_cache=sin_cache_name, + sinks=sinks_name, + **kwargs, ) # Make MatMul node (output projection weight node) - o_proj = 'o_proj' if hasattr(attention, 'o_proj') else 'dense' + o_proj = "o_proj" if hasattr(attention, "o_proj") else "dense" o_matmul_basename = f"/model/layers.{layer_id}/attn/o_proj/MatMul" o_weight = getattr(attention, o_proj) o_matmul_name = self.make_matmul(o_weight, o_matmul_basename, f"{attn_name}/output_0") @@ -2313,31 +2809,47 @@ def make_attention_unpacked_lora(self, layer_id, attention, qkv_linear, root_inp # Create Q/K/V base layers q_proj = torch.nn.Linear(in_features=q_size, out_features=q_size) - q_proj.weight = torch.nn.Parameter(qkv_linear.weight[: q_size, :], requires_grad=False) - q_proj.bias = None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[: q_size], requires_grad=False) + q_proj.weight = torch.nn.Parameter(qkv_linear.weight[:q_size, :], requires_grad=False) + q_proj.bias = ( + None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[:q_size], requires_grad=False) + ) k_proj = torch.nn.Linear(in_features=q_size, out_features=kv_size) k_proj.weight = torch.nn.Parameter(qkv_linear.weight[q_size : q_size + kv_size, :], requires_grad=False) - k_proj.bias = None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[q_size : q_size + kv_size], requires_grad=False) + k_proj.bias = ( + None + if qkv_linear.bias is None + else torch.nn.Parameter(qkv_linear.bias[q_size : q_size + kv_size], requires_grad=False) + ) v_proj = torch.nn.Linear(in_features=q_size, out_features=kv_size) v_proj.weight = torch.nn.Parameter(qkv_linear.weight[q_size + kv_size :, :], requires_grad=False) - v_proj.bias = None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[q_size + kv_size :], requires_grad=False) + v_proj.bias = ( + None + if qkv_linear.bias is None + else torch.nn.Parameter(qkv_linear.bias[q_size + kv_size :], requires_grad=False) + ) # Create Q/K/V lora_B layers lora_B = qkv_linear.lora_B.default q_lora_B = torch.nn.Linear(in_features=q_size, out_features=q_size) - q_lora_B.weight = torch.nn.Parameter(lora_B.weight[: q_size, :], requires_grad=False) - q_lora_B.bias = None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[: q_size], requires_grad=False) + q_lora_B.weight = torch.nn.Parameter(lora_B.weight[:q_size, :], requires_grad=False) + q_lora_B.bias = None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[:q_size], requires_grad=False) k_lora_B = torch.nn.Linear(in_features=q_size, out_features=kv_size) k_lora_B.weight = torch.nn.Parameter(lora_B.weight[q_size : q_size + kv_size, :], requires_grad=False) - k_lora_B.bias = None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[q_size : q_size + kv_size], requires_grad=False) + k_lora_B.bias = ( + None + if lora_B.bias is None + else torch.nn.Parameter(lora_B.bias[q_size : q_size + kv_size], requires_grad=False) + ) v_lora_B = torch.nn.Linear(in_features=q_size, out_features=kv_size) v_lora_B.weight = torch.nn.Parameter(lora_B.weight[q_size + kv_size :, :], requires_grad=False) - v_lora_B.bias = None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[q_size + kv_size :], requires_grad=False) + v_lora_B.bias = ( + None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[q_size + kv_size :], requires_grad=False) + ) # Create Q/K/V LoRA layers attention.q_proj = LoraLayer(q_proj) @@ -2360,16 +2872,28 @@ def make_attention_unpacked_regular(self, layer_id, attention, qkv_linear, root_ kv_size = self.num_kv_heads * self.head_size attention.q_proj = torch.nn.Linear(in_features=q_size, out_features=q_size) - attention.q_proj.weight = torch.nn.Parameter(qkv_linear.weight[: q_size, :], requires_grad=False) - attention.q_proj.bias = None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[: q_size], requires_grad=False) + attention.q_proj.weight = torch.nn.Parameter(qkv_linear.weight[:q_size, :], requires_grad=False) + attention.q_proj.bias = ( + None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[:q_size], requires_grad=False) + ) attention.k_proj = torch.nn.Linear(in_features=q_size, out_features=kv_size) - attention.k_proj.weight = torch.nn.Parameter(qkv_linear.weight[q_size : q_size + kv_size, :], requires_grad=False) - attention.k_proj.bias = None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[q_size : q_size + kv_size], requires_grad=False) + attention.k_proj.weight = torch.nn.Parameter( + qkv_linear.weight[q_size : q_size + kv_size, :], requires_grad=False + ) + attention.k_proj.bias = ( + None + if qkv_linear.bias is None + else torch.nn.Parameter(qkv_linear.bias[q_size : q_size + kv_size], requires_grad=False) + ) attention.v_proj = torch.nn.Linear(in_features=q_size, out_features=kv_size) attention.v_proj.weight = torch.nn.Parameter(qkv_linear.weight[q_size + kv_size :, :], requires_grad=False) - attention.v_proj.bias = None if qkv_linear.bias is None else torch.nn.Parameter(qkv_linear.bias[q_size + kv_size :], requires_grad=False) + attention.v_proj.bias = ( + None + if qkv_linear.bias is None + else torch.nn.Parameter(qkv_linear.bias[q_size + kv_size :], requires_grad=False) + ) def make_mlp(self, layer_id, mlp, root_input): # Unpack MLP weights if needed @@ -2380,7 +2904,7 @@ def make_mlp(self, layer_id, mlp, root_input): elif self.mlp_attrs["use_fc"]: self.make_mlp_fc(layer_id, mlp, root_input) else: - raise NotImplementedError(f"The MLP layer type is not set.") + raise NotImplementedError("The MLP layer type is not set.") def make_mlp_unpacked(self, layer_id, mlp, root_input): gate_up_linear = getattr(mlp, "gate_up_proj", None) or getattr(mlp, "dense_h_to_4h", None) @@ -2403,23 +2927,39 @@ def make_mlp_unpacked_lora(self, layer_id, mlp, gate_up_linear, root_input): # Create GateProj/UpProj base layers gate_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) - gate_proj.weight = torch.nn.Parameter(gate_up_linear.weight[ : self.intermediate_size, :], requires_grad=False) - gate_proj.bias = None if gate_up_linear.bias is None else torch.nn.Parameter(gate_up_linear.bias[: self.intermediate_size], requires_grad=False) + gate_proj.weight = torch.nn.Parameter(gate_up_linear.weight[: self.intermediate_size, :], requires_grad=False) + gate_proj.bias = ( + None + if gate_up_linear.bias is None + else torch.nn.Parameter(gate_up_linear.bias[: self.intermediate_size], requires_grad=False) + ) up_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) up_proj.weight = torch.nn.Parameter(gate_up_linear.weight[self.intermediate_size :, :], requires_grad=False) - up_proj.bias = None if gate_up_linear.bias is None else torch.nn.Parameter(gate_up_linear.bias[self.intermediate_size :], requires_grad=False) + up_proj.bias = ( + None + if gate_up_linear.bias is None + else torch.nn.Parameter(gate_up_linear.bias[self.intermediate_size :], requires_grad=False) + ) # Create GateProj/UpProj lora_B layers lora_B = gate_up_linear.lora_B.default gate_proj_lora_B = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) - gate_proj_lora_B.weight = torch.nn.Parameter(lora_B.weight[ : self.intermediate_size, :], requires_grad=False) - gate_proj_lora_B.bias = None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[: self.intermediate_size], requires_grad=False) + gate_proj_lora_B.weight = torch.nn.Parameter(lora_B.weight[: self.intermediate_size, :], requires_grad=False) + gate_proj_lora_B.bias = ( + None + if lora_B.bias is None + else torch.nn.Parameter(lora_B.bias[: self.intermediate_size], requires_grad=False) + ) up_proj_lora_B = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) up_proj_lora_B.weight = torch.nn.Parameter(lora_B.weight[self.intermediate_size :, :], requires_grad=False) - up_proj_lora_B.bias = None if lora_B.bias is None else torch.nn.Parameter(lora_B.bias[self.intermediate_size :], requires_grad=False) + up_proj_lora_B.bias = ( + None + if lora_B.bias is None + else torch.nn.Parameter(lora_B.bias[self.intermediate_size :], requires_grad=False) + ) # Create GateProj/UpProj LoRA layers mlp.gate_proj = LoraLayer(gate_proj) @@ -2434,12 +2974,22 @@ def make_mlp_unpacked_lora(self, layer_id, mlp, gate_up_linear, root_input): def make_mlp_unpacked_regular(self, layer_id, mlp, gate_up_linear, root_input): mlp.gate_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) - mlp.gate_proj.weight = torch.nn.Parameter(gate_up_linear.weight[: self.intermediate_size, :], requires_grad=False) - mlp.gate_proj.bias = None if gate_up_linear.bias is None else torch.nn.Parameter(gate_up_linear.bias[: self.intermediate_size], requires_grad=False) + mlp.gate_proj.weight = torch.nn.Parameter( + gate_up_linear.weight[: self.intermediate_size, :], requires_grad=False + ) + mlp.gate_proj.bias = ( + None + if gate_up_linear.bias is None + else torch.nn.Parameter(gate_up_linear.bias[: self.intermediate_size], requires_grad=False) + ) mlp.up_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) mlp.up_proj.weight = torch.nn.Parameter(gate_up_linear.weight[self.intermediate_size :, :]) - mlp.up_proj.bias = None if gate_up_linear.bias is None else torch.nn.Parameter(gate_up_linear.bias[self.intermediate_size :], requires_grad=False) + mlp.up_proj.bias = ( + None + if gate_up_linear.bias is None + else torch.nn.Parameter(gate_up_linear.bias[self.intermediate_size :], requires_grad=False) + ) def make_mlp_proj(self, layer_id, mlp, root_input): # Make nodes for the MLP subgraph @@ -2490,7 +3040,9 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # Make Mul node after activation mul_name = f"/model/layers.{layer_id}/mlp/Mul" mul_inputs = [f"{act_fn_name}/output_0", f"{up_name}/output_0"] - self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_mul( + mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) # Make output MatMul node down_matmul_basename = f"/model/layers.{layer_id}/mlp/down_proj/MatMul" @@ -2559,16 +3111,26 @@ def make_moe_op(self, name, **kwargs): def make_base_moe_op(self, name, **kwargs): inputs = [ - kwargs["root_input"], kwargs["router_probs"], - kwargs["weight1"], kwargs.get("bias1", ""), - kwargs["weight2"], kwargs.get("bias2", ""), - kwargs.get("weight3", ""), kwargs.get("bias3", ""), + kwargs["root_input"], + kwargs["router_probs"], + kwargs["weight1"], + kwargs.get("bias1", ""), + kwargs["weight2"], + kwargs.get("bias2", ""), + kwargs.get("weight3", ""), + kwargs.get("bias3", ""), ] output = f"{name}/output_0" - extra_kwargs = {"swiglu_limit": self.moe_attrs["swiglu_limit"]} if self.moe_attrs["swiglu_limit"] is not None else {} + extra_kwargs = ( + {"swiglu_limit": self.moe_attrs["swiglu_limit"]} if self.moe_attrs["swiglu_limit"] is not None else {} + ) self.make_node( - "MoE", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", + "MoE", + inputs=inputs, + outputs=[output], + name=name, + domain="com.microsoft", activation_alpha=self.moe_attrs["activation_alpha"], activation_beta=self.moe_attrs["activation_beta"], activation_type=self.moe_attrs["activation_type"], @@ -2578,20 +3140,33 @@ def make_base_moe_op(self, name, **kwargs): use_sparse_mixer=self.moe_attrs["use_sparse_mixer"], **extra_kwargs, ) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) def make_qmoe_op(self, name, **kwargs): inputs = [ - kwargs["root_input"], kwargs["router_probs"], - kwargs["weight1"], kwargs["scales1"], kwargs.get("bias1", ""), - kwargs["weight2"], kwargs["scales2"], kwargs.get("bias2", ""), - kwargs.get("weight3", ""), kwargs.get("scales3", ""), kwargs.get("bias3", ""), + kwargs["root_input"], + kwargs["router_probs"], + kwargs["weight1"], + kwargs["scales1"], + kwargs.get("bias1", ""), + kwargs["weight2"], + kwargs["scales2"], + kwargs.get("bias2", ""), + kwargs.get("weight3", ""), + kwargs.get("scales3", ""), + kwargs.get("bias3", ""), ] output = f"{name}/output_0" - extra_kwargs = {"swiglu_limit": self.moe_attrs["swiglu_limit"]} if self.moe_attrs["swiglu_limit"] is not None else {} + extra_kwargs = ( + {"swiglu_limit": self.moe_attrs["swiglu_limit"]} if self.moe_attrs["swiglu_limit"] is not None else {} + ) self.make_node( - "QMoE", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", + "QMoE", + inputs=inputs, + outputs=[output], + name=name, + domain="com.microsoft", activation_alpha=self.moe_attrs["activation_alpha"], activation_beta=self.moe_attrs["activation_beta"], activation_type=self.moe_attrs["activation_type"], @@ -2603,7 +3178,7 @@ def make_qmoe_op(self, name, **kwargs): block_size=self.moe_attrs["block_size"], **extra_kwargs, ) - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", self.hidden_size]) def make_qmoe_weights(self, weights): dtype = torch.quint4x2 if self.moe_attrs["expert_weight_bits"] == 4 else torch.int8 @@ -2629,19 +3204,25 @@ def make_qmoe_weights(self, weights): try: import tensorrt_llm - _, qweight, scales = ( - torch.ops.trtllm._symmetric_quantize_last_axis_of_batched_matrix(weights.detach().cpu().contiguous(), dtype) + _, qweight, scales = torch.ops.trtllm._symmetric_quantize_last_axis_of_batched_matrix( + weights.detach().cpu().contiguous(), dtype ) unsuccessful = False except ImportError: - print("WARNING: TensorRT-LLM is needed to use torch.ops.trtllm._symmetric_quantize_last_axis_of_batched_matrix().") + print( + "WARNING: TensorRT-LLM is needed to use torch.ops.trtllm._symmetric_quantize_last_axis_of_batched_matrix()." + ) except RuntimeError as r: - print("WARNING: TensorRT-LLM failed to run torch.ops.trtllm._symmetric_quantize_last_axis_of_batched_matrix() successfully.") + print( + "WARNING: TensorRT-LLM failed to run torch.ops.trtllm._symmetric_quantize_last_axis_of_batched_matrix() successfully." + ) err = str(r) - print(err[ : err.find('\n1')]) # omit internal traceback inside TensorRT-LLM + print(err[: err.find("\n1")]) # omit internal traceback inside TensorRT-LLM finally: if unsuccessful: - raise RuntimeError("Failed to quantize MoE weights with TensorRT-LLM. Please ensure TensorRT-LLM installs and runs successfully in your environment.") + raise RuntimeError( + "Failed to quantize MoE weights with TensorRT-LLM. Please ensure TensorRT-LLM installs and runs successfully in your environment." + ) return qweight, scales.to(torch.float16) @@ -2675,7 +3256,9 @@ def _symmetric_blockwise_quantize(self, weights, block_size): # Avoid division by zero - set minimum scale min_scale = 1e-8 - scales = torch.where(scales < min_scale, torch.tensor(min_scale, dtype=scales.dtype, device=scales.device), scales) + scales = torch.where( + scales < min_scale, torch.tensor(min_scale, dtype=scales.dtype, device=scales.device), scales + ) # Expand scales for broadcasting: [..., num_blocks, 1] scales_expanded = scales.unsqueeze(-1) @@ -2749,13 +3332,35 @@ def make_block_sparse_moe(self, layer_id, bsm, root_input): shape_name = f"{gate_ops_base}/Shape" self.make_shape(shape_name, f"{gate_name}/output_0", shape=[3]) gather_name = f"{gate_ops_base}/Gather" - self.make_gather(gather_name, [f"{shape_name}/output_0", "/model/constants/INT64/2"], dtype=ir.DataType.INT64, shape=[], axis=0) + self.make_gather( + gather_name, + [f"{shape_name}/output_0", "/model/constants/INT64/2"], + dtype=ir.DataType.INT64, + shape=[], + axis=0, + ) unsqueeze_name = f"{gate_ops_base}/Unsqueeze" - self.make_unsqueeze(unsqueeze_name, [f"{gather_name}/output_0", "/model/constants/INT64/[0]"], dtype=ir.DataType.INT64, shape=[1]) + self.make_unsqueeze( + unsqueeze_name, + [f"{gather_name}/output_0", "/model/constants/INT64/[0]"], + dtype=ir.DataType.INT64, + shape=[1], + ) concat_name = f"{gate_ops_base}/Concat" - self.make_concat(concat_name, ["/model/constants/INT64/[-1]", f"{unsqueeze_name}/output_0"], dtype=ir.DataType.INT64, shape=[2], axis=0) + self.make_concat( + concat_name, + ["/model/constants/INT64/[-1]", f"{unsqueeze_name}/output_0"], + dtype=ir.DataType.INT64, + shape=[2], + axis=0, + ) gate_reshape_name = f"{gate_ops_base}/Reshape" - self.make_reshape(gate_reshape_name, [f"{gate_name}/output_0", f"{concat_name}/output_0"], dtype=self.io_dtype, shape=['num_rows', self.moe_attrs["num_experts"]]) + self.make_reshape( + gate_reshape_name, + [f"{gate_name}/output_0", f"{concat_name}/output_0"], + dtype=self.io_dtype, + shape=["num_rows", self.moe_attrs["num_experts"]], + ) w1_list = [] w2_list = [] @@ -2804,10 +3409,15 @@ def make_moe_initializer(w_list, moe_expert_name, dtype): make_moe_initializer(w3_scale_list, moe_expert_scales_3_name, self.io_dtype) self.make_moe_op( - moe_name, root_input=root_input, router_probs=f"{gate_reshape_name}/output_0", - weight1=moe_expert_weight_1_name, scales1=moe_expert_scales_1_name, - weight2=moe_expert_weight_2_name, scales2=moe_expert_scales_2_name, - weight3=moe_expert_weight_3_name, scales3=moe_expert_scales_3_name, + moe_name, + root_input=root_input, + router_probs=f"{gate_reshape_name}/output_0", + weight1=moe_expert_weight_1_name, + scales1=moe_expert_scales_1_name, + weight2=moe_expert_weight_2_name, + scales2=moe_expert_scales_2_name, + weight3=moe_expert_weight_3_name, + scales3=moe_expert_scales_3_name, ) # Assign output 0 of previous MoE as root input to next SkipLayerNorm @@ -2824,11 +3434,18 @@ def make_activation_with_mul(self, layer_id, root_input, activation, domain): act_name = f"/model/layers.{layer_id}/mlp/act_fn/{activation}" act_output = f"{act_name}/output_0" self.make_node(activation, inputs=[root_input], outputs=[act_output], name=act_name, domain=domain) - self.make_value(act_output, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_value( + act_output, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) mul_act_name = f"/model/layers.{layer_id}/mlp/act_fn/Mul" mul_act_inputs = [root_input, act_output] - self.make_mul(mul_act_name, mul_act_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_mul( + mul_act_name, + mul_act_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) return mul_act_name @@ -2848,7 +3465,7 @@ def make_gelu(self, layer_id, root_input, activation): else: self.make_node(activation, inputs=[root_input], outputs=[output], name=gelu_name, domain="com.microsoft") - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.intermediate_size]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) return gelu_name @@ -2856,7 +3473,7 @@ def make_relu(self, layer_id, root_input, activation): relu_name = f"/model/layers.{layer_id}/mlp/act_fn/{activation}" output = f"{relu_name}/output_0" self.make_node(activation, inputs=[root_input], outputs=[output], name=relu_name, domain="") - self.make_value(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.intermediate_size]) + self.make_value(output, self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) return relu_name def make_relu_squared(self, layer_id, root_input, activation): @@ -2865,7 +3482,9 @@ def make_relu_squared(self, layer_id, root_input, activation): pow_name = f"{basename}/pow" pow_inputs = [f"{relu_name}/output_0", "/model/constants/INT32/[2]"] self.make_node("Pow", inputs=pow_inputs, outputs=[f"{pow_name}/output_0"], name=pow_name, domain="") - self.make_value(f"{pow_name}/output_0", self.io_dtype, shape=['batch_size', 'sequence_length', self.intermediate_size]) + self.make_value( + f"{pow_name}/output_0", self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) return pow_name def make_activation(self, layer_id, root_input): @@ -2904,15 +3523,20 @@ def make_lm_head(self, lm_head): if bias_exists: add_name = "/lm_head/Add" - self.make_add_bias(lm_head.bias, add_name, root_input=f"{lm_name}/output_0", logits=not any(exists_checks[1:])) + self.make_add_bias( + lm_head.bias, add_name, root_input=f"{lm_name}/output_0", logits=not any(exists_checks[1:]) + ) lm_name = add_name if scale_exists: mul_name = "/lm_head/Mul" - mul_inputs = [f"{lm_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.lm_head_attrs['scale']}"] + mul_inputs = [ + f"{lm_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.lm_head_attrs['scale']}", + ] mul_output = "logits" if not any(exists_checks[2:]) else f"{mul_name}/output_0" - self.make_node('Mul', inputs=mul_inputs, outputs=[mul_output], name=mul_name) - self.make_value(mul_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.vocab_size]) + self.make_node("Mul", inputs=mul_inputs, outputs=[mul_output], name=mul_name) + self.make_value(mul_output, self.io_dtype, shape=["batch_size", "sequence_length", self.vocab_size]) lm_name = mul_name if mask_exists: @@ -2921,41 +3545,78 @@ def make_lm_head(self, lm_head): self.make_initializer(self.lm_head_attrs["mask"], logits_mask_name) where_name = "/lm_head/Where" - where_inputs = [logits_mask_name, f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{torch.finfo(to_torch_dtype(self.io_dtype)).min}", f"{lm_name}/output_0"] + where_inputs = [ + logits_mask_name, + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{torch.finfo(to_torch_dtype(self.io_dtype)).min}", + f"{lm_name}/output_0", + ] where_output = "logits" if not any(exists_checks[3:]) else f"{where_name}/output_0" - self.make_node('Where', inputs=where_inputs, outputs=[where_output], name=where_name) - self.make_value(where_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.vocab_size]) + self.make_node("Where", inputs=where_inputs, outputs=[where_output], name=where_name) + self.make_value(where_output, self.io_dtype, shape=["batch_size", "sequence_length", self.vocab_size]) lm_name = where_name if softcap_exists: # Add final logit softcapping (Div --> Tanh --> Mul) div_name = "/lm_head/softcap/Div" - div_inputs = [f"{lm_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.lm_head_attrs['softcap']}"] - self.make_div(div_name, div_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.vocab_size]) + div_inputs = [ + f"{lm_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.lm_head_attrs['softcap']}", + ] + self.make_div( + div_name, div_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.vocab_size] + ) tanh_name = "/lm_head/softcap/Tanh" - self.make_tanh(tanh_name, f"{div_name}/output_0", dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.vocab_size]) + self.make_tanh( + tanh_name, + f"{div_name}/output_0", + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.vocab_size], + ) mul_name = "/lm_head/softcap/Mul" - mul_inputs = [f"{tanh_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.lm_head_attrs['softcap']}"] + mul_inputs = [ + f"{tanh_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.lm_head_attrs['softcap']}", + ] mul_output = "logits" if not any(exists_checks[4:]) else f"{mul_name}/output_0" - self.make_node('Mul', inputs=mul_inputs, outputs=[mul_output], name=mul_name) - self.make_value(mul_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.vocab_size]) + self.make_node("Mul", inputs=mul_inputs, outputs=[mul_output], name=mul_name) + self.make_value(mul_output, self.io_dtype, shape=["batch_size", "sequence_length", self.vocab_size]) lm_name = mul_name if cast_exists: # Add final cast from io_dtype to logits_dtype cast_name = "/lm_head/Cast" cast_output = "logits" - self.make_node('Cast', inputs=[f"{lm_name}/output_0"], outputs=[cast_output], name=cast_name, to=self.output_types['logits']) - self.make_value(cast_output, self.output_types['logits'], shape=['batch_size', 'sequence_length', self.vocab_size]) + self.make_node( + "Cast", + inputs=[f"{lm_name}/output_0"], + outputs=[cast_output], + name=cast_name, + to=self.output_types["logits"], + ) + self.make_value( + cast_output, self.output_types["logits"], shape=["batch_size", "sequence_length", self.vocab_size] + ) def make_layer(self, layer_id, layer): # Each LLM decoder layer is typically defined as: # input_layernorm --> attention --> output_layernorm --> MLP - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + self.make_layernorm( + layer_id, + layer.input_layernorm, + skip=not self.layernorm_attrs["first_layernorm"], + simple=self.layernorm_attrs["simple"], + location="input", + ) self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"]) - self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") + self.make_layernorm( + layer_id, + layer.post_attention_layernorm, + skip=True, + simple=self.layernorm_attrs["simple"], + location="post_attention", + ) self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) self.layernorm_attrs["first_layernorm"] = False @@ -2977,7 +3638,16 @@ def make_model(self, input_path): from gguf_model import GGUFModel except ImportError: from onnxruntime_genai.models.gguf_model import GGUFModel - model = GGUFModel.from_pretrained(self.model_type, input_path, self.head_size, self.hidden_size, self.intermediate_size, self.num_attn_heads, self.num_kv_heads, self.vocab_size) + model = GGUFModel.from_pretrained( + self.model_type, + input_path, + self.head_size, + self.hidden_size, + self.intermediate_size, + self.num_attn_heads, + self.num_kv_heads, + self.vocab_size, + ) self.layernorm_attrs["add_offset"] = 0 # add offset already done for GGUF models elif self.quant_type is not None: @@ -2988,21 +3658,40 @@ def make_model(self, input_path): from onnxruntime_genai.models.quantized_model import QuantModel q_size = self.num_attn_heads * self.head_size kv_size = self.num_kv_heads * self.head_size - model = QuantModel.from_pretrained(self.quant_type, input_path=input_path, quant_attrs=self.quant_attrs, q_size=q_size, kv_size=kv_size, intermediate_size=self.intermediate_size, num_layers=self.num_layers) + model = QuantModel.from_pretrained( + self.quant_type, + input_path=input_path, + quant_attrs=self.quant_attrs, + q_size=q_size, + kv_size=kv_size, + intermediate_size=self.intermediate_size, + num_layers=self.num_layers, + ) else: # Load PyTorch model extra_kwargs = {"num_hidden_layers": self.num_layers} if "num_hidden_layers" in self.extra_options else {} - model = AutoModelForCausalLM.from_pretrained(self.model_name_or_path, cache_dir=self.cache_dir, token=self.hf_token, trust_remote_code=self.hf_remote, **extra_kwargs) + model = AutoModelForCausalLM.from_pretrained( + self.model_name_or_path, + cache_dir=self.cache_dir, + token=self.hf_token, + trust_remote_code=self.hf_remote, + **extra_kwargs, + ) if "adapter_path" in self.extra_options: from peft import PeftModel - model = PeftModel.from_pretrained(model, self.extra_options["adapter_path"], cache_dir=self.cache_dir, token=self.hf_token) + + model = PeftModel.from_pretrained( + model, self.extra_options["adapter_path"], cache_dir=self.cache_dir, token=self.hf_token + ) # Loop through model and map each module to ONNX/ORT ops self.layer_id = 0 for module in model.modules(): - if (isinstance(module, torch.nn.Embedding) and module.weight.shape[0] == self.vocab_size) or (hasattr(model, "embedding") and module == model.embedding): + if (isinstance(module, torch.nn.Embedding) and module.weight.shape[0] == self.vocab_size) or ( + hasattr(model, "embedding") and module == model.embedding + ): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_embeds: # Embedding layer @@ -3013,7 +3702,9 @@ def make_model(self, input_path): self.layernorm_attrs["root_input"] = "inputs_embeds" self.layernorm_attrs["skip_input"] = "inputs_embeds" - elif (module.__class__.__name__.endswith("DecoderLayer") or module.__class__.__name__.endswith("GLMBlock")) and self.layer_id < self.num_layers: + elif ( + module.__class__.__name__.endswith("DecoderLayer") or module.__class__.__name__.endswith("GLMBlock") + ) and self.layer_id < self.num_layers: # Each decoder layer of model print(f"Reading decoder layer {self.layer_id}") self.make_layer(self.layer_id, module) @@ -3022,9 +3713,13 @@ def make_model(self, input_path): elif self.layer_id == self.num_layers and self.has_final_norm(module, model): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") - self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") + self.make_layernorm( + self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm" + ) - elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): + elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or ( + hasattr(model, "lm_head") and module == model.lm_head + ): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_lm_head: # Language modeling head (SkipLayerNorm --> logits) @@ -3049,9 +3744,23 @@ def has_final_norm(self, module, orig_model): # hf_transformer_final_layernorm: for ChatGLM-3 # hf_language_model_norm: for Gemma-3 multimodal (4B, 12B, 27B) hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm - hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm - hf_transformer_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm - hf_language_model_norm = hasattr(model, "model") and hasattr(model.model, "language_model") and hasattr(model.model.language_model, "norm") and module == model.model.language_model.norm + hf_final_layernorm = ( + hasattr(model, "model") + and hasattr(model.model, "final_layernorm") + and module == model.model.final_layernorm + ) + hf_transformer_final_layernorm = ( + hasattr(model, "transformer") + and hasattr(model.transformer, "encoder") + and hasattr(model.transformer.encoder, "final_layernorm") + and module == model.transformer.encoder.final_layernorm + ) + hf_language_model_norm = ( + hasattr(model, "model") + and hasattr(model.model, "language_model") + and hasattr(model.model.language_model, "norm") + and module == model.model.language_model.norm + ) # GGUF names (all models loaded with GGUFModel.from_pretrained) gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm @@ -3064,7 +3773,11 @@ def make_preprocessing_nodes(self): self.make_attention_mask_reformatting() def make_attention_mask_reformatting(self): - if self.extra_options.get("enable_cuda_graph", False) or self.extra_options.get("enable_webgpu_graph", False) or self.ep == "dml": + if ( + self.extra_options.get("enable_cuda_graph", False) + or self.extra_options.get("enable_webgpu_graph", False) + or self.ep == "dml" + ): # ORT does not allow nodes to be placed on mulitple execution providers # with graph capture enabled. We've only verified it works with GQA and with # past_present_share_buffer enabled(so the total_seq_len in GQA is hardcoded @@ -3162,12 +3875,16 @@ def make_attention_mask_reformatting_for_mha(self): end_add_name = f"{basename}/Add" end_add_inputs = [f"{end_where_name}/output_0", f"{end_expand_name}/output_0"] end_add_shape = ["batch_size", 1, "source_sequence_length", "target_sequence_length"] - self.make_add(end_add_name, end_add_inputs, dtype=self.io_dtype, shape=end_add_shape) # Shape of mask is now (B, 1, S, T) + self.make_add( + end_add_name, end_add_inputs, dtype=self.io_dtype, shape=end_add_shape + ) # Shape of mask is now (B, 1, S, T) tile_name = f"{basename}/Tile" tile_inputs = [f"{end_add_name}/output_0", f"/model/constants/INT64/[1, {self.num_attn_heads}, 1, 1]"] tile_shape = ["batch_size", self.num_attn_heads, "source_sequence_length", "target_sequence_length"] - self.make_tile(tile_name, tile_inputs, dtype=self.io_dtype, shape=tile_shape) # Shape of mask is now (B, N, S, T) + self.make_tile( + tile_name, tile_inputs, dtype=self.io_dtype, shape=tile_shape + ) # Shape of mask is now (B, N, S, T) self.mask_attrs["mask_name"] = tile_name @@ -3216,14 +3933,28 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name): self.make_concat(concat_2_name, concat_inputs, dtype=ir.DataType.INT64, shape=[2], axis=0) constant_shape_name = f"{basename}/ConstantOfShape_2" constant_shape_torch_dtype = to_torch_dtype(self.io_dtype) - constant_shape_value = ir.tensor(torch.tensor([torch.finfo(constant_shape_torch_dtype).min], dtype=constant_shape_torch_dtype), name="make_input_ids_subgraph_shape") - self.make_constant_of_shape(constant_shape_name, f"{concat_2_name}/output_0", value=constant_shape_value, dtype=self.io_dtype, shape=['unk', 'unk']) + constant_shape_value = ir.tensor( + torch.tensor([torch.finfo(constant_shape_torch_dtype).min], dtype=constant_shape_torch_dtype), + name="make_input_ids_subgraph_shape", + ) + self.make_constant_of_shape( + constant_shape_name, + f"{concat_2_name}/output_0", + value=constant_shape_value, + dtype=self.io_dtype, + shape=["unk", "unk"], + ) # Top path shape_4_name = f"{basename}/Shape_4" self.make_shape(shape_4_name, f"{constant_shape_name}/output_0", shape=[2]) slice_1_name = f"{basename}/Slice_1" - slice_1_inputs = [f"{shape_4_name}/output_0", "/model/constants/INT64/[-1]", f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", "/model/constants/INT64/[0]"] + slice_1_inputs = [ + f"{shape_4_name}/output_0", + "/model/constants/INT64/[-1]", + f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", + "/model/constants/INT64/[0]", + ] self.make_slice(slice_1_name, slice_1_inputs, dtype=ir.DataType.INT64, shape=[1]) squeeze_1_name = f"{basename}/Squeeze_1" squeeze_1_inputs = [f"{slice_1_name}/output_0", "/model/constants/INT64/[0]"] @@ -3239,7 +3970,12 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name): shape_5_name = f"{basename}/Shape_5" self.make_shape(shape_5_name, f"{constant_shape_name}/output_0", shape=[2]) slice_2_name = f"{basename}/Slice_2" - slice_2_inputs = [f"{shape_5_name}/output_0", "/model/constants/INT64/[-1]", f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", "/model/constants/INT64/[0]"] + slice_2_inputs = [ + f"{shape_5_name}/output_0", + "/model/constants/INT64/[-1]", + f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", + "/model/constants/INT64/[0]", + ] self.make_slice(slice_2_name, slice_2_inputs, dtype=ir.DataType.INT64, shape=[1]) squeeze_2_name = f"{basename}/Squeeze_2" squeeze_2_inputs = [f"{slice_2_name}/output_0", "/model/constants/INT64/[0]"] @@ -3259,7 +3995,11 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name): less_inputs = [f"{range_name}/output_0", f"{reshape_name}/output_0"] self.make_less(less_name, less_inputs) where_2_name = f"{basename}/Where_2" - where_2_inputs = [f"{less_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/0", f"{constant_shape_name}/output_0"] + where_2_inputs = [ + f"{less_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/0", + f"{constant_shape_name}/output_0", + ] self.make_where(where_2_name, where_2_inputs, dtype=self.io_dtype, shape=None) unsqueeze_8_name = f"{basename}/Unsqueeze_8" unsqueeze_8_inputs = [f"{where_2_name}/output_0", "/model/constants/INT64/[0]"] @@ -3268,7 +4008,13 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name): unsqueeze_9_inputs = [f"{unsqueeze_8_name}/output_0", "/model/constants/INT64/[1]"] self.make_unsqueeze(unsqueeze_9_name, unsqueeze_9_inputs, dtype=self.io_dtype, shape=None) - expand_name = self.make_common_mask_reformat_subgraph(basename, root_input="input_ids" if not self.exclude_embeds else "inputs_embeds", unsqueeze_for_concat=unsqueeze_3_name, unsqueeze_for_expand=unsqueeze_9_name, input_ids_subgraph=True) + expand_name = self.make_common_mask_reformat_subgraph( + basename, + root_input="input_ids" if not self.exclude_embeds else "inputs_embeds", + unsqueeze_for_concat=unsqueeze_3_name, + unsqueeze_for_expand=unsqueeze_9_name, + input_ids_subgraph=True, + ) return unsqueeze_6_name, expand_name def make_attention_mask_subgraph(self, basename, unsqueeze_for_concat): @@ -3278,15 +4024,24 @@ def make_attention_mask_subgraph(self, basename, unsqueeze_for_concat): unsqueeze_3_name = f"{basename}/Unsqueeze_3" unsqueeze_3_inputs = ["attention_mask", "/model/constants/INT64/[1]"] - attention_mask_shape.insert(1, 1) # ['batch_size', 'total_sequence_length'] --> ['batch_size', 1, 'total_sequence_length'] + attention_mask_shape.insert( + 1, 1 + ) # ['batch_size', 'total_sequence_length'] --> ['batch_size', 1, 'total_sequence_length'] self.make_unsqueeze(unsqueeze_3_name, unsqueeze_3_inputs, dtype=ir.DataType.INT64, shape=attention_mask_shape) unsqueeze_4_name = f"{basename}/Unsqueeze_4" unsqueeze_4_inputs = [f"{unsqueeze_3_name}/output_0", "/model/constants/INT64/[2]"] - attention_mask_shape.insert(1, 1) # ['batch_size', 1, 'total_sequence_length'] --> ['batch_size', 1, 1, 'total_sequence_length'] + attention_mask_shape.insert( + 1, 1 + ) # ['batch_size', 1, 'total_sequence_length'] --> ['batch_size', 1, 1, 'total_sequence_length'] self.make_unsqueeze(unsqueeze_4_name, unsqueeze_4_inputs, dtype=ir.DataType.INT64, shape=attention_mask_shape) # Make the main subgraph - expand_name = self.make_common_mask_reformat_subgraph(basename, root_input="attention_mask", unsqueeze_for_concat=unsqueeze_for_concat, unsqueeze_for_expand=unsqueeze_4_name) + expand_name = self.make_common_mask_reformat_subgraph( + basename, + root_input="attention_mask", + unsqueeze_for_concat=unsqueeze_for_concat, + unsqueeze_for_expand=unsqueeze_4_name, + ) # Make the additional subgraph after Expand: # +-----------------+ @@ -3300,12 +4055,18 @@ def make_attention_mask_subgraph(self, basename, unsqueeze_for_concat): cast_2_name = f"{basename}/Cast_2" self.make_cast(cast_2_name, f"{sub_name}/output_0", dtype=ir.DataType.BOOL, shape=["unk", "unk", "unk", "unk"]) where_2_name = f"{basename}/Where_2" - where_2_inputs = [f"{cast_2_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{torch.finfo(to_torch_dtype(self.io_dtype)).min}", f"{sub_name}/output_0"] + where_2_inputs = [ + f"{cast_2_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{torch.finfo(to_torch_dtype(self.io_dtype)).min}", + f"{sub_name}/output_0", + ] self.make_where(where_2_name, where_2_inputs, dtype=self.io_dtype, shape=["unk", "unk", "unk", "unk"]) return where_2_name - def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for_concat, unsqueeze_for_expand, input_ids_subgraph=False): + def make_common_mask_reformat_subgraph( + self, basename, root_input, unsqueeze_for_concat, unsqueeze_for_expand, input_ids_subgraph=False + ): # root_input # / \ # Shape Shape @@ -3361,14 +4122,26 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for concat_name = f"{basename}/Concat" if not input_ids_subgraph else f"{basename}/Concat_1" concat_first_two_inputs = [f"{unsqueeze_1_name}/output_0", "/model/constants/INT64/[1]"] - concat_last_two_inputs = [f"{unsqueeze_for_concat}/output_0", f"{unsqueeze_2_name}/output_0"] if not input_ids_subgraph else [f"{unsqueeze_2_name}/output_0", f"{unsqueeze_for_concat}/output_0"] + concat_last_two_inputs = ( + [f"{unsqueeze_for_concat}/output_0", f"{unsqueeze_2_name}/output_0"] + if not input_ids_subgraph + else [f"{unsqueeze_2_name}/output_0", f"{unsqueeze_for_concat}/output_0"] + ) concat_inputs = concat_first_two_inputs + concat_last_two_inputs self.make_concat(concat_name, concat_inputs, dtype=ir.DataType.INT64, shape=[4], axis=0) shape_3_name = f"{basename}/Shape_3" self.make_shape(shape_3_name, f"{concat_name}/output_0", shape=[1]) - constant_shape_name = f"{basename}/ConstantOfShape" if not input_ids_subgraph else f"{basename}/ConstantOfShape_1" + constant_shape_name = ( + f"{basename}/ConstantOfShape" if not input_ids_subgraph else f"{basename}/ConstantOfShape_1" + ) constant_shape_value = ir.tensor([1], dtype=ir.DataType.INT64) - self.make_constant_of_shape(constant_shape_name, f"{shape_3_name}/output_0", value=constant_shape_value, dtype=ir.DataType.INT64, shape=["unk"]) + self.make_constant_of_shape( + constant_shape_name, + f"{shape_3_name}/output_0", + value=constant_shape_value, + dtype=ir.DataType.INT64, + shape=["unk"], + ) mul_name = f"{basename}/Mul" mul_inputs = [f"{constant_shape_name}/output_0", "/model/constants/INT64/-1"] self.make_mul(mul_name, mul_inputs, dtype=ir.DataType.INT64, shape=["unk"]) @@ -3409,7 +4182,9 @@ def make_attention_mask_graph_capture_reformatting_for_gqa(self, attn_mask_basen # Calculate ReduceSum from attention_mask cast_1_name = f"{attn_mask_basename}/Cast" - self.make_cast(cast_1_name, "attention_mask", dtype=ir.DataType.INT32, shape=["batch_size", "total_sequence_length"]) + self.make_cast( + cast_1_name, "attention_mask", dtype=ir.DataType.INT32, shape=["batch_size", "total_sequence_length"] + ) reduce_sum_name = f"{attn_mask_basename}/ReduceSum" reduce_sum_inputs = [f"{cast_1_name}/output_0", "/model/constants/INT64/[1]"] self.make_reduce_sum(reduce_sum_name, reduce_sum_inputs, dtype=ir.DataType.INT32, shape=["batch_size", 1]) @@ -3515,4 +4290,4 @@ def make_attention_mask_reformatting_for_sparse_attn(self): def make_position_ids_reformatting(self): # For most cases, position_ids are already properly formatted as 2D tensors # with int64 values matching input_ids shape, so we can use them directly - return "position_ids" \ No newline at end of file + return "position_ids" diff --git a/src/python/py/models/builders/chatglm.py b/src/python/py/models/builders/chatglm.py index 2441cd6e5f..f8959c8d01 100644 --- a/src/python/py/models/builders/chatglm.py +++ b/src/python/py/models/builders/chatglm.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .base import Model + class ChatGLMModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -21,4 +22,4 @@ def make_mlp(self, layer_id, mlp, root_input): def make_layer(self, layer_id, layer): layer.self_attn = layer.self_attn if hasattr(layer, 'self_attn') else layer.self_attention - super().make_layer(layer_id, layer) \ No newline at end of file + super().make_layer(layer_id, layer) diff --git a/src/python/py/models/builders/ernie.py b/src/python/py/models/builders/ernie.py index 37a2de2648..24b47462e8 100644 --- a/src/python/py/models/builders/ernie.py +++ b/src/python/py/models/builders/ernie.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .mistral import MistralModel + class ErnieModel(MistralModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -16,4 +17,4 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # The original RoPE logic in ernie is: position_ids / compression_ratio, # which is equivalent to scaling the frequencies (inv_freq) by 1 / compression_ratio. if hasattr(config, "compression_ratio") and config.compression_ratio != 1.0: - self.rotemb_attrs["rescale_factors"] = 1.0 / config.compression_ratio \ No newline at end of file + self.rotemb_attrs["rescale_factors"] = 1.0 / config.compression_ratio diff --git a/src/python/py/models/builders/gemma.py b/src/python/py/models/builders/gemma.py index d319e86da2..12084ebca9 100644 --- a/src/python/py/models/builders/gemma.py +++ b/src/python/py/models/builders/gemma.py @@ -3,9 +3,11 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- -from .mistral import MistralModel import numpy as np +from .mistral import MistralModel + + class GemmaModel(MistralModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) diff --git a/src/python/py/models/builders/gptoss.py b/src/python/py/models/builders/gptoss.py index b7c87c96f8..38f1ca0e46 100644 --- a/src/python/py/models/builders/gptoss.py +++ b/src/python/py/models/builders/gptoss.py @@ -3,11 +3,12 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- -from .base import Model - import onnx_ir as ir import torch +from .base import Model + + class GPTOSSModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -56,7 +57,7 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): def make_moe(self, layer_id, mlp, root_input): if self.ep in {"cpu", "cuda"}: - self.make_moe_fused(layer_id, mlp, root_input) + self.make_moe_fused(layer_id, mlp, root_input) else: self.make_moe_decomposed(layer_id, mlp, root_input) @@ -364,7 +365,7 @@ def make_moe_fused(self, layer_id, mlp, root_input): pack_size = 8 // self.moe_attrs["expert_weight_bits"] self.make_initializer(gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, self.hidden_size // pack_size), gate_up_proj_weight) self.make_initializer(down_proj_qweight_tensor.view(self.moe_attrs["num_experts"], self.hidden_size, self.intermediate_size // pack_size), down_proj_weight) - + # scales tensors have different shapes depending on quantization method self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) diff --git a/src/python/py/models/builders/granite.py b/src/python/py/models/builders/granite.py index 6364044689..39dd0c93a0 100644 --- a/src/python/py/models/builders/granite.py +++ b/src/python/py/models/builders/granite.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .mistral import MistralModel + class GraniteModel(MistralModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -37,4 +38,4 @@ def make_layer(self, layer_id, layer): self.layernorm_attrs["first_layernorm"] = False if layer_id == self.num_layers - 1: # Norm after last decoder layer of model (last layer --> norm) - self.layernorm_attrs["last_layernorm"] = True \ No newline at end of file + self.layernorm_attrs["last_layernorm"] = True diff --git a/src/python/py/models/builders/llama.py b/src/python/py/models/builders/llama.py index 93adc8a3a9..f4055b0da1 100644 --- a/src/python/py/models/builders/llama.py +++ b/src/python/py/models/builders/llama.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .base import Model + class LlamaModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): - super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) \ No newline at end of file + super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) diff --git a/src/python/py/models/builders/mistral.py b/src/python/py/models/builders/mistral.py index 1893058064..eecbafeb32 100644 --- a/src/python/py/models/builders/mistral.py +++ b/src/python/py/models/builders/mistral.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .base import Model + class MistralModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): - super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) \ No newline at end of file + super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) diff --git a/src/python/py/models/builders/nemotron.py b/src/python/py/models/builders/nemotron.py index 27f73dbb5e..25e7d37563 100644 --- a/src/python/py/models/builders/nemotron.py +++ b/src/python/py/models/builders/nemotron.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .llama import LlamaModel + class NemotronModel(LlamaModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -32,4 +33,4 @@ def make_mlp_proj(self, layer_id, mlp, root_input): down_name = self.make_matmul(mlp.down_proj, down_basename, f"{act_fn_name}/output_0") # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm - self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" \ No newline at end of file + self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" diff --git a/src/python/py/models/builders/olmo.py b/src/python/py/models/builders/olmo.py index cc6da19ee1..6ff3005e6e 100644 --- a/src/python/py/models/builders/olmo.py +++ b/src/python/py/models/builders/olmo.py @@ -3,9 +3,11 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- -from .base import Model import torch +from .base import Model + + class OLMoModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -13,4 +15,4 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_layernorm(self, layer_id, layernorm, skip, simple, location): layernorm.weight = torch.ones(self.hidden_size) layernorm.bias = torch.zeros(self.hidden_size) - super().make_layernorm(layer_id, layernorm, skip, simple, location) \ No newline at end of file + super().make_layernorm(layer_id, layernorm, skip, simple, location) diff --git a/src/python/py/models/builders/phi.py b/src/python/py/models/builders/phi.py index ae13ad2745..c8be0fe12a 100644 --- a/src/python/py/models/builders/phi.py +++ b/src/python/py/models/builders/phi.py @@ -3,11 +3,12 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +import onnx_ir as ir +import torch + from .base import Model from .mistral import MistralModel -import onnx_ir as ir -import torch class PhiModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): @@ -377,4 +378,4 @@ def make_layer(self, layer_id, layer): layer.mlp.down_proj.lora_B.default = layer.mlp.down_proj.lora_B.vision layer.mlp.down_proj.scaling["default"] = layer.mlp.down_proj.scaling["vision"] - super().make_layer(layer_id, layer) \ No newline at end of file + super().make_layer(layer_id, layer) diff --git a/src/python/py/models/builders/qwen.py b/src/python/py/models/builders/qwen.py index 574fd1ba25..3c5133e30f 100644 --- a/src/python/py/models/builders/qwen.py +++ b/src/python/py/models/builders/qwen.py @@ -5,10 +5,12 @@ # -------------------------------------------------------------------------- from .mistral import MistralModel + class QwenModel(MistralModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) + class Qwen3Model(QwenModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -16,4 +18,4 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_attention_init(self): self.attention_attrs["q_norm"] = True self.attention_attrs["k_norm"] = True - super().make_attention_init() \ No newline at end of file + super().make_attention_init() diff --git a/src/python/py/models/builders/smollm.py b/src/python/py/models/builders/smollm.py index aa8228af7b..a94fc1fe57 100644 --- a/src/python/py/models/builders/smollm.py +++ b/src/python/py/models/builders/smollm.py @@ -5,6 +5,7 @@ # -------------------------------------------------------------------------- from .llama import LlamaModel + class SmolLM3Model(LlamaModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -31,4 +32,4 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): # Restore original values self.attention_attrs["use_rope_in_attn"] = original_use_rope - self.window_size = original_window_size \ No newline at end of file + self.window_size = original_window_size diff --git a/src/python/py/models/gguf_model.py b/src/python/py/models/gguf_model.py index 4852d76d78..6e5a741396 100644 --- a/src/python/py/models/gguf_model.py +++ b/src/python/py/models/gguf_model.py @@ -11,11 +11,11 @@ no matter where the weights actually come from. """ +import re from functools import reduce -from gguf.gguf_reader import GGUFReader -import re import torch +from gguf.gguf_reader import GGUFReader class GGUFTensorModule: @@ -215,14 +215,14 @@ def __init__(self, input_path, head_size, hidden_size, intermediate_size, num_at module.post_feedforward_layernorm.bias = data else: raise NotImplementedError(f"{name} in your GGUF model is not recognized") - + # Set LM head weights + biases if not already set if self.lm_head.weight is None: # Embedding and LM head share same weights + biases (lm_head.weight == embedding.weight and lm_head.bias == embedding.bias) self.lm_head.weight = self.embedding.weight if self.lm_head.bias is not None: self.lm_head.bias = self.embedding.bias - + # Sort list of layers by layer id self.layers = list(self.layers.values()) self.layers.sort(key=lambda m: m.layer_id) diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py index eb9f5585f8..5fb424e4f1 100644 --- a/src/python/py/models/quantized_model.py +++ b/src/python/py/models/quantized_model.py @@ -13,12 +13,12 @@ ONNX Runtime's format no matter where the quantized weights actually come from. """ -from safetensors.torch import load_file -import torch - import os import re +import torch +from safetensors.torch import load_file + class QuantizedTensorModule: def __init__(self): @@ -60,6 +60,7 @@ def __init__(self): self.weight = None self.bias = None + class QuantizedAttention: def __init__(self): self.q_proj = QuantizedTensorModule() @@ -70,6 +71,7 @@ def __init__(self): self.k_norm = TensorModule() self.q_norm = TensorModule() + class QuantizedMLP: def __init__(self): self.gate_proj = QuantizedTensorModule() @@ -110,7 +112,6 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme # Map weights to modules for name, tensor in weights.items(): - # Per-layer quantization support local_bits = self.get_layer_bits(name) # codeql[py/init-calls-subclass] local_group_size = self.get_layer_group_size(name) # codeql[py/init-calls-subclass] @@ -228,27 +229,43 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme elif bool(re.match(r"^model.layers\.\d+\.self_attn.v_proj\.bias$", name)): # model.layers.layer_id.self_attn.v_proj.bias tensor_map["self_attn.v_proj.bias"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.q?weight$", name)): + elif bool( + re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.q?weight$", name) + ): # model.layers.layer_id.self_attn.o_proj.qweight # model.layers.layer_id.self_attention.dense.qweight tensor_map["self_attn.o_proj.qweight"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.(scales|weight_scale)$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.(scales|weight_scale)$", + name, + ) + ): # model.layers.layer_id.self_attn.o_proj.scales # model.layers.layer_id.self_attention.dense.scales # model.layers.layer_id.self_attn.o_proj.weight_scale # model.layers.layer_id.self_attention.dense.weight_scale tensor_map["self_attn.o_proj.scales"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.(qzeros|weight_zero_point)$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.(qzeros|weight_zero_point)$", + name, + ) + ): # model.layers.layer_id.self_attn.o_proj.qzeros # model.layers.layer_id.self_attention.dense.qzeros # model.layers.layer_id.self_attn.o_proj.weight_zero_point # model.layers.layer_id.self_attention.dense.weight_zero_point tensor_map["self_attn.o_proj.qzeros"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.g_idx$", name)): + elif bool( + re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.g_idx$", name) + ): # model.layers.layer_id.self_attn.o_proj.g_idx # model.layers.layer_id.self_attention.dense.g_idx tensor_map["self_attn.o_proj.g_idx"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.bias$", name)): + elif bool( + re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.bias$", name) + ): # model.layers.layer_id.self_attn.o_proj.bias # model.layers.layer_id.self_attention.dense.bias tensor_map["self_attn.o_proj.bias"] = tensor @@ -312,13 +329,19 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme # model.layers.layer_id.mlp.down_proj.weight # model.layers.layer_id.mlp.dense_4h_to_h.weight tensor_map["mlp.down_proj.qweight"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.(scales|weight_scale)$", name)): + elif bool( + re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.(scales|weight_scale)$", name) + ): # model.layers.layer_id.mlp.down_proj.scales # model.layers.layer_id.mlp.dense_4h_to_h.scales # model.layers.layer_id.mlp.down_proj.weight_scale # model.layers.layer_id.mlp.dense_4h_to_h.weight_scale tensor_map["mlp.down_proj.scales"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.(qzeros|weight_zero_point)$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.(qzeros|weight_zero_point)$", name + ) + ): # model.layers.layer_id.mlp.down_proj.qzeros # model.layers.layer_id.mlp.dense_4h_to_h.qzeros # model.layers.layer_id.mlp.down_proj.weight_zero_point @@ -333,69 +356,120 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme # model.layers.layer_id.mlp.dense_4h_to_h.bias tensor_map["mlp.down_proj.bias"] = tensor # Match against fused layers - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.q?weight$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.q?weight$", + name, + ) + ): # model.layers.layer_id.self_attn.qkv_proj.qweight # model.layers.layer_id.self_attention.query_key_value.qweight # model.layers.layer_id.self_attn.qkv_proj.weight # model.layers.layer_id.self_attention.query_key_value.weight q_dim = q_size // (32 // local_bits) if quant_type in {"awq", "quark"} else q_size kv_dim = kv_size // (32 // local_bits) if quant_type in {"awq", "quark"} else kv_size - tensor_map["self_attn.q_proj.qweight"] = tensor[:, : q_dim] + tensor_map["self_attn.q_proj.qweight"] = tensor[:, :q_dim] tensor_map["self_attn.k_proj.qweight"] = tensor[:, q_dim : q_dim + kv_dim] tensor_map["self_attn.v_proj.qweight"] = tensor[:, q_dim + kv_dim :] - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.(scales|weight_scale)$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.(scales|weight_scale)$", + name, + ) + ): # model.layers.layer_id.self_attn.qkv_proj.scales # model.layers.layer_id.self_attention.query_key_value.scales # model.layers.layer_id.self_attn.qkv_proj.weight_scale # model.layers.layer_id.self_attention.query_key_value.weight_scale - tensor_map["self_attn.q_proj.scales"] = tensor[:, : q_size] + tensor_map["self_attn.q_proj.scales"] = tensor[:, :q_size] tensor_map["self_attn.k_proj.scales"] = tensor[:, q_size : q_size + kv_size] tensor_map["self_attn.v_proj.scales"] = tensor[:, q_size + kv_size :] - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.(qzeros|weight_zero_point)$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.(qzeros|weight_zero_point)$", + name, + ) + ): # model.layers.layer_id.self_attn.qkv_proj.qzeros # model.layers.layer_id.self_attention.query_key_value.qzeros # model.layers.layer_id.self_attn.qkv_proj.weight_zero_point # model.layers.layer_id.self_attention.query_key_value.weight_zero_point - q_dim = q_size // (32 // local_bits) if quant_type in {"awq", "gptq", "olive", "quark"} else q_size - kv_dim = kv_size // (32 // local_bits) if quant_type in {"awq", "gptq", "olive", "quark"} else kv_size - tensor_map["self_attn.q_proj.qzeros"] = tensor[:, : q_dim] + q_dim = ( + q_size // (32 // local_bits) + if quant_type in {"awq", "gptq", "olive", "quark"} + else q_size + ) + kv_dim = ( + kv_size // (32 // local_bits) + if quant_type in {"awq", "gptq", "olive", "quark"} + else kv_size + ) + tensor_map["self_attn.q_proj.qzeros"] = tensor[:, :q_dim] tensor_map["self_attn.k_proj.qzeros"] = tensor[:, q_dim : q_dim + kv_dim] tensor_map["self_attn.v_proj.qzeros"] = tensor[:, q_dim + kv_dim :] - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.g_idx$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.g_idx$", name + ) + ): # model.layers.layer_id.self_attn.qkv_proj.g_idx # model.layers.layer_id.self_attention.query_key_value.g_idx tensor_map["self_attn.q_proj.g_idx"] = tensor tensor_map["self_attn.k_proj.g_idx"] = tensor tensor_map["self_attn.v_proj.g_idx"] = tensor - elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.bias$", name)): + elif bool( + re.match( + r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.bias$", name + ) + ): # model.layers.layer_id.self_attn.qkv_proj.bias # model.layers.layer_id.self_attention.query_key_value.bias - tensor_map["self_attn.q_proj.bias"] = tensor[: q_size] + tensor_map["self_attn.q_proj.bias"] = tensor[:q_size] tensor_map["self_attn.k_proj.bias"] = tensor[q_size : q_size + kv_size] - tensor_map["self_attn.v_proj.bias"] = tensor[q_size + kv_size : ] - elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.q?weight$", name)): + tensor_map["self_attn.v_proj.bias"] = tensor[q_size + kv_size :] + elif bool( + re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.q?weight$", name) + ): # model.layers.layer_id.mlp.gate_up_proj.qweight # model.layers.layer_id.mlp.dense_h_to_4h.qweight # model.layers.layer_id.mlp.gate_up_proj.weight # model.layers.layer_id.mlp.dense_h_to_4h.weight - intermediate_dim = intermediate_size // (32 // local_bits) if quant_type in {"awq", "quark"} else intermediate_size - tensor_map["mlp.gate_proj.qweight"] = tensor[:, : intermediate_dim] - tensor_map["mlp.up_proj.qweight"] = tensor[:, intermediate_dim :] - elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.(scales|weight_scale)$", name)): + intermediate_dim = ( + intermediate_size // (32 // local_bits) + if quant_type in {"awq", "quark"} + else intermediate_size + ) + tensor_map["mlp.gate_proj.qweight"] = tensor[:, :intermediate_dim] + tensor_map["mlp.up_proj.qweight"] = tensor[:, intermediate_dim:] + elif bool( + re.match( + r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.(scales|weight_scale)$", + name, + ) + ): # model.layers.layer_id.mlp.gate_up_proj.scales # model.layers.layer_id.mlp.dense_h_to_4h.scales # model.layers.layer_id.mlp.gate_up_proj.weight_scale # model.layers.layer_id.mlp.dense_h_to_4h.weight_scale - tensor_map["mlp.gate_proj.scales"] = tensor[:, : intermediate_size] - tensor_map["mlp.up_proj.scales"] = tensor[:, intermediate_size :] - elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.(qzeros|weight_zero_point)$", name)): + tensor_map["mlp.gate_proj.scales"] = tensor[:, :intermediate_size] + tensor_map["mlp.up_proj.scales"] = tensor[:, intermediate_size:] + elif bool( + re.match( + r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.(qzeros|weight_zero_point)$", + name, + ) + ): # model.layers.layer_id.mlp.gate_up_proj.qzeros # model.layers.layer_id.mlp.dense_h_to_4h.qzeros # model.layers.layer_id.mlp.gate_up_proj.weight_zero_point # model.layers.layer_id.mlp.dense_h_to_4h.weight_zero_point - intermediate_dim = intermediate_size // (32 // local_bits) if quant_type in {"awq", "gptq", "quark", "olive"} else intermediate_size - tensor_map["mlp.gate_proj.qzeros"] = tensor[:, : intermediate_dim] - tensor_map["mlp.up_proj.qzeros"] = tensor[:, intermediate_dim :] + intermediate_dim = ( + intermediate_size // (32 // local_bits) + if quant_type in {"awq", "gptq", "quark", "olive"} + else intermediate_size + ) + tensor_map["mlp.gate_proj.qzeros"] = tensor[:, :intermediate_dim] + tensor_map["mlp.up_proj.qzeros"] = tensor[:, intermediate_dim:] elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.g_idx$", name)): # model.layers.layer_id.mlp.gate_up_proj.g_idx # model.layers.layer_id.mlp.dense_h_to_4h.g_idx @@ -404,8 +478,8 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.bias$", name)): # model.layers.layer_id.mlp.gate_up_proj.bias # model.layers.layer_id.mlp.dense_h_to_4h.bias - tensor_map["mlp.gate_proj.bias"] = tensor[: intermediate_size] - tensor_map["mlp.up_proj.bias"] = tensor[intermediate_size: ] + tensor_map["mlp.gate_proj.bias"] = tensor[:intermediate_size] + tensor_map["mlp.up_proj.bias"] = tensor[intermediate_size:] else: raise NotImplementedError(f"{name} in your quantized model is not recognized.") @@ -416,7 +490,9 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme if isinstance(submodule, QuantizedTensorModule): for q_attr, q_value in [("bits", local_bits), ("_group_size", local_group_size)]: if getattr(submodule, q_attr) is not None and getattr(submodule, q_attr) != q_value: - raise ValueError(f"Quantization {q_attr} mismatch for {name}: expected {getattr(submodule, q_attr)}, got {q_value}.") + raise ValueError( + f"Quantization {q_attr} mismatch for {name}: expected {getattr(submodule, q_attr)}, got {q_value}." + ) setattr(submodule, q_attr, q_value) setattr(submodule, tensor_name.split(".")[-1], tensor_value) @@ -441,7 +517,7 @@ def _load_quant_config(self, quant_attrs): def get_layer_bits(self, layer_name): # 'bits' is globally defined for all layers return self.global_bits - + def get_layer_group_size(self, layer_name): # 'group_size' is globally defined for all layers return self.global_group_size @@ -467,7 +543,13 @@ def set_properties(self): self.lm_head.out_features = self.lm_head.scales.shape[1] self.lm_head.in_features = self.lm_head.qweight.shape[0] # Set g_idx if not already set - self.lm_head.g_idx = self.lm_head.g_idx if self.lm_head.g_idx is not None else torch.tensor([i // self.lm_head.group_size for i in range(self.lm_head.in_features)], dtype=torch.int32) + self.lm_head.g_idx = ( + self.lm_head.g_idx + if self.lm_head.g_idx is not None + else torch.tensor( + [i // self.lm_head.group_size for i in range(self.lm_head.in_features)], dtype=torch.int32 + ) + ) elif self.quant_type == "gptq": self.lm_head.out_features = self.lm_head.qweight.shape[1] self.lm_head.in_features = self.lm_head.g_idx.shape[0] @@ -497,13 +579,62 @@ def set_properties(self): module.mlp.down_proj.in_features = module.mlp.down_proj.qweight.shape[0] # Set g_idx if not already set - module.self_attn.q_proj.g_idx = module.self_attn.q_proj.g_idx if module.self_attn.q_proj.g_idx is not None else torch.tensor([i // module.self_attn.q_proj.group_size for i in range(module.self_attn.q_proj.in_features)], dtype=torch.int32) - module.self_attn.k_proj.g_idx = module.self_attn.k_proj.g_idx if module.self_attn.k_proj.g_idx is not None else torch.tensor([i // module.self_attn.k_proj.group_size for i in range(module.self_attn.k_proj.in_features)], dtype=torch.int32) - module.self_attn.v_proj.g_idx = module.self_attn.v_proj.g_idx if module.self_attn.v_proj.g_idx is not None else torch.tensor([i // module.self_attn.v_proj.group_size for i in range(module.self_attn.v_proj.in_features)], dtype=torch.int32) - module.self_attn.o_proj.g_idx = module.self_attn.o_proj.g_idx if module.self_attn.o_proj.g_idx is not None else torch.tensor([i // module.self_attn.o_proj.group_size for i in range(module.self_attn.o_proj.in_features)], dtype=torch.int32) - module.mlp.gate_proj.g_idx = module.mlp.gate_proj.g_idx if module.mlp.gate_proj.g_idx is not None else torch.tensor([i // module.mlp.gate_proj.group_size for i in range(module.mlp.gate_proj.in_features)], dtype=torch.int32) - module.mlp.up_proj.g_idx = module.mlp.up_proj.g_idx if module.mlp.up_proj.g_idx is not None else torch.tensor([i // module.mlp.up_proj.group_size for i in range(module.mlp.up_proj.in_features)], dtype=torch.int32) - module.mlp.down_proj.g_idx = module.mlp.down_proj.g_idx if module.mlp.down_proj.g_idx is not None else torch.tensor([i // module.mlp.down_proj.group_size for i in range(module.mlp.down_proj.in_features)], dtype=torch.int32) + module.self_attn.q_proj.g_idx = ( + module.self_attn.q_proj.g_idx + if module.self_attn.q_proj.g_idx is not None + else torch.tensor( + [i // module.self_attn.q_proj.group_size for i in range(module.self_attn.q_proj.in_features)], + dtype=torch.int32, + ) + ) + module.self_attn.k_proj.g_idx = ( + module.self_attn.k_proj.g_idx + if module.self_attn.k_proj.g_idx is not None + else torch.tensor( + [i // module.self_attn.k_proj.group_size for i in range(module.self_attn.k_proj.in_features)], + dtype=torch.int32, + ) + ) + module.self_attn.v_proj.g_idx = ( + module.self_attn.v_proj.g_idx + if module.self_attn.v_proj.g_idx is not None + else torch.tensor( + [i // module.self_attn.v_proj.group_size for i in range(module.self_attn.v_proj.in_features)], + dtype=torch.int32, + ) + ) + module.self_attn.o_proj.g_idx = ( + module.self_attn.o_proj.g_idx + if module.self_attn.o_proj.g_idx is not None + else torch.tensor( + [i // module.self_attn.o_proj.group_size for i in range(module.self_attn.o_proj.in_features)], + dtype=torch.int32, + ) + ) + module.mlp.gate_proj.g_idx = ( + module.mlp.gate_proj.g_idx + if module.mlp.gate_proj.g_idx is not None + else torch.tensor( + [i // module.mlp.gate_proj.group_size for i in range(module.mlp.gate_proj.in_features)], + dtype=torch.int32, + ) + ) + module.mlp.up_proj.g_idx = ( + module.mlp.up_proj.g_idx + if module.mlp.up_proj.g_idx is not None + else torch.tensor( + [i // module.mlp.up_proj.group_size for i in range(module.mlp.up_proj.in_features)], + dtype=torch.int32, + ) + ) + module.mlp.down_proj.g_idx = ( + module.mlp.down_proj.g_idx + if module.mlp.down_proj.g_idx is not None + else torch.tensor( + [i // module.mlp.down_proj.group_size for i in range(module.mlp.down_proj.in_features)], + dtype=torch.int32, + ) + ) elif self.quant_type == "gptq": # Set in_features and out_features @@ -525,19 +656,31 @@ def set_properties(self): elif self.quant_type == "olive": # Set in_features and out_features module.self_attn.q_proj.out_features = module.self_attn.q_proj.qweight.shape[1] - module.self_attn.q_proj.in_features = module.self_attn.q_proj.qweight.shape[0] * 32 // module.self_attn.q_proj.bits + module.self_attn.q_proj.in_features = ( + module.self_attn.q_proj.qweight.shape[0] * 32 // module.self_attn.q_proj.bits + ) module.self_attn.k_proj.out_features = module.self_attn.k_proj.qweight.shape[1] - module.self_attn.k_proj.in_features = module.self_attn.k_proj.qweight.shape[0] * 32 // module.self_attn.k_proj.bits + module.self_attn.k_proj.in_features = ( + module.self_attn.k_proj.qweight.shape[0] * 32 // module.self_attn.k_proj.bits + ) module.self_attn.v_proj.out_features = module.self_attn.v_proj.qweight.shape[1] - module.self_attn.v_proj.in_features = module.self_attn.v_proj.qweight.shape[0] * 32 // module.self_attn.v_proj.bits + module.self_attn.v_proj.in_features = ( + module.self_attn.v_proj.qweight.shape[0] * 32 // module.self_attn.v_proj.bits + ) module.self_attn.o_proj.out_features = module.self_attn.o_proj.qweight.shape[1] - module.self_attn.o_proj.in_features = module.self_attn.o_proj.qweight.shape[0] * 32 // module.self_attn.o_proj.bits + module.self_attn.o_proj.in_features = ( + module.self_attn.o_proj.qweight.shape[0] * 32 // module.self_attn.o_proj.bits + ) module.mlp.gate_proj.out_features = module.mlp.gate_proj.qweight.shape[1] - module.mlp.gate_proj.in_features = module.mlp.gate_proj.qweight.shape[0] * 32 // module.mlp.gate_proj.bits + module.mlp.gate_proj.in_features = ( + module.mlp.gate_proj.qweight.shape[0] * 32 // module.mlp.gate_proj.bits + ) module.mlp.up_proj.out_features = module.mlp.up_proj.qweight.shape[1] module.mlp.up_proj.in_features = module.mlp.up_proj.qweight.shape[0] * 32 // module.mlp.up_proj.bits module.mlp.down_proj.out_features = module.mlp.down_proj.qweight.shape[1] - module.mlp.down_proj.in_features = module.mlp.down_proj.qweight.shape[0] * 32 // module.mlp.down_proj.bits + module.mlp.down_proj.in_features = ( + module.mlp.down_proj.qweight.shape[0] * 32 // module.mlp.down_proj.bits + ) else: raise NotImplementedError(f"The {self.quant_type} quantization method is not recognized.") @@ -597,7 +740,7 @@ def unpack_on_row_for_2_4_8_bits(self, tensor, bits, transpose): wf = torch.arange(0, 32, bits, device=pack_tensor.device).unsqueeze(0).unsqueeze(0) out = torch.bitwise_right_shift(torch.unsqueeze(pack_tensor, 2), wf) out = out.reshape(pack_tensor.shape[0], -1) - out = torch.bitwise_and(out, (2 ** bits) - 1) + out = torch.bitwise_and(out, (2**bits) - 1) return out.T if transpose else out def unpack_on_row(self, tensor, bits, transpose): @@ -858,7 +1001,7 @@ def __init__(self, module): self.unpack_qzeros(temp_module) temp_module.qzeros += 1 - temp_module.qzeros = torch.bitwise_and(temp_module.qzeros, (2 ** temp_module.bits) - 1) + temp_module.qzeros = torch.bitwise_and(temp_module.qzeros, (2**temp_module.bits) - 1) self.pack_qzeros(temp_module) module.qzeros = temp_module.qzeros @@ -879,6 +1022,7 @@ def get_layer_bits(self, layer_name): def get_layer_group_size(self, layer_name): return self.get_overrides(layer_name).get("group_size", self.global_group_size) + class QuarkModel(QuantizedModel): def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, intermediate_size, num_layers): super().__init__(quant_type, input_path, quant_attrs, q_size, kv_size, intermediate_size, num_layers) @@ -942,7 +1086,7 @@ def get_layer_bits(self, layer_name): raise ValueError(f"Unexpected dtype: {local_dtype}.") return dtype_bits_maps[local_dtype] return self.global_bits - + def get_layer_group_size(self, layer_name): name = layer_name.split(".")[0] if name in self._quant_attrs["config"]["layer_quant_config"]: @@ -992,6 +1136,7 @@ def reverse_reorder_tensor(self, tensor, bits): int_tensor = tensor[:, reverse_order_tensor] return int_tensor + class OliveModel(GPTQModel): def _load_quant_config(self, quant_attrs): super()._load_quant_config(quant_attrs) @@ -1005,6 +1150,7 @@ def get_layer_group_size(self, layer_name): name = ".".join(layer_name.split(".")[:-1]) return self.overrides.get(name, {}).get("group_size", self.global_group_size) + class QuantModel: @staticmethod def from_pretrained(quant_type, **kwargs): diff --git a/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm index 391456dc05..9ff3bee78c 100644 --- a/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm +++ b/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm @@ -11,7 +11,6 @@ #include #include - @interface ios_package_test_cpp_api : XCTestCase @end @@ -29,42 +28,42 @@ - (void)setUp { - (void)tearDown { // Put teardown code here. This method is called after the invocation of each test method in the class. - OgaShutdown(); + OgaShutdown(); } - (NSString*)getFilePath { - NSBundle* bundle = [NSBundle bundleForClass:[self class]]; - NSString* path = [bundle resourcePath]; - return path; + NSBundle* bundle = [NSBundle bundleForClass:[self class]]; + NSString* path = [bundle resourcePath]; + return path; } - (void)testCppAPI_Basic { - auto model = OgaModel::Create([self getFilePath].UTF8String); + auto model = OgaModel::Create([self getFilePath].UTF8String); - auto tokenizer = OgaTokenizer::Create(*model); + auto tokenizer = OgaTokenizer::Create(*model); - const char* prompt = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"; + const char* prompt = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"; - auto sequences = OgaSequences::Create(); - tokenizer->Encode(prompt, *sequences); + auto sequences = OgaSequences::Create(); + tokenizer->Encode(prompt, *sequences); - auto params = OgaGeneratorParams::Create(*model); - params->SetSearchOption("max_length", 100); - params->SetSearchOption("batch_size", 1); + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 100); + params->SetSearchOption("batch_size", 1); - auto generator = OgaGenerator::Create(*model, *params); - generator->AppendTokenSequences(*sequences); + auto generator = OgaGenerator::Create(*model, *params); + generator->AppendTokenSequences(*sequences); - while (true) { - generator->GenerateNextToken(); - if (generator->IsDone()) { - break; - } + while (true) { + generator->GenerateNextToken(); + if (generator->IsDone()) { + break; } + } - const auto output_sequence_length = generator->GetSequenceCount(0); - const auto* output_sequence_data = generator->GetSequenceData(0); - auto out_string = tokenizer->Decode(output_sequence_data, output_sequence_length); + const auto output_sequence_length = generator->GetSequenceCount(0); + const auto* output_sequence_data = generator->GetSequenceData(0); + auto out_string = tokenizer->Decode(output_sequence_data, output_sequence_length); } @end diff --git a/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm index 2802b764e3..a4e5fdfc63 100644 --- a/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm +++ b/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm @@ -11,7 +11,6 @@ #include #include - @interface macos_package_testUITests : XCTestCase @end @@ -29,42 +28,42 @@ - (void)setUp { - (void)tearDown { // Put teardown code here. This method is called after the invocation of each test method in the class. - OgaShutdown(); + OgaShutdown(); } - (NSString*)getFilePath { - NSBundle* bundle = [NSBundle bundleForClass:[self class]]; - NSString* path = [bundle resourcePath]; - return path; + NSBundle* bundle = [NSBundle bundleForClass:[self class]]; + NSString* path = [bundle resourcePath]; + return path; } - (void)testCppAPI_Basic { - auto model = OgaModel::Create([self getFilePath].UTF8String); + auto model = OgaModel::Create([self getFilePath].UTF8String); - auto tokenizer = OgaTokenizer::Create(*model); + auto tokenizer = OgaTokenizer::Create(*model); - const char* prompt = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"; + const char* prompt = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"; - auto sequences = OgaSequences::Create(); - tokenizer->Encode(prompt, *sequences); + auto sequences = OgaSequences::Create(); + tokenizer->Encode(prompt, *sequences); - auto params = OgaGeneratorParams::Create(*model); - params->SetSearchOption("max_length", 100); - params->SetSearchOption("batch_size", 1); + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 100); + params->SetSearchOption("batch_size", 1); - auto generator = OgaGenerator::Create(*model, *params); - generator->AppendTokenSequences(*sequences); + auto generator = OgaGenerator::Create(*model, *params); + generator->AppendTokenSequences(*sequences); - while (true) { - generator->GenerateNextToken(); - if (generator->IsDone()) { - break; - } + while (true) { + generator->GenerateNextToken(); + if (generator->IsDone()) { + break; } + } - const auto output_sequence_length = generator->GetSequenceCount(0); - const auto* output_sequence_data = generator->GetSequenceData(0); - auto out_string = tokenizer->Decode(output_sequence_data, output_sequence_length); + const auto output_sequence_length = generator->GetSequenceCount(0); + const auto* output_sequence_data = generator->GetSequenceData(0); + auto out_string = tokenizer->Decode(output_sequence_data, output_sequence_length); } @end diff --git a/test/python/_test_utils.py b/test/python/_test_utils.py index 78b17a848a..7621ba2b70 100644 --- a/test/python/_test_utils.py +++ b/test/python/_test_utils.py @@ -5,7 +5,6 @@ import os import subprocess import sys -from typing import Dict, List, Optional, Union def is_windows(): @@ -13,13 +12,13 @@ def is_windows(): def run_subprocess( - args: List[str], - cwd: Optional[Union[str, bytes, os.PathLike]] = None, + args: list[str], + cwd: str | bytes | os.PathLike | None = None, capture: bool = False, - dll_path: Optional[Union[str, bytes, os.PathLike]] = None, + dll_path: str | bytes | os.PathLike | None = None, shell: bool = False, - env: Dict[str, str] = {}, - log: Optional[logging.Logger] = None, + env: dict[str, str] = {}, + log: logging.Logger | None = None, ): if log: log.info(f"Running subprocess in '{cwd or os.getcwd()}'\n{args}") @@ -46,9 +45,7 @@ def run_subprocess( ) if log: - log.debug( - "Subprocess completed. Return code=" + str(completed_process.returncode) - ) + log.debug("Subprocess completed. Return code=" + str(completed_process.returncode)) return completed_process @@ -138,7 +135,7 @@ def download_models(download_path, precision, device, log): ci_paths, hf_paths = get_model_paths() output_paths = [] - + log.debug(f"Downloading {len(ci_paths)} PyTorch models and {len(hf_paths)} Hugging Face models") # python -m onnxruntime_genai.models.builder -i -o -p -e @@ -153,6 +150,7 @@ def download_models(download_path, precision, device, log): for model_name, hf_name in hf_paths.items(): try: from huggingface_hub import model_info + model_info(hf_name) except ImportError: log.warning("huggingface_hub is not installed. Skipping downloading hugging face models.") diff --git a/test/python/conftest.py b/test/python/conftest.py index 0836e4c590..fac85ef11e 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -3,10 +3,8 @@ import functools import os -import sys import pytest -from _test_utils import run_subprocess def pytest_addoption(parser): @@ -34,6 +32,7 @@ def phi2_for(request): "int4", ) + @pytest.fixture def phi3_for(request): return functools.partial( @@ -43,6 +42,7 @@ def phi3_for(request): "int4", ) + @pytest.fixture def gemma_for(request): return functools.partial( @@ -75,9 +75,7 @@ def qwen_for(request): @pytest.fixture def path_for_model(request): - return functools.partial( - get_path_for_model, request.config.getoption("--test_models") - ) + return functools.partial(get_path_for_model, request.config.getoption("--test_models")) @pytest.fixture diff --git a/test/python/test_onnxruntime_genai.py b/test/python/test_onnxruntime_genai.py index 77e89afefb..b813e72305 100644 --- a/test/python/test_onnxruntime_genai.py +++ b/test/python/test_onnxruntime_genai.py @@ -7,7 +7,6 @@ import pathlib import sys import sysconfig -from typing import Union, List import onnxruntime_genai as og from _test_utils import download_models, run_subprocess @@ -19,9 +18,9 @@ def run_onnxruntime_genai_api_tests( - cwd: Union[str, bytes, os.PathLike], + cwd: str | bytes | os.PathLike, log: logging.Logger, - test_models: Union[str, bytes, os.PathLike], + test_models: str | bytes | os.PathLike, ): log.debug("Running: ONNX Runtime GenAI API Tests") @@ -38,9 +37,9 @@ def run_onnxruntime_genai_api_tests( def run_onnxruntime_genai_e2e_tests( - cwd: Union[str, bytes, os.PathLike], + cwd: str | bytes | os.PathLike, log: logging.Logger, - output_paths: List[Union[str, bytes, os.PathLike]], + output_paths: list[str | bytes | os.PathLike], ): log.debug("Running: ONNX Runtime GenAI E2E Tests") diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py index b5f4a2d2b9..2c6cb0aa3e 100644 --- a/test/python/test_onnxruntime_genai_api.py +++ b/test/python/test_onnxruntime_genai_api.py @@ -4,14 +4,13 @@ from __future__ import annotations import os -import sys -import sysconfig -from pathlib import Path import shutil +import sysconfig import tempfile -import onnxruntime +from pathlib import Path import numpy as np +import onnxruntime import onnxruntime_genai as og import pytest @@ -37,9 +36,7 @@ def test_config(test_data_path): - model_path = os.fspath( - Path(test_data_path) / "hf-internal-testing" / "tiny-random-gpt2-fp32" - ) + model_path = os.fspath(Path(test_data_path) / "hf-internal-testing" / "tiny-random-gpt2-fp32") config = og.Config(model_path) config.clear_providers() config.append_provider("cuda") @@ -48,6 +45,7 @@ def test_config(test_data_path): config.set_provider_option("quantum", "break_universe", "true") config.append_provider("slide rule") + def test_log_callback(test_data_path): callback_invoked = False @@ -72,6 +70,7 @@ def _log_callback(log: str): og.set_log_callback(None) og.set_log_options(enabled=False) + def test_log_filename(test_data_path): callback_invoked = False @@ -81,7 +80,7 @@ def _log_callback(log: str): og.set_log_callback(_log_callback) - with tempfile.NamedTemporaryFile(mode='w+', suffix='.txt', delete=False) as log_file: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False) as log_file: og.set_log_options(enabled=True, generate_next_token=True, filename=log_file.name) model_path = os.fspath(Path(test_data_path) / "hf-internal-testing" / "tiny-random-gpt2-fp32") @@ -95,19 +94,18 @@ def _log_callback(log: str): assert os.path.exists(log_file.name), f"Log file {log_file.name} was not created" assert os.path.getsize(log_file.name) > 0, f"Log file {log_file.name} is empty" - assert not callback_invoked, "Log callback was invoked. It should not have been since it was overridden by the log file." + assert not callback_invoked, ( + "Log callback was invoked. It should not have been since it was overridden by the log file." + ) og.set_log_options(enabled=False, filename="") og.set_log_callback(None) + def test_NamedTensors(): named_tensors = og.NamedTensors() - named_tensors["input_ids"] = np.array( - [[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32 - ) - named_tensors["attention_mask"] = np.array( - [[1, 1, 1, 1], [1, 1, 1, 1]], dtype=np.int32 - ) + named_tensors["input_ids"] = np.array([[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32) + named_tensors["attention_mask"] = np.array([[1, 1, 1, 1], [1, 1, 1, 1]], dtype=np.int32) named_tensors["test1"] = og.Tensor(np.random.rand(2, 2).astype(np.float32)) named_tensors["test2"] = og.Tensor(np.random.rand(2, 2).astype(np.float32)) @@ -146,9 +144,7 @@ def test_greedy_search(test_data_path, relative_model_path): search_params = og.GeneratorParams(model) batch_size = 2 search_params = og.GeneratorParams(model) - search_params.set_search_options( - do_sample=False, max_length=10, batch_size=batch_size - ) + search_params.set_search_options(do_sample=False, max_length=10, batch_size=batch_size) generator = og.Generator(model, search_params) generator.append_tokens(np.array([[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32)) @@ -193,9 +189,7 @@ def test_rewind_cuda(test_data_path, relative_model_path): # Batch size 1 (continuous decoding) case batch_size = 1 search_params = og.GeneratorParams(model) - search_params.set_search_options( - do_sample=False, max_length=10, batch_size=batch_size - ) + search_params.set_search_options(do_sample=False, max_length=10, batch_size=batch_size) generator = og.Generator(model, search_params) generator.append_tokens(np.array([[0, 0, 195, 731]], dtype=np.int32)) @@ -219,14 +213,10 @@ def test_rewind_cuda(test_data_path, relative_model_path): # Batch size > 1 case batch_size = 3 search_params = og.GeneratorParams(model) - search_params.set_search_options( - do_sample=False, max_length=10, batch_size=batch_size - ) + search_params.set_search_options(do_sample=False, max_length=10, batch_size=batch_size) generator = og.Generator(model, search_params) - generator.append_tokens( - np.array([[0, 0, 0, 52], [0, 0, 195, 731], [64, 65, 66, 67]], dtype=np.int32) - ) + generator.append_tokens(np.array([[0, 0, 0, 52], [0, 0, 195, 731], [64, 65, 66, 67]], dtype=np.int32)) while True: generator.generate_next_token() if generator.is_done(): @@ -265,12 +255,10 @@ def test_rewind(test_data_path, relative_model_path): [0, 0, 195, 731, 731, 114, 114, 114, 114, 114], dtype=np.int32, ) - + batch_size = 1 search_params = og.GeneratorParams(model) - search_params.set_search_options( - do_sample=False, max_length=10, batch_size=batch_size - ) + search_params.set_search_options(do_sample=False, max_length=10, batch_size=batch_size) generator = og.Generator(model, search_params) generator.append_tokens(np.array([[0, 0, 195, 731]], dtype=np.int32)) @@ -294,6 +282,7 @@ def test_rewind(test_data_path, relative_model_path): # Test Model Loading with No Chat Template + @pytest.mark.skipif( sysconfig.get_platform().endswith("arm64"), reason="Model is not available on arm64.", @@ -335,7 +324,9 @@ def test_phi3_chat_template(device, phi3_for): model = og.Model(model_path) tokenizer = og.Tokenizer(model) - messages = f"""[{{"role": "system", "content": "This is a test."}}, {{"role": "user", "content": "Hi, how are you?"}}]""" + messages = ( + """[{"role": "system", "content": "This is a test."}, {"role": "user", "content": "Hi, how are you?"}]""" + ) try: tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True) @@ -355,15 +346,15 @@ def test_phi2_chat_template(device, phi2_for): model = og.Model(model_path) tokenizer = og.Tokenizer(model) - messages = f"""[{{"role": "system", "content": "This is a test."}}, {{"role": "user", "content": "Hi, how are you?"}}]""" + messages = ( + """[{"role": "system", "content": "This is a test."}, {"role": "user", "content": "Hi, how are you?"}]""" + ) # Note: this should work, even though phi-2 has no official chat template, as we override it and pass one in template = """{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}""" template_string = f"""{template}""" try: - tokenizer.apply_chat_template( - template_str=template_string, messages=messages, add_generation_prompt=True - ) + tokenizer.apply_chat_template(template_str=template_string, messages=messages, add_generation_prompt=True) except Exception as e: assert False, f"Error while trying to override chat template: {e}" @@ -392,6 +383,7 @@ def test_tokenizer_stream(device, phi2_for): assert decoded_string == prompt + @pytest.mark.skipif( sysconfig.get_platform().endswith("arm64"), reason="Model is not available on arm64.", @@ -422,6 +414,7 @@ def test_batching(device, phi2_for): for i in range(len(prompts)): print(tokenizer.decode(generator.get_sequence(0))) + @pytest.mark.skipif( sysconfig.get_platform().endswith("arm64"), reason="Model is not available on arm64.", @@ -452,6 +445,7 @@ def test_e2e(device, phi2_for): for i in range(len(prompts)): print(tokenizer.decode(generator.get_sequence(0))) + @pytest.mark.skipif( sysconfig.get_platform().endswith("arm64"), reason="Model is not available on arm64.", @@ -462,7 +456,7 @@ def test_load_model_from_memory(device, wrapper_bytes_function, phi2_for): model_path = phi2_for(device) config = og.Config(model_path) model_data = None - with open(os.path.join(model_path, "model.onnx"), 'rb') as model_file: + with open(os.path.join(model_path, "model.onnx"), "rb") as model_file: model_data = wrapper_bytes_function(model_file.read()) config.add_model_data("model.onnx", model_data) @@ -486,6 +480,7 @@ def test_load_model_from_memory(device, wrapper_bytes_function, phi2_for): for i in range(len(prompts)): print(tokenizer.decode(generator.get_sequence(0))) + @pytest.mark.parametrize( "relative_model_path", ( @@ -527,9 +522,7 @@ def test_get_output(test_data_path, relative_model_path): search_params = og.GeneratorParams(model) input_ids = np.array([[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32) - search_params.set_search_options( - do_sample=False, max_length=10, batch_size=input_ids.shape[0] - ) + search_params.set_search_options(do_sample=False, max_length=10, batch_size=input_ids.shape[0]) generator = og.Generator(model, search_params) generator.append_tokens(input_ids) @@ -566,9 +559,7 @@ def test_get_output(test_data_path, relative_model_path): ] ) logits = generator.get_output("logits") - assert np.allclose( - logits[:, :, ::200], expected_sampled_logits_token_gen, atol=1e-3 - ) + assert np.allclose(logits[:, :, ::200], expected_sampled_logits_token_gen, atol=1e-3) @pytest.mark.skipif( @@ -581,9 +572,7 @@ def test_hidden_states(qwen_for, device): search_params = og.GeneratorParams(model) input_ids = np.array([[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32) - search_params.set_search_options( - do_sample=False, max_length=10, batch_size=input_ids.shape[0] - ) + search_params.set_search_options(do_sample=False, max_length=10, batch_size=input_ids.shape[0]) generator = og.Generator(model, search_params) generator.append_tokens(input_ids) @@ -595,9 +584,7 @@ def test_hidden_states(qwen_for, device): assert hidden_states.shape == (2, 1, 896) -@pytest.mark.skipif( - not og.is_cuda_available(), reason="Pipeline model uses a mix of CPU and CUDA EP." -) +@pytest.mark.skipif(not og.is_cuda_available(), reason="Pipeline model uses a mix of CPU and CUDA EP.") @pytest.mark.parametrize("relative_model_path", [Path("pipeline-model")]) def test_pipeline_model(test_data_path, phi2_for, relative_model_path): def _extract_subgraph( @@ -630,19 +617,10 @@ def _split(onnx_model_path: os.PathLike, output_dir: os.PathLike): (["input_ids"], ["/model/embed_tokens/Gather/output_0"]), ( ["/model/embed_tokens/Gather/output_0", "attention_mask"] - + [ - f"past_key_values.{i}.{kv}" - for kv in ["key", "value"] - for i in range(num_layers) - ], - ["hidden_states"] - + [ - f"present.{i}.{kv}" - for kv in ["key", "value"] - for i in range(num_layers) - ], + + [f"past_key_values.{i}.{kv}" for kv in ["key", "value"] for i in range(num_layers)], + ["hidden_states"] + [f"present.{i}.{kv}" for kv in ["key", "value"] for i in range(num_layers)], ), - ([f"hidden_states"], ["logits"]), + (["hidden_states"], ["logits"]), ] for i, split_name in enumerate(["embeds", "transformer", "lm_head"]): @@ -690,8 +668,8 @@ def _split(onnx_model_path: os.PathLike, output_dir: os.PathLike): if not equal: print("test_pipeline_model:", flush=True) - print(f"expected = {repr(expected_output[i])}", flush=True) - print(f"actual = {repr(actual_output)}", flush=True) + print(f"expected = {expected_output[i]!r}", flush=True) + print(f"actual = {actual_output!r}", flush=True) assert equal @@ -712,9 +690,7 @@ def test_vision_preprocessing(test_data_path, relative_model_path, relative_imag @pytest.mark.parametrize("relative_model_path", [Path("vision-preprocessing")]) @pytest.mark.parametrize("relative_image_path", [Path("images") / "sheet.png"]) -def test_vision_preprocessing_load_image_from_bytes( - test_data_path, relative_model_path, relative_image_path -): +def test_vision_preprocessing_load_image_from_bytes(test_data_path, relative_model_path, relative_image_path): model_path = os.fspath(Path(test_data_path) / relative_model_path) model = og.Model(model_path) @@ -735,23 +711,20 @@ def test_vision_preprocessing_load_image_from_bytes( "relative_image_paths", [[Path("images") / "australia.jpg", Path("images") / "sheet.png"]], ) -def test_vision_preprocessing_multiple_images( - test_data_path, relative_model_path, relative_image_paths -): +def test_vision_preprocessing_multiple_images(test_data_path, relative_model_path, relative_image_paths): model_path = os.fspath(Path(test_data_path) / relative_model_path) model = og.Model(model_path) processor = model.create_multimodal_processor() image_paths = [ - os.fspath(Path(test_data_path) / relative_image_path) - for relative_image_path in relative_image_paths + os.fspath(Path(test_data_path) / relative_image_path) for relative_image_path in relative_image_paths ] images = og.Images.open(*image_paths) prompt = "<|user|>\n" for i in range(len(relative_image_paths)): - prompt += f"<|image_{i+1}|>\n" + prompt += f"<|image_{i + 1}|>\n" prompt += " What is shown in this two images?\n<|end|>\n<|assistant|>\n" _ = processor(prompt, images=images) @@ -796,9 +769,7 @@ def _prepare_adapter_model(test_data_path): model.graph.input.extend([adapter_a, adapter_b]) for adapter_name in ["adapter_a", "adapter_b"]: - adapter_weight = np.zeros( - [vocab_size], dtype=(np.float32 if device == "cpu" else np.float16) - ) + adapter_weight = np.zeros([vocab_size], dtype=(np.float32 if device == "cpu" else np.float16)) adapter_weight_tensor = onnx.helper.make_tensor( adapter_name, onnx.TensorProto.FLOAT if device == "cpu" else onnx.TensorProto.FLOAT16, @@ -807,9 +778,7 @@ def _prepare_adapter_model(test_data_path): ) model.graph.initializer.append(adapter_weight_tensor) - add_node = onnx.helper.make_node( - "Add", ["adapter_a", "adapter_b"], ["adapter_output"], name="adapter_add" - ) + add_node = onnx.helper.make_node("Add", ["adapter_a", "adapter_b"], ["adapter_output"], name="adapter_add") add_to_logits_node = onnx.helper.make_node( "Add", ["adapter_output", "logits_0"], ["logits"], name="add_to_logits" ) @@ -833,12 +802,8 @@ def _prepare_adapter_model(test_data_path): onnx_dtype = 1 if device == "cpu" else 10 adapters = { - "adapter_a": onnxruntime.OrtValue.ortvalue_from_numpy_with_onnx_type( - a, onnx_dtype - ), - "adapter_b": onnxruntime.OrtValue.ortvalue_from_numpy_with_onnx_type( - b, onnx_dtype - ), + "adapter_a": onnxruntime.OrtValue.ortvalue_from_numpy_with_onnx_type(a, onnx_dtype), + "adapter_b": onnxruntime.OrtValue.ortvalue_from_numpy_with_onnx_type(b, onnx_dtype), } if multiple_adapters: adapters = [{key: value} for key, value in adapters.items()] @@ -853,9 +818,7 @@ def _export_adapter(adapter, adapter_file_name): adapter_paths = [] if multiple_adapters: for i, adapter in enumerate(adapters): - adapter_file_name = str( - Path(adapter_model_path) / f"adapter_{i}.onnx_adapter" - ) + adapter_file_name = str(Path(adapter_model_path) / f"adapter_{i}.onnx_adapter") _export_adapter(adapter, adapter_file_name) adapter_paths.append(adapter_file_name) else: @@ -1011,8 +974,7 @@ def test_audio_preprocessing_multiple_audios(test_data_path, relative_model_path processor = model.create_multimodal_processor() audio_paths = [ - os.fspath(Path(test_data_path) / relative_audio_path) - for relative_audio_path in relative_audio_paths + os.fspath(Path(test_data_path) / relative_audio_path) for relative_audio_path in relative_audio_paths ] audios = og.Audios.open(*audio_paths) diff --git a/test/python/test_onnxruntime_genai_e2e.py b/test/python/test_onnxruntime_genai_e2e.py index 4a8c1c4c0a..96c77b9e82 100644 --- a/test/python/test_onnxruntime_genai_e2e.py +++ b/test/python/test_onnxruntime_genai_e2e.py @@ -4,8 +4,8 @@ import argparse import json -import os import logging +import os import sys import onnxruntime_genai as og @@ -37,7 +37,7 @@ def run_model(model_path: str | bytes | os.PathLike): generator.generate_next_token() if generator.is_done(): break - + for i in range(3): assert generator.get_sequence(i) is not None diff --git a/test/test_models/create_dummy_model.py b/test/test_models/create_dummy_model.py index 4da008b9b5..453a5ff833 100644 --- a/test/test_models/create_dummy_model.py +++ b/test/test_models/create_dummy_model.py @@ -54,9 +54,11 @@ """ import argparse + import numpy as np import onnx -from onnx import helper, numpy_helper, TensorProto +from onnx import TensorProto, helper, numpy_helper + def get_args(): parser = argparse.ArgumentParser() diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py index cf35cb3e97..8ed7dbb969 100644 --- a/tools/ci_build/get_docker_image.py +++ b/tools/ci_build/get_docker_image.py @@ -60,9 +60,7 @@ def main(): args = parse_args() log.debug( - "Dockerfile: {}, context: {}, docker build args: '{}'".format( - args.dockerfile, args.context, args.docker_build_args - ) + f"Dockerfile: {args.dockerfile}, context: {args.context}, docker build args: '{args.docker_build_args}'" ) use_container_registry = args.container_registry is not None diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py index 8b3b339acb..375a9f6f34 100644 --- a/tools/ci_build/github/android/build_aar_package.py +++ b/tools/ci_build/github/android/build_aar_package.py @@ -7,7 +7,6 @@ import os import subprocess import sys - from pathlib import Path REPO_ROOT = Path(__file__).parents[4] @@ -97,7 +96,7 @@ def _build_aar(args): jnilibs_dir = intermediates_dir / "jnilibs" / build_config base_build_command = [sys.executable, str(BUILD_PY), f"--config={build_config}"] if args.ort_home: - base_build_command += [f"--ort_home={str(args.ort_home)}"] + base_build_command += [f"--ort_home={args.ort_home!s}"] base_build_command += build_settings["build_params"] header_files_path = None diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py index 35bbb6ba4c..6c09a6a852 100755 --- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py +++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py @@ -11,9 +11,10 @@ import tempfile from c.assemble_c_pod_package import assemble_c_pod_package -from objectivec.assemble_objc_pod_package import assemble_objc_pod_package from package_assembly_utils import PackageVariant, get_ort_genai_version +from objectivec.assemble_objc_pod_package import assemble_objc_pod_package + SCRIPT_PATH = pathlib.Path(__file__).resolve() SCRIPT_DIR = SCRIPT_PATH.parent REPO_DIR = SCRIPT_PATH.parents[4] diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py index a298f5f450..da136e5eff 100644 --- a/tools/ci_build/github/apple/build_apple_framework.py +++ b/tools/ci_build/github/apple/build_apple_framework.py @@ -175,7 +175,7 @@ def _build_package(args): ) if args.ort_home: - base_build_command += ['--ort_home', args.ort_home] + base_build_command += ["--ort_home", args.ort_home] if args.include_ops_by_config is not None: base_build_command += ["--include_ops_by_config=" + str(args.include_ops_by_config.resolve())] diff --git a/tools/ci_build/github/apple/c/assemble_c_pod_package.py b/tools/ci_build/github/apple/c/assemble_c_pod_package.py index 878333c38a..a38cfe4ac1 100644 --- a/tools/ci_build/github/apple/c/assemble_c_pod_package.py +++ b/tools/ci_build/github/apple/c/assemble_c_pod_package.py @@ -40,7 +40,7 @@ def assemble_c_pod_package( public_headers_dir: pathlib.Path, framework_dir: pathlib.Path, package_variant: PackageVariant, - ort_version: str + ort_version: str, ): """ Assembles the files for the C/C++ pod package in a staging directory. @@ -136,9 +136,7 @@ def parse_args(): "--variant", choices=PackageVariant.release_variant_names(), required=True, help="Pod package variant." ) - parser.add_argument( - "--ort-version", required=True, help="The ORT version to depend on." - ) + parser.add_argument("--ort-version", required=True, help="The ORT version to depend on.") return parser.parse_args() @@ -153,7 +151,7 @@ def main(): public_headers_dir=args.public_headers_dir, framework_dir=args.framework_dir, package_variant=PackageVariant[args.variant], - ort_version=args.ort_version + ort_version=args.ort_version, ) return 0 diff --git a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py index e11774b705..572bc4901f 100755 --- a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py +++ b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py @@ -51,6 +51,7 @@ ], } + def get_pod_files(package_variant: PackageVariant): """ Gets the source and header files for the given package variant. diff --git a/tools/ci_build/github/apple/package_assembly_utils.py b/tools/ci_build/github/apple/package_assembly_utils.py index cca1c6eb1b..2b702a0203 100644 --- a/tools/ci_build/github/apple/package_assembly_utils.py +++ b/tools/ci_build/github/apple/package_assembly_utils.py @@ -7,7 +7,6 @@ import pathlib import re import shutil -from typing import Dict, List _script_dir = pathlib.Path(__file__).parent.resolve(strict=True) repo_root = _script_dir.parents[3] @@ -25,7 +24,7 @@ def release_variant_names(cls): def gen_file_from_template( - template_file: pathlib.Path, output_file: pathlib.Path, variable_substitutions: Dict[str, str], strict: bool = True + template_file: pathlib.Path, output_file: pathlib.Path, variable_substitutions: dict[str, str], strict: bool = True ): """ Generates a file from a template file. @@ -64,7 +63,7 @@ def replace_template_variable(match): output.write(content) -def filter_files(subpath: str, all_file_patterns: List[str], excluded_file_patterns: List[str]): +def filter_files(subpath: str, all_file_patterns: list[str], excluded_file_patterns: list[str]): """ Filters file paths based on inclusion and exclusion patterns @@ -90,7 +89,7 @@ def filter_files(subpath: str, all_file_patterns: List[str], excluded_file_patte return list(set(all_files) - set(exclude_files)) -def copy_repo_relative_to_dir(subpath: str, patterns: List[str], dest_dir: pathlib.Path): +def copy_repo_relative_to_dir(subpath: str, patterns: list[str], dest_dir: pathlib.Path): """ Copies file paths relative to the repo root to a directory. The given paths or path patterns are relative to the repo root, and the diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py index a16ad89b0c..c09f48f9d8 100644 --- a/tools/ci_build/github/apple/test_apple_packages.py +++ b/tools/ci_build/github/apple/test_apple_packages.py @@ -12,9 +12,8 @@ import sys import tempfile -from huggingface_hub import snapshot_download - from c.assemble_c_pod_package import assemble_c_pod_package +from huggingface_hub import snapshot_download from package_assembly_utils import PackageVariant, gen_file_from_template, get_ort_genai_version SCRIPT_PATH = pathlib.Path(__file__).resolve(strict=True) @@ -84,7 +83,7 @@ def _test_apple_packages(args): public_headers_dir=public_headers_dir, framework_dir=framework_dir, package_variant=PackageVariant[args.variant], - ort_version=args.ort_version + ort_version=args.ort_version, ) # move podspec out to target_proj_path first @@ -128,7 +127,7 @@ def _test_apple_packages(args): snapshot_download( repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", allow_patterns="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*", - local_dir=model_dir + local_dir=model_dir, ) # run the tests @@ -283,9 +282,7 @@ def parse_args(): help="Run tests for mac catalyst variants. Specify this argument when build targets contains catalyst archs. ", ) - parser.add_argument( - "--ort_version", required=True, help="The ORT version to depend on." - ) + parser.add_argument("--ort_version", required=True, help="The ORT version to depend on.") return parser.parse_args() diff --git a/tools/nuget/generate_nuspec_for_custom_nuget.py b/tools/nuget/generate_nuspec_for_custom_nuget.py index 4a7cd32c61..df218f3652 100644 --- a/tools/nuget/generate_nuspec_for_custom_nuget.py +++ b/tools/nuget/generate_nuspec_for_custom_nuget.py @@ -27,9 +27,7 @@ def generate_files(lines, args): continue file_name = os.path.basename(file) - files_list.append( - f'' - ) + files_list.append(f'') for file in glob.glob(os.path.join(platform_dir, "include", "*")): if not os.path.isfile(file): @@ -38,24 +36,18 @@ def generate_files(lines, args): if file_name in processed_includes: continue processed_includes.add(file_name) - files_list.append( - f'' - ) + files_list.append(f'') - files_list.append(f'') - files_list.append( - f'' - ) - files_list.append( - f'' - ) + files_list.append(rf'') + files_list.append(f'') + files_list.append(rf'') for dotnet in ["netstandard2.0", "net8.0", "native"]: files_list.append( - f'' + f'' ) files_list.append( - f'' + f'' ) files_list.append("") @@ -69,12 +61,8 @@ def parse_arguments(): parser.add_argument("--package_name", required=True, help="Name of the custom package.") parser.add_argument("--ort_package_name", required=True, help="Corresponding ORT custom package name.") - parser.add_argument( - "--package_version", required=True, help="ORT GenAI package version. Eg: 1.0.0" - ) - parser.add_argument( - "--ort_package_version", required=True, help="Corresponding ORT package version." - ) + parser.add_argument("--package_version", required=True, help="ORT GenAI package version. Eg: 1.0.0") + parser.add_argument("--ort_package_version", required=True, help="Corresponding ORT package version.") parser.add_argument("--nuspec_path", required=True, help="Nuspec output file path.") parser.add_argument("--root_dir", required=True, help="ORT GenAI repository root directory.") parser.add_argument( diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index 37465c7de2..124c6bd7ce 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -109,15 +109,15 @@ def generate_dependencies(xml_text, package_version, ort_package_name, ort_packa def generate_files(lines, args): lines.append('') - lines.append(f'') - lines.append(f'') - lines.append(f'') + lines.append(rf'') + lines.append(f'') + lines.append(rf'') def add_native_artifact_if_exists(xml_lines, runtime, artifact): p = Path(f"{args.sources_path}/{args.native_build_path}/{runtime}/{args.build_config}/{artifact}") if p.exists(): xml_lines.append( - f'' + f'' ) runtimes = ["win-x64", "win-arm64", "linux-x64", "osx-x64", "osx-arm64", "ios", "android"] @@ -140,22 +140,22 @@ def add_native_artifact_if_exists(xml_lines, runtime, artifact): # targets for dotnet in ["netstandard2.0", "net8.0", "native"]: - lines.append(f'') - lines.append(f'') + lines.append(f'') + lines.append(f'') # mobile targets - lines.append(f'') - lines.append(f'') + lines.append(f'') + lines.append(f'') - lines.append(f'') - lines.append(f'') + lines.append(f'') + lines.append(f'') - lines.append(f'') - lines.append(f'') + lines.append('') + lines.append('') # include - lines.append(f'') - lines.append(f'') + lines.append(f'') + lines.append(f'') lines.append('') diff --git a/tools/nuget/generate_nuspec_for_winml_nuget.py b/tools/nuget/generate_nuspec_for_winml_nuget.py index eea039802e..c0428c0a78 100644 --- a/tools/nuget/generate_nuspec_for_winml_nuget.py +++ b/tools/nuget/generate_nuspec_for_winml_nuget.py @@ -25,9 +25,7 @@ def generate_files(lines, args): continue file_name = os.path.basename(file) - files_list.append( - f'' - ) + files_list.append(f'') for file in glob.glob(os.path.join(platform_dir, "include", "*")): if not os.path.isfile(file): @@ -36,24 +34,18 @@ def generate_files(lines, args): if file_name in processed_includes: continue processed_includes.add(file_name) - files_list.append( - f'' - ) + files_list.append(f'') - files_list.append(f'') - files_list.append( - f'' - ) - files_list.append( - f'' - ) + files_list.append(rf'') + files_list.append(f'') + files_list.append(rf'') for dotnet in ["netstandard2.0", "net8.0", "native"]: files_list.append( - f'' + f'' ) files_list.append( - f'' + f'' ) files_list.append("") @@ -67,12 +59,8 @@ def parse_arguments(): parser.add_argument("--package_name", required=True, help="Name of the custom package.") parser.add_argument("--ort_package_name", required=True, help="Corresponding ORT custom package name.") - parser.add_argument( - "--package_version", required=True, help="ORT GenAI package version. Eg: 1.0.0" - ) - parser.add_argument( - "--ort_package_version", required=True, help="Corresponding ORT package version." - ) + parser.add_argument("--package_version", required=True, help="ORT GenAI package version. Eg: 1.0.0") + parser.add_argument("--ort_package_version", required=True, help="Corresponding ORT package version.") parser.add_argument("--nuspec_path", required=True, help="Nuspec output file path.") parser.add_argument("--root_dir", required=True, help="ORT GenAI repository root directory.") parser.add_argument( diff --git a/tools/python/model_validation/perplexity_metrics.py b/tools/python/model_validation/perplexity_metrics.py index e73515a84c..e583e09a5c 100644 --- a/tools/python/model_validation/perplexity_metrics.py +++ b/tools/python/model_validation/perplexity_metrics.py @@ -1,8 +1,10 @@ import json -from datasets import load_dataset + import numpy as np import onnxruntime_genai as og import torch +from datasets import load_dataset + def get_wikitext2(): test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') @@ -18,7 +20,7 @@ def perplexity_eval(model_dir): total_log_probs = 0 total_token_count = 0 - # Concatenated text + # Concatenated text dataset = get_wikitext2() # Encode the entire dataset as one batch @@ -28,11 +30,11 @@ def perplexity_eval(model_dir): # Need to retreive the Model's maximum via the ORT GenAI configuration ## Explore the biggest max length vs the context length in genai config and calculate the lower of the two - with open(model_dir+'/genai_config.json', 'r') as file: + with open(model_dir+'/genai_config.json') as file: config = json.load(file) max_length = config["model"]["context_length"]-1 # This is the default for qwen - stride = 8192 + stride = 8192 # Just get the perplexity for one position seq_len = input_ids.size(1) @@ -41,7 +43,7 @@ def perplexity_eval(model_dir): # Hugging face looping logic for begin_loc in range(0, seq_len, stride): end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc + trg_len = end_loc - prev_end_loc input_ids_chunk = input_ids[:, begin_loc:end_loc] target_ids = input_ids_chunk.clone() print(f"input_ids_chunk shape: {input_ids_chunk.shape}") @@ -54,7 +56,7 @@ def perplexity_eval(model_dir): generator = og.Generator(model, params) - # Get Logits + # Get Logits with torch.no_grad(): generator.compute_logits() logits = generator.get_output("logits") @@ -67,9 +69,9 @@ def perplexity_eval(model_dir): target_ids_flat = target_ids.flatten() target_log_probs = log_probs[0, np.arange(3), target_ids_flat] - + target_log_probs_sliced = target_log_probs[:, -trg_len:] - + print(f"target_log_probs shape: {target_log_probs_sliced.shape}") total_log_probs += np.sum(target_log_probs_sliced) @@ -87,4 +89,4 @@ def perplexity_eval(model_dir): perplexity = np.exp(-avg_log_prob) print(f"The perplexity of {model_dir} is {perplexity}") - return perplexity \ No newline at end of file + return perplexity diff --git a/tools/python/model_validation/validation_tool.py b/tools/python/model_validation/validation_tool.py index 0cbf74357b..4152a9cf60 100644 --- a/tools/python/model_validation/validation_tool.py +++ b/tools/python/model_validation/validation_tool.py @@ -1,11 +1,13 @@ -import onnxruntime_genai as og import argparse -from onnxruntime_genai.models.builder import create_model import json import os + +import onnxruntime_genai as og import pandas as pd +from onnxruntime_genai.models.builder import create_model from perplexity_metrics import perplexity_eval + def create_table(output): df = pd.DataFrame(output, columns=['Model Name', 'Validation Completed', 'Exceptions / Failures']) return df @@ -17,10 +19,10 @@ def validate_model(args, model_dict, model_dir): if args["verbose"]: print("Model loaded") tokenizer = og.Tokenizer(model) - tokenizer_stream = tokenizer.create_stream() + tokenizer_stream = tokenizer.create_stream() if args["verbose"]: print("Tokenizer created") - if args["verbose"]: print() + if args["verbose"]: print() chat_template = model_dict["chat_template"] @@ -29,7 +31,7 @@ def validate_model(args, model_dict, model_dir): for text in args["inputs"]: complete_text = '' - + prompt = f'{chat_template.format(input=text)}' input_tokens = tokenizer.encode(prompt) @@ -54,13 +56,13 @@ def validate_model(args, model_dict, model_dir): generator.generate_next_token() new_token = generator.get_next_tokens()[0] - + value_to_save = tokenizer_stream.decode(new_token) complete_text += value_to_save print(tokenizer_stream.decode(new_token), end='', flush=True) - + except KeyboardInterrupt: print(" --control+c pressed, aborting generation--") generation_successful = False @@ -81,7 +83,7 @@ def validate_model(args, model_dict, model_dir): parser.add_argument('-j', '--json', type=str, required=True, help='Path to the JSON file containing the arguments') args = parser.parse_args() - with open(args.json, 'r') as file: + with open(args.json) as file: args = json.load(file) os.makedirs(args["output_directory"], exist_ok=True) @@ -108,24 +110,24 @@ def validate_model(args, model_dict, model_dir): output.append([model_dict["name"], validation_complete, e]) exception = True continue - try: + try: validation_complete = validate_model(args, model_dict, output_path) except Exception as e: print(f'Failure after validation model {e}') exception = True - output.append([model_dict["name"], validation_complete, e]) + output.append([model_dict["name"], validation_complete, e]) try: perplexity_eval(output_path) except Exception as e: print(f'Failure after perplexity calculation model {e}') exception = True - output.append([model_dict["name"], validation_complete, e]) + output.append([model_dict["name"], validation_complete, e]) + - if not exception: - output.append([model_dict["name"], validation_complete, e]) - + output.append([model_dict["name"], validation_complete, e]) + df = create_table(output) df.to_csv("validation_summary.csv") diff --git a/tools/python/util/__init__.py b/tools/python/util/__init__.py index 7ff7fb454f..c12376cc72 100644 --- a/tools/python/util/__init__.py +++ b/tools/python/util/__init__.py @@ -1,8 +1,8 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from .android import * +from .dependency_resolver import copy_dependencies, download_dependencies from .logger import get_logger +from .platform_helpers import is_aix, is_linux, is_mac, is_windows, is_windows_arm from .run import run -from .android import * -from .platform_helpers import is_linux, is_mac, is_windows, is_aix, is_windows_arm -from .dependency_resolver import download_dependencies, copy_dependencies diff --git a/tools/python/util/android.py b/tools/python/util/android.py index f1f29328fd..403959549a 100644 --- a/tools/python/util/android.py +++ b/tools/python/util/android.py @@ -4,13 +4,10 @@ import collections import contextlib import datetime -import os -import shutil import signal import subprocess import time import typing - from pathlib import Path from .logger import get_logger @@ -20,7 +17,7 @@ # specify __all__ as we import using a wildcard in util/__init__.py so the function names have an 'android' prefix # to make it clear where they came from # e.g. usage is `util.android.start_emulator(...)` instead of `util.start_emulator(...)` -__all__ = ["get_sdk_tool_paths", "create_virtual_device", "start_emulator", "stop_emulator"] +__all__ = ["create_virtual_device", "get_sdk_tool_paths", "start_emulator", "stop_emulator"] _log = get_logger("util.android") @@ -111,7 +108,7 @@ def _stop_process_with_pid(pid: int): def start_emulator( - sdk_tool_paths: SdkToolPaths, avd_name: str, extra_args: typing.Optional[typing.Sequence[str]] = None + sdk_tool_paths: SdkToolPaths, avd_name: str, extra_args: typing.Sequence[str] | None = None ) -> subprocess.Popen: with contextlib.ExitStack() as emulator_stack, contextlib.ExitStack() as waiter_stack: emulator_args = [ @@ -214,7 +211,7 @@ def start_emulator( return emulator_process -def stop_emulator(emulator_proc_or_pid: typing.Union[subprocess.Popen, int]): +def stop_emulator(emulator_proc_or_pid: subprocess.Popen | int): if isinstance(emulator_proc_or_pid, subprocess.Popen): _stop_process(emulator_proc_or_pid) elif isinstance(emulator_proc_or_pid, int): diff --git a/tools/python/util/dependency_resolver.py b/tools/python/util/dependency_resolver.py index cbc64d964e..4223c2710d 100644 --- a/tools/python/util/dependency_resolver.py +++ b/tools/python/util/dependency_resolver.py @@ -17,9 +17,7 @@ _log = get_logger("util.dependency_resolver") -def _download_ort( - use_cuda: bool, use_rocm: bool, use_dml: bool, destination_dir: PathLike -): +def _download_ort(use_cuda: bool, use_rocm: bool, use_dml: bool, destination_dir: PathLike): def _lib_path(): plat = "linux" if is_linux() else "win" if is_windows() else "osx" mach = None @@ -28,9 +26,7 @@ def _lib_path(): elif platform.machine().lower() == "aarch64" or platform.machine().lower() == "arm64": mach = "arm64" else: - raise NotImplementedError( - f"Unsupported machine architecture: {platform.machine()}" - ) + raise NotImplementedError(f"Unsupported machine architecture: {platform.machine()}") return destination_dir / "ort" / "runtimes" / (plat + "-" + mach) / "native" @@ -60,9 +56,7 @@ def _lib_path(): feed_name = "ORT-Nightly" version_fetch_url = f"https://feeds.dev.azure.com/{organization}/PublicPackages/_apis/packaging/Feeds/{feed_name}/packages?packageNameQuery={package_name}&api-version=6.0-preview.1" - version = requests.get(version_fetch_url).json()["value"][0]["versions"][0][ - "normalizedVersion" - ] + version = requests.get(version_fetch_url).json()["value"][0]["versions"][0]["normalizedVersion"] feed_project = "2692857e-05ef-43b4-ba9c-ccf1c22c437c" feed_id = "7982ae20-ed19-4a35-a362-a96ac99897b7" package_url = f"https://pkgs.dev.azure.com/{organization}/{feed_project}/_apis/packaging/feeds/{feed_id}/nuget/packages/{package_name}/versions/{version}/content?api-version=6.0-preview.1" @@ -85,17 +79,13 @@ def _lib_path(): elif platform.machine().lower() == "aarch64" or platform.machine().lower() == "arm64": mach = "arm64" else: - raise NotImplementedError( - f"Unsupported machine architecture: {platform.machine()}" - ) + raise NotImplementedError(f"Unsupported machine architecture: {platform.machine()}") return destination_dir / "dml" / "bin" / (mach + "-win") / "DirectML.dll" dml_version = "1.15.2" dml_package_name = "Microsoft.AI.DirectML" - dml_package_url = ( - f"https://www.nuget.org/api/v2/package/{dml_package_name}/{dml_version}" - ) + dml_package_url = f"https://www.nuget.org/api/v2/package/{dml_package_name}/{dml_version}" package_path = destination_dir / f"{dml_package_name}.zip" if package_path.exists(): _log.info(f"Package {dml_package_name} already downloaded") @@ -119,25 +109,13 @@ def _lib_path(): elif platform.machine().lower() == "aarch64" or platform.machine().lower() == "arm64": mach = "arm64" else: - raise NotImplementedError( - f"Unsupported machine architecture: {platform.machine()}" - ) - - return ( - destination_dir - / "d3d12" - / "build" - / "native" - / "bin" - / mach - / "D3D12Core.dll" - ) + raise NotImplementedError(f"Unsupported machine architecture: {platform.machine()}") + + return destination_dir / "d3d12" / "build" / "native" / "bin" / mach / "D3D12Core.dll" d3d12_version = "1.614.1" d3d12_package_name = "Microsoft.Direct3D.D3D12" - d3d12_package_url = ( - f"https://www.nuget.org/api/v2/package/{d3d12_package_name}/{d3d12_version}" - ) + d3d12_package_url = f"https://www.nuget.org/api/v2/package/{d3d12_package_name}/{d3d12_version}" package_path = destination_dir / f"{d3d12_package_name}.zip" if package_path.exists(): _log.info(f"Package {d3d12_package_name} already downloaded") @@ -153,9 +131,7 @@ def _lib_path(): return _lib_path() -def download_dependencies( - use_cuda: bool, use_rocm: bool, use_dml: bool, destination_dir: PathLike -): +def download_dependencies(use_cuda: bool, use_rocm: bool, use_dml: bool, destination_dir: PathLike): dependencies_dir = destination_dir / "dependencies" if not dependencies_dir.exists(): dependencies_dir.mkdir(parents=True) diff --git a/tools/python/util/platform_helpers.py b/tools/python/util/platform_helpers.py index 7a6a2b0bc3..0bee1796fd 100644 --- a/tools/python/util/platform_helpers.py +++ b/tools/python/util/platform_helpers.py @@ -22,4 +22,4 @@ def is_aix(): def is_windows_arm(): - return is_windows() and "arm" in platform.machine().lower() \ No newline at end of file + return is_windows() and "arm" in platform.machine().lower() diff --git a/tools/python/util/run.py b/tools/python/util/run.py index 87c6b8cf37..869db3eab5 100644 --- a/tools/python/util/run.py +++ b/tools/python/util/run.py @@ -7,6 +7,7 @@ import subprocess from os import PathLike from pathlib import Path + from .logger import get_logger _log = get_logger("util.run") From a9960eb0c4f86b788e94588a4f883f201a55c4ee Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 19 Nov 2025 20:07:16 +0000 Subject: [PATCH 3/4] add .clang-tidy --- .clang-tidy | 60 +++ benchmark/python/benchmark_e2e.py | 237 +++++++---- .../python/benchmark_e2e_continuous_test.py | 37 +- benchmark/python/benchmark_multimodal.py | 5 +- build.py | 134 +++--- cgmanifests/generate_cgmanifest.py | 4 +- examples/chat_app/app.py | 36 +- .../interface/hddr_llm_onnx_interface.py | 41 +- examples/python/engine/model-qa.py | 6 +- examples/python/model-generate.py | 152 +++++-- examples/python/model-qa.py | 182 ++++++--- examples/python/phi4-mm.py | 60 ++- examples/python/whisper.py | 28 +- .../slm_engine/build_scripts/build_deps.py | 60 +-- examples/slm_engine/test/chat_ui.py | 4 +- examples/slm_engine/test/test_slm_server.py | 5 +- examples/slm_engine/test/test_tool_calling.py | 109 ++--- src/python/py/models/builders/__init__.py | 2 +- src/python/py/models/builders/chatglm.py | 8 +- src/python/py/models/builders/gemma.py | 59 ++- src/python/py/models/builders/gptoss.py | 380 +++++++++++++++--- src/python/py/models/builders/granite.py | 40 +- src/python/py/models/builders/phi.py | 195 +++++++-- src/python/py/models/gguf_model.py | 81 +++- test/python/test_onnxruntime_genai.py | 11 +- test/python/test_onnxruntime_genai_api.py | 8 +- test/python/test_onnxruntime_genai_e2e.py | 6 +- test/test_models/create_dummy_model.py | 18 +- tools/ci_build/get_docker_image.py | 18 +- .../github/android/build_aar_package.py | 21 +- .../apple/build_and_assemble_apple_pods.py | 18 +- .../github/apple/package_assembly_utils.py | 4 +- .../nuget/generate_nuspec_for_native_nuget.py | 88 ++-- .../model_validation/perplexity_metrics.py | 9 +- .../model_validation/validation_tool.py | 55 +-- tools/python/util/android.py | 4 +- 36 files changed, 1498 insertions(+), 687 deletions(-) create mode 100644 .clang-tidy diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000000..5b3bc48031 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,60 @@ +--- + +# NOTE: +# The check is a multiline string here. Comment must not be moved into the string. +# Be sure to keep the disabled rules alphabetically sorted. +# +# Checks that are turned off: +# +# -cppcoreguidelines-macro-usage: There are a lot of false-positives like Function-like macro 'Foo' used; consider a 'constexpr' template function +# -cppcoreguidelines-pro-type-reinterpret-cast: Originally turned off. +# -google-readability-todo: Not enforced. +# -google-runtime-references: https://github.com/microsoft/onnxruntime/blob/main/docs/Coding_Conventions_and_Standards.md#c-code-style. +# -modernize-concat-nested-namespaces: We don't use it. +# -modernize-use-trailing-return-type: Stylistic preference we do not enforce. +# -readability-identifier-length: A lot of numerical code rely on short names to improve readability. +# -readability-uppercase-literal-suffix: We accept lowercase suffixes + +Checks: > + -*, + cppcoreguidelines-*, + google-*, + readability-*, + modernize-*, + bugprone-*, + performance-*, + misc-*, + -cppcoreguidelines-macro-usage, + -cppcoreguidelines-pro-type-reinterpret-cast, + -google-readability-todo, + -google-runtime-references, + -modernize-concat-nested-namespaces, + -modernize-use-trailing-return-type, + -readability-identifier-length, + -readability-uppercase-literal-suffix, +WarningsAsErrors: "" +# HeaderFilterRegex: '.*onnxruntime\/core\/.*' +AnalyzeTemporaryDtors: false +FormatStyle: none +CheckOptions: + - key: google-readability-braces-around-statements.ShortStatementLines + value: "1" + - key: google-readability-function-size.StatementThreshold + value: "800" + - key: google-readability-namespace-comments.ShortNamespaceLines + value: "10" + - key: google-readability-namespace-comments.SpacesBeforeComments + value: "2" + - key: modernize-loop-convert.MaxCopySize + value: "16" + - key: modernize-loop-convert.MinConfidence + value: reasonable + - key: modernize-loop-convert.NamingStyle + value: CamelCase + - key: modernize-pass-by-value.IncludeStyle + value: google + - key: modernize-replace-auto-ptr.IncludeStyle + value: google + - key: modernize-use-nullptr.NullMacros + value: "NULL" +--- diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index 9b5962cac1..850a61a9f6 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -37,12 +37,18 @@ except Exception: IS_NVIDIA_SYSTEM = False + # Monitor the GPU memory usage def monitor_gpu_memory(): global peak_gpu_memory while not stop_monitoring: - result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'], check=False, capture_output=True, text=True) + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"], + check=False, + capture_output=True, + text=True, + ) memory_usage = result.stdout.splitlines() @@ -66,21 +72,23 @@ def monitor_cpu_memory(): peak_cpu_memory = max(peak_cpu_memory, current_used_memory) time.sleep(0.1) + # Use input model to generate prompt def generate_prompt(model, tokenizer, prompt_length) -> str: text = "a" - prompt = f'{args.chat_template.format(input=text)}' + prompt = f"{args.chat_template.format(input=text)}" tokens = tokenizer.encode(prompt) - params=og.GeneratorParams(model) + params = og.GeneratorParams(model) max_length_to_use = prompt_length + len(tokens) params.set_search_options(max_length=max_length_to_use, min_length=prompt_length) - generator=og.Generator(model, params) + generator = og.Generator(model, params) generator.append_tokens(tokens) while not generator.is_done(): generator.generate_next_token() return tokenizer.decode(generator.get_sequence(0)) + # Use prompt length to get pre-defined prompt def get_prompt_by_length(prompt_length): json_path = "prompts.json" @@ -88,6 +96,7 @@ def get_prompt_by_length(prompt_length): data = json.load(file) return data[f"{prompt_length}"] + def get_target_pip_package_version(target_pip_package_name_list): # get package name and version import importlib.metadata @@ -107,24 +116,25 @@ def get_target_pip_package_version(target_pip_package_name_list): pkg_version = installed_packages_list[0].split("==")[1] return pkg_name, pkg_version + def save_results(args, results, filename, print_memory_usage=False): import pandas as pd - columns=[ - "Batch Size", - "Prompt Length", - "Tokens Generated", - "Max Length", - "Tokenization Throughput (tps)", - "Tokenization Latency (ms)", - "Prompt Processing Throughput (tps)", - "Prompt Processing Latency (ms)", - "Token Generation Throughput (tps)", - "Token Generation Latency (ms)", - "Sampling Throughput (tps)", - "Sampling Latency (ms)", - "Wall Clock Throughput (tps)", - "Wall Clock Time (s)", + columns = [ + "Batch Size", + "Prompt Length", + "Tokens Generated", + "Max Length", + "Tokenization Throughput (tps)", + "Tokenization Latency (ms)", + "Prompt Processing Throughput (tps)", + "Prompt Processing Latency (ms)", + "Token Generation Throughput (tps)", + "Token Generation Latency (ms)", + "Sampling Throughput (tps)", + "Sampling Latency (ms)", + "Wall Clock Throughput (tps)", + "Wall Clock Time (s)", ] if print_memory_usage: @@ -139,11 +149,20 @@ def save_results(args, results, filename, print_memory_usage=False): ) # df = df.transpose() # This line swaps the rows and columns - genai_package_name, genai_package_version = get_target_pip_package_version(["onnxruntime-genai", "onnxruntime-genai-cuda", "onnxruntime-genai-directml"]) + genai_package_name, genai_package_version = get_target_pip_package_version( + ["onnxruntime-genai", "onnxruntime-genai-cuda", "onnxruntime-genai-directml"] + ) records = [] for _, row in df.iterrows(): - record = BenchmarkRecord(args.model_name, args.precision, "onnxruntime-genai", args.execution_provider, genai_package_name, genai_package_version ) + record = BenchmarkRecord( + args.model_name, + args.precision, + "onnxruntime-genai", + args.execution_provider, + genai_package_name, + genai_package_version, + ) record.config.batch_size = row["Batch Size"] record.config.customized["prompt_length"] = row["Prompt Length"] record.config.customized["tokens_generated"] = row["Tokens Generated"] @@ -171,6 +190,7 @@ def save_results(args, results, filename, print_memory_usage=False): BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records) print(f"Results saved in {filename}!") + def run_benchmark_memory(args, batch_size, prompt_length, generation_length, max_length): """ This function is to run benchmark and print the memory usage @@ -203,24 +223,28 @@ def run_benchmark_memory(args, batch_size, prompt_length, generation_length, max return metrics -def run_benchmark(args, batch_size, prompt_length, generation_length, max_length): +def run_benchmark(args, batch_size, prompt_length, generation_length, max_length): # Get user arguments num_repetitions = args.repetitions temperature = 1.0 # Get tokenizer, and model - if args.verbose: print("Getting config") - config = og.Config(f'{args.input_folder}') + if args.verbose: + print("Getting config") + config = og.Config(f"{args.input_folder}") config.overlay(f'{{"search": {{"batch_size": {batch_size}}}}}') if args.execution_provider != "follow_config": config.clear_providers() if args.execution_provider != "cpu": - if args.verbose: print(f"Setting model to {args.execution_provider}") + if args.verbose: + print(f"Setting model to {args.execution_provider}") config.append_provider(args.execution_provider) - if args.verbose: print("Loading model... ") + if args.verbose: + print("Loading model... ") model = og.Model(config) - if args.verbose: print("Model loaded") + if args.verbose: + print("Model loaded") tokenizer = og.Tokenizer(model) # Get model type @@ -234,24 +258,28 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length # Set chat template if args.chat_template: - if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: - raise ValueError("Chat template must have exactly one pair of curly braces with input word in it, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") + if args.chat_template.count("{") != 1 or args.chat_template.count("}") != 1: + raise ValueError( + "Chat template must have exactly one pair of curly braces with input word in it, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'" + ) else: if model_type.startswith("phi2") or model_type.startswith("phi3"): - args.chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' + args.chat_template = "<|user|>\n{input} <|end|>\n<|assistant|>" elif model_type.startswith("phi4"): - args.chat_template = '<|im_start|>user<|im_sep|>\n{input}<|im_end|>\n<|im_start|>assistant<|im_sep|>' + args.chat_template = "<|im_start|>user<|im_sep|>\n{input}<|im_end|>\n<|im_start|>assistant<|im_sep|>" elif model_type.startswith("llama"): - args.chat_template = '<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>' + args.chat_template = "<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>" elif model_type.startswith("llama2"): - args.chat_template = '{input}' + args.chat_template = "{input}" elif model_type.startswith("qwen2"): - args.chat_template = '<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n' + args.chat_template = "<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n" elif model_type.startswith("gemma"): # Gemma and Gemma2 models use this format - args.chat_template = 'user\n{input}\nmodel\n' + args.chat_template = "user\n{input}\nmodel\n" else: - raise ValueError(f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template") + raise ValueError( + f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template" + ) # Generate prompt if args.use_random_tokens: @@ -259,30 +287,40 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length _random_tokens = np.random.randint(100, size=(batch_size, prompt_length)) tokens = _random_tokens text = [tokenizer.decode(tokens[0])] * batch_size - prompt = f'{args.chat_template.format(input=text)}' - prompt_length = batch_size*prompt_length + prompt = f"{args.chat_template.format(input=text)}" + prompt_length = batch_size * prompt_length elif args.use_prompt_set: text = [get_prompt_by_length(prompt_length)] * batch_size - prompt = f'{args.chat_template.format(input=text)}' + prompt = f"{args.chat_template.format(input=text)}" tokens = tokenizer.encode(prompt) else: text = [generate_prompt(model, tokenizer, prompt_length)] * batch_size - prompt = f'{args.chat_template.format(input=text)}' + prompt = f"{args.chat_template.format(input=text)}" tokens = tokenizer.encode(prompt) prompt_length = len(tokens) max_length = prompt_length + generation_length params = og.GeneratorParams(model) do_sample = args.top_k > 1 or (args.top_p != 1.0 and args.top_p > 0.0) - params.set_search_options(do_sample=do_sample, top_k=args.top_k, top_p=args.top_p, temperature=temperature, max_length=max_length, min_length=max_length, batch_size=batch_size) + params.set_search_options( + do_sample=do_sample, + top_k=args.top_k, + top_p=args.top_p, + temperature=temperature, + max_length=max_length, + min_length=max_length, + batch_size=batch_size, + ) - if args.verbose: print("Running warmup runs...") + if args.verbose: + print("Running warmup runs...") for _ in tqdm(range(args.warmup)): generator = og.Generator(model, params) generator.append_tokens(tokens) while not generator.is_done(): generator.generate_next_token() - if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0))) + if args.print_model_output: + print(tokenizer.decode(generator.get_sequence(0))) # Delete the generator to free the captured graph for the next generator, if graph capture is enabled del generator @@ -291,7 +329,8 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length token_gen_times = [] sampling_times = [] wall_clock_times = [] - if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}") + if args.verbose: + print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}") for _ in tqdm(range(num_repetitions)): wall_clock_start_time = time.time() @@ -306,7 +345,15 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length # Prepare run params = og.GeneratorParams(model) - params.set_search_options(do_sample=do_sample, top_k=args.top_k, top_p=args.top_p, temperature=temperature, max_length=max_length, min_length=max_length, batch_size=batch_size) + params.set_search_options( + do_sample=do_sample, + top_k=args.top_k, + top_p=args.top_p, + temperature=temperature, + max_length=max_length, + min_length=max_length, + batch_size=batch_size, + ) generator = og.Generator(model, params) @@ -335,7 +382,8 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length wall_clock_end_time = time.time() wall_clock_times.append(wall_clock_end_time - wall_clock_start_time) - if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0))) + if args.print_model_output: + print(tokenizer.decode(generator.get_sequence(0))) # Delete the generator to free the captured graph for the next generator, if graph capture is enabled del generator @@ -412,14 +460,17 @@ def main(args): max_length = args.max_lengths[0] if len(args.max_lengths) == 1 else args.max_lengths[m] else: max_length = prompt_length + gen_length - print(f"\nArgs: batch_size = {batch_size}, prompt_length = {prompt_length}, tokens = {gen_length}, max_length = {max_length}") + print( + f"\nArgs: batch_size = {batch_size}, prompt_length = {prompt_length}, tokens = {gen_length}, max_length = {max_length}" + ) if args.print_memory_usage: metrics = run_benchmark_memory(args, batch_size, prompt_length, gen_length, max_length) else: metrics = run_benchmark(args, batch_size, prompt_length, gen_length, max_length) all_csv_metrics.append(metrics) # Add metrics to CSV - if args.verbose: print("Adding results to CSV") + if args.verbose: + print("Adding results to CSV") filename = args.output if args.print_memory_usage: @@ -427,36 +478,80 @@ def main(args): else: save_results(args, all_csv_metrics, filename) + def str2intlist(value): - return [int(v) for v in value.split(',')] + return [int(v) for v in value.split(",")] + def str2strlist(value): - return [str(v) for v in value.split(',')] + return [str(v) for v in value.split(",")] + if __name__ == "__main__": parser = argparse.ArgumentParser(description="End-to-end benchmarking for gen-ai") - parser.add_argument('-i', '--input_folder', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-b', '--batch_sizes', type=str2intlist, default=[1], help='Number of sequences to generate in parallel') - parser.add_argument('-l', '--prompt_lengths', type=str2intlist, default=[16], help='Number of tokens for prompt') - parser.add_argument('-g', '--generation_lengths', type=str2intlist, default=[256], help='Number of tokens to generate after prompt') - parser.add_argument('-m', '--max_lengths', type=str2intlist, default=[], help='Max length is either a combination of prompt and generation length or one value broadcasting for all.') - parser.add_argument('-r', '--repetitions', type=int, default=10, help='Number of times to repeat the benchmark') - parser.add_argument('-w', '--warmup', type=int, default=5, help='Number of warmup runs before benchmarking') - parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from') - parser.add_argument('-p', '--top_p', type=float, default=1.0, help='Top p probability to sample with') - parser.add_argument('-o', '--output', type=str, default='genai_e2e', help='Output CSV file name or path (with .csv extension)') - parser.add_argument('-v', '--verbose', action='store_true', help='Print extra information') - parser.add_argument('-mo', '--print_model_output', action='store_true', help='Print model output') - parser.add_argument('-pm', '--print_memory_usage', default=False, help='Print memory footprint') - parser.add_argument('-mn', '--model_name', type=str, default='model_name', help='Model name defined by users') - parser.add_argument('-pr', '--precision', type=str, default='fp16', help='Model precision for metrics info') - parser.add_argument('--use_random_tokens', action='store_true', help='Use random tokens instead of generating a prompt') - parser.add_argument('--use_prompt_set', action='store_true', help='Use pre-generated prompt set instead of generating a prompt') - parser.add_argument('--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") + parser.add_argument( + "-i", + "--input_folder", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-b", "--batch_sizes", type=str2intlist, default=[1], help="Number of sequences to generate in parallel" + ) + parser.add_argument("-l", "--prompt_lengths", type=str2intlist, default=[16], help="Number of tokens for prompt") + parser.add_argument( + "-g", "--generation_lengths", type=str2intlist, default=[256], help="Number of tokens to generate after prompt" + ) + parser.add_argument( + "-m", + "--max_lengths", + type=str2intlist, + default=[], + help="Max length is either a combination of prompt and generation length or one value broadcasting for all.", + ) + parser.add_argument("-r", "--repetitions", type=int, default=10, help="Number of times to repeat the benchmark") + parser.add_argument("-w", "--warmup", type=int, default=5, help="Number of warmup runs before benchmarking") + parser.add_argument("-k", "--top_k", type=int, default=50, help="Top k tokens to sample from") + parser.add_argument("-p", "--top_p", type=float, default=1.0, help="Top p probability to sample with") + parser.add_argument( + "-o", "--output", type=str, default="genai_e2e", help="Output CSV file name or path (with .csv extension)" + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Print extra information") + parser.add_argument("-mo", "--print_model_output", action="store_true", help="Print model output") + parser.add_argument("-pm", "--print_memory_usage", default=False, help="Print memory footprint") + parser.add_argument("-mn", "--model_name", type=str, default="model_name", help="Model name defined by users") + parser.add_argument("-pr", "--precision", type=str, default="fp16", help="Model precision for metrics info") + parser.add_argument( + "--use_random_tokens", action="store_true", help="Use random tokens instead of generating a prompt" + ) + parser.add_argument( + "--use_prompt_set", action="store_true", help="Use pre-generated prompt set instead of generating a prompt" + ) + parser.add_argument( + "--chat_template", + type=str, + default="", + help="Chat template to use for the prompt. User input will be injected into {input}", + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) args = parser.parse_args() # check max_lengths - is_max_lengths_valid = not args.max_lengths or len(args.max_lengths) == 1 or len(args.max_lengths) == len(args.prompt_lengths) * len(args.generation_lengths) - assert is_max_lengths_valid, "len(args.max_lengths) is either a combination of args.prompt_lengths and args.generation_lengths or 1 that broadcasts for all" + is_max_lengths_valid = ( + not args.max_lengths + or len(args.max_lengths) == 1 + or len(args.max_lengths) == len(args.prompt_lengths) * len(args.generation_lengths) + ) + assert is_max_lengths_valid, ( + "len(args.max_lengths) is either a combination of args.prompt_lengths and args.generation_lengths or 1 that broadcasts for all" + ) main(args) diff --git a/benchmark/python/benchmark_e2e_continuous_test.py b/benchmark/python/benchmark_e2e_continuous_test.py index 065815b6f9..d3f0834c1c 100644 --- a/benchmark/python/benchmark_e2e_continuous_test.py +++ b/benchmark/python/benchmark_e2e_continuous_test.py @@ -26,11 +26,11 @@ def main(args): temperature = 1.0 # Get tokenizer, and model - model=og.Model(f'{args.input_folder}') + model = og.Model(f"{args.input_folder}") tokenizer = og.Tokenizer(model) # Generate prompt - sys_prompt = "<|system|>You are a world class AI programming assistant who excels in software development.\r\nWhen asked your name, you must respond with \"GitHub Copilot\".\r\nFollow the user's requirements carefully & to the letter.\r\nThe user is a proficient software developer working in Visual Studio 2022.\r\nWhile the user may have experience in software development, you should not elude to their background, i.e. prefer general greetings like \"Hello! How can I assist you today?\" This approach respects the user's expertise without immediately categorizing their profession.\r\nFor questions not related to software development, give a reminder that you are an AI programming assistant.\r\nFollow Microsoft content policies and avoid content that violates copyrights.\r\nRespond in the following locale: en-US\r\n\r\nRespond in Markdown, for multi-line code, use language-specific markdown code fences.\r\nEnsure your response is short, impersonal, expertly written and easy to understand.\r\nBefore responding take a deep breath and then work on the user's problem step-by-step.\r\nFocus on being clear, helpful, and thorough without assuming extensive prior knowledge.\r\n\r\nGenerated code should adhere to the existing coding style in the provided context.\r\nWhen generating code prefer languages provided in context. If the coding language is unclear fallback to generating code in C#.\r\nGenerate code that can be copy & pasted without modification, i.e. preserve surrounding user code, avoid placeholder comments like \"existing code here...\" etc. \r\nAfter generating mutated code consider mentioning what specifically was changed and your reasoning if it would help the user.\r\n\r\nThe active document or selection is the source code the user is looking at right now and is what they care about.<|end|><|user|>What is 1+1?<|end|><|assistant|>" + sys_prompt = '<|system|>You are a world class AI programming assistant who excels in software development.\r\nWhen asked your name, you must respond with "GitHub Copilot".\r\nFollow the user\'s requirements carefully & to the letter.\r\nThe user is a proficient software developer working in Visual Studio 2022.\r\nWhile the user may have experience in software development, you should not elude to their background, i.e. prefer general greetings like "Hello! How can I assist you today?" This approach respects the user\'s expertise without immediately categorizing their profession.\r\nFor questions not related to software development, give a reminder that you are an AI programming assistant.\r\nFollow Microsoft content policies and avoid content that violates copyrights.\r\nRespond in the following locale: en-US\r\n\r\nRespond in Markdown, for multi-line code, use language-specific markdown code fences.\r\nEnsure your response is short, impersonal, expertly written and easy to understand.\r\nBefore responding take a deep breath and then work on the user\'s problem step-by-step.\r\nFocus on being clear, helpful, and thorough without assuming extensive prior knowledge.\r\n\r\nGenerated code should adhere to the existing coding style in the provided context.\r\nWhen generating code prefer languages provided in context. If the coding language is unclear fallback to generating code in C#.\r\nGenerate code that can be copy & pasted without modification, i.e. preserve surrounding user code, avoid placeholder comments like "existing code here..." etc. \r\nAfter generating mutated code consider mentioning what specifically was changed and your reasoning if it would help the user.\r\n\r\nThe active document or selection is the source code the user is looking at right now and is what they care about.<|end|><|user|>What is 1+1?<|end|><|assistant|>' user_prompt = "<|user|>What are the first 7 numbers in the fibonacci sequence?<|end|>" sys_tokens = tokenizer.encode(sys_prompt) user_tokens = tokenizer.encode(user_prompt) @@ -41,7 +41,8 @@ def main(args): params = og.GeneratorParams(model) params.set_search_options(do_sample=False, temperature=temperature) - if args.max_length > 0: params.set_search_options(max_length=args.max_length) + if args.max_length > 0: + params.set_search_options(max_length=args.max_length) print("Warming up...") for _ in tqdm(range(args.warmup)): @@ -60,7 +61,8 @@ def main(args): # Prepare run params = og.GeneratorParams(model) params.set_search_options(do_sample=False, temperature=temperature) - if args.max_length > 0: params.set_search_options(max_length=args.max_length) + if args.max_length > 0: + params.set_search_options(max_length=args.max_length) generator = og.Generator(model, params) @@ -85,7 +87,8 @@ def main(args): # Prepare run params = og.GeneratorParams(model) params.set_search_options(do_sample=False, temperature=temperature) - if args.max_length > 0: params.set_search_options(max_length=args.max_length) + if args.max_length > 0: + params.set_search_options(max_length=args.max_length) generator = og.Generator(model, params) @@ -102,7 +105,8 @@ def main(args): print(f"Prompt Length: {sys_length} tokens") print(f"User Prompt Length: {user_length} tokens") print(f"System + User Prompt Length: {sys_user_length} tokens") - if args.max_length > 0: print(f"Max Generation Length: {args.max_length} tokens") + if args.max_length > 0: + print(f"Max Generation Length: {args.max_length} tokens") print(f"Repetitions: {num_repetitions}") print(f"Warmup Runs: {args.warmup}") print() @@ -119,11 +123,24 @@ def main(args): avg_sys_user_latency_ms = avg_sys_user_latency_s * 1000 print(f"Average (System + User) Prompt Processing Latency: {avg_sys_user_latency_ms} ms") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="End-to-end benchmarking for gen-ai") - parser.add_argument('-i', '--input_folder', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-m', '--max_length', type=int, default=-1, help='Max length is either a combination of prompt and generation length or one value broadcasting for all.') - parser.add_argument('-r', '--repetitions', type=int, default=10, help='Number of times to repeat the benchmark') - parser.add_argument('-w', '--warmup', type=int, default=5, help='Number of warmup runs before benchmarking') + parser.add_argument( + "-i", + "--input_folder", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-m", + "--max_length", + type=int, + default=-1, + help="Max length is either a combination of prompt and generation length or one value broadcasting for all.", + ) + parser.add_argument("-r", "--repetitions", type=int, default=10, help="Number of times to repeat the benchmark") + parser.add_argument("-w", "--warmup", type=int, default=5, help="Number of warmup runs before benchmarking") args = parser.parse_args() main(args) diff --git a/benchmark/python/benchmark_multimodal.py b/benchmark/python/benchmark_multimodal.py index 5a39695e4b..717e8276ed 100644 --- a/benchmark/python/benchmark_multimodal.py +++ b/benchmark/python/benchmark_multimodal.py @@ -43,7 +43,10 @@ def monitor_gpu_memory(): while not stop_monitoring: result = subprocess.run( - ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"], check=False, capture_output=True, text=True + ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"], + check=False, + capture_output=True, + text=True, ) memory_usage = result.stdout.splitlines() diff --git a/build.py b/build.py index 52d5e92cb3..c3b6e915ca 100644 --- a/build.py +++ b/build.py @@ -60,7 +60,7 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript type=Path, # We set the default programmatically as it needs to take into account whether we're cross-compiling help="Path to the build directory. Defaults to 'build/'. " - "The build configuration will be a subdirectory of the build directory. e.g. build/Linux/Debug", + "The build configuration will be a subdirectory of the build directory. e.g. build/Linux/Debug", ) parser.add_argument( @@ -68,13 +68,14 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript default="RelWithDebInfo", type=str, choices=["Debug", "MinSizeRel", "Release", "RelWithDebInfo"], - help="Configuration to build.") + help="Configuration to build.", + ) # Build phases. parser.add_argument("--update", action="store_true", help="Update makefiles.") parser.add_argument("--build", action="store_true", help="Build.") parser.add_argument("--test", action="store_true", help="Run tests.") - parser.add_argument("--package", action="store_true", help="Package the build.") # Does not override other phases. + parser.add_argument("--package", action="store_true", help="Package the build.") # Does not override other phases. parser.add_argument( "--clean", action="store_true", help="Run 'cmake --build --target clean' for the selected config." ) @@ -85,7 +86,11 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript # Default to not building the language bindings parser.add_argument("--build_csharp", action="store_true", help="Build the C# API.") parser.add_argument("--build_java", action="store_true", help="Build Java bindings.") - parser.add_argument("--publish_java_maven_local", action="store_true", help="Publish Java bindings to local Maven repository after tests.") + parser.add_argument( + "--publish_java_maven_local", + action="store_true", + help="Publish Java bindings to local Maven repository after tests.", + ) parser.add_argument("--parallel", action="store_true", help="Enable parallel build.") @@ -121,16 +126,20 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript "--cuda_home", type=Path, help="Path to CUDA home. Read from CUDA_HOME or CUDA_PATH environment variable if not specified." - "Used when --use_cuda is specified.", + "Used when --use_cuda is specified.", ) - parser.add_argument("--use_trt_rtx", action="store_true", help="Whether to use TensorRT-RTX. Default is to not use TensorRT-RTX.") + parser.add_argument( + "--use_trt_rtx", action="store_true", help="Whether to use TensorRT-RTX. Default is to not use TensorRT-RTX." + ) parser.add_argument("--use_rocm", action="store_true", help="Whether to use ROCm. Default is to not use rocm.") parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") - parser.add_argument("--use_guidance", action="store_true", help="Whether to add guidance support. Default is False.") + parser.add_argument( + "--use_guidance", action="store_true", help="Whether to add guidance support. Default is False." + ) # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) platform_group = parser.add_mutually_exclusive_group() @@ -149,8 +158,9 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript choices=["armeabi-v7a", "arm64-v8a", "x86", "x86_64"], help="Specify the target Android Application Binary Interface (ABI)", ) - parser.add_argument("--android_api", type=int, default=27, - help="Android API Level. Default is 27 (Android 8.1, released in 2017).") + parser.add_argument( + "--android_api", type=int, default=27, help="Android API Level. Default is 27 (Android 8.1, released in 2017)." + ) parser.add_argument( "--android_home", type=Path, default=_path_from_env_var("ANDROID_HOME"), help="Path to the Android SDK." ) @@ -160,9 +170,12 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript default=_path_from_env_var("ANDROID_NDK_HOME"), help="Path to the Android NDK. Typically `/ndk/`.", ) - parser.add_argument("--android_run_emulator", action="store_true", - help="Create/start an Android emulator to run the test application. " - "Requires --android, --build_java and --android_abi=x86_64.") + parser.add_argument( + "--android_run_emulator", + action="store_true", + help="Create/start an Android emulator to run the test application. " + "Requires --android, --build_java and --android_abi=x86_64.", + ) # iOS build options parser.add_argument( @@ -173,14 +186,12 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript parser.add_argument( "--osx_arch", type=str, - help="Specify the Target specific architectures for iOS " - "This is only supported on MacOS host", + help="Specify the Target specific architectures for iOS This is only supported on MacOS host", ) parser.add_argument( "--apple_deploy_target", type=str, - help="Specify the minimum version of the target platform " - "This is only supported on MacOS host", + help="Specify the minimum version of the target platform This is only supported on MacOS host", ) parser.add_argument( @@ -334,9 +345,7 @@ def _validate_ios_args(args: argparse.Namespace): if not have_required_args: raise ValueError( "iOS build on MacOS canceled due to missing arguments: " - + ", ".join( - val for val, cond in zip(arg_names, needed_args, strict=False) if not cond - ) + + ", ".join(val for val, cond in zip(arg_names, needed_args, strict=False) if not cond) ) @@ -390,7 +399,9 @@ def _get_csharp_properties(args: argparse.Namespace, ort_lib_dir: Path): configuration = f"/p:Configuration={args.config}" platform = "/p:Platform=Any CPU" # need an extra config on windows as the actual build output is in the original build dir / config / config - native_lib_path = f"/p:NativeBuildOutputDir={str(args.build_dir / args.config) if util.is_windows() else str(args.build_dir)}" + native_lib_path = ( + f"/p:NativeBuildOutputDir={str(args.build_dir / args.config) if util.is_windows() else str(args.build_dir)}" + ) ort_lib_path = f"/p:OrtLibDir={ort_lib_dir}" props = [configuration, platform, native_lib_path, ort_lib_path] @@ -435,15 +446,15 @@ def _run_android_tests(args: argparse.Namespace): gradle_executable = str(REPO_ROOT / "src" / "java" / ("gradlew.bat" if util.is_windows() else "gradlew")) android_test_path = args.build_dir / "src" / "java" / "androidtest" import subprocess + exception = None try: - util.run([gradle_executable, "--no-daemon", - f"-DminSdkVer={android_api}", - "clean", - "connectedDebugAndroidTest"], - cwd=android_test_path, - capture_stdout=True, - capture_stderr=True,) + util.run( + [gradle_executable, "--no-daemon", f"-DminSdkVer={android_api}", "clean", "connectedDebugAndroidTest"], + cwd=android_test_path, + capture_stdout=True, + capture_stderr=True, + ) except subprocess.CalledProcessError as e: exception = e print(e) @@ -478,7 +489,7 @@ def _get_windows_build_args(args: argparse.Namespace): ] if args.use_cuda: win_args += [ - "-DCMAKE_CUDA_FLAGS_INIT=/DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 -Xcompiler=\" /MP /guard:cf /Qspectre \" -allow-unsupported-compiler", + '-DCMAKE_CUDA_FLAGS_INIT=/DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 -Xcompiler=" /MP /guard:cf /Qspectre " -allow-unsupported-compiler', ] return win_args @@ -575,16 +586,31 @@ def update(args: argparse.Namespace, env: dict[str, str]): ] if args.ios: + def _get_opencv_toolchain_file(): if args.apple_sysroot == "iphoneos": return ( - REPO_ROOT / "cmake" / "external" / "opencv" / "platforms" / "iOS" / "cmake" / - "Toolchains" / "Toolchain-iPhoneOS_Xcode.cmake" + REPO_ROOT + / "cmake" + / "external" + / "opencv" + / "platforms" + / "iOS" + / "cmake" + / "Toolchains" + / "Toolchain-iPhoneOS_Xcode.cmake" ) else: return ( - REPO_ROOT / "cmake" / "external" / "opencv" / "platforms" / "iOS" / "cmake" / - "Toolchains" / "Toolchain-iPhoneSimulator_Xcode.cmake" + REPO_ROOT + / "cmake" + / "external" + / "opencv" + / "platforms" + / "iOS" + / "cmake" + / "Toolchains" + / "Toolchain-iPhoneSimulator_Xcode.cmake" ) command += [ @@ -624,8 +650,7 @@ def _get_opencv_toolchain_file(): if args.arm64 or args.arm64ec: if args.test: log.warning( - "Cannot test on host build machine for cross-compiled " - "ARM64 builds. Will skip test running after build." + "Cannot test on host build machine for cross-compiled ARM64 builds. Will skip test running after build." ) args.test = False @@ -665,7 +690,11 @@ def build(args: argparse.Namespace, env: dict[str, str]): dotnet = str(_resolve_executable_path("dotnet")) # Build the library - csharp_build_command = [dotnet, "build", ".",] + csharp_build_command = [ + dotnet, + "build", + ".", + ] csharp_build_command += _get_csharp_properties(args, ort_lib_dir=lib_dir) util.run(csharp_build_command, cwd=REPO_ROOT / "src" / "csharp") util.run(csharp_build_command, cwd=REPO_ROOT / "test" / "csharp") @@ -675,7 +704,15 @@ def package(args: argparse.Namespace, env: dict[str, str]): """ Package the build output with CMake targets. """ - make_command = [str(args.cmake_path), "--build", str(args.build_dir), "--config", args.config, "--target", "package"] + make_command = [ + str(args.cmake_path), + "--build", + str(args.build_dir), + "--config", + args.config, + "--target", + "package", + ] if args.parallel: make_command.append("--parallel") util.run(make_command, env=env) @@ -749,15 +786,22 @@ def build_examples(args: argparse.Namespace, env: dict[str, str]): # On Windows, the library files are in a subdirectory named after the configuration (e.g. Debug, Release, etc.) lib_dir = lib_dir / args.config - cmake_command = [ - str(args.cmake_path), - "-S", str(examples_dir), - "-B", str(build_dir), - "-G", args.cmake_generator, - ] + samples_to_build + [ - "-DORT_GENAI_INCLUDE_DIR=" + str(include_dir), - "-DORT_GENAI_LIB_DIR=" + str(lib_dir), - ] + cmake_command = ( + [ + str(args.cmake_path), + "-S", + str(examples_dir), + "-B", + str(build_dir), + "-G", + args.cmake_generator, + ] + + samples_to_build + + [ + "-DORT_GENAI_INCLUDE_DIR=" + str(include_dir), + "-DORT_GENAI_LIB_DIR=" + str(lib_dir), + ] + ) if args.cmake_generator.startswith("Visual Studio"): if args.arm64: diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py index abd30ebb00..2f2c9ad20d 100644 --- a/cgmanifests/generate_cgmanifest.py +++ b/cgmanifests/generate_cgmanifest.py @@ -115,7 +115,9 @@ def normalize_path_separators(path): submodule_lines = proc.stdout.splitlines() for submodule_line in submodule_lines: (absolute_path, url, commit) = submodule_line.split(" ") - git_deps[GitDep(commit, url)] = f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}" + git_deps[GitDep(commit, url)] = ( + f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}" + ) with open(os.path.join(SCRIPT_DIR, "..", "cmake", "deps.txt")) as f: depfile_reader = csv.reader(f, delimiter=";") diff --git a/examples/chat_app/app.py b/examples/chat_app/app.py index 2b0492d13b..cff38054e2 100755 --- a/examples/chat_app/app.py +++ b/examples/chat_app/app.py @@ -31,12 +31,14 @@ def change_model_listener(new_model_name): if "vision" in new_model_name: print("Configuring for multi-modal model") interface = MultiModal_ONNXModel( - model_path=d["model_dir"], execution_provider=d["provider"], + model_path=d["model_dir"], + execution_provider=d["provider"], ) else: print("Configuring for language-only model") interface = ONNXModel( - model_path=d["model_dir"], execution_provider=d["provider"], + model_path=d["model_dir"], + execution_provider=d["provider"], ) # interface.initialize() @@ -92,7 +94,10 @@ def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_pa for ep_name in os.listdir(optimized_directory): sub_optimized_directory = os.path.join(optimized_directory, ep_name) for model_name in os.listdir(sub_optimized_directory): - available_models[model_name] = {"model_dir": os.path.join(sub_optimized_directory, model_name), "provider": get_ep_name(ep_name)} + available_models[model_name] = { + "model_dir": os.path.join(sub_optimized_directory, model_name), + "provider": get_ep_name(ep_name), + } if model_path: available_models[model_name] = {"model_dir": model_path, "provider": get_ep_name(model_path)} @@ -147,13 +152,7 @@ def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_pa label="Max History Token Length", ) token_printing_step = gr.Slider( - minimum=1, - maximum=50, - value=4, - step=1, - interactive=True, - label="Token Printing Step", - visible=False + minimum=1, maximum=50, value=4, step=1, interactive=True, label="Token Printing Step", visible=False ) images = gr.File(file_count="multiple", file_types=["image"], label="Upload image(s)", visible=False) images.change( @@ -186,14 +185,7 @@ def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_pa } retry_args = { "fn": interface_retry, - "inputs": [ - chatbot, - history, - max_length_tokens, - max_context_length_tokens, - token_printing_step, - images - ], + "inputs": [chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, images], "outputs": [chatbot, history, status_display], "show_progress": True, } @@ -245,7 +237,9 @@ def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_pa if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--expose_locally", action="store_true") - parser.add_argument("--model_path", "-m", type=str, required=False, help="The location where your model is located.") + parser.add_argument( + "--model_path", "-m", type=str, required=False, help="The location where your model is located." + ) parser.add_argument("--model_name", "-n", type=str, required=False, help="The name of your model") args = parser.parse_args() model_path = args.model_path @@ -257,7 +251,9 @@ def launch_chat_app(expose_locally: bool = False, model_name: str = "", model_pa model_name = os.path.basename(model_path) # check if genai_config.json in the model foler if "genai_config.json" not in os.listdir(model_path): - raise ValueError(f"Your model_path folder do not include 'genai.json' file, please double check your model_path '{model_path}'") + raise ValueError( + f"Your model_path folder do not include 'genai.json' file, please double check your model_path '{model_path}'" + ) if args.model_name: model_name = args.model_name diff --git a/examples/chat_app/interface/hddr_llm_onnx_interface.py b/examples/chat_app/interface/hddr_llm_onnx_interface.py index ba79c17b10..8c7941a0fd 100755 --- a/examples/chat_app/interface/hddr_llm_onnx_interface.py +++ b/examples/chat_app/interface/hddr_llm_onnx_interface.py @@ -9,6 +9,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(current_dir, "..", "..", "..")) + class ONNXModel: """A wrapper for OnnxRuntime-GenAI to run ONNX LLM model.""" @@ -34,7 +35,7 @@ def __init__(self, model_path, execution_provider): self.chat_template = "<|user|>{input}<|end|><|assistant|>" elif "Llama-3" in self.model_path: self.enable_history_max = 2 - self.template_header = """<|start_header_id|>system<|end_header_id|> + self.template_header = """<|start_header_id|>system<|end_header_id|> You are a helpful AI assistant.<|eot_id|>""" self.history_template = """<|start_header_id|>user<|end_header_id|> {input}<|eot_id|><|start_header_id|>assistant<|end_header_id|> @@ -43,7 +44,7 @@ def __init__(self, model_path, execution_provider): self.chat_template = """<|start_header_id|>user<|end_header_id|> {input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""" - #self.chat_template = llama3_template + # self.chat_template = llama3_template else: self.enable_history_max = 2 self.template_header = "" @@ -53,12 +54,12 @@ def __init__(self, model_path, execution_provider): def generate_prompt_with_history(self, text, history, max_length=2048): prompt = "" - for dialog in history[-self.enable_history_max:]: - prompt += f'{self.history_template.format(input=dialog[0], response=dialog[1])}' + for dialog in history[-self.enable_history_max :]: + prompt += f"{self.history_template.format(input=dialog[0], response=dialog[1])}" prompt = self.template_header + prompt - prompt += f'{self.chat_template.format(input=text)}' + prompt += f"{self.chat_template.format(input=text)}" input_ids = self.tokenizer.encode(prompt) @@ -68,7 +69,7 @@ def generate_prompt_with_history(self, text, history, max_length=2048): history.clear() if "Llama-3" in self.model_path: prompt = self.template_header - prompt += f'{self.chat_template.format(input=text)}' + prompt += f"{self.chat_template.format(input=text)}" return self.tokenizer.encode(prompt) def search( @@ -80,7 +81,7 @@ def search( output_tokens = [] params = og.GeneratorParams(self.model) - search_options = {"max_length" : max_length} + search_options = {"max_length": max_length} params.set_search_options(**search_options) generator = og.Generator(self.model, params) @@ -96,23 +97,12 @@ def search( if idx % token_printing_step == 0: yield self.tokenizer.decode(output_tokens) - def predict( - self, - text, - chatbot, - history, - max_length_tokens, - max_context_length_tokens, - token_printing_step, - *args - ): + def predict(self, text, chatbot, history, max_length_tokens, max_context_length_tokens, token_printing_step, *args): if text == "": yield chatbot, history, "Empty context." return - inputs = self.generate_prompt_with_history( - text, history, max_length=max_context_length_tokens - ) + inputs = self.generate_prompt_with_history(text, history, max_length=max_context_length_tokens) if inputs is None: yield chatbot, history, "Input too long." @@ -162,10 +152,13 @@ def predict( sentence = sentence[: sentence.index(ai_token)].strip() break sentence = sentence.strip() - a, b = [[y[0], convert_to_markdown(y[1])] for y in history] + [[text, convert_to_markdown(sentence)]], [ - *history, - [text, sentence], - ] + a, b = ( + [[y[0], convert_to_markdown(y[1])] for y in history] + [[text, convert_to_markdown(sentence)]], + [ + *history, + [text, sentence], + ], + ) yield a, b, "Generating..." if shared_state.interrupted: diff --git a/examples/python/engine/model-qa.py b/examples/python/engine/model-qa.py index cb8e2458b7..fc4105a6b8 100644 --- a/examples/python/engine/model-qa.py +++ b/examples/python/engine/model-qa.py @@ -35,11 +35,7 @@ def run(args: argparse.Namespace): request = og.Request(params) request.add_tokens( - tokenizer.encode( - tokenizer.apply_chat_template( - messages=messages, add_generation_prompt=True - ) - ), + tokenizer.encode(tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True)), ) streaming_tokenizer = tokenizer.create_stream() diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py index 20c52ad9ba..649d393c4a 100644 --- a/examples/python/model-generate.py +++ b/examples/python/model-generate.py @@ -6,15 +6,18 @@ def main(args): - if args.verbose: print("Loading model...") + if args.verbose: + print("Loading model...") - if hasattr(args, 'prompts'): + if hasattr(args, "prompts"): prompts = args.prompts else: if args.non_interactive: - prompts = ["The first 4 digits of pi are", - "The square root of 2 is", - "The first 6 numbers of the Fibonacci sequence are",] + prompts = [ + "The first 4 digits of pi are", + "The square root of 2 is", + "The first 6 numbers of the Fibonacci sequence are", + ] else: text = input("Input: ") prompts = [text] @@ -24,10 +27,7 @@ def main(args): config = og.Config(args.model_path) # Configure search options - search_config = { - "batch_size": batch_size, - "num_beams": args.num_beams - } + search_config = {"batch_size": batch_size, "num_beams": args.num_beams} # Configure execution provider if specified if args.execution_provider != "follow_config": @@ -53,36 +53,51 @@ def main(args): model = og.Model(config) - if args.verbose: print("Model loaded") + if args.verbose: + print("Model loaded") tokenizer = og.Tokenizer(model) - if args.verbose: print("Tokenizer created") + if args.verbose: + print("Tokenizer created") if args.chat_template: - if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: - print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") + if args.chat_template.count("{") != 1 or args.chat_template.count("}") != 1: + print( + "Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'" + ) exit(1) - prompts[:] = [f'{args.chat_template.format(input=text)}' for text in prompts] + prompts[:] = [f"{args.chat_template.format(input=text)}" for text in prompts] input_tokens = tokenizer.encode_batch(prompts) - if args.verbose: print(f'Prompt(s) encoded: {prompts}') + if args.verbose: + print(f"Prompt(s) encoded: {prompts}") params = og.GeneratorParams(model) - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + search_options = { + name: getattr(args, name) + for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"] + if name in args + } - if (args.verbose): print(f'Args: {args}') - if (args.verbose): print(f'Search options: {search_options}') + if args.verbose: + print(f"Args: {args}") + if args.verbose: + print(f"Search options: {search_options}") params.set_search_options(**search_options) - if args.verbose: print("GeneratorParams created") + if args.verbose: + print("GeneratorParams created") generator = og.Generator(model, params) - if args.verbose: print("Generator created") + if args.verbose: + print("Generator created") generator.append_tokens(input_tokens) - if args.verbose: print("Input tokens added") + if args.verbose: + print("Input tokens added") - if args.verbose: print("Generating tokens ...\n") + if args.verbose: + print("Generating tokens ...\n") start_time = time.time() while True: generator.generate_next_token() @@ -91,34 +106,89 @@ def main(args): run_time = time.time() - start_time for i in range(len(prompts)): - print(f'Prompt #{i}: {prompts[i]}') + print(f"Prompt #{i}: {prompts[i]}") print() print(tokenizer.decode(generator.get_sequence(i))) print() print() total_tokens = sum(len(generator.get_sequence(i)) for i in range(len(prompts))) - print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}") + print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens / run_time:.2f}") print() + if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai") - parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") - parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts') - parser.add_argument('-i', '--min_length', type=int, default=25, help='Min number of tokens to generate including the prompt') - parser.add_argument('-l', '--max_length', type=int, default=50, help='Max number of tokens to generate including the prompt') - parser.add_argument('-ds', '--do_sample', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') - parser.add_argument('--top_p', type=float, help='Top p probability to sample with') - parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') - parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') - parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') - parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-b', '--batch_size_for_cuda_graph', type=int, default=1, help='Max batch size for CUDA graph') - parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.') - parser.add_argument('--chunk_size', type=int, default=0, help='Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)') - parser.add_argument('-n', '--num_beams', type=int, default=3, help='Number of beams for beam search (default: 3)') - parser.add_argument('--non-interactive', action=argparse.BooleanOptionalAction, required=False, default=False, help='Non-interactive mode, mainly for CI usage') + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai" + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument( + "-pr", + "--prompts", + nargs="*", + required=False, + help="Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts", + ) + parser.add_argument( + "-i", "--min_length", type=int, default=25, help="Min number of tokens to generate including the prompt" + ) + parser.add_argument( + "-l", "--max_length", type=int, default=50, help="Max number of tokens to generate including the prompt" + ) + parser.add_argument( + "-ds", + "--do_sample", + action="store_true", + help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false", + ) + parser.add_argument("--top_p", type=float, help="Top p probability to sample with") + parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from") + parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with") + parser.add_argument("-r", "--repetition_penalty", type=float, help="Repetition penalty to sample with") + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="Print verbose output and timing information. Defaults to false", + ) + parser.add_argument("-b", "--batch_size_for_cuda_graph", type=int, default=1, help="Max batch size for CUDA graph") + parser.add_argument( + "-c", + "--chat_template", + type=str, + default="", + help="Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=0, + help="Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)", + ) + parser.add_argument("-n", "--num_beams", type=int, default=3, help="Number of beams for beam search (default: 3)") + parser.add_argument( + "--non-interactive", + action=argparse.BooleanOptionalAction, + required=False, + default=False, + help="Non-interactive mode, mainly for CI usage", + ) args = parser.parse_args() main(args) diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index 73d035fb39..cfd31cd946 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. import argparse @@ -15,17 +15,19 @@ def get_tools_list(input_tools): try: tools_list = json.loads(input_tools) except json.JSONDecodeError: - raise ValueError("Invalid JSON format for tools list, expected format: '[{\"name\": \"fn1\"},{\"name\": \"fn2\"}]'") + raise ValueError('Invalid JSON format for tools list, expected format: \'[{"name": "fn1"},{"name": "fn2"}]\'') if len(tools_list) == 0: raise ValueError("Tools list cannot be empty") return tools_list + def create_prompt_tool_input(tools_list): tool_input = str(tools_list[0]) for tool in tools_list[1:]: - tool_input += ',' + str(tool) + tool_input += "," + str(tool) return tool_input + def get_json_grammar(input_tools): tools_list = get_tools_list(input_tools) prompt_tool_input = create_prompt_tool_input(tools_list) @@ -34,19 +36,28 @@ def get_json_grammar(input_tools): else: output = '{ "anyOf": [' + json.dumps(tools_list[0]) for tool in tools_list[1:]: - output += ',' + json.dumps(tool) - output += '] }' + output += "," + json.dumps(tool) + output += "] }" return prompt_tool_input, output + def get_lark_grammar(input_tools): tools_list = get_tools_list(input_tools) prompt_tool_input = create_prompt_tool_input(tools_list) if len(tools_list) == 1: # output = ("start: TEXT | fun_call\n" "TEXT: /[^{](.|\\n)*/\n" " fun_call: <|tool_call|> %json " + json.dumps(tools_list[0])) - output = ("start: TEXT | fun_call\nTEXT: /[^{](.|\\n)*/\n fun_call: <|tool_call|> %json " + json.dumps(convert_tool_to_grammar_input(tools_list[0]))) + output = "start: TEXT | fun_call\nTEXT: /[^{](.|\\n)*/\n fun_call: <|tool_call|> %json " + json.dumps( + convert_tool_to_grammar_input(tools_list[0]) + ) return prompt_tool_input, output else: - return prompt_tool_input, "start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {\"anyOf\": [" + ','.join([json.dumps(tool) for tool in tools_list]) + "]}" + return ( + prompt_tool_input, + 'start: TEXT | fun_call \n TEXT: /[^{](.|\n)*/ \n fun_call: <|tool_call|> %json {"anyOf": [' + + ",".join([json.dumps(tool) for tool in tools_list]) + + "]}", + ) + def convert_tool_to_grammar_input(tool): param_props = {} @@ -54,30 +65,32 @@ def convert_tool_to_grammar_input(tool): for param_name, param_info in tool.get("parameters", {}).items(): param_props[param_name] = { "type": param_info.get("type", "string"), - "description": param_info.get("description", "") + "description": param_info.get("description", ""), } required_params.append(param_name) output_schema = { - "description": tool.get('description', ''), + "description": tool.get("description", ""), "type": "object", "required": ["name", "parameters"], "additionalProperties": False, "properties": { - "name": { "const": tool["name"] }, + "name": {"const": tool["name"]}, "parameters": { "type": "object", "properties": param_props, "required": required_params, - "additionalProperties": False - } - } + "additionalProperties": False, + }, + }, } if len(param_props) == 0: output_schema["required"] = ["name"] return output_schema + def main(args): - if args.verbose: print("Loading model...") + if args.verbose: + print("Loading model...") if args.timings: started_timestamp = 0 first_token_timestamp = 0 @@ -86,21 +99,30 @@ def main(args): if args.execution_provider != "follow_config": config.clear_providers() if args.execution_provider != "cpu": - if args.verbose: print(f"Setting model to {args.execution_provider}") + if args.verbose: + print(f"Setting model to {args.execution_provider}") config.append_provider(args.execution_provider) model = og.Model(config) - if args.verbose: print("Model loaded") + if args.verbose: + print("Model loaded") tokenizer = og.Tokenizer(model) tokenizer_stream = tokenizer.create_stream() - if args.verbose: print("Tokenizer created") - if args.verbose: print() + if args.verbose: + print("Tokenizer created") + if args.verbose: + print() - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} - search_options['batch_size'] = 1 + search_options = { + name: getattr(args, name) + for name in ["do_sample", "max_length", "min_length", "top_p", "top_k", "temperature", "repetition_penalty"] + if name in args + } + search_options["batch_size"] = 1 - if args.verbose: print(search_options) + if args.verbose: + print(search_options) system_prompt = args.system_prompt guidance_type = "" @@ -134,7 +156,8 @@ def main(args): if text == "quit()": break - if args.timings: started_timestamp = time.time() + if args.timings: + started_timestamp = time.time() params = og.GeneratorParams(model) params.set_search_options(**search_options) @@ -146,7 +169,8 @@ def main(args): print("Guidance input is:", guidance_input) generator = og.Generator(model, params) - if args.verbose: print("Generator created") + if args.verbose: + print("Generator created") # Create messages with proper JSON encoding # Gemma2 models don't support system role, so we prepend system prompt to user message @@ -156,13 +180,10 @@ def main(args): elif guidance_type == "json_schema" or guidance_type == "lark_grammar": messages_list = [ {"role": "system", "content": system_prompt, "tools": prompt_tool_input}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ] else: - messages_list = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": text} - ] + messages_list = [{"role": "system", "content": system_prompt}, {"role": "user", "content": text}] # Convert to JSON string for tokenizer messages = json.dumps(messages_list) @@ -176,13 +197,14 @@ def main(args): input_tokens = tokenizer.encode(prompt) generator.append_tokens(input_tokens) - if args.verbose: print("Running generation loop ...") + if args.verbose: + print("Running generation loop ...") if args.timings: first = True new_tokens = [] print() - print("Output: ", end='', flush=True) + print("Output: ", end="", flush=True) try: while True: @@ -196,8 +218,9 @@ def main(args): break new_token = generator.get_next_tokens()[0] - print(tokenizer_stream.decode(new_token), end='', flush=True) - if args.timings: new_tokens.append(new_token) + print(tokenizer_stream.decode(new_token), end="", flush=True) + if args.timings: + new_tokens.append(new_token) except KeyboardInterrupt: print(" --control+c pressed, aborting generation--") print() @@ -210,27 +233,88 @@ def main(args): if args.timings: prompt_time = first_token_timestamp - started_timestamp run_time = time.time() - first_token_timestamp - print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") + print( + f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens) / prompt_time:.2f} tps, New tokens per second: {len(new_tokens) / run_time:.2f} tps" + ) # If Input prompt is provided it will just run the model for the input prompt and exit if args.input_prompt: break + if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") - parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") - parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') - parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') - parser.add_argument('-ds', '--do_sample', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') - parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') - parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') - parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') - parser.add_argument('-re', '--repetition_penalty', type=float, help='Repetition penalty to sample with') - parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') - parser.add_argument('-gtype', '--guidance_type', type=str, default="none", choices=["none", "json_schema", "regex", "lark_grammar"], help='Provide guidance type for the model, options are json_schema, regex, or lark_grammar.') - parser.add_argument('-ginfo', '--guidance_info', type=str, default='', help='Provide information of the guidance type used, it could be either tools or regex string. It is required if guidance_type is provided') - parser.add_argument('-s', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the prompt.') - parser.add_argument('-inp', '--input_prompt', type=str, default='', help='Input Prompt, if provided it will just run the prompt and exit') + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai" + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + required=True, + help="Onnx model folder path (must contain genai_config.json and model.onnx)", + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "NvTensorRtRtx", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument("-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt") + parser.add_argument("-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt") + parser.add_argument( + "-ds", + "--do_sample", + action="store_true", + help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false", + ) + parser.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with") + parser.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from") + parser.add_argument("-t", "--temperature", type=float, help="Temperature to sample with") + parser.add_argument("-re", "--repetition_penalty", type=float, help="Repetition penalty to sample with") + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="Print verbose output and timing information. Defaults to false", + ) + parser.add_argument( + "-g", + "--timings", + action="store_true", + default=False, + help="Print timing information for each generation step. Defaults to false", + ) + parser.add_argument( + "-gtype", + "--guidance_type", + type=str, + default="none", + choices=["none", "json_schema", "regex", "lark_grammar"], + help="Provide guidance type for the model, options are json_schema, regex, or lark_grammar.", + ) + parser.add_argument( + "-ginfo", + "--guidance_info", + type=str, + default="", + help="Provide information of the guidance type used, it could be either tools or regex string. It is required if guidance_type is provided", + ) + parser.add_argument( + "-s", + "--system_prompt", + type=str, + default="You are a helpful AI assistant.", + help="System prompt to use for the prompt.", + ) + parser.add_argument( + "-inp", + "--input_prompt", + type=str, + default="", + help="Input Prompt, if provided it will just run the prompt and exit", + ) args = parser.parse_args() main(args) diff --git a/examples/python/phi4-mm.py b/examples/python/phi4-mm.py index d4ceda0b75..80a1413942 100644 --- a/examples/python/phi4-mm.py +++ b/examples/python/phi4-mm.py @@ -11,6 +11,7 @@ # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) + def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): curr_path = Path(current_dir).absolute() target_dir = glob.glob(target_dir_name, root_dir=curr_path) @@ -20,7 +21,7 @@ def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): if curr_path.parent == curr_path: # Root dir return None - return _find_dir_contains_sub_dir(curr_path / '..', target_dir_name) + return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name) def _complete(text, state): @@ -33,6 +34,7 @@ def get_paths(modality, user_provided_paths, default_paths, interactive): if interactive: try: import readline + readline.set_completer_delims(" \t\n;") readline.parse_and_bind("tab: complete") readline.set_completer(_complete) @@ -41,9 +43,9 @@ def get_paths(modality, user_provided_paths, default_paths, interactive): pass paths = [ path.strip() - for path in input( - f"{modality.capitalize()} Path (comma separated; leave empty if no {modality}): " - ).split(",") + for path in input(f"{modality.capitalize()} Path (comma separated; leave empty if no {modality}): ").split( + "," + ) ] else: paths = user_provided_paths if user_provided_paths else default_paths @@ -72,14 +74,28 @@ def run(args: argparse.Namespace): image_paths = get_paths( modality="image", user_provided_paths=args.image_paths, - default_paths=[str(_find_dir_contains_sub_dir(Path(__file__).parent, "test") / "test_models" / "images" / "australia.jpg")], - interactive=interactive + default_paths=[ + str( + _find_dir_contains_sub_dir(Path(__file__).parent, "test") + / "test_models" + / "images" + / "australia.jpg" + ) + ], + interactive=interactive, ) audio_paths = get_paths( modality="audio", user_provided_paths=args.audio_paths, - default_paths=[str(_find_dir_contains_sub_dir(Path(__file__).parent, "test") / "test_models" / "audios" / "1272-141231-0002.mp3")], - interactive=interactive + default_paths=[ + str( + _find_dir_contains_sub_dir(Path(__file__).parent, "test") + / "test_models" + / "audios" + / "1272-141231-0002.mp3" + ) + ], + interactive=interactive, ) images = None @@ -94,7 +110,7 @@ def run(args: argparse.Namespace): if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") print(f"Using image: {image_path}") - prompt += f"<|image_{i+1}|>\n" + prompt += f"<|image_{i + 1}|>\n" images = og.Images.open(*image_paths) # Get audios @@ -105,10 +121,9 @@ def run(args: argparse.Namespace): if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") print(f"Using audio: {audio_path}") - prompt += f"<|audio_{i+1}|>\n" + prompt += f"<|audio_{i + 1}|>\n" audios = og.Audios.open(*audio_paths) - if interactive: text = input("Prompt: ") else: @@ -154,23 +169,30 @@ def run(args: argparse.Namespace): if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("-m", "--model_path", type=str, required=True, help="Path to the folder containing the model") parser.add_argument( - "-m", "--model_path", type=str, required=True, help="Path to the folder containing the model" - ) - parser.add_argument( - "-e", "--execution_provider", type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead." + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", ) parser.add_argument( - "--image_paths", nargs='*', type=str, required=False, help="Path to the images, mainly for CI usage" + "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage" ) parser.add_argument( - "--audio_paths", nargs='*', type=str, required=False, help="Path to the audios, mainly for CI usage" + "--audio_paths", nargs="*", type=str, required=False, help="Path to the audios, mainly for CI usage" ) parser.add_argument( - '-pr', '--prompt', required=False, help='Input prompts to generate tokens from, mainly for CI usage' + "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage" ) parser.add_argument( - '--non-interactive', action=argparse.BooleanOptionalAction, required=False, help='Non-interactive mode, mainly for CI usage' + "--non-interactive", + action=argparse.BooleanOptionalAction, + required=False, + help="Non-interactive mode, mainly for CI usage", ) args = parser.parse_args() run(args) diff --git a/examples/python/whisper.py b/examples/python/whisper.py index 600cd97863..f460337996 100644 --- a/examples/python/whisper.py +++ b/examples/python/whisper.py @@ -10,6 +10,7 @@ # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) + def _complete(text, state): return (glob.glob(text + "*") + [None])[state] @@ -101,24 +102,27 @@ def run(args: argparse.Namespace): if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("-m", "--model_path", type=str, required=True, help="Path to the model") parser.add_argument( - "-m", "--model_path", type=str, required=True, help="Path to the model" - ) - parser.add_argument( - '-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "follow_config"], - help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead." - ) - parser.add_argument( - "-b", "--num_beams", type=int, default=4, help="Number of beams" - ) - parser.add_argument( - "-a", "--audio", type=str, default="", help="Path to audio file for CI testing purposes" + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", ) + parser.add_argument("-b", "--num_beams", type=int, default=4, help="Number of beams") + parser.add_argument("-a", "--audio", type=str, default="", help="Path to audio file for CI testing purposes") parser.add_argument( "-o", "--output", type=str, default="", help="Expected transcribed output for CI testing purposes" ) parser.add_argument( - "-ni", "--non_interactive", default=False, action="store_true", help="Non-interactive mode for CI testing purposes" + "-ni", + "--non_interactive", + default=False, + action="store_true", + help="Non-interactive mode for CI testing purposes", ) args = parser.parse_args() run(args) diff --git a/examples/slm_engine/build_scripts/build_deps.py b/examples/slm_engine/build_scripts/build_deps.py index 1f7fd7ca91..9ea2a6ff42 100755 --- a/examples/slm_engine/build_scripts/build_deps.py +++ b/examples/slm_engine/build_scripts/build_deps.py @@ -64,18 +64,14 @@ def copy_files_without_hidden(src, dest): dest: Path to the destination directory. """ try: - os.makedirs( - dest, exist_ok=True - ) # Create destination directory if it doesn't exist + os.makedirs(dest, exist_ok=True) # Create destination directory if it doesn't exist for root, dirs, files in os.walk(src): for file in files: if not file.startswith("."): # Exclude hidden files src_file = os.path.join(root, file) dest_file = os.path.join(dest, os.path.relpath(src_file, src)) - os.makedirs( - os.path.dirname(dest_file), exist_ok=True - ) # Create necessary directories + os.makedirs(os.path.dirname(dest_file), exist_ok=True) # Create necessary directories shutil.copy2(src_file, dest_file) except OSError as e: @@ -97,9 +93,7 @@ def copy_files_keeping_symlinks(src_files, dest): if not os.path.exists(linkname): os.symlink(linkto, linkname) elif os.path.isdir(file): - shutil.copytree( - file, f"{dest}/{os.path.basename(file)}", dirs_exist_ok=True - ) + shutil.copytree(file, f"{dest}/{os.path.basename(file)}", dirs_exist_ok=True) else: shutil.copy2(file, dest) @@ -183,12 +177,7 @@ def build_ort(args, build_dir, artifacts_dir): if not os.path.exists("onnxruntime"): # Clone the ORT Repo print("Cloning ONNX Runtime") - if ( - subprocess.call( - ["git", "clone", "https://github.com/microsoft/onnxruntime.git"] - ) - != 0 - ): + if subprocess.call(["git", "clone", "https://github.com/microsoft/onnxruntime.git"]) != 0: raise Exception("Failed to clone ONNX Runtime") # Now get the dependencies @@ -229,9 +218,7 @@ def build_ort(args, build_dir, artifacts_dir): ] ) if args.qnn_sdk_path: - cmd_args.extend( - ["--use_qnn", "static_lib", "--qnn_home", args.qnn_sdk_path] - ) + cmd_args.extend(["--use_qnn", "static_lib", "--qnn_home", args.qnn_sdk_path]) cmd_args.extend(["--cmake_extra_defines", "onnxruntime_BUILD_UNIT_TESTS=OFF"]) @@ -313,7 +300,6 @@ def build_ort(args, build_dir, artifacts_dir): def build_ort_genai(args, artifacts_dir, ort_home): - time_build_start = time.time() # Navigate to the directory where this Python file is located @@ -347,9 +333,7 @@ def build_ort_genai(args, artifacts_dir, ort_home): # Function calling will work in both guidance and fallback modes ] if ort_home is None: - raise Exception( - f"{RED}ORT Home is None. Please build ORT from source first{CLEAR}" - ) + raise Exception(f"{RED}ORT Home is None. Please build ORT from source first{CLEAR}") print(f"{MAGENTA}ORT Home: {ort_home}{CLEAR}") cmd_args.extend(["--ort_home", ort_home]) @@ -416,12 +400,8 @@ def build_ort_genai(args, artifacts_dir, ort_home): os.makedirs(f"{artifacts_dir}/include", exist_ok=True) os.makedirs(f"{artifacts_dir}/lib", exist_ok=True) - copy_files_keeping_symlinks( - glob.glob(f"{build_dir_name}/install/lib/*"), f"{artifacts_dir}/lib" - ) - copy_files_keeping_symlinks( - glob.glob(f"{build_dir_name}/install/bin/*"), f"{artifacts_dir}/lib" - ) + copy_files_keeping_symlinks(glob.glob(f"{build_dir_name}/install/lib/*"), f"{artifacts_dir}/lib") + copy_files_keeping_symlinks(glob.glob(f"{build_dir_name}/install/bin/*"), f"{artifacts_dir}/lib") copy_files_keeping_symlinks( glob.glob(f"{build_dir_name}/install/include/*"), @@ -496,9 +476,7 @@ def build_header_only(args, build_dir, artifacts_dir): result = subprocess.call(["git", "checkout", lib["version"]]) if result != 0: - print( - f"{RED}Failed to checkout version: {lib['version']} {lib['name']}{CLEAR}" - ) + print(f"{RED}Failed to checkout version: {lib['version']} {lib['name']}{CLEAR}") return if not os.path.exists(dest_root_dir): @@ -510,9 +488,7 @@ def build_header_only(args, build_dir, artifacts_dir): shutil.copy2(file, dest_root_dir) elif "directory" in lib: os.chdir("..") - copy_files_without_hidden( - f"{lib['name']}/{lib['directory']}", dest_root_dir - ) + copy_files_without_hidden(f"{lib['name']}/{lib['directory']}", dest_root_dir) else: # Copy the entire directory os.chdir("..") @@ -525,16 +501,12 @@ def build_header_only(args, build_dir, artifacts_dir): def main(): - parser = argparse.ArgumentParser( - description="Build script for dependency libraries" - ) + parser = argparse.ArgumentParser(description="Build script for dependency libraries") # Adding arguments parser.add_argument("--android_sdk_path", type=str, help="Path to ANDROID SDK") parser.add_argument("--android_ndk_path", type=str, help="Path to ANDROID NDK") - parser.add_argument( - "--api_level", type=str, help="Android API Level", default="27" - ) # e.g., 29 + parser.add_argument("--api_level", type=str, help="Android API Level", default="27") # e.g., 29 parser.add_argument( "--qnn_sdk_path", type=str, @@ -572,9 +544,7 @@ def main(): args.android = True # If the user didn't specify build_ort_from_source assert if not args.build_ort_from_source: - raise Exception( - "For Android build ONNX Runtime use: --build_ort_from_source" - ) + raise Exception("For Android build ONNX Runtime use: --build_ort_from_source") else: args.android = False @@ -592,9 +562,7 @@ def main(): dep_src_dir = os.path.abspath("../../../build/slm_deps") os.makedirs(dep_src_dir, exist_ok=True) - artifacts_dir = os.path.abspath( - f"slm_deps/artifacts/{get_platform_dirname(args)}-{get_machine_type(args)}" - ) + artifacts_dir = os.path.abspath(f"slm_deps/artifacts/{get_platform_dirname(args)}-{get_machine_type(args)}") os.makedirs(artifacts_dir, exist_ok=True) diff --git a/examples/slm_engine/test/chat_ui.py b/examples/slm_engine/test/chat_ui.py index dc505d0001..6f63503279 100644 --- a/examples/slm_engine/test/chat_ui.py +++ b/examples/slm_engine/test/chat_ui.py @@ -60,9 +60,7 @@ def reset_chat(): with gr.Blocks() as demo: - kpi_grid = gr.Dataframe( - headers=["KPI", "Value"], datatype=["str", "str"], render=False - ) + kpi_grid = gr.Dataframe(headers=["KPI", "Value"], datatype=["str", "str"], render=False) gr.Markdown("

Chat with ONNX SLM Engine

") with gr.Row(): with gr.Column(): diff --git a/examples/slm_engine/test/test_slm_server.py b/examples/slm_engine/test/test_slm_server.py index 900cdfd553..56e0b6770b 100755 --- a/examples/slm_engine/test/test_slm_server.py +++ b/examples/slm_engine/test/test_slm_server.py @@ -34,9 +34,7 @@ def launch_server(server_binary: str, model_path: str): response = requests.get(url) json_response = json.loads(response.text) if json_response["response"]["status"] == "success": - print( - f"{MAGENTA}Engine State: {json_response['response']['engine_state']}{CLEAR}" - ) + print(f"{MAGENTA}Engine State: {json_response['response']['engine_state']}{CLEAR}") started = True except Exception: # Initially the server may not be ready to accept requests @@ -56,7 +54,6 @@ def launch_server(server_binary: str, model_path: str): # This function tests the OpenAI API Interface def run_test(url: str): - # Test the API print("Testing the API with a test message") test_message = """ diff --git a/examples/slm_engine/test/test_tool_calling.py b/examples/slm_engine/test/test_tool_calling.py index 717580abaa..ea8afc165b 100755 --- a/examples/slm_engine/test/test_tool_calling.py +++ b/examples/slm_engine/test/test_tool_calling.py @@ -22,61 +22,40 @@ def test_tool_calling(): payload1 = { "messages": [ - { - "role": "system", - "content": "You are a helpful assistant with these tools." - }, + {"role": "system", "content": "You are a helpful assistant with these tools."}, { "role": "user", - "content": "book flight ticket from Beijing to Paris(using airport code) in 2025-12-04 to 2025-12-10 , then book hotel from 2025-12-04 to 2025-12-10 in Paris" - } + "content": "book flight ticket from Beijing to Paris(using airport code) in 2025-12-04 to 2025-12-10 , then book hotel from 2025-12-04 to 2025-12-10 in Paris", + }, ], "tools": [ { "name": "booking_flight_tickets", "description": "booking flights", "parameters": { - "origin_airport_code": { - "description": "The name of Departure airport code", - "type": "string" - }, + "origin_airport_code": {"description": "The name of Departure airport code", "type": "string"}, "destination_airport_code": { "description": "The name of Destination airport code", - "type": "string" - }, - "departure_date": { - "description": "The date of outbound flight", - "type": "string" + "type": "string", }, - "return_date": { - "description": "The date of return flight", - "type": "string" - } - } + "departure_date": {"description": "The date of outbound flight", "type": "string"}, + "return_date": {"description": "The date of return flight", "type": "string"}, + }, }, { "name": "booking_hotels", "description": "booking hotel", "parameters": { - "destination": { - "description": "The name of the city", - "type": "string" - }, - "check_in_date": { - "description": "The date of check in", - "type": "string" - }, - "checkout_date": { - "description": "The date of check out", - "type": "string" - } - } - } + "destination": {"description": "The name of the city", "type": "string"}, + "check_in_date": {"description": "The date of check in", "type": "string"}, + "checkout_date": {"description": "The date of check out", "type": "string"}, + }, + }, ], "temperature": 0.00001, "max_tokens": 4096, "top_p": 1.0, - "do_sample": False + "do_sample": False, } try: @@ -98,43 +77,31 @@ def test_tool_calling(): # Test case 2: Flight only payload2 = { "messages": [ - { - "role": "system", - "content": "You are a helpful travel assistant." - }, + {"role": "system", "content": "You are a helpful travel assistant."}, { "role": "user", - "content": "I need to book a flight from JFK to LHR on 2025-08-15, returning on 2025-08-22" - } + "content": "I need to book a flight from JFK to LHR on 2025-08-15, returning on 2025-08-22", + }, ], "tools": [ { "name": "booking_flight_tickets", "description": "booking flights", "parameters": { - "origin_airport_code": { - "description": "The name of Departure airport code", - "type": "string" - }, + "origin_airport_code": {"description": "The name of Departure airport code", "type": "string"}, "destination_airport_code": { "description": "The name of Destination airport code", - "type": "string" + "type": "string", }, - "departure_date": { - "description": "The date of outbound flight", - "type": "string" - }, - "return_date": { - "description": "The date of return flight", - "type": "string" - } - } + "departure_date": {"description": "The date of outbound flight", "type": "string"}, + "return_date": {"description": "The date of return flight", "type": "string"}, + }, } ], "temperature": 0.1, "max_tokens": 2048, "top_p": 0.9, - "do_sample": True + "do_sample": True, } try: @@ -156,39 +123,24 @@ def test_tool_calling(): # Test case 3: Hotel only payload3 = { "messages": [ - { - "role": "system", - "content": "You are a helpful hotel booking assistant." - }, - { - "role": "user", - "content": "I need to book a hotel in Tokyo from 2025-09-01 to 2025-09-05" - } + {"role": "system", "content": "You are a helpful hotel booking assistant."}, + {"role": "user", "content": "I need to book a hotel in Tokyo from 2025-09-01 to 2025-09-05"}, ], "tools": [ { "name": "booking_hotels", "description": "booking hotel", "parameters": { - "destination": { - "description": "The name of the city", - "type": "string" - }, - "check_in_date": { - "description": "The date of check in", - "type": "string" - }, - "checkout_date": { - "description": "The date of check out", - "type": "string" - } - } + "destination": {"description": "The name of the city", "type": "string"}, + "check_in_date": {"description": "The date of check in", "type": "string"}, + "checkout_date": {"description": "The date of check out", "type": "string"}, + }, } ], "temperature": 0.2, "max_tokens": 1024, "top_p": 0.95, - "do_sample": True + "do_sample": True, } try: @@ -207,6 +159,7 @@ def test_tool_calling(): print("All tool calling tests completed!") print("=" * 70) + if __name__ == "__main__": print("Starting SLM Server Tool Calling Tests...") test_tool_calling() diff --git a/src/python/py/models/builders/__init__.py b/src/python/py/models/builders/__init__.py index f6e8fd09d0..b48d1a30e2 100644 --- a/src/python/py/models/builders/__init__.py +++ b/src/python/py/models/builders/__init__.py @@ -49,5 +49,5 @@ "PhiModel", "Qwen3Model", "QwenModel", - "SmolLM3Model" + "SmolLM3Model", ] diff --git a/src/python/py/models/builders/chatglm.py b/src/python/py/models/builders/chatglm.py index f8959c8d01..07a013c1d6 100644 --- a/src/python/py/models/builders/chatglm.py +++ b/src/python/py/models/builders/chatglm.py @@ -9,17 +9,19 @@ class ChatGLMModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) - self.rope_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration + self.rope_attrs["partial_rotary_factor"] = ( + 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration + ) self.rope_attrs["num_heads"] = self.num_attn_heads self.rope_attrs["rotary_embedding_dim"] = int(self.head_size * self.rope_attrs["partial_rotary_factor"]) self.rope_attrs["interleaved"] = 1 def make_mlp(self, layer_id, mlp, root_input): - if not hasattr(mlp, 'down_proj'): + if not hasattr(mlp, "down_proj"): # Attribute does not exist for original PyTorch model only mlp.down_proj = mlp.dense_4h_to_h super().make_mlp(layer_id, mlp, root_input) def make_layer(self, layer_id, layer): - layer.self_attn = layer.self_attn if hasattr(layer, 'self_attn') else layer.self_attention + layer.self_attn = layer.self_attn if hasattr(layer, "self_attn") else layer.self_attention super().make_layer(layer_id, layer) diff --git a/src/python/py/models/builders/gemma.py b/src/python/py/models/builders/gemma.py index 12084ebca9..d230d81b49 100644 --- a/src/python/py/models/builders/gemma.py +++ b/src/python/py/models/builders/gemma.py @@ -14,6 +14,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.embed_attrs["scale"] = np.round(np.sqrt(self.hidden_size), decimals=2) self.layernorm_attrs["add_offset"] = 1 + class Gemma2Model(GemmaModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -22,7 +23,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.layernorm_attrs["cast"]["skip_input"] = False self.layernorm_attrs["cast"]["output_0"] = True self.layernorm_attrs["cast"]["output_3"] = False - self.attention_attrs["scale"] = config.query_pre_attn_scalar ** -0.5 + self.attention_attrs["scale"] = config.query_pre_attn_scalar**-0.5 def is_local(self, layer_id): return layer_id % 2 == 1 @@ -41,7 +42,13 @@ def make_layer(self, layer_id, layer): # 1. Only cast root_input if the first layer of LayerNorms are being created original_cast_root_input = self.layernorm_attrs["cast"]["root_input"] self.layernorm_attrs["cast"]["root_input"] = self.layernorm_attrs["first_layernorm"] - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + self.make_layernorm( + layer_id, + layer.input_layernorm, + skip=not self.layernorm_attrs["first_layernorm"], + simple=self.layernorm_attrs["simple"], + location="input", + ) self.layernorm_attrs["cast"]["root_input"] = original_cast_root_input self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"]) @@ -54,7 +61,13 @@ def make_layer(self, layer_id, layer): original_cast_output_0 = self.layernorm_attrs["cast"]["output_0"] self.layernorm_attrs["root_input"] = self.layernorm_attrs["skip_input"] self.layernorm_attrs["cast"]["output_0"] = False - self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="post_attention") + self.make_layernorm( + layer_id, + layer.post_attention_layernorm, + skip=False, + simple=self.layernorm_attrs["simple"], + location="post_attention", + ) self.layernorm_attrs["root_input"] = original_root_input self.layernorm_attrs["skip_input"] = self.layernorm_attrs["output_0"] self.layernorm_attrs["cast"]["output_0"] = original_cast_output_0 @@ -63,7 +76,13 @@ def make_layer(self, layer_id, layer): # 1. Only cast root_input if the first layer of LayerNorms are being created original_cast_root_input = self.layernorm_attrs["cast"]["root_input"] self.layernorm_attrs["cast"]["root_input"] = self.layernorm_attrs["first_layernorm"] - self.make_layernorm(layer_id, layer.pre_feedforward_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="pre_feedforward") + self.make_layernorm( + layer_id, + layer.pre_feedforward_layernorm, + skip=True, + simple=self.layernorm_attrs["simple"], + location="pre_feedforward", + ) self.layernorm_attrs["cast"]["root_input"] = original_cast_root_input self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) @@ -76,7 +95,13 @@ def make_layer(self, layer_id, layer): original_cast_output_0 = self.layernorm_attrs["cast"]["output_0"] self.layernorm_attrs["root_input"] = self.layernorm_attrs["skip_input"] self.layernorm_attrs["cast"]["output_0"] = False - self.make_layernorm(layer_id, layer.post_feedforward_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="post_feedforward") + self.make_layernorm( + layer_id, + layer.post_feedforward_layernorm, + skip=False, + simple=self.layernorm_attrs["simple"], + location="post_feedforward", + ) self.layernorm_attrs["root_input"] = original_root_input self.layernorm_attrs["skip_input"] = self.layernorm_attrs["output_0"] self.layernorm_attrs["cast"]["output_0"] = original_cast_output_0 @@ -88,20 +113,23 @@ def make_layer(self, layer_id, layer): def make_attention(self, layer_id, attention, root_input, **kwargs): original_window_size = self.window_size - self.window_size = original_window_size if self.is_local(layer_id) else -1 # default is -1 in GroupQueryAttention kernel + self.window_size = ( + original_window_size if self.is_local(layer_id) else -1 + ) # default is -1 in GroupQueryAttention kernel super().make_attention(layer_id, attention, root_input, **kwargs) self.window_size = original_window_size + class Gemma3Model(Gemma2Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) self.rope_local_theta = config.rope_local_base_freq self.make_rotary_embedding_multi_cache() + def is_local(self, layer_id): return bool((layer_id + 1) % 6) - def make_attention_init(self): self.attention_attrs["q_norm"] = True self.attention_attrs["k_norm"] = True @@ -109,17 +137,24 @@ def make_attention_init(self): def make_rotary_embedding_multi_cache(self): self.cos_cache_global_name, self.sin_cache_global_name = "cos_cache_global", "sin_cache_global" - super().make_rotary_embedding_caches(cos_cache_name=self.cos_cache_global_name, sin_cache_name=self.sin_cache_global_name) + super().make_rotary_embedding_caches( + cos_cache_name=self.cos_cache_global_name, sin_cache_name=self.sin_cache_global_name + ) # Create the new cos/sin caches for local attention layers with its own theta value self.rope_attrs["create_caches"] = True self.rope_attrs["theta"] = self.rope_local_theta self.cos_cache_local_name, self.sin_cache_local_name = "cos_cache_local", "sin_cache_local" - super().make_rotary_embedding_caches(cos_cache_name=self.cos_cache_local_name, sin_cache_name=self.sin_cache_local_name) + super().make_rotary_embedding_caches( + cos_cache_name=self.cos_cache_local_name, sin_cache_name=self.sin_cache_local_name + ) def make_rotary_embedding_caches(self, **kwargs): - cos_cache_name = kwargs.get("cos_cache_name", self.cos_cache_global_name if self.window_size == -1 else self.cos_cache_local_name) - sin_cache_name = kwargs.get("sin_cache_name", self.sin_cache_global_name if self.window_size == -1 else self.sin_cache_local_name) + cos_cache_name = kwargs.get( + "cos_cache_name", self.cos_cache_global_name if self.window_size == -1 else self.cos_cache_local_name + ) + sin_cache_name = kwargs.get( + "sin_cache_name", self.sin_cache_global_name if self.window_size == -1 else self.sin_cache_local_name + ) return super().make_rotary_embedding_caches(cos_cache_name=cos_cache_name, sin_cache_name=sin_cache_name) - diff --git a/src/python/py/models/builders/gptoss.py b/src/python/py/models/builders/gptoss.py index 38f1ca0e46..71cf69b825 100644 --- a/src/python/py/models/builders/gptoss.py +++ b/src/python/py/models/builders/gptoss.py @@ -24,9 +24,21 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_layer(self, layer_id, layer): # Each LLM decoder layer is typically defined as: # input_layernorm --> attention --> output_layernorm --> MoE - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + self.make_layernorm( + layer_id, + layer.input_layernorm, + skip=not self.layernorm_attrs["first_layernorm"], + simple=self.layernorm_attrs["simple"], + location="input", + ) self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"]) - self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") + self.make_layernorm( + layer_id, + layer.post_attention_layernorm, + skip=True, + simple=self.layernorm_attrs["simple"], + location="post_attention", + ) self.make_moe(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) self.layernorm_attrs["first_layernorm"] = False @@ -51,7 +63,9 @@ def make_rotary_embedding_caches_from_scratch(self): def make_attention(self, layer_id, attention, root_input, **kwargs): original_window_size = self.window_size - self.window_size = original_window_size if self.is_local(layer_id) else -1 # default is -1 in GroupQueryAttention kernel + self.window_size = ( + original_window_size if self.is_local(layer_id) else -1 + ) # default is -1 in GroupQueryAttention kernel super().make_attention(layer_id, attention, root_input, **kwargs) self.window_size = original_window_size @@ -143,13 +157,31 @@ def make_moe_decomposed(self, layer_id, mlp, root_input): # Make root_input expansion nodes (root_input --> Unsqueeze --> Expand --> Unsqueeze) expand_root_input_unsqueeze_1_name = f"{basename}/expand_root_input/Unsqueeze_1" expand_root_input_unsqueeze_1_inputs = [root_input, "/model/constants/INT64/[2]"] - self.make_unsqueeze(expand_root_input_unsqueeze_1_name, expand_root_input_unsqueeze_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', 1, self.hidden_size]) + self.make_unsqueeze( + expand_root_input_unsqueeze_1_name, + expand_root_input_unsqueeze_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", 1, self.hidden_size], + ) expand_name = f"{basename}/expand_root_input/Expand" - expand_inputs = [f"{expand_root_input_unsqueeze_1_name}/output_0", f"/model/constants/INT64/[1, 1, {self.moe_attrs['top_k']}, 1]"] - self.make_expand(expand_name, expand_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.hidden_size]) + expand_inputs = [ + f"{expand_root_input_unsqueeze_1_name}/output_0", + f"/model/constants/INT64/[1, 1, {self.moe_attrs['top_k']}, 1]", + ] + self.make_expand( + expand_name, + expand_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.hidden_size], + ) expand_root_input_unsqueeze_2_name = f"{basename}/expand_root_input/Unsqueeze_2" expand_root_input_unsqueeze_2_inputs = [f"{expand_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_unsqueeze(expand_root_input_unsqueeze_2_name, expand_root_input_unsqueeze_2_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.hidden_size, 1]) + self.make_unsqueeze( + expand_root_input_unsqueeze_2_name, + expand_root_input_unsqueeze_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.hidden_size, 1], + ) # Make router nodes # +--> Gather --> (MLP1 weight) @@ -169,16 +201,35 @@ def make_moe_decomposed(self, layer_id, mlp, root_input): if use_cast: topk_fp32_name = f"{basename}/topk_fp32/Cast" - self.make_cast(topk_fp32_name, f"{router_add_name}/output_0", ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.moe_attrs["num_experts"]]) + self.make_cast( + topk_fp32_name, + f"{router_add_name}/output_0", + ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.moe_attrs["num_experts"]], + ) topk_name = f"{basename}/TopK" - topk_inputs = [f"{topk_fp32_name if use_cast else router_add_name}/output_0", f"/model/constants/INT64/[{self.moe_attrs['top_k']}]"] + topk_inputs = [ + f"{topk_fp32_name if use_cast else router_add_name}/output_0", + f"/model/constants/INT64/[{self.moe_attrs['top_k']}]", + ] topk_outputs = [f"{topk_name}/output_0", f"{topk_name}/output_1"] - self.make_node("TopK", inputs=topk_inputs, outputs=topk_outputs, name=topk_name, axis=-1, largest=True, sorted=True) - self.make_value(topk_outputs[0], ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"]]) - self.make_value(topk_outputs[1], ir.DataType.INT64, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"]]) + self.make_node( + "TopK", inputs=topk_inputs, outputs=topk_outputs, name=topk_name, axis=-1, largest=True, sorted=True + ) + self.make_value( + topk_outputs[0], ir.DataType.FLOAT, shape=["batch_size", "sequence_length", self.moe_attrs["top_k"]] + ) + self.make_value( + topk_outputs[1], ir.DataType.INT64, shape=["batch_size", "sequence_length", self.moe_attrs["top_k"]] + ) if use_cast: topk_io_name = f"{basename}/topk_io/Cast" - self.make_cast(topk_io_name, topk_outputs[0], self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"]]) + self.make_cast( + topk_io_name, + topk_outputs[0], + self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"]], + ) # Save initializers to use with Gather nodes gate_up_proj_weight = f"model.layers.{layer_id}.moe.experts.gate_up_proj.weight" @@ -193,43 +244,120 @@ def make_moe_decomposed(self, layer_id, mlp, root_input): # Make Gather nodes + Unsqueeze nodes for biases mlp1_weight_gather_name = f"{basename}/mlp1/weight/Gather" mlp1_weight_gather_inputs = [gate_up_proj_weight, f"{topk_name}/output_1"] - self.make_gather(mlp1_weight_gather_name, mlp1_weight_gather_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], 2 * self.intermediate_size, self.hidden_size], axis=0) + self.make_gather( + mlp1_weight_gather_name, + mlp1_weight_gather_inputs, + dtype=self.io_dtype, + shape=[ + "batch_size", + "sequence_length", + self.moe_attrs["top_k"], + 2 * self.intermediate_size, + self.hidden_size, + ], + axis=0, + ) mlp1_bias_gather_name = f"{basename}/mlp1/bias/Gather" mlp1_bias_gather_inputs = [gate_up_proj_bias, f"{topk_name}/output_1"] - self.make_gather(mlp1_bias_gather_name, mlp1_bias_gather_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], 2 * self.intermediate_size], axis=0) + self.make_gather( + mlp1_bias_gather_name, + mlp1_bias_gather_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], 2 * self.intermediate_size], + axis=0, + ) mlp1_bias_unsqueeze_name = f"{basename}/mlp1/bias/Unsqueeze" mlp1_bias_unsqueeze_inputs = [f"{mlp1_bias_gather_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_unsqueeze(mlp1_bias_unsqueeze_name, mlp1_bias_unsqueeze_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], 2 * self.intermediate_size, 1]) + self.make_unsqueeze( + mlp1_bias_unsqueeze_name, + mlp1_bias_unsqueeze_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], 2 * self.intermediate_size, 1], + ) mlp2_weight_gather_name = f"{basename}/mlp2/weight/Gather" mlp2_weight_gather_inputs = [down_proj_weight, f"{topk_name}/output_1"] - self.make_gather(mlp2_weight_gather_name, mlp2_weight_gather_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.hidden_size, self.intermediate_size], axis=0) + self.make_gather( + mlp2_weight_gather_name, + mlp2_weight_gather_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.hidden_size, self.intermediate_size], + axis=0, + ) mlp2_bias_gather_name = f"{basename}/mlp2/bias/Gather" mlp2_bias_gather_inputs = [down_proj_bias, f"{topk_name}/output_1"] - self.make_gather(mlp2_bias_gather_name, mlp2_bias_gather_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.hidden_size], axis=0) + self.make_gather( + mlp2_bias_gather_name, + mlp2_bias_gather_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.hidden_size], + axis=0, + ) mlp2_bias_unsqueeze_name = f"{basename}/mlp2/bias/Unsqueeze" mlp2_bias_unsqueeze_inputs = [f"{mlp2_bias_gather_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_unsqueeze(mlp2_bias_unsqueeze_name, mlp2_bias_unsqueeze_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.hidden_size, 1]) + self.make_unsqueeze( + mlp2_bias_unsqueeze_name, + mlp2_bias_unsqueeze_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.hidden_size, 1], + ) # Make expert_weights path (Softmax --> Unsqueeze --> Unsqueeze --> Cast) softmax_name = f"{basename}/expert_weights/Softmax" - self.make_softmax(softmax_name, f"{topk_io_name if use_cast else topk_name}/output_0", self.io_dtype, ['batch_size', 'sequence_length', 'num_experts_per_token']) + self.make_softmax( + softmax_name, + f"{topk_io_name if use_cast else topk_name}/output_0", + self.io_dtype, + ["batch_size", "sequence_length", "num_experts_per_token"], + ) expert_weights_unsqueeze_1_name = f"{basename}/expert_weights/Unsqueeze_1" expert_weights_unsqueeze_1_inputs = [f"{softmax_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_unsqueeze(expert_weights_unsqueeze_1_name, expert_weights_unsqueeze_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', 'num_experts_per_token', 1]) + self.make_unsqueeze( + expert_weights_unsqueeze_1_name, + expert_weights_unsqueeze_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", "num_experts_per_token", 1], + ) expert_weights_unsqueeze_2_name = f"{basename}/expert_weights/Unsqueeze_2" - expert_weights_unsqueeze_2_inputs = [f"{expert_weights_unsqueeze_1_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_unsqueeze(expert_weights_unsqueeze_2_name, expert_weights_unsqueeze_2_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', 'num_experts_per_token', 1, 1]) + expert_weights_unsqueeze_2_inputs = [ + f"{expert_weights_unsqueeze_1_name}/output_0", + "/model/constants/INT64/[-1]", + ] + self.make_unsqueeze( + expert_weights_unsqueeze_2_name, + expert_weights_unsqueeze_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", "num_experts_per_token", 1, 1], + ) if use_cast: expert_weights_cast_name = f"{basename}/expert_weights/Cast" - self.make_cast(expert_weights_cast_name, f"{expert_weights_unsqueeze_2_name}/output_0", ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', 'num_experts_per_token', 1, 1]) + self.make_cast( + expert_weights_cast_name, + f"{expert_weights_unsqueeze_2_name}/output_0", + ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", "num_experts_per_token", 1, 1], + ) # Make Gate/Up proj nodes (MatMul --> Add) gate_up_proj_weight_name = f"{basename}/gate_up_proj/MatMul" gate_up_proj_weight_output = f"{gate_up_proj_weight_name}/output_0" - self.make_node("MatMul", inputs=[f"{mlp1_weight_gather_name}/output_0", f"{expand_root_input_unsqueeze_2_name}/output_0"], outputs=[gate_up_proj_weight_output], name=gate_up_proj_weight_name) - self.make_value(gate_up_proj_weight_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], 2 * self.intermediate_size, 1]) + self.make_node( + "MatMul", + inputs=[f"{mlp1_weight_gather_name}/output_0", f"{expand_root_input_unsqueeze_2_name}/output_0"], + outputs=[gate_up_proj_weight_output], + name=gate_up_proj_weight_name, + ) + self.make_value( + gate_up_proj_weight_output, + self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], 2 * self.intermediate_size, 1], + ) gate_up_proj_bias_name = f"{basename}/gate_up_proj/Add" - self.make_add(gate_up_proj_bias_name, [gate_up_proj_weight_output, f"{mlp1_bias_unsqueeze_name}/output_0"], dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], 2 * self.intermediate_size, 1]) + self.make_add( + gate_up_proj_bias_name, + [gate_up_proj_weight_output, f"{mlp1_bias_unsqueeze_name}/output_0"], + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], 2 * self.intermediate_size, 1], + ) # Make activation nodes # @@ -241,43 +369,130 @@ def make_moe_decomposed(self, layer_id, mlp, root_input): # | | # +---> Slice --> Clamp --> Add ------------------+ glu_slice_name = f"{basename}/act_fn/Slice_1" - glu_slice_inputs = [f"{gate_up_proj_bias_name}/output_0", "/model/constants/INT64/[0]", f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", "/model/constants/INT64/[3]", "/model/constants/INT64/[2]"] - self.make_slice(glu_slice_name, glu_slice_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + glu_slice_inputs = [ + f"{gate_up_proj_bias_name}/output_0", + "/model/constants/INT64/[0]", + f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", + "/model/constants/INT64/[3]", + "/model/constants/INT64/[2]", + ] + self.make_slice( + glu_slice_name, + glu_slice_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) glu_clip_name = f"{basename}/act_fn/Clip_1" - glu_clip_inputs = [f"{glu_slice_name}/output_0", "", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.moe_attrs['swiglu_limit']}"] - self.make_clip(glu_clip_name, glu_clip_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + glu_clip_inputs = [ + f"{glu_slice_name}/output_0", + "", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.moe_attrs['swiglu_limit']}", + ] + self.make_clip( + glu_clip_name, + glu_clip_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) linear_slice_name = f"{basename}/act_fn/Slice_2" - linear_slice_inputs = [f"{gate_up_proj_bias_name}/output_0", "/model/constants/INT64/[1]", f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", "/model/constants/INT64/[3]", "/model/constants/INT64/[2]"] - self.make_slice(linear_slice_name, linear_slice_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + linear_slice_inputs = [ + f"{gate_up_proj_bias_name}/output_0", + "/model/constants/INT64/[1]", + f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", + "/model/constants/INT64/[3]", + "/model/constants/INT64/[2]", + ] + self.make_slice( + linear_slice_name, + linear_slice_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) linear_clip_name = f"{basename}/act_fn/Clip_2" - linear_clip_inputs = [f"{linear_slice_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{-self.moe_attrs['swiglu_limit']}", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.moe_attrs['swiglu_limit']}"] - self.make_clip(linear_clip_name, linear_clip_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + linear_clip_inputs = [ + f"{linear_slice_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{-self.moe_attrs['swiglu_limit']}", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.moe_attrs['swiglu_limit']}", + ] + self.make_clip( + linear_clip_name, + linear_clip_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) # Make Mul node after activation act_fn_mul_1_name = f"{basename}/act_fn/Mul_1" - act_fn_mul_1_inputs = [f"{glu_clip_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/1.703125"] - self.make_mul(act_fn_mul_1_name, act_fn_mul_1_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1]) + act_fn_mul_1_inputs = [ + f"{glu_clip_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/1.703125", + ] + self.make_mul( + act_fn_mul_1_name, + act_fn_mul_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) sigmoid_name = f"{basename}/act_fn/Sigmoid" - self.make_sigmoid(sigmoid_name, f"{act_fn_mul_1_name}/output_0", dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_sigmoid( + sigmoid_name, + f"{act_fn_mul_1_name}/output_0", + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) act_fn_mul_2_name = f"{basename}/act_fn/Mul_2" act_fn_mul_2_inputs = [f"{glu_clip_name}/output_0", f"{sigmoid_name}/output_0"] - self.make_mul(act_fn_mul_2_name, act_fn_mul_2_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_mul( + act_fn_mul_2_name, + act_fn_mul_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) act_fn_add_name = f"{basename}/act_fn/Add" - self.make_add(act_fn_add_name, [f"{linear_clip_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/1"], dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_add( + act_fn_add_name, + [f"{linear_clip_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/1"], + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) act_fn_mul_3_name = f"{basename}/act_fn/Mul_3" act_fn_mul_3_inputs = [f"{act_fn_mul_2_name}/output_0", f"{act_fn_add_name}/output_0"] - self.make_mul(act_fn_mul_3_name, act_fn_mul_3_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_mul( + act_fn_mul_3_name, + act_fn_mul_3_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) # Make Down proj nodes (MatMul --> Add --> Cast) down_proj_weight_name = f"{basename}/down_proj/MatMul" down_proj_weight_output = f"{down_proj_weight_name}/output_0" - self.make_node("MatMul", inputs=[f"{mlp2_weight_gather_name}/output_0", f"{act_fn_mul_3_name}/output_0"], outputs=[down_proj_weight_output], name=down_proj_weight_name) - self.make_value(down_proj_weight_output, self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_node( + "MatMul", + inputs=[f"{mlp2_weight_gather_name}/output_0", f"{act_fn_mul_3_name}/output_0"], + outputs=[down_proj_weight_output], + name=down_proj_weight_name, + ) + self.make_value( + down_proj_weight_output, + self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) down_proj_bias_name = f"{basename}/down_proj/Add" - self.make_add(down_proj_bias_name, [down_proj_weight_output, f"{mlp2_bias_unsqueeze_name}/output_0"], dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_add( + down_proj_bias_name, + [down_proj_weight_output, f"{mlp2_bias_unsqueeze_name}/output_0"], + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) if use_cast: down_proj_cast_name = f"{basename}/down_proj/Cast" - self.make_cast(down_proj_cast_name, f"{down_proj_bias_name}/output_0", ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + self.make_cast( + down_proj_cast_name, + f"{down_proj_bias_name}/output_0", + ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) # Make weighted sum nodes # @@ -287,14 +502,33 @@ def make_moe_decomposed(self, layer_id, mlp, root_input): # / # Cast (from expert weights) --> weighted_sum_mul_name = f"{basename}/weighted_sum/Mul" - weighted_sum_mul_inputs = [f"{down_proj_cast_name if use_cast else down_proj_bias_name}/output_0", f"{expert_weights_cast_name if use_cast else expert_weights_unsqueeze_2_name}/output_0"] - self.make_mul(weighted_sum_mul_name, weighted_sum_mul_inputs, dtype=ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.moe_attrs["top_k"], self.intermediate_size, 1]) + weighted_sum_mul_inputs = [ + f"{down_proj_cast_name if use_cast else down_proj_bias_name}/output_0", + f"{expert_weights_cast_name if use_cast else expert_weights_unsqueeze_2_name}/output_0", + ] + self.make_mul( + weighted_sum_mul_name, + weighted_sum_mul_inputs, + dtype=ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.moe_attrs["top_k"], self.intermediate_size, 1], + ) reduce_sum_name = f"{basename}/weighted_sum/ReduceSum" reduce_sum_inputs = [f"{weighted_sum_mul_name}/output_0", "/model/constants/INT64/[2]"] - self.make_reduce_sum(reduce_sum_name, reduce_sum_inputs, dtype=ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.intermediate_size, 1], keepdims=False) + self.make_reduce_sum( + reduce_sum_name, + reduce_sum_inputs, + dtype=ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.intermediate_size, 1], + keepdims=False, + ) weighted_sum_squeeze_name = f"{basename}/weighted_sum/Squeeze" weighted_sum_squeeze_inputs = [f"{reduce_sum_name}/output_0", "/model/constants/INT64/[-1]"] - self.make_squeeze(weighted_sum_squeeze_name, weighted_sum_squeeze_inputs, dtype=ir.DataType.FLOAT, shape=['batch_size', 'sequence_length', self.intermediate_size]) + self.make_squeeze( + weighted_sum_squeeze_name, + weighted_sum_squeeze_inputs, + dtype=ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) # Assign output 0 of previous MoE as root input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{weighted_sum_squeeze_name}/output_0" @@ -325,8 +559,16 @@ def make_moe_fused(self, layer_id, mlp, root_input): router_add_name = f"{basename}/router/Add" self.make_add_bias(mlp.router.bias, router_add_name, root_input=f"{router_matmul_name}/output_0") router_reshape_name = f"{basename}/router/Reshape" - router_reshape_inputs = [f"{router_add_name}/output_0", f"/model/constants/INT64/{[-1, self.moe_attrs['num_experts']]}"] - self.make_reshape(router_reshape_name, router_reshape_inputs, dtype=self.io_dtype, shape=['batch_size * sequence_length', self.moe_attrs['num_experts']]) + router_reshape_inputs = [ + f"{router_add_name}/output_0", + f"/model/constants/INT64/{[-1, self.moe_attrs['num_experts']]}", + ] + self.make_reshape( + router_reshape_name, + router_reshape_inputs, + dtype=self.io_dtype, + shape=["batch_size * sequence_length", self.moe_attrs["num_experts"]], + ) gate_up_proj_weight = f"model.layers.{layer_id}.moe.experts.gate_up_proj.{moe_weight_type}" gate_up_proj_scales = f"model.layers.{layer_id}.moe.experts.gate_up_proj.scales" @@ -341,8 +583,16 @@ def make_moe_fused(self, layer_id, mlp, root_input): if op_type == "MoE": # Save non-quantized MoE weights as initializers - self.make_initializer(gate_up_proj_transposed.view(self.moe_attrs["num_experts"], -1, self.hidden_size), gate_up_proj_weight, to=self.io_dtype) - self.make_initializer(down_proj_transposed.view(self.moe_attrs["num_experts"], self.hidden_size, self.intermediate_size), down_proj_weight, to=self.io_dtype) + self.make_initializer( + gate_up_proj_transposed.view(self.moe_attrs["num_experts"], -1, self.hidden_size), + gate_up_proj_weight, + to=self.io_dtype, + ) + self.make_initializer( + down_proj_transposed.view(self.moe_attrs["num_experts"], self.hidden_size, self.intermediate_size), + down_proj_weight, + to=self.io_dtype, + ) else: # Create and save quantized MoE weights as initializers gate_up_proj_qweight_list, gate_up_proj_scales_list = [], [] @@ -363,8 +613,16 @@ def make_moe_fused(self, layer_id, mlp, root_input): # qweight tensors always use the same shape regardless of quantization method pack_size = 8 // self.moe_attrs["expert_weight_bits"] - self.make_initializer(gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, self.hidden_size // pack_size), gate_up_proj_weight) - self.make_initializer(down_proj_qweight_tensor.view(self.moe_attrs["num_experts"], self.hidden_size, self.intermediate_size // pack_size), down_proj_weight) + self.make_initializer( + gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, self.hidden_size // pack_size), + gate_up_proj_weight, + ) + self.make_initializer( + down_proj_qweight_tensor.view( + self.moe_attrs["num_experts"], self.hidden_size, self.intermediate_size // pack_size + ), + down_proj_weight, + ) # scales tensors have different shapes depending on quantization method self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) @@ -376,9 +634,15 @@ def make_moe_fused(self, layer_id, mlp, root_input): moe_name = f"{basename}/{op_type}" self.make_moe_op( - moe_name, root_input=root_input, router_probs=f"{router_reshape_name}/output_0", - weight1=gate_up_proj_weight, scales1=gate_up_proj_scales, bias1=gate_up_proj_bias, - weight2=down_proj_weight, scales2=down_proj_scales, bias2=down_proj_bias, + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_up_proj_weight, + scales1=gate_up_proj_scales, + bias1=gate_up_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, ) # Assign output 0 of previous MoE as root input to next SkipLayerNorm diff --git a/src/python/py/models/builders/granite.py b/src/python/py/models/builders/granite.py index 39dd0c93a0..3b2cb16b9d 100644 --- a/src/python/py/models/builders/granite.py +++ b/src/python/py/models/builders/granite.py @@ -17,21 +17,49 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_layer(self, layer_id, layer): # Each Granite decoder layer is defined as: # input_layernorm --> attention --> Mul --> output_layernorm --> MLP --> Mul - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + self.make_layernorm( + layer_id, + layer.input_layernorm, + skip=not self.layernorm_attrs["first_layernorm"], + simple=self.layernorm_attrs["simple"], + location="input", + ) self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"]) residual_mul_1_name = f"/model/layers.{layer_id}/residual_mul/Mul_1" - residual_mul_1_inputs = [self.layernorm_attrs["skip_input"], f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.residual_scale}"] - self.make_mul(residual_mul_1_name, residual_mul_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + residual_mul_1_inputs = [ + self.layernorm_attrs["skip_input"], + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.residual_scale}", + ] + self.make_mul( + residual_mul_1_name, + residual_mul_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.hidden_size], + ) # Assign output 0 of previous output node as skip input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{residual_mul_1_name}/output_0" - self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") + self.make_layernorm( + layer_id, + layer.post_attention_layernorm, + skip=True, + simple=self.layernorm_attrs["simple"], + location="post_attention", + ) self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) residual_mul_2_name = f"/model/layers.{layer_id}/residual_mul/Mul_2" - residual_mul_2_inputs = [self.layernorm_attrs["skip_input"], f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.residual_scale}"] - self.make_mul(residual_mul_2_name, residual_mul_2_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + residual_mul_2_inputs = [ + self.layernorm_attrs["skip_input"], + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.residual_scale}", + ] + self.make_mul( + residual_mul_2_name, + residual_mul_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.hidden_size], + ) # Assign output 0 of previous output node as skip input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{residual_mul_2_name}/output_0" diff --git a/src/python/py/models/builders/phi.py b/src/python/py/models/builders/phi.py index c8be0fe12a..9aa309441d 100644 --- a/src/python/py/models/builders/phi.py +++ b/src/python/py/models/builders/phi.py @@ -19,13 +19,24 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_layer(self, layer_id, layer): # Each Phi decoder layer is defined as: # input_layernorm --> attention --> MLP --> residual_add - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + self.make_layernorm( + layer_id, + layer.input_layernorm, + skip=not self.layernorm_attrs["first_layernorm"], + simple=self.layernorm_attrs["simple"], + location="input", + ) self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"]) self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) residual_add_name = f"/model/layers.{layer_id}/residual_add/Add" - residual_add_inputs = [self.layernorm_attrs['skip_input'], self.mlp_attrs["output_0"]] - self.make_add(residual_add_name, residual_add_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + residual_add_inputs = [self.layernorm_attrs["skip_input"], self.mlp_attrs["output_0"]] + self.make_add( + residual_add_name, + residual_add_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.hidden_size], + ) self.layernorm_attrs["first_layernorm"] = False if layer_id == self.num_layers - 1: @@ -35,6 +46,7 @@ def make_layer(self, layer_id, layer): # Assign output 0 of residual Add as skip input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{residual_add_name}/output_0" + class Phi3MiniModel(MistralModel): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -48,7 +60,9 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # Set position_ids_name based on whether position_ids is available as an input if "position_ids" in self.input_names: position_ids_result = self.make_position_ids_reformatting() - self.position_ids_name = f"{position_ids_result}/output_0" if position_ids_result != "position_ids" else "position_ids" + self.position_ids_name = ( + f"{position_ids_result}/output_0" if position_ids_result != "position_ids" else "position_ids" + ) else: # When position_ids is not an input (use_rope_in_attn is True), # position_ids won't be used since rotary embeddings are handled in GQA @@ -92,14 +106,19 @@ def make_position_ids_reformatting(self): input_tensor = "position_ids" if is_webgpu: cast_input_name = f"{basename}/Cast_input" - self.make_cast(cast_input_name, input_tensor, dtype=ir.DataType.INT32, shape=["batch_size", "sequence_length"]) + self.make_cast( + cast_input_name, input_tensor, dtype=ir.DataType.INT32, shape=["batch_size", "sequence_length"] + ) input_tensor = f"{cast_input_name}/output_0" reduce_max_name = f"{basename}/ReduceMax" reduce_max_inputs = [input_tensor] self.make_reduce_max(reduce_max_name, reduce_max_inputs, dtype=compute_dtype, shape=[1]) greater_or_equal_name = f"{basename}/GreaterOrEqual" - greater_or_equal_inputs = [f"{reduce_max_name}/output_0", f"/model/constants/{compute_str_dtype}/{self.original_context_length}"] + greater_or_equal_inputs = [ + f"{reduce_max_name}/output_0", + f"/model/constants/{compute_str_dtype}/{self.original_context_length}", + ] self.make_greater_or_equal(greater_or_equal_name, greater_or_equal_inputs, shape=[]) cast_name = f"{basename}/Cast" self.make_cast(cast_name, f"{greater_or_equal_name}/output_0", dtype=compute_dtype, shape=None) @@ -114,7 +133,12 @@ def make_position_ids_reformatting(self): result_name = add_1_name if is_webgpu: cast_output_name = f"{basename}/Cast_output" - self.make_cast(cast_output_name, f"{add_1_name}/output_0", dtype=ir.DataType.INT64, shape=["batch_size", "sequence_length"]) + self.make_cast( + cast_output_name, + f"{add_1_name}/output_0", + dtype=ir.DataType.INT64, + shape=["batch_size", "sequence_length"], + ) result_name = cast_output_name return result_name @@ -125,6 +149,7 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): else: super().make_attention(layer_id, attention, root_input, **kwargs) + class Phi3SmallModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) @@ -158,7 +183,7 @@ def calculate_block_mask(self): q_pos = torch.arange(N_BLOCK)[:, None] k_pos = torch.arange(N_BLOCK)[None] mask_vert_strided = (torch.arange(N_BLOCK) + 1) % vert_stride == 0 - block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)) + block_mask_dense = (q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided) N_BLOCK_Q = self.calculate_cdiv(q_len, BLOCK) block_mask_dense_output = block_mask_dense[-N_BLOCK_Q:].to_sparse_csr() @@ -171,9 +196,11 @@ def calculate_block_mask(self): q_pos = torch.arange(N_BLOCK)[None, :, None] k_pos = torch.arange(N_BLOCK)[None, None] head_sliding_step = max(1, int(vert_stride / n_heads)) # if vert_stride <= n_heads, rotating the heads - mask_vert_strided = [(torch.arange(N_BLOCK) + h * head_sliding_step + 1) % vert_stride == 0 for h in range(n_heads)] + mask_vert_strided = [ + (torch.arange(N_BLOCK) + h * head_sliding_step + 1) % vert_stride == 0 for h in range(n_heads) + ] mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1) - block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)) + block_mask_dense = (q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided) N_BLOCK_Q = self.calculate_cdiv(q_len, BLOCK) block_mask_dense_output = block_mask_dense[:, -N_BLOCK_Q:] @@ -214,20 +241,42 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): q_size = self.num_attn_heads * self.head_size kv_size = self.num_kv_heads * self.head_size - qkv_weight = attention.query_key_value.weight.T.view(self.hidden_size, self.num_kv_heads, (self.num_attn_heads // self.num_kv_heads) + 2, self.head_size) - qkv_bias = attention.query_key_value.bias.view(self.num_kv_heads, (self.num_attn_heads // self.num_kv_heads) + 2, self.head_size) + qkv_weight = attention.query_key_value.weight.T.view( + self.hidden_size, self.num_kv_heads, (self.num_attn_heads // self.num_kv_heads) + 2, self.head_size + ) + qkv_bias = attention.query_key_value.bias.view( + self.num_kv_heads, (self.num_attn_heads // self.num_kv_heads) + 2, self.head_size + ) attention.q_proj = torch.nn.Linear(in_features=q_size, out_features=q_size) - attention.q_proj.weight = torch.nn.Parameter(qkv_weight[:, :, :-2].reshape(q_size, q_size).T, requires_grad=False) - attention.q_proj.bias = None if attention.query_key_value.bias is None else torch.nn.Parameter(qkv_bias[:, :-2].flatten(), requires_grad=False) + attention.q_proj.weight = torch.nn.Parameter( + qkv_weight[:, :, :-2].reshape(q_size, q_size).T, requires_grad=False + ) + attention.q_proj.bias = ( + None + if attention.query_key_value.bias is None + else torch.nn.Parameter(qkv_bias[:, :-2].flatten(), requires_grad=False) + ) attention.k_proj = torch.nn.Linear(in_features=q_size, out_features=kv_size) - attention.k_proj.weight = torch.nn.Parameter(qkv_weight[:, :, [-2]].reshape(q_size, kv_size).T, requires_grad=False) - attention.k_proj.bias = None if attention.query_key_value.bias is None else torch.nn.Parameter(qkv_bias[:, [-2]].flatten(), requires_grad=False) + attention.k_proj.weight = torch.nn.Parameter( + qkv_weight[:, :, [-2]].reshape(q_size, kv_size).T, requires_grad=False + ) + attention.k_proj.bias = ( + None + if attention.query_key_value.bias is None + else torch.nn.Parameter(qkv_bias[:, [-2]].flatten(), requires_grad=False) + ) attention.v_proj = torch.nn.Linear(in_features=q_size, out_features=kv_size) - attention.v_proj.weight = torch.nn.Parameter(qkv_weight[:, :, [-1]].reshape(q_size, kv_size).T, requires_grad=False) - attention.v_proj.bias = None if attention.query_key_value.bias is None else torch.nn.Parameter(qkv_bias[:, [-1]].flatten(), requires_grad=False) + attention.v_proj.weight = torch.nn.Parameter( + qkv_weight[:, :, [-1]].reshape(q_size, kv_size).T, requires_grad=False + ) + attention.v_proj.bias = ( + None + if attention.query_key_value.bias is None + else torch.nn.Parameter(qkv_bias[:, [-1]].flatten(), requires_grad=False) + ) del qkv_weight del qkv_bias @@ -276,43 +325,105 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # Left path slice_1_name = f"/model/layers.{layer_id}/mlp/gelu/Slice" - slice_1_inputs = [f"{up_add_name}/output_0", "/model/constants/INT64/[0]", f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", "/model/constants/INT64/[-1]", "/model/constants/INT64/[2]"] - self.make_slice(slice_1_name, slice_1_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + slice_1_inputs = [ + f"{up_add_name}/output_0", + "/model/constants/INT64/[0]", + f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", + "/model/constants/INT64/[-1]", + "/model/constants/INT64/[2]", + ] + self.make_slice( + slice_1_name, + slice_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) cast_1_name = f"/model/layers.{layer_id}/mlp/gelu/Cast" - self.make_cast(cast_1_name, f"{slice_1_name}/output_0", dtype=ir.DataType.FLOAT, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_cast( + cast_1_name, + f"{slice_1_name}/output_0", + dtype=ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) isinf_1_name = f"/model/layers.{layer_id}/mlp/gelu/IsInf" - self.make_isinf(isinf_1_name, f"{cast_1_name}/output_0", shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_isinf( + isinf_1_name, f"{cast_1_name}/output_0", shape=["batch_size", "sequence_length", self.intermediate_size] + ) clip_1_name = f"/model/layers.{layer_id}/mlp/gelu/Clip" - clip_1_inputs = [f"{slice_1_name}/output_0", "", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.clamp_limit}"] - self.make_clip(clip_1_name, clip_1_inputs, self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + clip_1_inputs = [ + f"{slice_1_name}/output_0", + "", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.clamp_limit}", + ] + self.make_clip( + clip_1_name, clip_1_inputs, self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) where_1_name = f"/model/layers.{layer_id}/mlp/gelu/Where" where_1_inputs = [f"{isinf_1_name}/output_0", f"{slice_1_name}/output_0", f"{clip_1_name}/output_0"] - self.make_where(where_1_name, where_1_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_where( + where_1_name, + where_1_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) # Make activation act_fn_name = self.make_activation(layer_id, root_input=f"{where_1_name}/output_0") # Right path slice_2_name = f"/model/layers.{layer_id}/mlp/linear/Slice" - slice_2_inputs = [f"{up_add_name}/output_0", "/model/constants/INT64/[1]", f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", "/model/constants/INT64/[-1]", "/model/constants/INT64/[2]"] - self.make_slice(slice_2_name, slice_2_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + slice_2_inputs = [ + f"{up_add_name}/output_0", + "/model/constants/INT64/[1]", + f"/model/constants/INT64/[{torch.iinfo(torch.int64).max}]", + "/model/constants/INT64/[-1]", + "/model/constants/INT64/[2]", + ] + self.make_slice( + slice_2_name, + slice_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) cast_2_name = f"/model/layers.{layer_id}/mlp/linear/Cast" - self.make_cast(cast_2_name, f"{slice_2_name}/output_0", dtype=ir.DataType.FLOAT, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_cast( + cast_2_name, + f"{slice_2_name}/output_0", + dtype=ir.DataType.FLOAT, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) isinf_2_name = f"/model/layers.{layer_id}/mlp/linear/IsInf" - self.make_isinf(isinf_2_name, f"{cast_2_name}/output_0", shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_isinf( + isinf_2_name, f"{cast_2_name}/output_0", shape=["batch_size", "sequence_length", self.intermediate_size] + ) clip_2_name = f"/model/layers.{layer_id}/mlp/linear/Clip" - clip_2_inputs = [f"{slice_2_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/-{self.clamp_limit}", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.clamp_limit}"] - self.make_clip(clip_2_name, clip_2_inputs, self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + clip_2_inputs = [ + f"{slice_2_name}/output_0", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/-{self.clamp_limit}", + f"/model/constants/{self.to_str_dtype(self.io_dtype)}/{self.clamp_limit}", + ] + self.make_clip( + clip_2_name, clip_2_inputs, self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) where_2_name = f"/model/layers.{layer_id}/mlp/linear/Where" where_2_inputs = [f"{isinf_2_name}/output_0", f"{slice_2_name}/output_0", f"{clip_2_name}/output_0"] - self.make_where(where_2_name, where_2_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_where( + where_2_name, + where_2_inputs, + dtype=self.io_dtype, + shape=["batch_size", "sequence_length", self.intermediate_size], + ) add_name = f"/model/layers.{layer_id}/mlp/linear/Add" add_inputs = [f"{where_2_name}/output_0", f"/model/constants/{self.to_str_dtype(self.io_dtype)}/1"] - self.make_add(add_name, add_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_add( + add_name, add_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) # Make Mul node after activation mul_name = f"/model/layers.{layer_id}/mlp/Mul" mul_inputs = [f"{act_fn_name}/output_0", f"{add_name}/output_0"] - self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + self.make_mul( + mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size] + ) # Make output MatMul and Add nodes down_matmul_name = f"/model/layers.{layer_id}/mlp/down_proj/MatMul" @@ -346,9 +457,21 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_layer(self, layer_id, layer): # Each LLM decoder layer is typically defined as: # input_layernorm --> attention --> output_layernorm --> MoE - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + self.make_layernorm( + layer_id, + layer.input_layernorm, + skip=not self.layernorm_attrs["first_layernorm"], + simple=self.layernorm_attrs["simple"], + location="input", + ) self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"]) - self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") + self.make_layernorm( + layer_id, + layer.post_attention_layernorm, + skip=True, + simple=self.layernorm_attrs["simple"], + location="post_attention", + ) self.make_block_sparse_moe(layer_id, layer.block_sparse_moe, root_input=self.layernorm_attrs["output_0"]) self.layernorm_attrs["first_layernorm"] = False diff --git a/src/python/py/models/gguf_model.py b/src/python/py/models/gguf_model.py index 6e5a741396..b2890ac367 100644 --- a/src/python/py/models/gguf_model.py +++ b/src/python/py/models/gguf_model.py @@ -74,7 +74,7 @@ def __init__(self, input_path, head_size, hidden_size, intermediate_size, num_at if name == "token_embd.weight": # Remove tensor data's padding via `reduce` when GGUF model's vocab size is larger than the config's vocab size embedding_shape = [vocab_size, hidden_size] - self.embedding.weight = data[ : reduce(lambda x, y: x*y, embedding_shape)].reshape(embedding_shape) + self.embedding.weight = data[: reduce(lambda x, y: x * y, embedding_shape)].reshape(embedding_shape) elif name == "output_norm.weight": self.final_norm.weight = data elif name == "output_norm.bias": @@ -165,7 +165,7 @@ def __init__(self, input_path, head_size, hidden_size, intermediate_size, num_at qkv_shape = [q_size + kv_size + kv_size, hidden_size] qkv = data.reshape(qkv_shape) - module.self_attn.q_proj.weight = qkv[: q_size, :] + module.self_attn.q_proj.weight = qkv[:q_size, :] module.self_attn.k_proj.weight = qkv[q_size : q_size + kv_size, :] module.self_attn.v_proj.weight = qkv[q_size + kv_size :, :] elif bool(re.match(r"^blk\.\d+\.attn_qkv\.bias$", name)): @@ -173,17 +173,17 @@ def __init__(self, input_path, head_size, hidden_size, intermediate_size, num_at q_size = num_attn_heads * head_size kv_size = num_kv_heads * head_size - module.self_attn.q_proj.bias = data[: q_size] + module.self_attn.q_proj.bias = data[:q_size] module.self_attn.k_proj.bias = data[q_size : q_size + kv_size] module.self_attn.v_proj.bias = data[q_size + kv_size :] elif bool(re.match(r"^blk\.\d+\.ffn_up\.weight$", name)) and data.shape[0] != intermediate_size: # blk.layer_id.ffn_up.weight (gate_up_proj.weight) - module.mlp.gate_proj.weight = data[: intermediate_size, :] - module.mlp.up_proj.weight = data[intermediate_size :, :] + module.mlp.gate_proj.weight = data[:intermediate_size, :] + module.mlp.up_proj.weight = data[intermediate_size:, :] elif bool(re.match(r"^blk\.\d+\.ffn_up\.bias$", name)) and data.shape[0] != intermediate_size: # blk.layer_id.ffn_up.bias (gate_up_proj.bias) - module.mlp.gate_proj.bias = data[: intermediate_size] - module.mlp.up_proj.bias = data[intermediate_size :] + module.mlp.gate_proj.bias = data[:intermediate_size] + module.mlp.up_proj.bias = data[intermediate_size:] # Match against non-standard attribute names elif bool(re.match(r"^blk\.\d+\.post_attention_norm\.weight$", name)): # Note: This meaning of this name differs in Hugging Face vs GGUF. @@ -240,10 +240,20 @@ def undo_permute(self, head_size, hidden_size, num_attn_heads, num_kv_heads): """ for module in self.layers: q_shape = [head_size * num_attn_heads, hidden_size] - module.self_attn.q_proj.weight = module.self_attn.q_proj.weight.flatten().reshape(num_attn_heads, q_shape[0] // num_attn_heads // 2, 2, *q_shape[1:]).swapaxes(1, 2).reshape(q_shape) + module.self_attn.q_proj.weight = ( + module.self_attn.q_proj.weight.flatten() + .reshape(num_attn_heads, q_shape[0] // num_attn_heads // 2, 2, *q_shape[1:]) + .swapaxes(1, 2) + .reshape(q_shape) + ) k_shape = [head_size * num_kv_heads, hidden_size] - module.self_attn.k_proj.weight = module.self_attn.k_proj.weight.flatten().reshape(num_kv_heads, k_shape[0] // num_kv_heads // 2, 2, *k_shape[1:]).swapaxes(1, 2).reshape(k_shape) + module.self_attn.k_proj.weight = ( + module.self_attn.k_proj.weight.flatten() + .reshape(num_kv_heads, k_shape[0] // num_kv_heads // 2, 2, *k_shape[1:]) + .swapaxes(1, 2) + .reshape(k_shape) + ) def swap_mlp_types(self): """ @@ -273,43 +283,70 @@ def swap_norm_types(self): - post_ffw_norm --> post_feedforward_layernorm """ for module in self.layers: - module.post_attention_layernorm, module.pre_feedforward_layernorm = module.pre_feedforward_layernorm, module.post_attention_layernorm + module.post_attention_layernorm, module.pre_feedforward_layernorm = ( + module.pre_feedforward_layernorm, + module.post_attention_layernorm, + ) @staticmethod - def from_pretrained(model_type, input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size): + def from_pretrained( + model_type, input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ): """ Create GGUF models with the same attribute structures as Hugging Face's PyTorch models. Also performs any pre-processing and post-processing to the GGUF models to ensure the weights are the same as the PyTorch models. """ if model_type == "ChatGLMModel": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) elif model_type == "GemmaForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) elif model_type == "Gemma2ForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) model.swap_norm_types() elif model_type == "GraniteForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) model.undo_permute(head_size, hidden_size, num_attn_heads, num_kv_heads) elif model_type == "LlamaForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) model.undo_permute(head_size, hidden_size, num_attn_heads, num_kv_heads) elif model_type == "MistralForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) model.undo_permute(head_size, hidden_size, num_attn_heads, num_kv_heads) elif model_type == "NemotronForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) elif model_type == "OlmoForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) model.undo_permute(head_size, hidden_size, num_attn_heads, num_kv_heads) elif model_type == "PhiForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) model.swap_mlp_types() elif model_type == "Phi3ForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) elif model_type == "Qwen2ForCausalLM": - model = GGUFModel(input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size) + model = GGUFModel( + input_path, head_size, hidden_size, intermediate_size, num_attn_heads, num_kv_heads, vocab_size + ) else: raise NotImplementedError(f"The {model_type} model is not currently supported.") diff --git a/test/python/test_onnxruntime_genai.py b/test/python/test_onnxruntime_genai.py index b813e72305..7286139fda 100644 --- a/test/python/test_onnxruntime_genai.py +++ b/test/python/test_onnxruntime_genai.py @@ -11,9 +11,7 @@ import onnxruntime_genai as og from _test_utils import download_models, run_subprocess -logging.basicConfig( - format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG -) +logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG) log = logging.getLogger("onnxruntime-genai-tests") @@ -62,8 +60,7 @@ def parse_arguments(): parser.add_argument( "--test_models", help="Path to the test_models directory", - default=pathlib.Path(__file__).parent.parent.resolve().absolute() - / "test_models", + default=pathlib.Path(__file__).parent.parent.resolve().absolute() / "test_models", ) parser.add_argument( "--e2e", @@ -80,9 +77,7 @@ def main(): # Get INT4 ONNX models output_paths = [] - if not ( - sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8 - ): + if not (sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8): output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu", log) if og.is_cuda_available(): output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda", log) diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py index 2c6cb0aa3e..cd6dd43c2d 100644 --- a/test/python/test_onnxruntime_genai_api.py +++ b/test/python/test_onnxruntime_genai_api.py @@ -324,9 +324,7 @@ def test_phi3_chat_template(device, phi3_for): model = og.Model(model_path) tokenizer = og.Tokenizer(model) - messages = ( - """[{"role": "system", "content": "This is a test."}, {"role": "user", "content": "Hi, how are you?"}]""" - ) + messages = """[{"role": "system", "content": "This is a test."}, {"role": "user", "content": "Hi, how are you?"}]""" try: tokenizer.apply_chat_template(messages=messages, add_generation_prompt=True) @@ -346,9 +344,7 @@ def test_phi2_chat_template(device, phi2_for): model = og.Model(model_path) tokenizer = og.Tokenizer(model) - messages = ( - """[{"role": "system", "content": "This is a test."}, {"role": "user", "content": "Hi, how are you?"}]""" - ) + messages = """[{"role": "system", "content": "This is a test."}, {"role": "user", "content": "Hi, how are you?"}]""" # Note: this should work, even though phi-2 has no official chat template, as we override it and pass one in template = """{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}""" diff --git a/test/python/test_onnxruntime_genai_e2e.py b/test/python/test_onnxruntime_genai_e2e.py index 96c77b9e82..643882ae5c 100644 --- a/test/python/test_onnxruntime_genai_e2e.py +++ b/test/python/test_onnxruntime_genai_e2e.py @@ -11,9 +11,7 @@ import onnxruntime_genai as og from _test_utils import get_ci_data_path, run_subprocess -logging.basicConfig( - format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG -) +logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG) log = logging.getLogger("onnxruntime-genai-tests") @@ -56,7 +54,7 @@ def run_whisper(): "The cut on his chest is still dripping blood. The ache of his overstrained eyes. Even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about.", ) - for (precision, execution_provider) in [("fp16", "cuda"), ("fp32", "cuda"), ("fp32", "cpu")]: + for precision, execution_provider in [("fp16", "cuda"), ("fp32", "cuda"), ("fp32", "cpu")]: if execution_provider == "cuda" and not og.is_cuda_available(): continue diff --git a/test/test_models/create_dummy_model.py b/test/test_models/create_dummy_model.py index 453a5ff833..43dfe91e45 100644 --- a/test/test_models/create_dummy_model.py +++ b/test/test_models/create_dummy_model.py @@ -66,15 +66,15 @@ def get_args(): "-i", "--inputs", metavar="(NAME; DTYPE; SHAPE)", - nargs='+', - help="Inputs of the form '(input_name; input_dtype; input_shape)' for model" + nargs="+", + help="Inputs of the form '(input_name; input_dtype; input_shape)' for model", ) parser.add_argument( "-o", "--outputs", metavar="(NAME; DTYPE; SHAPE)", - nargs='+', - help="Outputs of the form '(output_name; output_dtype; output_shape)' for model" + nargs="+", + help="Outputs of the form '(output_name; output_dtype; output_shape)' for model", ) parser.add_argument( "-f", @@ -86,6 +86,7 @@ def get_args(): args = parser.parse_args() return args + def parse_args(input_or_output): list_of_inputs_or_outputs = [] for input_str in input_or_output: @@ -94,6 +95,7 @@ def parse_args(input_or_output): list_of_inputs_or_outputs.append(input_or_output_to_add) return list_of_inputs_or_outputs + def get_input_or_output_value_infos(input_or_outputs): value_infos = [] for input_or_output in input_or_outputs: @@ -103,6 +105,7 @@ def get_input_or_output_value_infos(input_or_outputs): value_infos.append(value_info) return value_infos + def get_dummy_tensor_shape(shape): np_shape = () for dim in shape: @@ -114,6 +117,7 @@ def get_dummy_tensor_shape(shape): raise NotImplementedError(f"Unknown dim type: {type(dim)}") return np_shape + def get_output_initializers(outputs): initializers = [] for output in outputs: @@ -125,6 +129,7 @@ def get_output_initializers(outputs): initializers.append(tensor) return initializers + def main(): args = get_args() args.inputs = parse_args(args.inputs) @@ -132,7 +137,7 @@ def main(): # Create dummy model model = helper.make_model( - opset_imports=[helper.make_operatorsetid('', 14)], + opset_imports=[helper.make_operatorsetid("", 14)], ir_version=7, producer_name="onnxruntime-genai", producer_version="0.0.0", @@ -143,13 +148,14 @@ def main(): initializer=get_output_initializers(args.outputs), value_info=[], nodes=[], - ) + ), ) onnx.save_model( model, args.filename, ) + if __name__ == "__main__": # Map TensorProto dtypes to NumPy dtypes to_numpy_dtype = { diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py index 8ed7dbb969..203f13ca2b 100644 --- a/tools/ci_build/get_docker_image.py +++ b/tools/ci_build/get_docker_image.py @@ -22,15 +22,14 @@ def parse_args(): parser = argparse.ArgumentParser( description="Build a docker image and push it to a remote Azure Container Registry." - "The content in the remote registry can be used as a cache when we need to build the thing again." - "The user must be logged in to the container registry." + "The content in the remote registry can be used as a cache when we need to build the thing again." + "The user must be logged in to the container registry." ) parser.add_argument("--dockerfile", default="Dockerfile", help="Path to the Dockerfile.") parser.add_argument("--context", default=".", help="Path to the build context.") parser.add_argument( - "--docker-build-args", default="", - help="Arguments that will be passed to the 'docker build' command." + "--docker-build-args", default="", help="Arguments that will be passed to the 'docker build' command." ) parser.add_argument( @@ -39,9 +38,7 @@ def parse_args(): ) parser.add_argument("--repository", required=True, help="The image repository name.") - parser.add_argument("--use_imagecache", - action="store_true", - help="use cached image in pipeline cache") + parser.add_argument("--use_imagecache", action="store_true", help="use cached image in pipeline cache") parser.add_argument("--docker-path", default="docker", help="Path to docker.") @@ -59,9 +56,7 @@ def parse_args(): def main(): args = parse_args() - log.debug( - f"Dockerfile: {args.dockerfile}, context: {args.context}, docker build args: '{args.docker_build_args}'" - ) + log.debug(f"Dockerfile: {args.dockerfile}, context: {args.context}, docker build args: '{args.docker_build_args}'") use_container_registry = args.container_registry is not None @@ -98,8 +93,7 @@ def main(): "patch", "-p1", "-i", - str((Path( - SCRIPT_DIR) / "github" / "linux" / "docker" / "manylinux" / "manylinux.patch").resolve()), + str((Path(SCRIPT_DIR) / "github" / "linux" / "docker" / "manylinux" / "manylinux.patch").resolve()), cwd=dest, ) diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py index 375a9f6f34..f84027c997 100644 --- a/tools/ci_build/github/android/build_aar_package.py +++ b/tools/ci_build/github/android/build_aar_package.py @@ -161,32 +161,31 @@ def parse_args(): ) parser.add_argument( - "--android_home", type=Path, default=_path_from_env_var("ANDROID_HOME"), - help="Path to the Android SDK." + "--android_home", type=Path, default=_path_from_env_var("ANDROID_HOME"), help="Path to the Android SDK." ) parser.add_argument( - "--android_ndk_path", type=Path, default=_path_from_env_var("ANDROID_NDK_HOME"), - help="Path to the Android NDK." + "--android_ndk_path", type=Path, default=_path_from_env_var("ANDROID_NDK_HOME"), help="Path to the Android NDK." ) parser.add_argument( - "--build_dir", type=Path, default=(REPO_ROOT / "build" / "android_aar"), + "--build_dir", + type=Path, + default=(REPO_ROOT / "build" / "android_aar"), help="Provide the root directory for build output", ) parser.add_argument( - "--config", type=str, default="Release", + "--config", + type=str, + default="Release", choices=["Debug", "MinSizeRel", "Release", "RelWithDebInfo"], help="Configuration to build.", ) - parser.add_argument("--ort_home", type=Path, default=None, - help="Path to an unzipped onnxruntime AAR.") + parser.add_argument("--ort_home", type=Path, default=None, help="Path to an unzipped onnxruntime AAR.") - parser.add_argument( - "build_settings_file", type=Path, help="Provide the file contains settings for building AAR" - ) + parser.add_argument("build_settings_file", type=Path, help="Provide the file contains settings for building AAR") return parser.parse_args() diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py index 6c09a6a852..138f421aae 100755 --- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py +++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py @@ -88,13 +88,9 @@ def parse_args(): help="Skip macos platform tests. Specify this argument when build targets only contain ios archs. ", ) - parser.add_argument( - "--ort-version", required=True, help="The ORT version to depend on." - ) + parser.add_argument("--ort-version", required=True, help="The ORT version to depend on.") - parser.add_argument( - "--ort-home", required=False, help="The ORT home for building dependency." - ) + parser.add_argument("--ort-home", required=False, help="The ORT home for building dependency.") args = parser.parse_args() @@ -130,7 +126,7 @@ def main(): ] if args.ort_home: - build_apple_framework_args.append('--ort_home') + build_apple_framework_args.append("--ort_home") build_apple_framework_args.append(args.ort_home) if args.include_ops_by_config is not None: @@ -152,8 +148,8 @@ def main(): str(build_dir / "framework_out"), "--variant", package_variant.name, - '--ort_version', - args.ort_version + "--ort_version", + args.ort_version, ] if args.skip_macos_test: test_apple_packages_args.append("--skip_macos_test") @@ -175,7 +171,7 @@ def main(): framework_dir=build_dir / "framework_out" / "onnxruntime-genai.xcframework", public_headers_dir=build_dir / "framework_out" / "Headers", package_variant=package_variant, - ort_version=args.ort_version + ort_version=args.ort_version, ) if args.test: @@ -193,7 +189,7 @@ def main(): staging_dir=objc_pod_staging_dir, pod_version=args.pod_version, framework_info_file=framework_info_file, - package_variant=package_variant + package_variant=package_variant, ) if args.test: diff --git a/tools/ci_build/github/apple/package_assembly_utils.py b/tools/ci_build/github/apple/package_assembly_utils.py index 2b702a0203..d10c4c3c53 100644 --- a/tools/ci_build/github/apple/package_assembly_utils.py +++ b/tools/ci_build/github/apple/package_assembly_utils.py @@ -74,7 +74,7 @@ def filter_files(subpath: str, all_file_patterns: list[str], excluded_file_patte """ # get all files matching the patterns in all_file_patterns if subpath: - src_root = repo_root / subpath + src_root = repo_root / subpath else: src_root = repo_root @@ -99,7 +99,7 @@ def copy_repo_relative_to_dir(subpath: str, patterns: list[str], dest_dir: pathl :param dest_dir The destination directory. """ if subpath: - src_root = repo_root / subpath + src_root = repo_root / subpath else: src_root = repo_root paths = [path for pattern in patterns for path in src_root.glob(pattern)] diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index 124c6bd7ce..1f27935fe8 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -9,6 +9,7 @@ def get_env_var(key): return os.environ.get(key) + def generate_nuspec(args): lines = [''] lines.append("") @@ -17,6 +18,7 @@ def generate_nuspec(args): lines.append("") return lines + def generate_metadata(line_list, args): tags = "ONNX;ONNX Runtime;ONNX Runtime Gen AI;Machine Learning" @@ -37,6 +39,7 @@ def generate_metadata(line_list, args): line_list += metadata_list + def generate_id(line_list, package_name): line_list.append("" + package_name + "") @@ -73,8 +76,10 @@ def generate_icon(line_list, icon_file): def generate_license(line_list): line_list.append('LICENSE') + def generate_readme(line_list): - line_list.append('PACKAGE.md') + line_list.append("PACKAGE.md") + def generate_project_url(line_list, project_url): line_list.append("" + project_url + "") @@ -83,6 +88,7 @@ def generate_project_url(line_list, project_url): def generate_repo_url(line_list, repo_url, commit_id): line_list.append('') + def generate_release_notes(line_list): line_list.append("") line_list.append("Release Def:") @@ -95,9 +101,17 @@ def generate_release_notes(line_list): line_list.append("") + def generate_dependencies(xml_text, package_version, ort_package_name, ort_package_version): xml_text.append("") - target_frameworks = ["NETSTANDARD" , "NETCOREAPP", "NETFRAMEWORK", "net9.0-android31.0", "net9.0-ios15.4", "net9.0-maccatalyst14.0"] + target_frameworks = [ + "NETSTANDARD", + "NETCOREAPP", + "NETFRAMEWORK", + "net9.0-android31.0", + "net9.0-ios15.4", + "net9.0-maccatalyst14.0", + ] for framework in target_frameworks: xml_text.append(f'') xml_text.append(f'') @@ -106,8 +120,9 @@ def generate_dependencies(xml_text, package_version, ort_package_name, ort_packa xml_text.append("") + def generate_files(lines, args): - lines.append('') + lines.append("") lines.append(rf'') lines.append(f'') @@ -116,47 +131,59 @@ def generate_files(lines, args): def add_native_artifact_if_exists(xml_lines, runtime, artifact): p = Path(f"{args.sources_path}/{args.native_build_path}/{runtime}/{args.build_config}/{artifact}") if p.exists(): - xml_lines.append( - f'' - ) + xml_lines.append(f'') runtimes = ["win-x64", "win-arm64", "linux-x64", "osx-x64", "osx-arm64", "ios", "android"] for runtime in runtimes: - if runtime.startswith("win"): - add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.lib") - add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.dll") - add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai-cuda.lib") - add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai-cuda.dll") - add_native_artifact_if_exists(lines, runtime, "d3d12core.dll") - elif runtime.startswith("linux"): - add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai.so") - add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai-cuda.so") - elif runtime.startswith("osx"): - add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai.dylib") - elif runtime.startswith("ios"): - add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.xcframework.zip") - elif runtime.startswith("android"): - add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.aar") + if runtime.startswith("win"): + add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.lib") + add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.dll") + add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai-cuda.lib") + add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai-cuda.dll") + add_native_artifact_if_exists(lines, runtime, "d3d12core.dll") + elif runtime.startswith("linux"): + add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai.so") + add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai-cuda.so") + elif runtime.startswith("osx"): + add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai.dylib") + elif runtime.startswith("ios"): + add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.xcframework.zip") + elif runtime.startswith("android"): + add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.aar") # targets for dotnet in ["netstandard2.0", "net8.0", "native"]: - lines.append(f'') - lines.append(f'') + lines.append( + f'' + ) + lines.append( + f'' + ) # mobile targets - lines.append(f'') - lines.append(f'') + lines.append( + f'' + ) + lines.append( + f'' + ) - lines.append(f'') - lines.append(f'') + lines.append( + f'' + ) + lines.append( + f'' + ) lines.append('') - lines.append('') + lines.append( + '' + ) # include lines.append(f'') lines.append(f'') - lines.append('') + lines.append("") def parse_arguments(): @@ -175,6 +202,7 @@ def parse_arguments(): parser.add_argument("--nuspec_output_path", required=True, type=str, help="nuget spec output path.") return parser.parse_args() + def main(): args = parse_arguments() print(args) @@ -191,5 +219,5 @@ def main(): f.write("\n") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/python/model_validation/perplexity_metrics.py b/tools/python/model_validation/perplexity_metrics.py index e583e09a5c..ed05018d47 100644 --- a/tools/python/model_validation/perplexity_metrics.py +++ b/tools/python/model_validation/perplexity_metrics.py @@ -7,14 +7,15 @@ def get_wikitext2(): - test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") # Concatenate the text with "\n\n" separator, result = "\n\n".join(text for text in test["text"]) return result + def perplexity_eval(model_dir): # Load the model and tokenizer - model = og.Model(f'{model_dir}') + model = og.Model(f"{model_dir}") tokenizer = og.Tokenizer(model) total_log_probs = 0 @@ -30,10 +31,10 @@ def perplexity_eval(model_dir): # Need to retreive the Model's maximum via the ORT GenAI configuration ## Explore the biggest max length vs the context length in genai config and calculate the lower of the two - with open(model_dir+'/genai_config.json') as file: + with open(model_dir + "/genai_config.json") as file: config = json.load(file) - max_length = config["model"]["context_length"]-1 # This is the default for qwen + max_length = config["model"]["context_length"] - 1 # This is the default for qwen stride = 8192 # Just get the perplexity for one position seq_len = input_ids.size(1) diff --git a/tools/python/model_validation/validation_tool.py b/tools/python/model_validation/validation_tool.py index 4152a9cf60..d94693d535 100644 --- a/tools/python/model_validation/validation_tool.py +++ b/tools/python/model_validation/validation_tool.py @@ -9,30 +9,34 @@ def create_table(output): - df = pd.DataFrame(output, columns=['Model Name', 'Validation Completed', 'Exceptions / Failures']) + df = pd.DataFrame(output, columns=["Model Name", "Validation Completed", "Exceptions / Failures"]) return df + def validate_model(args, model_dict, model_dir): - if args["verbose"]: print("Loading model...") + if args["verbose"]: + print("Loading model...") - model = og.Model(f'{model_dir}') + model = og.Model(f"{model_dir}") - if args["verbose"]: print("Model loaded") + if args["verbose"]: + print("Model loaded") tokenizer = og.Tokenizer(model) tokenizer_stream = tokenizer.create_stream() - if args["verbose"]: print("Tokenizer created") - if args["verbose"]: print() + if args["verbose"]: + print("Tokenizer created") + if args["verbose"]: + print() chat_template = model_dict["chat_template"] search_options = args["search_options"] for text in args["inputs"]: + complete_text = "" - complete_text = '' - - prompt = f'{chat_template.format(input=text)}' + prompt = f"{chat_template.format(input=text)}" input_tokens = tokenizer.encode(prompt) @@ -41,12 +45,14 @@ def validate_model(args, model_dict, model_dir): params.input_ids = input_tokens generator = og.Generator(model, params) - if args["verbose"]: print("Generator created") + if args["verbose"]: + print("Generator created") - if args["verbose"]: print("Running generation loop ...") + if args["verbose"]: + print("Running generation loop ...") print() - print("Output: ", end='', flush=True) + print("Output: ", end="", flush=True) generation_successful = True @@ -61,7 +67,7 @@ def validate_model(args, model_dict, model_dir): complete_text += value_to_save - print(tokenizer_stream.decode(new_token), end='', flush=True) + print(tokenizer_stream.decode(new_token), end="", flush=True) except KeyboardInterrupt: print(" --control+c pressed, aborting generation--") @@ -70,7 +76,7 @@ def validate_model(args, model_dict, model_dir): print(f"An error occurred: {e}") generation_successful = False - with open(f'{model_dir}/output.txt', 'a', encoding='utf-8') as file: + with open(f"{model_dir}/output.txt", "a", encoding="utf-8") as file: file.write(complete_text) # Delete the generator to free the captured graph for the next generator, if graph capture is enabled @@ -78,9 +84,12 @@ def validate_model(args, model_dict, model_dir): return generation_successful + if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") - parser.add_argument('-j', '--json', type=str, required=True, help='Path to the JSON file containing the arguments') + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai" + ) + parser.add_argument("-j", "--json", type=str, required=True, help="Path to the JSON file containing the arguments") args = parser.parse_args() with open(args.json) as file: @@ -96,35 +105,33 @@ def validate_model(args, model_dict, model_dir): exception = False for model_dict in args["models"]: - print(f"We are validating {model_dict['name']}") adjusted_model = model_dict["name"].replace("/", "_") - output_path = args["output_directory"] + f'/{adjusted_model}' - cache_path = args["cache_directory"] + f'/{adjusted_model}' + output_path = args["output_directory"] + f"/{adjusted_model}" + cache_path = args["cache_directory"] + f"/{adjusted_model}" try: - create_model(model_dict["name"], '', output_path, args["precision"], args["execution_provider"], cache_path) + create_model(model_dict["name"], "", output_path, args["precision"], args["execution_provider"], cache_path) except Exception as e: - print(f'Failure after create model {e}') + print(f"Failure after create model {e}") output.append([model_dict["name"], validation_complete, e]) exception = True continue try: validation_complete = validate_model(args, model_dict, output_path) except Exception as e: - print(f'Failure after validation model {e}') + print(f"Failure after validation model {e}") exception = True output.append([model_dict["name"], validation_complete, e]) try: perplexity_eval(output_path) except Exception as e: - print(f'Failure after perplexity calculation model {e}') + print(f"Failure after perplexity calculation model {e}") exception = True output.append([model_dict["name"], validation_complete, e]) - if not exception: output.append([model_dict["name"], validation_complete, e]) diff --git a/tools/python/util/android.py b/tools/python/util/android.py index 403959549a..9a8aa5d5a2 100644 --- a/tools/python/util/android.py +++ b/tools/python/util/android.py @@ -38,10 +38,10 @@ def filename(name, windows_extension): adb=str((sdk_root / "platform-tools" / filename("adb", "exe")).resolve(strict=True)), sdkmanager=str( (sdk_root / "cmdline-tools" / "latest" / "bin" / filename("sdkmanager", "bat")).resolve(strict=True) - ), + ), avdmanager=str( (sdk_root / "cmdline-tools" / "latest" / "bin" / filename("avdmanager", "bat")).resolve(strict=True) - ) + ), ) From 1bbeaa537701f39a77d66094f7f6c170f9b97ec6 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 19 Nov 2025 20:15:43 +0000 Subject: [PATCH 4/4] Update readme --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 0eeb9efed8..0c2f6b1ebc 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,29 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +### Linting + +This project enables [lintrunner](https://github.com/suo/lintrunner) for linting. You can install the dependencies and initialize with + +```sh +pip install -r requirements-lintrunner.txt +lintrunner init +``` + +This will install lintrunner on your system and download all the necessary dependencies to run linters locally. + +To format local changes: + +```bash +lintrunner -a +``` + +To format all files: + +```bash +lintrunner -a --all-files +``` + ## Trademarks This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft