From 2eddd1817ece65c7a8d2f3807adb6b165a212387 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Wed, 3 Dec 2025 11:31:02 -0800
Subject: [PATCH] Re-branding TensorRT-Model-Optimizer as Nvidia
 Model-Optimizer

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 ATTRIBUTIONS-Python.md                             |  4 ++--
 README.md                                          |  4 ++--
 ...ing_Expert_Parallelism_in_TensorRT-LLM_part3.md |  2 +-
 ..._DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md |  2 +-
 ...pSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md |  2 +-
 docs/source/developer-guide/perf-benchmarking.md   |  4 ++--
 docs/source/developer-guide/perf-overview.md       |  2 +-
 docs/source/features/auto_deploy/support_matrix.md |  2 +-
 docs/source/features/quantization.md               |  8 ++++----
 .../source/legacy/performance/perf-benchmarking.md |  2 +-
 docs/source/torch/auto_deploy/support_matrix.md    |  2 +-
 docs/source/torch/features/quantization.md         |  6 +++---
 examples/auto_deploy/README.md                     |  8 ++++----
 examples/disaggregated/README.md                   |  2 +-
 .../_tensorrt_engine/llm_medusa_decoding.py        |  4 ++--
 .../llm-api/_tensorrt_engine/quickstart_example.py |  2 +-
 examples/llm-api/llm_inference.py                  |  2 +-
 examples/llm-api/quickstart_example.py             |  2 +-
 examples/medusa/README.md                          |  2 +-
 examples/models/core/deepseek_v3/README.md         |  6 +++---
 examples/models/core/exaone/README.md              | 10 +++++-----
 examples/models/core/llama/README.md               |  2 +-
 examples/models/core/llama4/README.md              |  6 +++---
 examples/models/core/qwen/README.md                | 14 +++++++-------
 examples/quantization/README.md                    |  2 +-
 .../examples/models/core/mllama/poetry.lock        |  2 +-
 security_scanning/poetry.lock                      |  2 +-
 27 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
index f7360a7e932..4e350512a2b 100644
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@@ -25486,7 +25486,7 @@ limitations under the License.
 ```
 
 ### URLs
-  - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
+  - `Homepage`: https://github.com/NVIDIA/Model-Optimizer
 
 
 ## nvidia-modelopt-core (0.33.1)
@@ -25513,7 +25513,7 @@ limitations under the License.
 ```
 
 ### URLs
-  - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer
+  - `Homepage`: https://github.com/NVIDIA/Model-Optimizer
 
 
 ## nvidia-nccl-cu12 (2.27.3)
diff --git a/README.md b/README.md
index f09c61783d5..208767b0377 100644
--- a/README.md
+++ b/README.md
@@ -164,7 +164,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
 
 
-* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
+* [2024/08/20] 🏎️SDXL with #Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
 
 * [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
@@ -209,7 +209,7 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights 
 * [2024/05/21] ✨@modal_labs has the codes for serverless @AIatMeta Llama 3 on #TensorRT #LLM ✨👀 📚 Marvelous Modal Manual:
 Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.com/docs/examples/trtllm_llama)
 
-* [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
+* [2024/05/08] NVIDIA Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
 
 * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
 
diff --git a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
index 4b80603e29d..800c406bd22 100644
--- a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
+++ b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md
@@ -46,7 +46,7 @@ In this third blog of our scaling Expert Parallelism (EP) series, we push the pe
 
 The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1's MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, "wo" is the abbreviation for the weight matrix for the output.
 
-We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
+We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
 * https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2
 * https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2
 
diff --git a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
index cd55d049d4a..b5e3e6558a6 100644
--- a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
@@ -67,7 +67,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw
 
 *TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.
 
-** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+** nvfp4 model checkpoint is generated by the [NVIDIA Model Optimizer toolkit](https://github.com/NVIDIA/Model-Optimizer).
 
 *** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability
 
diff --git a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
index 2da07411a83..d2483af3f3f 100644
--- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
@@ -29,7 +29,7 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam
 * FP8 KV cache and FP8 attention, rather than BF16 precision.
 * FP4 Allgather for better communication bandwidth utilization.
 
-The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
+The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
 
 | Precision | GPQA Diamond | MATH-500
 | :-- | :-- | :-- |
diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md
index 4e4e3ca4217..57ef00d8f6c 100644
--- a/docs/source/developer-guide/perf-benchmarking.md
+++ b/docs/source/developer-guide/perf-benchmarking.md
@@ -423,10 +423,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp
 - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
 
-To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html).
+To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html).
 
 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
 above:
 
diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md
index 0a144a58d4c..aefa91fd43c 100644
--- a/docs/source/developer-guide/perf-overview.md
+++ b/docs/source/developer-guide/perf-overview.md
@@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To
 
 The performance numbers below were collected using the steps described in this document.
 
-Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
+Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
 
 *(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:*
 
diff --git a/docs/source/features/auto_deploy/support_matrix.md b/docs/source/features/auto_deploy/support_matrix.md
index 26c07b308b8..fec6d841af4 100644
--- a/docs/source/features/auto_deploy/support_matrix.md
+++ b/docs/source/features/auto_deploy/support_matrix.md
@@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations:
 
 ### Precision Support
 
-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
 
 **Supported precision types include:**
 
diff --git a/docs/source/features/quantization.md b/docs/source/features/quantization.md
index 8a0e160529f..e057a91b39c 100644
--- a/docs/source/features/quantization.md
+++ b/docs/source/features/quantization.md
@@ -23,7 +23,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac
 
 ### Running Pre-quantized Models
 
-TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 ```python
 from tensorrt_llm import LLM
@@ -54,8 +54,8 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg
 Follow this step-by-step guide to quantize a model:
 
 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```
 
@@ -108,4 +108,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w
 ## Quick Links
 
 - [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4)
-- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html)
+- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html)
diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md
index 55caef07bab..5efd6625f00 100644
--- a/docs/source/legacy/performance/perf-benchmarking.md
+++ b/docs/source/legacy/performance/perf-benchmarking.md
@@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
 
 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
 above:
 
diff --git a/docs/source/torch/auto_deploy/support_matrix.md b/docs/source/torch/auto_deploy/support_matrix.md
index c8780cbca14..f0158253dda 100644
--- a/docs/source/torch/auto_deploy/support_matrix.md
+++ b/docs/source/torch/auto_deploy/support_matrix.md
@@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations:
 
 ### Precision Support
 
-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
 
 **Supported precision types include:**
 
diff --git a/docs/source/torch/features/quantization.md b/docs/source/torch/features/quantization.md
index a2b6c48be21..47cc745165b 100644
--- a/docs/source/torch/features/quantization.md
+++ b/docs/source/torch/features/quantization.md
@@ -1,7 +1,7 @@
 # Quantization
 
 The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub,
-which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 ```python
 from tensorrt_llm._torch import LLM
@@ -12,7 +12,7 @@ llm.generate("Hello, my name is")
 Or you can try the following commands to get a quantized model by yourself:
 
 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```
diff --git a/examples/auto_deploy/README.md b/examples/auto_deploy/README.md
index c89c1a552c9..5343d88999d 100644
--- a/examples/auto_deploy/README.md
+++ b/examples/auto_deploy/README.md
@@ -90,16 +90,16 @@ python lm_eval_ad.py \
 --model autodeploy --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,world_size=2 --tasks mmlu
 ```
 
-### Mixed-precision Quantization using TensorRT Model Optimizer
+### Mixed-precision Quantization using Model Optimizer
 
-TensorRT Model Optimizer [AutoQuantize](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
+Model Optimizer [AutoQuantize](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance.
 
 Currently `AutoQuantize` supports only `effective_bits` as the performance constraint (for both weight-only quantization and weight & activation quantization). See
-[AutoQuantize documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
+[AutoQuantize documentation](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details.
 
 #### 1. Quantize a model with ModelOpt
 
-Refer to [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
+Refer to [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint.
 
 #### 2. Deploy the quantized model with AutoDeploy
 
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 511bce36195..8b99f8845f6 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -212,7 +212,7 @@ In disaggregated serving, the context workers and generation workers have differ
 ### Prerequisites
 
 To enable mixed precision serving, you will need:
-1. A quantized checkpoint created with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+1. A quantized checkpoint created with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
 2. The original unquantized checkpoint (Can also be quantized)
 3. Both checkpoints must use the same KV cache dtype to ensure compatibility during transfer
 
diff --git a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
index b6d7f90c0f5..f45411b2336 100644
--- a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
+++ b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py
@@ -29,7 +29,7 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
     llm_kwargs = {}
 
     if use_modelopt_ckpt:
-        # This is a Llama-3.1-8B combined with Medusa heads provided by TensorRT Model Optimizer.
+        # This is a Llama-3.1-8B combined with Medusa heads provided by Model Optimizer.
         # Both the base model (except lm_head) and Medusa heads have been quantized in FP8.
         model = model_dir or "nvidia/Llama-3.1-8B-Medusa-FP8"
 
@@ -85,7 +85,7 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
     parser.add_argument(
         '--use_modelopt_ckpt',
         action='store_true',
-        help="Use FP8-quantized checkpoint from TensorRT Model Optimizer.")
+        help="Use FP8-quantized checkpoint from Model Optimizer.")
     # TODO: remove this arg after ModelOpt ckpt is public on HF
     parser.add_argument('--model_dir', type=Path, default=None)
     args = parser.parse_args()
diff --git a/examples/llm-api/_tensorrt_engine/quickstart_example.py b/examples/llm-api/_tensorrt_engine/quickstart_example.py
index a6ba9ec5598..d02f55c46b3 100644
--- a/examples/llm-api/_tensorrt_engine/quickstart_example.py
+++ b/examples/llm-api/_tensorrt_engine/quickstart_example.py
@@ -9,7 +9,7 @@ def main():
     build_config.max_num_tokens = 1024
 
     # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
               build_config=build_config)
 
diff --git a/examples/llm-api/llm_inference.py b/examples/llm-api/llm_inference.py
index 5146504d25d..6c806f07685 100644
--- a/examples/llm-api/llm_inference.py
+++ b/examples/llm-api/llm_inference.py
@@ -7,7 +7,7 @@
 def main():
 
     # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 
     # Sample prompts.
diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py
index 400a241c0e9..2d6f14012bd 100644
--- a/examples/llm-api/quickstart_example.py
+++ b/examples/llm-api/quickstart_example.py
@@ -4,7 +4,7 @@
 def main():
 
     # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 
     # Sample prompts.
diff --git a/examples/medusa/README.md b/examples/medusa/README.md
index eb442554ec4..7820335cd0f 100644
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
@@ -19,7 +19,7 @@ For more info about Medusa visit [speculative decoding documentation](https://nv
 The TensorRT LLM Medusa example code is located in [`examples/medusa`](./). There is one [`convert_checkpoint.py`](./convert_checkpoint.py) file to convert and build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run models with Medusa decoding support.
 In this example, we demonstrate the usage of two models:
 1. The Vucuna 7B model from Hugging Face [`FasterDecoding/medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) with its Medusa heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3).
-2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
+2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8.
 
 ### Build TensorRT engine(s)
 Get the weights by downloading base model [`vicuna-7b-v1.3`](https://huggingface.co/lmsys/vicuna-7b-v1.3) and Medusa Heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) from HF.
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index 3e824425630..934db2e4939 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -773,7 +773,7 @@ You can enable FP8 MLA through either of these methods:
 
 **Option 1: Checkpoint config**
 
-TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/Model-Optimizer).
 
 To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following shows the config for DeepSeek's block-wise FP8 GEMM quantization + FP8 MLA:
 
@@ -808,14 +808,14 @@ Or you can follow the steps to generate one by yourselves.
 
 #### Activation calibration
 
-[ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
+[ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves.
 
 ```bash
 # Make sure for enough GPU resources (8xH200s) to run the following commands
 PATH_OF_DEEPSEEK_R1=/llm-models/DeepSeek-R1/DeepSeek-R1
 
 # Install ModelOpt from source
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer/ && cd modelopt
+git clone https://github.com/NVIDIA/Model-Optimizer/ && cd modelopt
 pip install "nvidia-modelopt[all]" -U --extra-index-url https://pypi.nvidia.com
 
 # Clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
diff --git a/examples/models/core/exaone/README.md b/examples/models/core/exaone/README.md
index 549b83843aa..9ea4a9e71d1 100644
--- a/examples/models/core/exaone/README.md
+++ b/examples/models/core/exaone/README.md
@@ -85,17 +85,17 @@ The output will be like:
 
 #### PyTorch flow Quantization
 
-For PyTorch flow, TRT-LLM supports quantized format generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command:
 
 ```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model  hf_models/$MODEL_NAME --quant fp8 --export_fmt hf
 ```
 
-For more information, please refer to official [docs](https://github.com/NVIDIA/TensorRT-Model-Optimizer) or [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
 
 Troubleshooting
 
@@ -107,7 +107,7 @@ Hint: Move the offending context manager(s) to outside the compiled region.
 Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
 ```
 
-This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
+This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
 
 Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
 ```json
diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
index 464fe8bdf34..df26ac1ad69 100644
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@@ -1559,7 +1559,7 @@ Explanation:
 
 
 ### Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
 ``` bash
 trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
     --tp_size 8 \
diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md
index 93e37788640..a6c02070e90 100644
--- a/examples/models/core/llama4/README.md
+++ b/examples/models/core/llama4/README.md
@@ -42,7 +42,7 @@ Explanation:
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --max_batch_size 512 \
@@ -94,7 +94,7 @@ Explanation:
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --max_batch_size 8 \
@@ -140,7 +140,7 @@ Explanation:
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
-TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint.
+TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint.
 ``` bash
 trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --tp_size 8 \
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 52a5ecb481d..1d3d97b2679 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -663,19 +663,19 @@ trtllm-eval --model=Qwen3-30B-A3B/ --tokenizer=Qwen3-30B-A3B/ --backend=pytorch
 To quantize the Qwen3 model for use with the PyTorch backend, we'll use NVIDIA's Model Optimizer (ModelOpt) tool. Follow these steps:
 
 ```bash
-# Clone the TensorRT Model Optimizer (ModelOpt)
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-pushd TensorRT-Model-Optimizer
+# Clone the Model Optimizer (ModelOpt)
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+pushd Model-Optimizer
 
 # install the ModelOpt
 pip install -e .
 
 # Quantize the Qwen3-235B-A22B model by nvfp4
-# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
+# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`.
 ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-235B-A22B/ --quant nvfp4 --export_fmt hf
 
 # Quantize the Qwen3-32B model by fp8_pc_pt
-# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
+# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`.
 ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-32B/ --quant fp8_pc_pt --export_fmt hf
 popd
 ```
@@ -687,7 +687,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to
 ```bash
 #!/bin/bash
 
-folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
+folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
 path_config=extra-llm-api-config.yml
 num_gpus=8
 ep_size=8
@@ -727,7 +727,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \
 We suggest benchmarking with a real dataset. It will prevent from having improperly distributed tokens in the MoE. Here, we use the `aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt` dataset. It has 10000 samples with an average input length of 1024 and an average output length of 2048. If you don't have a dataset (this or an other) and you want to run the benchmark, you can use the following command to generate a random dataset:
 
 ```bash
-folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
+folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
 min_input_len=1024
 min_output_len=2048
 concurrency=128
diff --git a/examples/quantization/README.md b/examples/quantization/README.md
index e74736b61b8..b3b2e35b20f 100644
--- a/examples/quantization/README.md
+++ b/examples/quantization/README.md
@@ -11,7 +11,7 @@ The detailed LLM quantization recipe is distributed to the README.md of the corr
 
 ## Installation
 
-The NVIDIA TensorRT Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
+The NVIDIA Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM.
 
 ```bash
 # Install the additional requirements
diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock
index 11e0ed3ccb7..c58e7c12b52 100644
--- a/security_scanning/examples/models/core/mllama/poetry.lock
+++ b/security_scanning/examples/models/core/mllama/poetry.lock
@@ -708,7 +708,7 @@ files = [
 [[package]]
 name = "nvidia-modelopt"
 version = "0.21.1"
-description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit."
+description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit."
 optional = false
 python-versions = "<3.13,>=3.8"
 files = [
diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock
index 18ed93657e1..e5959abf84c 100644
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@@ -2793,7 +2793,7 @@ files = [
 [[package]]
 name = "nvidia-modelopt"
 version = "0.37.0"
-description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit."
+description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit."
 optional = false
 python-versions = "<3.13,>=3.10"
 files = [