NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎3rdparty/DeepGEMM‎ b/‎3rdparty/DeepGEMM‎
diff --git a/‎README.md‎
Lines changed: 4 additions & 1 deletion b/‎README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 7 additions & 2 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/deep_gemm/CMakeLists.txt‎
Lines changed: 126 additions & 0 deletions b/‎cpp/tensorrt_llm/deep_gemm/CMakeLists.txt‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/deep_gemm/deep_gemm_cpp_tllm.version‎
Lines changed: 4 additions & 0 deletions b/‎cpp/tensorrt_llm/deep_gemm/deep_gemm_cpp_tllm.version‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md‎ renamed to ‎docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md‎ b/‎docs/source/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md‎ renamed to ‎docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md‎
@@ -43,6 +43,9 @@ tensorrt_llm/bindings/**/*.pyi
 tensorrt_llm/deep_ep/
 tensorrt_llm/deep_ep_cpp_tllm.*.so
 tensorrt_llm/deep_ep_cpp_tllm.pyi
+tensorrt_llm/deep_gemm/
+tensorrt_llm/deep_gemm_cpp_tllm.*.so
+tensorrt_llm/deep_gemm_cpp_tllm.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst
 
@@ -26,3 +26,6 @@
 [submodule "3rdparty/cppzmq"]
 	path = 3rdparty/cppzmq
 	url = https://github.com/zeromq/cppzmq.git
+[submodule "3rdparty/DeepGEMM"]
+	path = 3rdparty/DeepGEMM
+	url = https://github.com/deepseek-ai/DeepGEMM.git
@@ -27,7 +27,7 @@ repos:
         args: [--allow-multiple-documents]
         exclude: ".*/gitlab/.*.yml"
     -   id: trailing-whitespace
-        exclude: '\.patch$'
+        exclude: '\.(patch|md)$'
     -   id: check-toml
     -   id: mixed-line-ending
         args: [--fix=lf]
 
@@ -18,12 +18,15 @@ TensorRT-LLM
 <div align="left">
 
 ## Tech Blogs
+* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
+
 
 * [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
 
 * [07/26] N-Gram Speculative Decoding in TensorRT‑LLM
-✨ [➡️ link](./docs/source/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md)
+✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)
 
 * [06/19] Disaggregated Serving in TensorRT-LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md)
 
@@ -31,6 +31,7 @@ option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
 option(BUILD_TESTS "Build Google tests" ON)
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
 option(BUILD_DEEP_EP "Build the Deep EP module" ON)
+option(BUILD_DEEP_GEMM "Build the DeepGEMM module" ON)
 option(BUILD_MICRO_BENCHMARKS "Build C++ micro benchmarks" OFF)
 option(NVTX_DISABLE "Disable all NVTX features" ON)
 option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
@@ -199,7 +200,9 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
+if(BINDING_TYPE STREQUAL "pybind"
+   OR BUILD_DEEP_EP
+   OR BUILD_DEEP_GEMM)
   add_subdirectory(${3RDPARTY_DIR}/pybind11
                    ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
 endif()
@@ -218,7 +221,9 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include)
-if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
+if(BINDING_TYPE STREQUAL "pybind"
+   OR BUILD_DEEP_EP
+   OR BUILD_DEEP_GEMM)
   include_directories(${3RDPARTY_DIR}/pybind11/include)
 endif()
 if(BINDING_TYPE STREQUAL "nanobind")
 
@@ -314,4 +314,8 @@ if(BUILD_DEEP_EP)
   add_subdirectory(deep_ep)
 endif()
 
+if(BUILD_DEEP_GEMM)
+  add_subdirectory(deep_gemm)
+endif()
+
 add_subdirectory(plugins)
@@ -0,0 +1,126 @@
+add_custom_target(deep_gemm)
+
+if(WIN32)
+  return()
+endif()
+
+# Prepare files
+# =============
+
+# Use DeepGEMM submodule
+set(DEEP_GEMM_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/DeepGEMM)
+get_filename_component(DEEP_GEMM_SOURCE_DIR ${DEEP_GEMM_SOURCE_DIR} ABSOLUTE)
+
+if(NOT EXISTS ${DEEP_GEMM_SOURCE_DIR})
+  message(
+    FATAL_ERROR
+      "DeepGEMM submodule not found at ${DEEP_GEMM_SOURCE_DIR}. Please run: git submodule update --init --recursive"
+  )
+endif()
+
+# Check if submodules are initialized
+if(NOT EXISTS ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include)
+  message(
+    FATAL_ERROR
+      "DeepGEMM submodules not initialized. Please run: git submodule update --init --recursive"
+  )
+endif()
+
+# Copy and update python files
+set(DEEP_GEMM_PYTHON_DEST ${CMAKE_CURRENT_BINARY_DIR}/python/deep_gemm)
+file(REMOVE_RECURSE ${DEEP_GEMM_PYTHON_DEST})
+file(MAKE_DIRECTORY ${DEEP_GEMM_PYTHON_DEST})
+
+# Copy all files from deep_gemm directory
+file(GLOB_RECURSE DEEP_GEMM_ALL_FILES ${DEEP_GEMM_SOURCE_DIR}/deep_gemm/*)
+configure_file(${DEEP_GEMM_SOURCE_DIR}/LICENSE ${DEEP_GEMM_PYTHON_DEST}/LICENSE
+               COPYONLY)
+foreach(SOURCE_FILE ${DEEP_GEMM_ALL_FILES})
+  file(RELATIVE_PATH REL_PATH ${DEEP_GEMM_SOURCE_DIR}/deep_gemm ${SOURCE_FILE})
+  get_filename_component(REL_DIR ${REL_PATH} DIRECTORY)
+  file(MAKE_DIRECTORY ${DEEP_GEMM_PYTHON_DEST}/${REL_DIR})
+
+  # Check if it's a Python file that needs import renaming
+  get_filename_component(FILE_EXT ${SOURCE_FILE} EXT)
+  if(FILE_EXT STREQUAL ".py")
+    # Read file content and replace module imports for Python files
+    file(READ ${SOURCE_FILE} _content)
+    string(REPLACE "deep_gemm_cpp" "tensorrt_llm.deep_gemm_cpp_tllm" _content
+                   "${_content}")
+
+    # Add adaptation header
+    string(
+      PREPEND
+      _content
+      "# Adapted from https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/${REL_PATH}\n"
+    )
+
+    # Write modified content
+    set(_dst "${DEEP_GEMM_PYTHON_DEST}/${REL_PATH}")
+    file(WRITE ${_dst} "${_content}")
+  else()
+    # Copy non-Python files as-is
+    set(_dst "${DEEP_GEMM_PYTHON_DEST}/${REL_PATH}")
+    file(COPY ${SOURCE_FILE} DESTINATION ${DEEP_GEMM_PYTHON_DEST}/${REL_DIR})
+  endif()
+
+  # Add dependency tracking
+  set_property(
+    DIRECTORY
+    APPEND
+    PROPERTY CMAKE_CONFIGURE_DEPENDS ${SOURCE_FILE})
+endforeach()
+
+# Copy third-party includes (cutlass and fmt) to the include directory
+set(DEEP_GEMM_INCLUDE_DEST ${DEEP_GEMM_PYTHON_DEST}/include)
+file(MAKE_DIRECTORY ${DEEP_GEMM_INCLUDE_DEST})
+file(COPY ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include/cute
+     DESTINATION ${DEEP_GEMM_INCLUDE_DEST})
+file(COPY ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include/cutlass
+     DESTINATION ${DEEP_GEMM_INCLUDE_DEST})
+
+# Find torch_python
+find_library(TORCH_PYTHON_LIB torch_python REQUIRED
+             HINTS ${TORCH_INSTALL_PREFIX}/lib)
+
+# Build deep_gemm_cpp_tllm extension (matching deep_gemm's setup.py)
+set(DEEP_GEMM_SOURCES ${DEEP_GEMM_SOURCE_DIR}/csrc/python_api.cpp)
+
+pybind11_add_module(deep_gemm_cpp_tllm ${DEEP_GEMM_SOURCES})
+set_target_properties(
+  deep_gemm_cpp_tllm
+  PROPERTIES CXX_STANDARD_REQUIRED ON
+             CXX_STANDARD 17
+             CXX_SCAN_FOR_MODULES OFF
+             CUDA_SEPARABLE_COMPILATION ON
+             LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_gemm_cpp_tllm.version
+             INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib"
+             BUILD_WITH_INSTALL_RPATH TRUE)
+
+target_compile_options(deep_gemm_cpp_tllm PRIVATE ${TORCH_CXX_FLAGS} -std=c++17
+                                                  -O3 -fPIC -Wno-psabi)
+
+# Extension name definition
+target_compile_definitions(deep_gemm_cpp_tllm
+                           PRIVATE TORCH_EXTENSION_NAME=deep_gemm_cpp_tllm)
+
+# Include directories matching deep_gemm setup.py
+target_include_directories(
+  deep_gemm_cpp_tllm
+  PRIVATE ${CUDA_INCLUDE_DIRS} ${DEEP_GEMM_SOURCE_DIR}/deep_gemm/include
+          ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include
+          ${DEEP_GEMM_SOURCE_DIR}/third-party/fmt/include)
+
+# Link libraries (matching deep_gemm setup.py: cuda, cudart + torch)
+target_link_libraries(
+  deep_gemm_cpp_tllm PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}
+                             CUDA::cuda_driver CUDA::cudart)
+
+# Link directories
+target_link_directories(
+  deep_gemm_cpp_tllm PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+  ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
+
+# Set targets
+# ===========
+add_dependencies(deep_gemm deep_gemm_cpp_tllm)
@@ -0,0 +1,4 @@
+{
+  global: PyInit_deep_gemm_cpp_tllm;
+  local: *;
+};
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  global: PyInit_deep_gemm_cpp_tllm;
 +  local: *;
 +};