Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()

if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
Original file line number Diff line number Diff line change
Expand Up @@ -254,15 +254,15 @@ void cutlass_moe_mm(
bool per_act_token, bool per_out_ch) {
int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
if (version_num >= 100) {
if (version_num >= 100 && version_num < 110) {
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch);
return;
}
#endif
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
if (version_num >= 90) {
if (version_num >= 90 && version_num < 100) {
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch);
Expand Down
20 changes: 20 additions & 0 deletions vllm/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2747,6 +2747,8 @@ def __post_init__(self):
self.measure()

def measure(self):
from vllm.platforms import current_platform

# we measure the torch peak memory usage via allocated_bytes,
# rather than `torch.cuda.memory_reserved()` .
# After `torch.cuda.reset_peak_memory_stats()`,
Expand All @@ -2756,6 +2758,24 @@ def measure(self):
"allocated_bytes.all.peak", 0)

self.free_memory, self.total_memory = torch.cuda.mem_get_info()
shared_sysmem_device_mem_sms = (
(8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark
if current_platform.is_cuda() and \
current_platform.get_device_capability() in \
shared_sysmem_device_mem_sms:
# On UMA (Orin, Thor and Spark) platform,
# where both CPU and GPU rely on system memory,
# the cudaMemGetInfo function shows the amount of free system memory
# rather than what’s actually available.
# In the case,
# torch.cuda.mem_get_info() only reports "free" memory,
# which can be lower than what is actually
# available due to not including cache memory.
# There’s also a comprehensive reference page
# that explains how you can compute the proper value yourself.
# https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
self.free_memory = psutil.virtual_memory().available
Comment thread
ProExpertProg marked this conversation as resolved.

self.cuda_memory = self.total_memory - self.free_memory

# torch.cuda.memory_reserved() is how many bytes
Expand Down