diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 000000000..a1b8c903b --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,201 @@ +name: Python package + +on: + push: + branches: [ "*" ] + pull_request: + branches: [ master ] + release: + types: [ published ] + +jobs: + + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v3 + # On Linux we use CMake within Docker + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.13 + with: + cmake-version: '3.26.x' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) + if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then + # Allow cross-compile om aarch64 + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu + fi + if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF -DENABLE_MPS=ON . + else + cmake -DENABLE_CUDA=OFF . + fi + if [ ${build_os:0:7} == windows ]; then + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + else + make + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: shared_library + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + cuda_version: ['12.1.0'] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v3 + # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) + - name: Set up Docker multiarch + if: startsWith(matrix.os, 'ubuntu') + uses: docker/setup-qemu-action@v2 + # On Linux we use CMake within Docker + - name: Setup cmake + if: ${{ !startsWith(matrix.os, 'linux') }} + uses: jwlawson/actions-setup-cmake@v1.13 + with: + cmake-version: '3.26.x' + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.10 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: 'local' + #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DENABLE_CUDA=ON . \ + && make" + else + cmake -DENABLE_CUDA=ON . + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: shared_library + path: output/* + retention-days: 7 + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} + steps: + # Check out code + - uses: actions/checkout@v3 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v3 + with: + name: shared_library + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Compile C++ code + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + # + - name: Install Python dependencies + shell: bash + run: | + pip install -r requirements.txt + # TODO: How to run CUDA tests on GitHub actions? + #- name: Run unit tests + # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents + # run: | + # PYTHONPATH=. pytest --log-cli-level=DEBUG tests + - name: Build wheel + shell: bash + run: | + python setup.py bdist_wheel + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: bdist_wheel + path: dist/bitsandbytes-*.whl + retention-days: 7 + publish: + needs: build-wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build dist + run: | + python setup.py sdist + - name: Download build artifact + uses: actions/download-artifact@v3 + with: + name: bdist_wheel + path: dist/ + - run: | + ls -lR dist/ + - name: Publish to PyPi + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.pypi }} diff --git a/.gitignore b/.gitignore index 2f929968b..202dcb13d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,26 @@ __pycache__/ *.py[cod] *$py.class - -# C extensions *.so +*.dll +*.dylib +*.o +*.obj +*.air +*.metallib + +# CMake generated files +CMakeCache.txt +CMakeScripts/ +cmake_install.cmake +Makefile +CMakeFiles/ +*.sln +*.vcxproj* +*.xcodeproj/ +bitsandbytes.dir/ +Debug/ +Release/ # Distribution / packaging .Python @@ -133,4 +150,5 @@ dmypy.json dependencies cuda_build +output/ .vscode/* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..d6e269d15 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,121 @@ +cmake_minimum_required(VERSION 3.22.1) + +option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF) +option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF) + +if(ENABLE_CUDA) + if(APPLE) + message(FATAL_ERROR "CUDA is not supported on macOS" ) + endif() + option(NO_CUBLASLT "Don't use CUBLAST" OFF) + if(NO_CUBLASLT) + set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72) + else() + set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90) + endif() +endif() + +if(ENABLE_CUDA) + message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}") + # Find CUDA tools if we are compiling with CUDA + find_package(CUDAToolkit REQUIRED) + if(NO_CUBLASLT) + set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt") + else() + set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}") + endif() + + project(bitsandbytes LANGUAGES CXX CUDA) + add_compile_definitions(BUILD_CUDA) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu) +elseif(ENABLE_MPS) + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) + endif() + message("Building MPS support") + set(LIBSUFFIX "mps") + project(bitsandbytes LANGUAGES CXX OBJCXX) + add_compile_definitions(BUILD_MPS) + set(METAL_SOURCES csrc/mps_kernels.metal) + file(MAKE_DIRECTORY "build") + add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" + COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES} + COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" + DEPENDS "${METAL_SOURCES}" + COMMENT "Compiling Metal kernels" + VERBATIM) + add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") + set(GPU_SOURCES csrc/mps_ops.mm) +else() + message("Building with CPU only") + set(LIBSUFFIX "cpu") + + project(bitsandbytes LANGUAGES CXX) + set(GPU_SOURCES) +endif() + +if(APPLE) + set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) +endif() +set(CMAKE_CXX_STANDARD 14) +set(CXX_STANDARD_REQUIRED C++14) + +if(WIN32) + # Mute warnings + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177") + + # Enable fast math on VC++ + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") + + # Export all symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() + +# Weird MSVC hacks +if(MSVC) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") +endif() + +# Add csrc files +add_library(bitsandbytes SHARED + ${GPU_SOURCES} + csrc/common.cpp + csrc/cpu_ops.cpp + csrc/pythonInterface.cpp) + +target_include_directories(bitsandbytes PUBLIC + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${CMAKE_CURRENT_SOURCE_DIR}/csrc + ${CMAKE_CURRENT_SOURCE_DIR}/include) + +if(ENABLE_CUDA) + target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include) + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math") + + set_target_properties( + bitsandbytes + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON) + + target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse) +endif() +if(ENABLE_MPS) + add_dependencies(bitsandbytes metallib) + target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") +endif() + +set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}") +# Set the output name of the CUDA library +if(MSVC) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +endif() + +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) diff --git a/Makefile b/Makefile.previous similarity index 77% rename from Makefile rename to Makefile.previous index 5f997a122..e45b940c8 100644 --- a/Makefile +++ b/Makefile.previous @@ -14,6 +14,33 @@ CUDA_VERSION:= endif endif +NATIVE_ARCH:=$(shell (arch | sed -e s/arm64/aarch64/)) +NATIVE_OS:=$(shell uname) +SHLIB_EXTENSION:=.so +ifeq ($(TARGET_ARCH),) + TARGET_ARCH:=$(NATIVE_ARCH) +endif +ifeq ($(TARGET_ARCH),aarch64) + EXTRA_FLAGS:= +else + EXTRA_FLAGS:=-msse3 -mavx2 +endif +ifeq ($(NATIVE_OS),Darwin) + TARGET_OS:=apple-darwin + SHLIB_EXTENSION:=.dylib +else + ifeq ($(OS),Windows_NT) + TARGET_OS:=windows + GPP:=cl.exe + SHLIB_EXTENSION:=.dll + else + TARGET_OS:=linux-gnu + endif +endif +ifneq ($(TARGET_ARCH),$(NATIVE_ARCH)) + EXTRA_FLAGS:=$(EXTRA_FLAGS) --target=$(TARGET_ARCH)-apple-darwin +endif + NVCC := $(CUDA_HOME)/bin/nvcc @@ -24,7 +51,7 @@ CSRC := $(ROOT_DIR)/csrc BUILD_DIR:= $(ROOT_DIR)/build FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c +FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.cpp INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib @@ -64,12 +91,12 @@ all: $(BUILD_DIR) env cuda110_nomatmul_kepler: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + $(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB) cuda11x_nomatmul_kepler: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + $(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB) cuda110_nomatmul: $(BUILD_DIR) env @@ -90,17 +117,17 @@ cuda118_nomatmul: $(BUILD_DIR) env cuda12x_nomatmul: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + $(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB) cuda110: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + $(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB) cuda11x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + $(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB) cuda118: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) @@ -110,10 +137,13 @@ cuda118: $(BUILD_DIR) env cuda12x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + $(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB) cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so + $(GPP) $(EXTRA_FLAGS) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu$(SHLIB_EXTENSION) + +version: + $(GPP) --version env: @echo "ENVIRONMENT" diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py index 523d02301..7aea163fe 100644 --- a/bitsandbytes/__main__.py +++ b/bitsandbytes/__main__.py @@ -103,7 +103,9 @@ def print_debug_info() -> None: print_header("OTHER") print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}") -print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities()}") +if COMPILED_WITH_CUDA: + cuda = get_cuda_lib_handle() + print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities(cuda)}") print_header("") print_header("DEBUG INFO END") print_header("") diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 19f224391..5914225cc 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -7,7 +7,8 @@ import torch -import bitsandbytes.functional as F +from .. import functional as F +from ..utils import is_cuda_device # math.prod not compatible with python < 3.8 @@ -224,6 +225,8 @@ def backward(ctx, grad_output): def supports_igemmlt(device: torch.device) -> bool: """check if this device supports the optimized int8 kernel""" + if not is_cuda_device(device): + return False if torch.cuda.get_device_capability(device=device) < (7, 5): return False device_name = torch.cuda.get_device_name(device=device) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index d52a6d607..ebccf2e0d 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -1,41 +1,20 @@ import ctypes as ct -import os import torch from pathlib import Path from warnings import warn -from bitsandbytes.cuda_setup.main import CUDASetup - - -setup = CUDASetup.get_instance() -if setup.initialized != True: - setup.run_cuda_setup() - -lib = setup.lib -try: - if lib is None and torch.cuda.is_available(): - CUDASetup.get_instance().generate_instructions() - CUDASetup.get_instance().print_log_stack() - raise RuntimeError(''' - CUDA Setup failed despite GPU being available. Please run the following command to get more information: - - python -m bitsandbytes - - Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them - to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes - and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''') - lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False - lib.get_context.restype = ct.c_void_p - lib.get_cusparse.restype = ct.c_void_p - lib.cget_managed_ptr.restype = ct.c_void_p - COMPILED_WITH_CUDA = True -except AttributeError as ex: - warn("The installed version of bitsandbytes was compiled without GPU support. " - "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.") +if torch.backends.mps.is_built(): + package_dir = Path(__file__).parent + binary_path = package_dir / "libbitsandbytes_mps.dylib" + lib = ct.cdll.LoadLibrary(binary_path) COMPILED_WITH_CUDA = False - print(str(ex)) +elif torch.cuda.is_available(): + from bitsandbytes.cuda_setup.main import CUDASetup + setup = CUDASetup.get_instance() + if setup.initialized != True: + setup.run_cuda_setup() # print the setup details after checking for errors so we do not print twice #if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 34c035425..1e4e8c611 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -99,6 +99,7 @@ def initialize(self): self.lib = None self.initialized = False self.error = False + self.cuda_setup_log = [] def manual_override(self): if torch.cuda.is_available(): @@ -116,7 +117,6 @@ def manual_override(self): def run_cuda_setup(self): self.initialized = True - self.cuda_setup_log = [] binary_name, cudart_path, cc, cuda_version_string = evaluate_cuda_setup() self.cudart_path = cudart_path diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index e17e70c4b..ee1d70f7e 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -136,12 +136,14 @@ def get_instance(cls): return cls._instance def get_context(self, device): - if device.index not in self.context: - prev_device = torch.cuda.current_device() - torch.cuda.set_device(device) - self.context[device.index] = ct.c_void_p(lib.get_context()) - torch.cuda.set_device(prev_device) - return self.context[device.index] + if device.type == "cuda": + if device.index not in self.context: + prev_device = torch.cuda.current_device() + torch.cuda.set_device(device) + self.context[device.index] = ct.c_void_p(lib.get_context()) + torch.cuda.set_device(prev_device) + return self.context[device.index] + return None class Cusparse_Context: @@ -413,12 +415,16 @@ def get_ptr(A: Tensor) -> ct.c_void_p: def pre_call(device): + if device.type != "cuda": + return prev_device = torch.cuda.current_device() torch.cuda.set_device(device) return prev_device def post_call(prev_device): + if not prev_device or prev_device.type != "cuda": + return torch.cuda.set_device(prev_device) @@ -812,7 +818,7 @@ def dequantize_blockwise( if out is None: out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device) - if A.device.type != 'cpu': + if A.device.type == 'cuda': device = pre_call(A.device) code = quant_state.code.to(A.device) if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: @@ -826,7 +832,9 @@ def dequantize_blockwise( lib.cdequantize_blockwise_bf16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel())) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") - post_call(A.device) + post_call(device) + elif A.device.type == 'mps': + raise NotImplementedError("MPS is not implemented") else: code = quant_state.code.cpu() lib.cdequantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(quant_state.absmax), get_ptr(out), ct.c_longlong(quant_state.blocksize), ct.c_longlong(A.numel())) @@ -1495,7 +1503,7 @@ def histogram_scatter_add_2d( lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n) def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8): - if not torch.cuda.is_initialized(): torch.cuda.init() + if A.device.type == "cuda" and not torch.cuda.is_initialized(): torch.cuda.init() if A.dtype != expected_type or B.dtype != expected_type: raise TypeError( f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}" @@ -2101,7 +2109,7 @@ def double_quant( ): device = A.device assert A.dtype == torch.half - assert device.type == "cuda" + #assert device.type == "cuda" prev_device = pre_call(A.device) cols = A.shape[-1] diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index df2885306..9202d5c95 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -120,6 +120,19 @@ def execute_and_return_decoded_std_streams(command_string): std_out, std_err = execute_and_return_decoded_std_streams(command_string) return std_out, std_err +__cuda_devices = None +def get_cuda_devices(): + global __cuda_devices + if __cuda_devices is None: + cuda_devices = [] + if torch.cuda.is_available(): + devices = [d for d in range(torch.cuda.device_count())] + cuda_devices = [torch.cuda.get_device_name(d) for d in devices] + __cuda_devices = cuda_devices + return __cuda_devices + +def is_cuda_device(device): + return device in get_cuda_devices() def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None): diff --git a/csrc/common.cpp b/csrc/common.cpp index 52f029917..0a9601689 100644 --- a/csrc/common.cpp +++ b/csrc/common.cpp @@ -1,39 +1,35 @@ #include #include -void *quantize_block(void *arguments) { +void quantize_block(const quantize_block_args& args) { // 1. find absmax in block // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value // 4. check minimal distance // 5. store index - struct quantize_block_args *args = (quantize_block_args *) arguments; - // 1. find absmax in block float absmax_block = -FLT_MAX; - for (long long i = args->block_idx; i < args->block_end; i++) - absmax_block = fmax(absmax_block, fabs(args->A[i])); + for (long long i = args.block_idx; i < args.block_end; i++) + absmax_block = fmax(absmax_block, fabs(args.A[i])); - args->absmax[args->block_idx / args->blocksize] = absmax_block; + args.absmax[args.block_idx / args.blocksize] = absmax_block; - for (long long i = args->block_idx; i < args->block_end; i++) { + for (long long i = args.block_idx; i < args.block_end; i++) { // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value - float normed_value = args->A[i] / absmax_block; - long long idx = args->bin_searcher->scalar(normed_value); + float normed_value = args.A[i] / absmax_block; + long long idx = args.bin_searcher->scalar(normed_value); // 4. check minimal distance // The binary search returns always the value to the left, which might not be the closest value if (idx < 255) { - float dist_left = fabs(normed_value - (args->code[idx])); - float dist_right = fabs(normed_value - (args->code[idx + 1])); + float dist_left = fabs(normed_value - (args.code[idx])); + float dist_right = fabs(normed_value - (args.code[idx + 1])); if (dist_right < dist_left) { idx += 1; } } // 5. store index - args->out[i] = (unsigned char) idx; + args.out[i] = (unsigned char) idx; } - - return NULL; } diff --git a/csrc/common.h b/csrc/common.h index c99034e78..e513f2875 100644 --- a/csrc/common.h +++ b/csrc/common.h @@ -20,6 +20,6 @@ struct quantize_block_args { }; -void *quantize_block(void *arguments); +void quantize_block(const quantize_block_args& args); #endif diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index e28e7b2c2..478c1f4ff 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include using namespace BinSearch; @@ -31,12 +31,8 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; - pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks); - - struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *)); - - for(long long i = 0; i < valid_chunks; i++) - args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args)); + std::vector threads(valid_chunks); + std::vector args(valid_chunks); int chunks_processed = 0; for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize) @@ -44,30 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx; long long block_end = block_idx + valid_items; - struct quantize_block_args *arg = args[chunks_processed]; - arg->bin_searcher = &bin_searcher; - arg->code = code; - arg->A = A; - arg->absmax = absmax; - arg->out = out; - arg->block_end = block_end; - arg->block_idx = block_idx; - arg->threadidx = block_idx / blocksize; - arg->blocksize = blocksize; - - pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg); + struct quantize_block_args& arg = args[chunks_processed]; + arg.bin_searcher = &bin_searcher; + arg.code = code; + arg.A = A; + arg.absmax = absmax; + arg.out = out; + arg.block_end = block_end; + arg.block_idx = block_idx; + arg.threadidx = block_idx / blocksize; + arg.blocksize = blocksize; + + threads[chunks_processed] = std::thread([arg] { quantize_block(arg); }); chunks_processed += 1; if(chunks_processed == valid_chunks){ break; } } for (int i = 0; i < valid_chunks; i++) - int err = pthread_join(threads[i], NULL); - - free(threads); - for (int i = 0; i < valid_chunks; i++) - free(args[i]); - free(args); - + threads[i].join(); } } diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 1ab8aa242..c2e2d7da7 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive(int M, int N template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal new file mode 100644 index 000000000..a5c8e35b2 --- /dev/null +++ b/csrc/mps_kernels.metal @@ -0,0 +1,117 @@ +#include +using namespace metal; + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +template +static unsigned char quantize_scalar( + float rand, + device float* code, + float x) +{ + int pivot = 127; + int upper_pivot = 255; + int lower_pivot = 0; + + float lower = -1.0f; + float upper = 1.0f; + + float val = code[pivot]; + // i>>=1 = {32, 16, 8, 4, 2, 1} + for(int i = 64; i > 0; i>>=1) + { + if(x > val) + { + lower_pivot = pivot; + lower = val; + pivot+=i; + } + else + { + upper_pivot = pivot; + upper = val; + pivot-=i; + } + val = code[pivot]; + } + + if(upper_pivot == 255) + upper = code[upper_pivot]; + if(lower_pivot == 0) + lower = code[lower_pivot]; + + if(!STOCHASTIC) + { + if(x > val) + { + float midpoint = (upper+val)*0.5f; + if(x > midpoint) + { + return upper_pivot; + } + else + return pivot; + } + else + { + float midpoint = (lower+val)*0.5f; + if(x < midpoint) + return lower_pivot; + else + return pivot; + } + } + else + { + if(x > val) + { + float dist_to_upper = fabs(upper-x); + float dist_full = upper-val; + if(rand >= dist_to_upper/dist_full) return upper_pivot; + else return pivot; + } + else + { + float dist_to_lower = fabs(lower-x); + float dist_full = val-lower; + if(rand >= dist_to_lower/dist_full) return lower_pivot; + else return pivot; + } + } +} + +kernel void quantize(device float* code [[buffer(0)]], + device float* A [[buffer(1)]], + device uchar* out [[buffer(2)]], + constant uint& n [[buffer(3)]], + uint id [[thread_position_in_grid]]) { + const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); + uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; + const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); + + float vals[NUM]; + uchar qvals[NUM]; + + for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { + valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + vals[j] = A[i + j]; + } + + for (uint j = 0; j < valid_items; j++) { + qvals[j] = quantize_scalar(0.0f, code, vals[j]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + out[i + j] = qvals[j]; + } + } +} diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h new file mode 100644 index 000000000..e69de29bb diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm new file mode 100644 index 000000000..cb549e83d --- /dev/null +++ b/csrc/mps_ops.mm @@ -0,0 +1,67 @@ +#import + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +static inline MPSGraph* get_graph() +{ + static MPSGraph* cur = nil; + if(!cur) { + cur = [[MPSGraph alloc] init]; + } + return cur; +} + +static inline id get_device() +{ + NSError *error = nil; + static id device = nil; + if(!device) { + device = MTLCreateSystemDefaultDevice(); + } + if(!device) { + NSLog(@"Failed to get MPS device"); + abort(); + } + return device; +} + +static inline id get_library() +{ + NSError *error = nil; + static id library = nil; + if(!library) { + library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error]; + } + if(!library) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + return library; +} + +/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"]; + return out; +}*/ + + +// MPSGraph function for quantize +extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id device = get_device(); + id library = get_library(); + static id kernel = nil; + if(!kernel) { + kernel = [library newFunctionWithName:@"quantize"]; + if(!kernel) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + } + NSLog(@"Not implemented"); + return nil; +} diff --git a/csrc/ops.cuh b/csrc/ops.cuh index f37b3b3af..cc7b59505 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -9,7 +9,9 @@ #include #include +#ifndef _MSC_VER #include +#endif #include #include diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp similarity index 99% rename from csrc/pythonInterface.c rename to csrc/pythonInterface.cpp index 865e4b6d5..49498c19b 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.cpp @@ -6,6 +6,9 @@ #if BUILD_CUDA #include #endif +#if BUILD_MPS +#include +#endif #include // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. @@ -412,6 +415,7 @@ extern "C" { gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); } #endif + void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } } diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h index d5fa58d12..7f52fce14 100644 --- a/include/Algo-Direct2.h +++ b/include/Algo-Direct2.h @@ -52,6 +52,7 @@ struct AlgoVecBase::val private: typedef AlgoScalarBase base_t; +#ifdef USE_SSE2 FORCE_INLINE //NO_INLINE void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const @@ -135,6 +136,7 @@ struct AlgoVecBase::val pr[0] = u.ui32[0]; pr[1] = u.ui32[2]; } +#endif // USE_SSE2 #ifdef USE_AVX diff --git a/include/Portable.h b/include/Portable.h index 1710b0502..78599944e 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -4,10 +4,40 @@ #include #include +#if defined(__aarch64__) +#ifdef __CUDACC__ +#undef USE_NEON // Doesn't work with nvcc, undefined symbols +#else +#include +#undef USE_NEON // Not yet implemented +#endif +#undef USE_AVX // x86_64 only +#undef USE_AVX2 // x86_64 only +#undef USE_SSE2 // x86_64 only +#undef USE_SSE41 // x86_64 only +#undef USE_SSE42 // x86_64 only +#undef USE_FMA // x86_64 only +#ifdef USE_NEON +typedef float32x4_t __m128; +typedef int32x4_t __m128i; +typedef float64x2_t __m128d; +#else +typedef struct {float a; float b; float c; float d;} __m128; +typedef struct {int a; int b; int c; int d;} __m128i; +typedef struct {double a; double b;} __m128d; +#endif +#else +#undef USE_NEON // ARM64 only #ifdef __FMA__ #define USE_FMA #endif +#if !defined(__SSE2__) && !defined(_MSC_VER) +#error Compiler must support SSE2 +#endif +#define USE_SSE2 +#if defined(__aarch64__) +#else #ifdef __AVX2__ #define USE_AVX2 #endif @@ -24,7 +54,8 @@ #ifdef __SSE4_2__ #define USE_SSE42 #endif - +#endif +#endif #ifndef _MSC_VER #include @@ -50,7 +81,7 @@ typedef unsigned __int64 uint64; namespace Details { -#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } +#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } // log2 is not defined in VS2008 #if defined(_MSC_VER) diff --git a/include/SIMD.h b/include/SIMD.h index a2ac1a9ae..18a38dbfd 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -2,6 +2,46 @@ #include "Portable.h" +#ifdef USE_SSE2 +#include +#if defined(USE_AVX) || defined(USE_AVX2) +#include +#else +#ifdef USE_SSE41 +#include +#endif +#endif +#endif + +namespace BinSearch { +namespace Details { + +template +struct FTOITraits{}; + +template +struct FVec; + +template +struct IVec; + +template +struct FVec1; + +template <> struct InstrFloatTraits +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef __m128d vec_t; +}; + +} +} + +#if !defined(__aarch64__) #ifdef USE_SSE42 #ifndef _MSC_VER #include @@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32) } // namespace #endif -#if defined(USE_AVX) || defined(USE_AVX2) -#include -#else -#include -#ifdef USE_SSE41 -#include -#endif -#endif - #include "Type.h" namespace BinSearch { namespace Details { -template -struct FVec; - -template -struct IVec; - -template -struct FVec1; - template <> struct InstrIntTraits { typedef __m128i vec_t; @@ -64,8 +86,8 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template -struct FTOITraits +template <> +struct FTOITraits { typedef IVec vec_t; }; @@ -285,9 +307,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec< FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_ps( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_ps( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttps_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm_fmsub_ps(a, b, c); } #endif @@ -339,9 +363,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_pd( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_pd( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttpd_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c ) { return _mm_fmsub_pd(a, b, c); } #endif @@ -560,3 +586,4 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec 0: dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) @@ -289,6 +291,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values] +@skip_if_no_cuda() @pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias", values, @@ -307,7 +310,6 @@ def test_matmullt( has_fp16_weights, has_bias ): - if not torch.cuda.is_available(): pytest.skip('No GPU found.') dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda") diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index e875bcd2b..138c5e88a 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -17,11 +17,3 @@ def test_manual_override(): import bitsandbytes as bnb loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name assert loaded_lib == 'libbitsandbytes_cuda122.so' - - - - - - - - diff --git a/tests/test_functional.py b/tests/test_functional.py index f825c14df..7fa0bf64c 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -12,6 +12,8 @@ from bitsandbytes import functional as F from scipy.stats import norm +from testutil import skip_if_no_cuda + torch.set_printoptions( precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 ) @@ -91,6 +93,7 @@ def teardown(): pass +@skip_if_no_cuda() @pytest.mark.parametrize( "dtype", [torch.float32, torch.float16], ids=["float", "half"] ) @@ -111,6 +114,7 @@ def test_estimate_quantiles(dtype): assert (diff > 5e-02).sum().item() == 0 +@skip_if_no_cuda() def test_quantile_quantization(): for i in range(100): A1 = torch.randn(1024, 1024, device="cuda") @@ -130,11 +134,12 @@ def test_quantile_quantization(): -def test_dynamic_quantization(): +@skip_if_no_cuda() +def test_dynamic_quantization(device): diffs = [] reldiffs = [] for i in range(100): - A1 = torch.randn(1024, 1024, device="cuda") + A1 = torch.randn(1024, 1024, device=device) C, S = F.quantize(A1) A2 = F.dequantize(C, S) diff = torch.abs(A1 - A2) @@ -146,7 +151,7 @@ def test_dynamic_quantization(): print(sum(reldiffs)/len(reldiffs)) for i in range(100): - A1 = torch.rand(1024, 1024, device="cuda") + A1 = torch.rand(1024, 1024, device=device) C, S = F.quantize(A1) A2 = F.dequantize(C, S) diff = torch.abs(A1 - A2).mean().item() @@ -154,7 +159,7 @@ def test_dynamic_quantization(): assert diff < 0.004 - +@skip_if_no_cuda() @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"]) @pytest.mark.parametrize("nested", [False, True], ids=["False", "True"]) @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) @@ -204,6 +209,7 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): +@skip_if_no_cuda() @pytest.mark.parametrize( "gtype", [torch.float32, torch.float16], ids=["float", "half"] ) @@ -306,6 +312,7 @@ def mean(xx): ] +@skip_if_no_cuda() @pytest.mark.parametrize( "dim1, dim2, quant_methods, batched", values, ids=names ) @@ -364,10 +371,11 @@ def test_stable_embedding(): ] +@skip_if_no_cuda() @pytest.mark.parametrize( "hidden_dim, batch_dim, transpose, seq_dim", values, ids=names ) -def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): +def test_igemm(device, hidden_dim, batch_dim, transpose, seq_dim): hidden_dim = hidden_dim - (hidden_dim % 32) batch_dim = batch_dim - (batch_dim % 16) seq_dim = seq_dim - (seq_dim % 16) @@ -382,8 +390,8 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): if transpose[1] else (hidden_dim, 32 * random.randint(1, 4)) ) - A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8) - B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8) + A = torch.randint(-128, 127, size=shapeA, device=device).to(torch.int8) + B = torch.randint(-128, 127, size=shapeB, device=device).to(torch.int8) if not transpose[0] and not transpose[1]: out2 = torch.matmul(A.float(), B.float()) out = F.igemm(A, B) @@ -406,8 +414,8 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): if transpose[1] else (hidden_dim, 32 * random.randint(1, 4)) ) - A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8) - B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8) + A = torch.randint(-128, 127, size=shapeA, device=device).to(torch.int8) + B = torch.randint(-128, 127, size=shapeB, device=device).to(torch.int8) if not transpose[0] and not transpose[1]: out2 = torch.matmul(A.float(), B.float()) out = F.igemm(A, B) @@ -428,6 +436,7 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): ] +@skip_if_no_cuda() @pytest.mark.parametrize("seq_dim, hidden_dim, batch_dim", values, ids=names) def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): seq_dim = seq_dim - (seq_dim % 32) @@ -461,6 +470,7 @@ def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): ] +@skip_if_no_cuda() @pytest.mark.parametrize( "seq_dim, hidden_dim, batch_dim, transpose", values, ids=names ) @@ -546,6 +556,7 @@ def min_max(x): ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dim3, dim4, transpose", values, ids=names) def test_ibmm(dim1, dim2, dim3, dim4, transpose): dim2 = dim2 - (dim2 % 16) @@ -582,6 +593,7 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose): names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names) def test_vector_quant(dim1, dim2, dim3): dim2 = dim2 - (dim2 % 16) @@ -611,6 +623,7 @@ def test_vector_quant(dim1, dim2, dim3): names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names) def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): if dims == 3 and out_order != "col32": @@ -695,6 +708,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names) def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): for i in range(k): @@ -743,6 +757,7 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names) def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): formatB = F.get_special_format_str() @@ -800,6 +815,7 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): ] +@skip_if_no_cuda() @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_8bit_training(batch, seq, model, hidden): formatB = F.get_special_format_str() @@ -967,6 +983,7 @@ def test_bench_8bit_training(batch, seq, model, hidden): names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names) def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): inner = torch.randint(1, 128, size=(1,)).item() @@ -1017,6 +1034,7 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dims", values, ids=names) def test_colrow_absmax(dim1, dim2, dims): for i in range(k): @@ -1073,6 +1091,7 @@ def test_colrow_absmax(dim1, dim2, dims): names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2", values, ids=names) def test_double_quant(dim1, dim2): for i in range(k): @@ -1120,6 +1139,7 @@ def test_double_quant(dim1, dim2): names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) def test_integrated_igemmlt(dim1, dim4, inner): for i in range(k): @@ -1164,6 +1184,7 @@ def test_integrated_igemmlt(dim1, dim4, inner): names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.skip("Row scale has some bugs for ampere") def test_igemmlt_row_scale(dim1, dim4, inner): @@ -1239,6 +1260,7 @@ def test_igemmlt_row_scale(dim1, dim4, inner): names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.skip("Row scale has some bugs for ampere") def test_row_scale_bench(dim1, dim4, inner): @@ -1309,6 +1331,7 @@ def test_row_scale_bench(dim1, dim4, inner): ] +@skip_if_no_cuda() @pytest.mark.parametrize( "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose", values, @@ -1358,6 +1381,7 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): ] +@skip_if_no_cuda() def test_overflow(): formatB = F.get_special_format_str() print(formatB) @@ -1382,6 +1406,7 @@ def test_overflow(): names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2", values, ids=names) def test_coo_double_quant(dim1, dim2): threshold = 3.00 @@ -1419,6 +1444,7 @@ def test_coo_double_quant(dim1, dim2): names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names) def test_spmm_coo(dim1, dim2, transposed_B): threshold = 1.5 @@ -1450,6 +1476,7 @@ def test_spmm_coo(dim1, dim2, transposed_B): assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30) +@skip_if_no_cuda() def test_spmm_bench(): batch = 2 model = 1024 * 1 @@ -1500,6 +1527,7 @@ def test_spmm_bench(): names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2", values, ids=names) def test_integrated_sparse_decomp(dim1, dim2): threshold = 3.0 @@ -1536,6 +1564,7 @@ def test_integrated_sparse_decomp(dim1, dim2): assert err2 < err1 +@skip_if_no_cuda() def test_matmuls(): a = torch.randn(256, 512).half().cuda() b = torch.randn(256, 512).half().cuda() @@ -1566,6 +1595,7 @@ def test_matmuls(): ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dtype, out_func", values, ids=names) def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): out_func = getattr(torch, out_func) @@ -1628,6 +1658,7 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): # print(time.time() - t0) +@skip_if_no_cuda() def test_coo2csr(): threshold = 1 A = torch.randn(128, 128).half().cuda() @@ -1648,6 +1679,7 @@ def test_coo2csr(): torch.testing.assert_close(A2[idx], csrA.values) +@skip_if_no_cuda() def test_coo2csc(): threshold = 1 A = torch.randn(128, 128).half().cuda() @@ -1682,6 +1714,7 @@ def test_coo2csc(): names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names) def test_spmm_coo_dequant(dim1, dim2, dtype): threshold = 6.0 @@ -1798,6 +1831,8 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): #values.append((batch_size, seqdim, 5140, 4*5140)) #values.append((batch_size, seqdim, 12288, 4*12288)) names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] + +@skip_if_no_cuda() @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): iters = 1000 @@ -1953,6 +1988,7 @@ def test_bench_matmul(batch, seq, model, hidden): #torch.cuda.synchronize() #print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") +@skip_if_no_cuda() def test_zeropoint(): def quant_zp(x): dtype = x.dtype @@ -2039,6 +2075,7 @@ def quant_zp(x): print(err1, err2, err3, err4, err5, err6) +@skip_if_no_cuda() def test_extract_outliers(): for i in range(k): shapeA = (4096, 4096 * 4) @@ -2090,6 +2127,7 @@ def test_blockwise_cpu_large(): +@skip_if_no_cuda() def test_fp8_quant(): for e_bits in range(1, 7): p_bits = 7-e_bits @@ -2138,8 +2176,8 @@ def test_fp8_quant(): #print(3, sum(relerr)/len(relerr)) +@skip_if_no_cuda() def test_few_bit_quant(): - #print('') for bits in range(2, 9): #print('='*30, bits, '='*30) @@ -2198,6 +2236,7 @@ def test_few_bit_quant(): #assert False +@skip_if_no_cuda() def test_kbit_quantile_estimation(): for i in range(100): data = torch.randn(1024, 1024, device='cuda') @@ -2223,6 +2262,7 @@ def test_kbit_quantile_estimation(): assert err < 0.035 +@skip_if_no_cuda() def test_bench_dequantization(): a = torch.rand(1024, 1024, device='cuda').half() code =F.create_fp8_map(True, 3, 0, 4).cuda() diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index 37f7af9cb..f8cd719de 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -11,6 +11,7 @@ from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout from bitsandbytes.nn.modules import Linear8bitLt +from testutil import skip_if_no_cuda # contributed by Alex Borzunov, see: # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py @@ -32,8 +33,7 @@ def test_layout_exact_match(): assert restored_x.is_contiguous() assert torch.all(torch.eq(restored_x, x)) - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") +@skip_if_no_cuda() def test_linear_no_igemmlt(): linear = torch.nn.Linear(1024, 3072) x = torch.randn(3, 1024, dtype=torch.half) diff --git a/tests/test_modules.py b/tests/test_modules.py index 7d2d03498..27100bfeb 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,11 +1,14 @@ +import math from itertools import product import pytest import torch from torch import nn +import einops import bitsandbytes as bnb +from testutil import skip_if_no_cuda class MockArgs: def __init__(self, initial_data): @@ -315,6 +318,7 @@ def forward(self, x): names = [f"threshold_{vals}" for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("threshold", values, ids=names) def test_linear8bitlt_inference(threshold): l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half() @@ -329,6 +333,7 @@ def test_linear8bitlt_inference(threshold): assert l1.state.CxB is not None +@skip_if_no_cuda() def test_linear8bitlt_accumulated_gradient(): l1 = torch.nn.Sequential(*[bnb.nn.Linear8bitLt(32, 32).cuda().half() for i in range(2)]) l2 = torch.nn.Sequential(*[torch.nn.Linear(32, 32).cuda().half() for i in range(2)]) @@ -343,7 +348,7 @@ def test_linear8bitlt_accumulated_gradient(): acc_steps = 10 for i in range(10): - b1 = torch.randn(16, 8, 32, device="cuda").half() + b1 = torch.randn(16, 8, 32, device=device).half() o1 = l1(b1) o2 = l2(b1) loss1 = o1.mean() @@ -374,7 +379,7 @@ def test_linear8bitlt_accumulated_gradient(): torch.testing.assert_close(l1[0].weight.grad, l2[0].weight.grad, atol=1e-3, rtol=1e-3) torch.testing.assert_close(l1[1].weight.grad, l2[1].weight.grad, atol=1e-3, rtol=1e-3) - +@skip_if_no_cuda() @pytest.mark.parametrize("threshold", [0.0, 2.0]) @pytest.mark.parametrize("memory_efficient_backward", [False]) def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): @@ -486,6 +491,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): assert (idx == 0).sum().item() <= b1.numel() * 0.005 +@skip_if_no_cuda() @pytest.mark.parametrize("module", [lambda nin, nout, bias=True: bnb.nn.Linear8bitLt(nin, nout, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4']) def test_linear_kbit_fp32_bias(module): # casts model to fp16 -> int8 automatically @@ -581,6 +587,7 @@ def test_kbit_backprop(module): print('rel out', sum(relerrs1)/len(relerrs1)) print('rel grad', sum(relerrs2)/len(relerrs2)) +@skip_if_no_cuda() def test_fp8linear(): b = 10 diff --git a/tests/test_optim.py b/tests/test_optim.py index 49d4f442a..7365e2dc6 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -13,6 +13,7 @@ import bitsandbytes as bnb import bitsandbytes.functional as F +from testutil import skip_if_no_cuda # import apex @@ -109,6 +110,7 @@ def rm_path(path): optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] values = list(product(dim1, dim2, gtype, optimizer_names)) names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_optimizer32bit(dim1, dim2, gtype, optim_name): if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip() @@ -186,6 +188,7 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names) def test_global_config(dim1, dim2, gtype): if dim1 == 1 and dim2 == 1: @@ -251,6 +254,7 @@ def test_global_config(dim1, dim2, gtype): ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_optimizer8bit(dim1, dim2, gtype, optim_name): if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip() @@ -388,6 +392,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, gtype, optim_bits", values, ids=names) def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): if dim1 == 1 and dim2 == 1: @@ -491,6 +496,7 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): ] +@skip_if_no_cuda() @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): if dim1 == 1 and dim2 == 1: diff --git a/tests/testutil.py b/tests/testutil.py new file mode 100644 index 000000000..a805f60bf --- /dev/null +++ b/tests/testutil.py @@ -0,0 +1,21 @@ +import pytest +import torch +from bitsandbytes.utils import get_cuda_devices + +def get_gpu_devices(): + """ + Returns a list of all GPU devices supported by Torch in the current environment (i.e. devices that Torch was built with + support for and are present in the current environment). + """ + ret = [] + if torch.backends.mps.is_built() and torch.backends.mps.is_available(): + ret.append("mps") + if torch.cuda.is_available(): + ret += get_cuda_devices() + return ret + +def skip_if_no_gpu(): + return pytest.mark.skipif(not get_gpu_devices() or not torch.cuda.is_available(), reason="No GPU device found by Torch") + +def skip_if_no_cuda(): + return pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA-compatible device found by Torch")