diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 000000000..a1b8c903b
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,201 @@
+name: Python package
+
+on:
+  push:
+    branches: [ "*" ]
+  pull_request:
+    branches: [ master ]
+  release:
+    types: [ published ]
+
+jobs:
+  
+  ##
+  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+  ##
+  build-shared-libs:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [x86_64, aarch64]
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # On Linux we use CMake within Docker
+    - name: Setup cmake
+      uses: jwlawson/actions-setup-cmake@v1.13
+      with:
+        cmake-version: '3.26.x'
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+      if: ${{ startsWith(matrix.os, 'windows') }}
+      # Compile C++ code
+    - name: Build C++
+      shell: bash
+      run: |
+        set -ex
+        build_os=${{ matrix.os }}
+        build_arch=${{ matrix.arch }}
+        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
+        if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then
+          # Allow cross-compile om aarch64
+          sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu
+        fi
+        if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then
+          cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF -DENABLE_MPS=ON .
+        else
+          cmake -DENABLE_CUDA=OFF .
+        fi
+        if [ ${build_os:0:7} == windows ]; then
+          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+        else
+          make
+        fi
+        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
+        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: shared_library
+        path: output/*
+        retention-days: 7
+  ##
+  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
+  ##
+  build-shared-libs-cuda:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        arch: [x86_64, aarch64]
+        cuda_version: ['12.1.0']
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
+    - name: Set up Docker multiarch
+      if: startsWith(matrix.os, 'ubuntu')
+      uses: docker/setup-qemu-action@v2
+      # On Linux we use CMake within Docker
+    - name: Setup cmake
+      if: ${{ !startsWith(matrix.os, 'linux') }}
+      uses: jwlawson/actions-setup-cmake@v1.13
+      with:
+        cmake-version: '3.26.x'
+      # Windows: We install Cuda on the agent (slow)
+    - uses: Jimver/cuda-toolkit@v0.2.10
+      if: startsWith(matrix.os, 'windows')
+      id: cuda-toolkit
+      with:
+        cuda: ${{ matrix.cuda_version }}
+        method: 'local'
+        #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+      if: ${{ startsWith(matrix.os, 'windows') }}
+      # Compile C++ code
+    - name: Build C++
+      shell: bash
+      run: |
+        set -ex
+        build_os=${{ matrix.os }}
+        build_arch=${{ matrix.arch }}
+        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
+        if [ ${build_os:0:6} == ubuntu ]; then 
+          image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
+          echo "Using image $image"
+          docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
+            "apt-get update \
+            && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+            && cmake -DENABLE_CUDA=ON . \
+            && make"
+        else
+          cmake -DENABLE_CUDA=ON .
+          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+        fi
+        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
+        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: shared_library
+        path: output/*
+        retention-days: 7
+  build-wheels:
+    needs:
+    - build-shared-libs
+    - build-shared-libs-cuda
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        arch: [x86_64, aarch64]
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }}
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # Download shared libraries
+    - name: Download build artifact
+      uses: actions/download-artifact@v3
+      with:
+        name: shared_library
+        path: output/
+    - name: Copy correct platform shared library
+      shell: bash
+      run: |
+        cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
+      # Compile C++ code
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+      # 
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        pip install -r requirements.txt
+    # TODO: How to run CUDA tests on GitHub actions?
+    #- name: Run unit tests
+    #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
+    #  run: |
+    #    PYTHONPATH=. pytest --log-cli-level=DEBUG tests
+    - name: Build wheel
+      shell: bash
+      run: |
+        python setup.py bdist_wheel
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: bdist_wheel
+        path: dist/bitsandbytes-*.whl
+        retention-days: 7
+  publish:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Build dist
+      run: |
+        python setup.py sdist
+    - name: Download build artifact
+      uses: actions/download-artifact@v3
+      with:
+        name: bdist_wheel
+        path: dist/
+    - run: |
+        ls -lR dist/
+    - name: Publish to PyPi
+      if: startsWith(github.ref, 'refs/tags')
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.pypi }}
diff --git a/.gitignore b/.gitignore
index 2f929968b..202dcb13d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,26 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
-# C extensions
 *.so
+*.dll
+*.dylib
+*.o
+*.obj
+*.air
+*.metallib
+
+# CMake generated files
+CMakeCache.txt
+CMakeScripts/
+cmake_install.cmake
+Makefile
+CMakeFiles/
+*.sln
+*.vcxproj*
+*.xcodeproj/
+bitsandbytes.dir/
+Debug/
+Release/
 
 # Distribution / packaging
 .Python
@@ -133,4 +150,5 @@ dmypy.json
 
 dependencies
 cuda_build
+output/
 .vscode/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..d6e269d15
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,121 @@
+cmake_minimum_required(VERSION 3.22.1)
+
+option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF)
+option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF)
+
+if(ENABLE_CUDA)
+    if(APPLE)
+        message(FATAL_ERROR "CUDA is not supported on macOS" )
+    endif()
+    option(NO_CUBLASLT "Don't use CUBLAST" OFF)
+    if(NO_CUBLASLT)
+        set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72)
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90)
+    endif()
+endif()
+
+if(ENABLE_CUDA)
+    message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}")
+    # Find CUDA tools if we are compiling with CUDA
+    find_package(CUDAToolkit REQUIRED)
+    if(NO_CUBLASLT)
+        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt")
+    else()
+        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}")
+    endif()
+
+    project(bitsandbytes LANGUAGES CXX CUDA)
+    add_compile_definitions(BUILD_CUDA)
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu)
+elseif(ENABLE_MPS)
+    if(NOT APPLE)
+        message(FATAL_ERROR "MPS is only supported on macOS" )
+    endif()
+    message("Building MPS support")
+    set(LIBSUFFIX "mps")
+    project(bitsandbytes LANGUAGES CXX OBJCXX)
+    add_compile_definitions(BUILD_MPS)
+    set(METAL_SOURCES csrc/mps_kernels.metal)
+    file(MAKE_DIRECTORY "build")
+    add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"  
+                   COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES}
+                   COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
+                   DEPENDS "${METAL_SOURCES}"  
+                   COMMENT "Compiling Metal kernels"  
+                   VERBATIM)
+    add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+    set(GPU_SOURCES csrc/mps_ops.mm)
+else()
+    message("Building with CPU only")
+    set(LIBSUFFIX "cpu")
+
+    project(bitsandbytes LANGUAGES CXX)
+    set(GPU_SOURCES)
+endif()
+
+if(APPLE)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
+endif()
+set(CMAKE_CXX_STANDARD 14)
+set(CXX_STANDARD_REQUIRED C++14)
+
+if(WIN32)
+    # Mute warnings
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177")
+
+    # Enable fast math on VC++
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
+
+    # Export all symbols
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+# Weird MSVC hacks
+if(MSVC)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}  /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+endif()
+
+# Add csrc files
+add_library(bitsandbytes SHARED
+	${GPU_SOURCES}
+	csrc/common.cpp
+	csrc/cpu_ops.cpp
+	csrc/pythonInterface.cpp)
+
+target_include_directories(bitsandbytes PUBLIC
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc
+    ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+if(ENABLE_CUDA)
+    target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
+
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
+
+    set_target_properties(
+        bitsandbytes
+        PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON)
+
+    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
+endif()
+if(ENABLE_MPS)
+    add_dependencies(bitsandbytes metallib)
+    target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
+endif()
+
+set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}")
+# Set the output name of the CUDA library
+if(MSVC)
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+endif()
+
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes)
diff --git a/Makefile b/Makefile.previous
similarity index 77%
rename from Makefile
rename to Makefile.previous
index 5f997a122..e45b940c8 100644
--- a/Makefile
+++ b/Makefile.previous
@@ -14,6 +14,33 @@ CUDA_VERSION:=
 endif
 endif
 
+NATIVE_ARCH:=$(shell (arch | sed -e s/arm64/aarch64/))
+NATIVE_OS:=$(shell uname)
+SHLIB_EXTENSION:=.so
+ifeq ($(TARGET_ARCH),)
+	TARGET_ARCH:=$(NATIVE_ARCH)
+endif
+ifeq ($(TARGET_ARCH),aarch64)
+	EXTRA_FLAGS:=
+else
+	EXTRA_FLAGS:=-msse3 -mavx2
+endif
+ifeq ($(NATIVE_OS),Darwin)
+	TARGET_OS:=apple-darwin
+	SHLIB_EXTENSION:=.dylib
+else
+	ifeq ($(OS),Windows_NT)
+		TARGET_OS:=windows
+		GPP:=cl.exe
+		SHLIB_EXTENSION:=.dll
+	else
+		TARGET_OS:=linux-gnu
+	endif
+endif
+ifneq ($(TARGET_ARCH),$(NATIVE_ARCH))
+	EXTRA_FLAGS:=$(EXTRA_FLAGS) --target=$(TARGET_ARCH)-apple-darwin
+endif
+
 
 
 NVCC := $(CUDA_HOME)/bin/nvcc
@@ -24,7 +51,7 @@ CSRC := $(ROOT_DIR)/csrc
 BUILD_DIR:= $(ROOT_DIR)/build
 
 FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
-FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
+FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.cpp
 
 INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
@@ -64,12 +91,12 @@ all: $(BUILD_DIR) env
 cuda110_nomatmul_kepler: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB)
 
 cuda11x_nomatmul_kepler: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB)
 
 
 cuda110_nomatmul: $(BUILD_DIR) env
@@ -90,17 +117,17 @@ cuda118_nomatmul: $(BUILD_DIR) env
 cuda12x_nomatmul: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB)
 
 cuda110: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB)
 
 cuda11x: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB)
 
 cuda118: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
@@ -110,10 +137,13 @@ cuda118: $(BUILD_DIR) env
 cuda12x: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB)
 
 cpuonly: $(BUILD_DIR) env
-	$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
+	$(GPP) $(EXTRA_FLAGS) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu$(SHLIB_EXTENSION)
+
+version:
+	$(GPP) --version
 
 env:
 	@echo "ENVIRONMENT"
diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py
index 523d02301..7aea163fe 100644
--- a/bitsandbytes/__main__.py
+++ b/bitsandbytes/__main__.py
@@ -103,7 +103,9 @@ def print_debug_info() -> None:
 
 print_header("OTHER")
 print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}")
-print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities()}")
+if COMPILED_WITH_CUDA:
+    cuda = get_cuda_lib_handle()
+    print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities(cuda)}")
 print_header("")
 print_header("DEBUG INFO END")
 print_header("")
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 19f224391..5914225cc 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -7,7 +7,8 @@
 
 import torch
 
-import bitsandbytes.functional as F
+from .. import functional as F
+from ..utils import is_cuda_device
 
 
 # math.prod not compatible with python < 3.8
@@ -224,6 +225,8 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
+    if not is_cuda_device(device):
+        return False
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
     device_name = torch.cuda.get_device_name(device=device)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index d52a6d607..ebccf2e0d 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -1,41 +1,20 @@
 import ctypes as ct
-import os
 import torch
 
 from pathlib import Path
 from warnings import warn
 
-from bitsandbytes.cuda_setup.main import CUDASetup
-
-
-setup = CUDASetup.get_instance()
-if setup.initialized != True:
-    setup.run_cuda_setup()
-
-lib = setup.lib
-try:
-    if lib is None and torch.cuda.is_available():
-        CUDASetup.get_instance().generate_instructions()
-        CUDASetup.get_instance().print_log_stack()
-        raise RuntimeError('''
-        CUDA Setup failed despite GPU being available. Please run the following command to get more information:
-
-        python -m bitsandbytes
-
-        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
-        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
-        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''')
-    lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False
-    lib.get_context.restype = ct.c_void_p
-    lib.get_cusparse.restype = ct.c_void_p
-    lib.cget_managed_ptr.restype = ct.c_void_p
-    COMPILED_WITH_CUDA = True
-except AttributeError as ex:
-    warn("The installed version of bitsandbytes was compiled without GPU support. "
-        "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.")
+if torch.backends.mps.is_built():
+    package_dir = Path(__file__).parent
+    binary_path = package_dir / "libbitsandbytes_mps.dylib"
+    lib = ct.cdll.LoadLibrary(binary_path)
     COMPILED_WITH_CUDA = False
-    print(str(ex))
+elif torch.cuda.is_available():
+    from bitsandbytes.cuda_setup.main import CUDASetup
 
+    setup = CUDASetup.get_instance()
+    if setup.initialized != True:
+        setup.run_cuda_setup()
 
 # print the setup details after checking for errors so we do not print twice
 #if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index 34c035425..1e4e8c611 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -99,6 +99,7 @@ def initialize(self):
             self.lib = None
             self.initialized = False
             self.error = False
+            self.cuda_setup_log = []
 
     def manual_override(self):
         if torch.cuda.is_available():
@@ -116,7 +117,6 @@ def manual_override(self):
 
     def run_cuda_setup(self):
         self.initialized = True
-        self.cuda_setup_log = []
 
         binary_name, cudart_path, cc, cuda_version_string = evaluate_cuda_setup()
         self.cudart_path = cudart_path
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index e17e70c4b..ee1d70f7e 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -136,12 +136,14 @@ def get_instance(cls):
         return cls._instance
 
     def get_context(self, device):
-        if device.index not in self.context:
-            prev_device = torch.cuda.current_device()
-            torch.cuda.set_device(device)
-            self.context[device.index] = ct.c_void_p(lib.get_context())
-            torch.cuda.set_device(prev_device)
-        return self.context[device.index]
+        if device.type == "cuda":
+            if device.index not in self.context:
+                prev_device = torch.cuda.current_device()
+                torch.cuda.set_device(device)
+                self.context[device.index] = ct.c_void_p(lib.get_context())
+                torch.cuda.set_device(prev_device)
+            return self.context[device.index]
+        return None
 
 
 class Cusparse_Context:
@@ -413,12 +415,16 @@ def get_ptr(A: Tensor) -> ct.c_void_p:
 
 
 def pre_call(device):
+    if device.type != "cuda":
+        return
     prev_device = torch.cuda.current_device()
     torch.cuda.set_device(device)
     return prev_device
 
 
 def post_call(prev_device):
+    if not prev_device or prev_device.type != "cuda":
+        return
     torch.cuda.set_device(prev_device)
 
 
@@ -812,7 +818,7 @@ def dequantize_blockwise(
     if out is None:
         out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
 
-    if A.device.type != 'cpu':
+    if A.device.type == 'cuda':
         device = pre_call(A.device)
         code = quant_state.code.to(A.device)
         if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
@@ -826,7 +832,9 @@ def dequantize_blockwise(
             lib.cdequantize_blockwise_bf16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
+        post_call(device)
+    elif A.device.type == 'mps':
+        raise NotImplementedError("MPS is not implemented")
     else:
         code = quant_state.code.cpu()
         lib.cdequantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(quant_state.absmax), get_ptr(out), ct.c_longlong(quant_state.blocksize), ct.c_longlong(A.numel()))
@@ -1495,7 +1503,7 @@ def histogram_scatter_add_2d(
     lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
 
 def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
-    if not torch.cuda.is_initialized(): torch.cuda.init()
+    if A.device.type == "cuda" and not torch.cuda.is_initialized(): torch.cuda.init()
     if A.dtype != expected_type or B.dtype != expected_type:
         raise TypeError(
             f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}"
@@ -2101,7 +2109,7 @@ def double_quant(
 ):
     device = A.device
     assert A.dtype == torch.half
-    assert device.type == "cuda"
+    #assert device.type == "cuda"
     prev_device = pre_call(A.device)
 
     cols = A.shape[-1]
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index df2885306..9202d5c95 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -120,6 +120,19 @@ def execute_and_return_decoded_std_streams(command_string):
     std_out, std_err = execute_and_return_decoded_std_streams(command_string)
     return std_out, std_err
 
+__cuda_devices = None
+def get_cuda_devices():
+    global __cuda_devices
+    if __cuda_devices is None:
+        cuda_devices = []
+        if torch.cuda.is_available():
+            devices = [d for d in range(torch.cuda.device_count())]
+            cuda_devices = [torch.cuda.get_device_name(d) for d in devices]
+        __cuda_devices = cuda_devices
+    return __cuda_devices
+
+def is_cuda_device(device):
+    return device in get_cuda_devices()
 
 
 def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None):
diff --git a/csrc/common.cpp b/csrc/common.cpp
index 52f029917..0a9601689 100644
--- a/csrc/common.cpp
+++ b/csrc/common.cpp
@@ -1,39 +1,35 @@
 #include <common.h>
 #include <float.h>
 
-void *quantize_block(void *arguments) {
+void quantize_block(const quantize_block_args& args) {
     // 1. find absmax in block
     // 2. divide input value by absmax to normalize into [-1.0, 1.0]
     // 3. do binary search to find the closest value
     // 4. check minimal distance
     // 5. store index
 
-    struct quantize_block_args *args = (quantize_block_args *) arguments;
-
     // 1. find absmax in block
     float absmax_block = -FLT_MAX;
-    for (long long i = args->block_idx; i < args->block_end; i++)
-        absmax_block = fmax(absmax_block, fabs(args->A[i]));
+    for (long long i = args.block_idx; i < args.block_end; i++)
+        absmax_block = fmax(absmax_block, fabs(args.A[i]));
 
-    args->absmax[args->block_idx / args->blocksize] = absmax_block;
+    args.absmax[args.block_idx / args.blocksize] = absmax_block;
 
-    for (long long i = args->block_idx; i < args->block_end; i++) {
+    for (long long i = args.block_idx; i < args.block_end; i++) {
         // 2. divide input value by absmax to normalize into [-1.0, 1.0]
         // 3. do binary search to find the closest value
-        float normed_value = args->A[i] / absmax_block;
-        long long idx = args->bin_searcher->scalar(normed_value);
+        float normed_value = args.A[i] / absmax_block;
+        long long idx = args.bin_searcher->scalar(normed_value);
 
         // 4. check minimal distance
         // The binary search returns always the value to the left, which might not be the closest value
         if (idx < 255) {
-            float dist_left = fabs(normed_value - (args->code[idx]));
-            float dist_right = fabs(normed_value - (args->code[idx + 1]));
+            float dist_left = fabs(normed_value - (args.code[idx]));
+            float dist_right = fabs(normed_value - (args.code[idx + 1]));
             if (dist_right < dist_left) { idx += 1; }
         }
 
         // 5. store index
-        args->out[i] = (unsigned char) idx;
+        args.out[i] = (unsigned char) idx;
     }
-
-    return NULL;
 }
diff --git a/csrc/common.h b/csrc/common.h
index c99034e78..e513f2875 100644
--- a/csrc/common.h
+++ b/csrc/common.h
@@ -20,6 +20,6 @@ struct quantize_block_args {
 };
 
 
-void *quantize_block(void *arguments);
+void quantize_block(const quantize_block_args& args);
 
 #endif
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index e28e7b2c2..478c1f4ff 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,6 +1,6 @@
 #include <BinSearch.h>
-#include <pthread.h>
 #include <common.h>
+#include <thread>
 
 using namespace BinSearch;
 
@@ -31,12 +31,8 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
     for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
     {
       long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
-      pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
-
-      struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));
-
-      for(long long i = 0; i < valid_chunks; i++)
-          args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args));
+      std::vector<std::thread> threads(valid_chunks);
+      std::vector<quantize_block_args> args(valid_chunks);
 
       int chunks_processed = 0;
       for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize)
@@ -44,30 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
           long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
           long long block_end = block_idx + valid_items;
 
-          struct quantize_block_args *arg = args[chunks_processed];
-          arg->bin_searcher = &bin_searcher;
-          arg->code = code;
-          arg->A = A;
-          arg->absmax = absmax;
-          arg->out = out;
-          arg->block_end = block_end;
-          arg->block_idx = block_idx;
-          arg->threadidx = block_idx / blocksize;
-          arg->blocksize = blocksize;
-
-          pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
+          struct quantize_block_args& arg = args[chunks_processed];
+          arg.bin_searcher = &bin_searcher;
+          arg.code = code;
+          arg.A = A;
+          arg.absmax = absmax;
+          arg.out = out;
+          arg.block_end = block_end;
+          arg.block_idx = block_idx;
+          arg.threadidx = block_idx / blocksize;
+          arg.blocksize = blocksize;
+
+          threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
           chunks_processed += 1;
           if(chunks_processed == valid_chunks){ break; }
       }
 
       for (int i = 0; i < valid_chunks; i++)
-          int err = pthread_join(threads[i], NULL);
-
-      free(threads);
-      for (int i = 0; i < valid_chunks; i++)
-          free(args[i]);
-      free(args);
-
+          threads[i].join();
     }
 
 }
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 1ab8aa242..c2e2d7da7 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N
 template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 
-template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal
new file mode 100644
index 000000000..a5c8e35b2
--- /dev/null
+++ b/csrc/mps_kernels.metal
@@ -0,0 +1,117 @@
+#include <metal_stdlib>
+using namespace metal;
+
+#define HLF_MAX 65504
+#define TH 1024
+#define NUM 4
+#define NUM_BLOCK 4096
+
+template<bool STOCHASTIC>
+static unsigned char quantize_scalar(
+  float rand,
+  device float* code,
+  float x)
+{
+    int pivot = 127;
+    int upper_pivot = 255;
+    int lower_pivot = 0;
+
+    float lower = -1.0f;
+    float upper = 1.0f;
+
+    float val = code[pivot];
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for(int i = 64; i > 0; i>>=1)
+    {
+        if(x > val)
+        {
+            lower_pivot = pivot;
+            lower = val;
+            pivot+=i;
+        }
+        else
+        {
+            upper_pivot = pivot;
+            upper = val;
+            pivot-=i;
+        }
+        val = code[pivot];
+    }
+
+    if(upper_pivot == 255)
+        upper = code[upper_pivot];
+    if(lower_pivot == 0)
+        lower = code[lower_pivot];
+
+    if(!STOCHASTIC)
+    {
+      if(x > val)
+      {
+        float midpoint = (upper+val)*0.5f;
+        if(x > midpoint)
+        {
+          return upper_pivot;
+        }
+        else
+          return pivot;
+      }
+      else
+      {
+        float midpoint = (lower+val)*0.5f;
+        if(x < midpoint)
+          return lower_pivot;
+        else
+          return pivot;
+      }
+    }
+    else
+    {
+      if(x > val)
+      {
+        float dist_to_upper = fabs(upper-x);
+        float dist_full = upper-val;
+        if(rand >= dist_to_upper/dist_full) return upper_pivot;
+        else return pivot;
+      }
+      else
+      {
+        float dist_to_lower = fabs(lower-x);
+        float dist_full = val-lower;
+        if(rand >= dist_to_lower/dist_full) return lower_pivot;
+        else return pivot;
+      }
+    }
+}
+
+kernel void quantize(device float* code [[buffer(0)]],  
+                      device float* A [[buffer(1)]],  
+                      device uchar* out [[buffer(2)]],  
+                      constant uint& n [[buffer(3)]],  
+                      uint id [[thread_position_in_grid]]) {  
+  const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);  
+  uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK;  
+  const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK);  
+  
+  float vals[NUM];  
+  uchar qvals[NUM];  
+  
+  for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) {  
+    valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;  
+  
+    threadgroup_barrier(mem_flags::mem_threadgroup);  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      vals[j] = A[i + j];  
+    }  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      qvals[j] = quantize_scalar<false>(0.0f, code, vals[j]);
+    }  
+  
+    threadgroup_barrier(mem_flags::mem_threadgroup);  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      out[i + j] = qvals[j];  
+    }  
+  }  
+}
diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h
new file mode 100644
index 000000000..e69de29bb
diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm
new file mode 100644
index 000000000..cb549e83d
--- /dev/null
+++ b/csrc/mps_ops.mm
@@ -0,0 +1,67 @@
+#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#define HLF_MAX 65504
+#define TH 1024
+#define NUM 4
+#define NUM_BLOCK 4096
+
+static inline MPSGraph* get_graph()
+{
+  static MPSGraph* cur = nil;
+  if(!cur) {
+    cur = [[MPSGraph alloc] init];
+  }
+  return cur;
+}
+
+static inline id<MTLDevice> get_device()
+{
+  NSError *error = nil;  
+  static id<MTLDevice> device = nil;
+  if(!device) {
+    device = MTLCreateSystemDefaultDevice();  
+  }
+  if(!device) {
+    NSLog(@"Failed to get MPS device");
+    abort();
+  }
+  return device;
+}
+
+static inline id<MTLLibrary> get_library()
+{
+  NSError *error = nil;  
+  static id<MTLLibrary> library = nil;
+  if(!library) {
+    library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error];
+  }
+  if(!library) {
+    NSLog(@"Failed to load bitsandbytes.metallib");
+    abort();
+  }
+  return library;
+}  
+
+/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n)
+{
+  id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"];
+  return out;
+}*/
+
+
+// MPSGraph function for quantize  
+extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n)
+{
+  id<MTLDevice> device = get_device();
+  id<MTLLibrary> library = get_library();
+  static id<MTLFunction> kernel = nil;
+  if(!kernel) {
+    kernel = [library newFunctionWithName:@"quantize"];
+    if(!kernel) {
+      NSLog(@"Failed to load bitsandbytes.metallib");
+      abort();
+    }
+  }
+  NSLog(@"Not implemented");
+  return nil;
+}  
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index f37b3b3af..cc7b59505 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -9,7 +9,9 @@
 
 #include <stdio.h>
 #include <iostream>
+#ifndef _MSC_VER
 #include <unistd.h>
+#endif
 #include <assert.h>
 
 #include <cuda_runtime_api.h>
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp
similarity index 99%
rename from csrc/pythonInterface.c
rename to csrc/pythonInterface.cpp
index 865e4b6d5..49498c19b 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.cpp
@@ -6,6 +6,9 @@
 #if BUILD_CUDA
 #include <ops.cuh>
 #endif
+#if BUILD_MPS
+#include <mps_ops.h>
+#endif
 #include <cpu_ops.h>
 
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
@@ -412,6 +415,7 @@ extern "C"
 	{ gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
 #endif
+
 	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
 	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
 }
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index d5fa58d12..7f52fce14 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -52,6 +52,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 private:
     typedef AlgoScalarBase<T, A> base_t;
 
+#ifdef USE_SSE2
     FORCE_INLINE
         //NO_INLINE
         void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
@@ -135,6 +136,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         pr[0] = u.ui32[0];
         pr[1] = u.ui32[2];
     }
+#endif // USE_SSE2
 
 #ifdef USE_AVX
 
diff --git a/include/Portable.h b/include/Portable.h
index 1710b0502..78599944e 100644
--- a/include/Portable.h
+++ b/include/Portable.h
@@ -4,10 +4,40 @@
 #include <stdexcept>
 #include <sstream>
 
+#if defined(__aarch64__)
+#ifdef __CUDACC__
+#undef USE_NEON // Doesn't work with nvcc, undefined symbols
+#else
+#include <arm_neon.h>
+#undef USE_NEON // Not yet implemented
+#endif
+#undef USE_AVX // x86_64 only
+#undef USE_AVX2 // x86_64 only
+#undef USE_SSE2 // x86_64 only
+#undef USE_SSE41 // x86_64 only
+#undef USE_SSE42 // x86_64 only
+#undef USE_FMA // x86_64 only
+#ifdef USE_NEON
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+typedef float64x2_t __m128d;
+#else
+typedef struct {float a; float b; float c; float d;} __m128;
+typedef struct {int a; int b; int c; int d;} __m128i;
+typedef struct {double a; double b;} __m128d;
+#endif
+#else
+#undef USE_NEON // ARM64 only
 #ifdef __FMA__
 #define USE_FMA
 #endif
+#if !defined(__SSE2__) && !defined(_MSC_VER)
+#error Compiler must support SSE2
+#endif
+#define USE_SSE2
 
+#if defined(__aarch64__)
+#else
 #ifdef __AVX2__
 #define USE_AVX2
 #endif
@@ -24,7 +54,8 @@
 #ifdef __SSE4_2__
 #define USE_SSE42
 #endif
-
+#endif
+#endif
 
 #ifndef _MSC_VER
 #include <stdint.h>
@@ -50,7 +81,7 @@ typedef unsigned __int64  uint64;
 
 namespace Details {
 
-#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
+#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
 
 // log2 is not defined in VS2008
 #if defined(_MSC_VER)
diff --git a/include/SIMD.h b/include/SIMD.h
index a2ac1a9ae..18a38dbfd 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -2,6 +2,46 @@
 
 #include "Portable.h"
 
+#ifdef USE_SSE2
+#include <emmintrin.h>
+#if defined(USE_AVX) || defined(USE_AVX2)
+#include <immintrin.h>
+#else
+#ifdef USE_SSE41
+#include <smmintrin.h>
+#endif
+#endif
+#endif
+
+namespace BinSearch {
+namespace Details {
+
+template <InstrSet I, typename T>
+struct FTOITraits{};
+
+template <InstrSet I, class T>
+struct FVec;
+
+template <InstrSet I, class T>
+struct IVec;
+
+template <InstrSet I, class T>
+struct FVec1;
+
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef __m128  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef __m128d vec_t;
+};
+
+}
+}
+
+#if !defined(__aarch64__)
 #ifdef USE_SSE42
 #ifndef _MSC_VER
 #include <popcntintrin.h>
@@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32)
 } // namespace
 #endif
 
-#if defined(USE_AVX) || defined(USE_AVX2)
-#include <immintrin.h>
-#else
-#include <emmintrin.h>
-#ifdef USE_SSE41
-#include <smmintrin.h>
-#endif
-#endif
-
 #include "Type.h"
 
 namespace BinSearch {
 namespace Details {
 
-template <InstrSet I, class T>
-struct FVec;
-
-template <InstrSet I, class T>
-struct IVec;
-
-template <InstrSet I, class T>
-struct FVec1;
-
 template <> struct InstrIntTraits<SSE>
 {
     typedef __m128i vec_t;
@@ -64,8 +86,8 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <InstrSet I, typename T>
-struct FTOITraits
+template <>
+struct FTOITraits<SSE, float>
 {
     typedef IVec<SSE, float> vec_t;
 };
@@ -285,9 +307,11 @@ FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<
 FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
 FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
 FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
 #endif
@@ -339,9 +363,11 @@ FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec
 FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
 FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
 FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
 FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
 FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
 #endif
@@ -560,3 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 } // namepsace Details
 } // namespace BinSearch
+#endif // !defined(__aarch64__)
\ No newline at end of file
diff --git a/include/Type.h b/include/Type.h
index 720bfb86f..16bf3e3ae 100644
--- a/include/Type.h
+++ b/include/Type.h
@@ -10,7 +10,7 @@ using std::size_t;
 
 namespace BinSearch {
 
-enum InstrSet { Scalar, SSE, AVX };
+enum InstrSet { Scalar, SSE, AVX, Neon };
 
 #define ALGOENUM(x, b) x,
 enum Algos
diff --git a/requirements.txt b/requirements.txt
index 3bde2dc6a..ad64af67a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
-lion-pytorch
-pytest
-scipy
+pytest==7.2.2
+einops==0.6.0
+wheel==0.40.0
+lion-pytorch==0.0.6
+scipy==1.11.4
diff --git a/setup.py b/setup.py
index 2068c5fd8..72c809820 100644
--- a/setup.py
+++ b/setup.py
@@ -6,11 +6,12 @@
 import os
 
 from setuptools import find_packages, setup
+from setuptools.dist import Distribution
 import bitsandbytes as bnb
 
 VERSION = bnb.__version__
 
-libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
+libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*"))
 libs = [os.path.basename(p) for p in libs]
 print("libs:", libs)
 
@@ -18,6 +19,10 @@
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
+# Tested with wheel v0.29.0
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
 
 setup(
     name=f"bitsandbytes",
@@ -37,4 +42,5 @@ def read(fname):
         "Development Status :: 4 - Beta",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
+    distclass=BinaryDistribution
 )
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..ec1dfef58
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,17 @@
+import pytest
+
+from testutil import get_gpu_devices
+
+@pytest.fixture(scope="session", params=get_gpu_devices())
+def gpu_device(request):
+    """
+    Use this fixture to run the test across all GPU devices supported by the environment.
+    """
+    return request.param
+
+@pytest.fixture(scope="session", params=[ "cpu" ] + get_gpu_devices())
+def device(request):
+    """
+    Use this fixture to run the test across all compute devices (CPU + GPU) supported by the environment.
+    """
+    return request.param
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 803fde145..0da7aa5a8 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -5,6 +5,8 @@
 
 import bitsandbytes as bnb
 
+from testutil import skip_if_no_cuda
+
 n = 1
 k = 25
 dim1 = torch.randint(16, 64, size=(n,)).tolist()
@@ -34,13 +36,13 @@
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose",
     values,
     ids=names,
 )
 def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
-    if not torch.cuda.is_available(): pytest.skip('No GPU found.')
     if dim2 > 0:
         dim2 = dim2 - (dim2 % 16)
     dim3 = dim3 - (dim3 % 16)
@@ -289,6 +291,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias",
     values,
@@ -307,7 +310,6 @@ def test_matmullt(
     has_fp16_weights,
     has_bias
 ):
-    if not torch.cuda.is_available(): pytest.skip('No GPU found.')
     dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
     dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
     outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda")
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index e875bcd2b..138c5e88a 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -17,11 +17,3 @@ def test_manual_override():
     import bitsandbytes as bnb
     loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name
     assert loaded_lib == 'libbitsandbytes_cuda122.so'
-
-
-
-
-
-
-
-
diff --git a/tests/test_functional.py b/tests/test_functional.py
index f825c14df..7fa0bf64c 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -12,6 +12,8 @@
 from bitsandbytes import functional as F
 from scipy.stats import norm
 
+from testutil import skip_if_no_cuda
+
 torch.set_printoptions(
     precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000
 )
@@ -91,6 +93,7 @@ def teardown():
     pass
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "dtype", [torch.float32, torch.float16], ids=["float", "half"]
 )
@@ -111,6 +114,7 @@ def test_estimate_quantiles(dtype):
     assert (diff > 5e-02).sum().item() == 0
 
 
+@skip_if_no_cuda()
 def test_quantile_quantization():
     for i in range(100):
         A1 = torch.randn(1024, 1024, device="cuda")
@@ -130,11 +134,12 @@ def test_quantile_quantization():
 
 
 
-def test_dynamic_quantization():
+@skip_if_no_cuda()
+def test_dynamic_quantization(device):
     diffs = []
     reldiffs = []
     for i in range(100):
-        A1 = torch.randn(1024, 1024, device="cuda")
+        A1 = torch.randn(1024, 1024, device=device)
         C, S = F.quantize(A1)
         A2 = F.dequantize(C, S)
         diff = torch.abs(A1 - A2)
@@ -146,7 +151,7 @@ def test_dynamic_quantization():
     print(sum(reldiffs)/len(reldiffs))
 
     for i in range(100):
-        A1 = torch.rand(1024, 1024, device="cuda")
+        A1 = torch.rand(1024, 1024, device=device)
         C, S = F.quantize(A1)
         A2 = F.dequantize(C, S)
         diff = torch.abs(A1 - A2).mean().item()
@@ -154,7 +159,7 @@ def test_dynamic_quantization():
         assert diff < 0.004
 
 
-
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
 @pytest.mark.parametrize("nested", [False, True], ids=["False", "True"])
 @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
@@ -204,6 +209,7 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed):
 
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "gtype", [torch.float32, torch.float16], ids=["float", "half"]
 )
@@ -306,6 +312,7 @@ def mean(xx):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "dim1, dim2, quant_methods, batched", values, ids=names
 )
@@ -364,10 +371,11 @@ def test_stable_embedding():
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "hidden_dim, batch_dim, transpose, seq_dim", values, ids=names
 )
-def test_igemm(hidden_dim, batch_dim, transpose, seq_dim):
+def test_igemm(device, hidden_dim, batch_dim, transpose, seq_dim):
     hidden_dim = hidden_dim - (hidden_dim % 32)
     batch_dim = batch_dim - (batch_dim % 16)
     seq_dim = seq_dim - (seq_dim % 16)
@@ -382,8 +390,8 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim):
             if transpose[1]
             else (hidden_dim, 32 * random.randint(1, 4))
         )
-        A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
-        B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8)
+        A = torch.randint(-128, 127, size=shapeA, device=device).to(torch.int8)
+        B = torch.randint(-128, 127, size=shapeB, device=device).to(torch.int8)
         if not transpose[0] and not transpose[1]:
             out2 = torch.matmul(A.float(), B.float())
             out = F.igemm(A, B)
@@ -406,8 +414,8 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim):
             if transpose[1]
             else (hidden_dim, 32 * random.randint(1, 4))
         )
-        A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
-        B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8)
+        A = torch.randint(-128, 127, size=shapeA, device=device).to(torch.int8)
+        B = torch.randint(-128, 127, size=shapeB, device=device).to(torch.int8)
         if not transpose[0] and not transpose[1]:
             out2 = torch.matmul(A.float(), B.float())
             out = F.igemm(A, B)
@@ -428,6 +436,7 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("seq_dim, hidden_dim, batch_dim", values, ids=names)
 def test_dim3_igemm(seq_dim, hidden_dim, batch_dim):
     seq_dim = seq_dim - (seq_dim % 32)
@@ -461,6 +470,7 @@ def test_dim3_igemm(seq_dim, hidden_dim, batch_dim):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "seq_dim, hidden_dim, batch_dim, transpose", values, ids=names
 )
@@ -546,6 +556,7 @@ def min_max(x):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, transpose", values, ids=names)
 def test_ibmm(dim1, dim2, dim3, dim4, transpose):
     dim2 = dim2 - (dim2 % 16)
@@ -582,6 +593,7 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose):
 names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names)
 def test_vector_quant(dim1, dim2, dim3):
     dim2 = dim2 - (dim2 % 16)
@@ -611,6 +623,7 @@ def test_vector_quant(dim1, dim2, dim3):
 names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
 def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     if dims == 3 and out_order != "col32":
@@ -695,6 +708,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names)
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for i in range(k):
@@ -743,6 +757,7 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names)
 def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
     formatB = F.get_special_format_str()
@@ -800,6 +815,7 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
 def test_bench_8bit_training(batch, seq, model, hidden):
     formatB = F.get_special_format_str()
@@ -967,6 +983,7 @@ def test_bench_8bit_training(batch, seq, model, hidden):
 names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
 def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
     inner = torch.randint(1, 128, size=(1,)).item()
@@ -1017,6 +1034,7 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
 names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dims", values, ids=names)
 def test_colrow_absmax(dim1, dim2, dims):
     for i in range(k):
@@ -1073,6 +1091,7 @@ def test_colrow_absmax(dim1, dim2, dims):
 names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
 def test_double_quant(dim1, dim2):
     for i in range(k):
@@ -1120,6 +1139,7 @@ def test_double_quant(dim1, dim2):
 names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
 def test_integrated_igemmlt(dim1, dim4, inner):
     for i in range(k):
@@ -1164,6 +1184,7 @@ def test_integrated_igemmlt(dim1, dim4, inner):
 names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
 @pytest.mark.skip("Row scale has some bugs for ampere")
 def test_igemmlt_row_scale(dim1, dim4, inner):
@@ -1239,6 +1260,7 @@ def test_igemmlt_row_scale(dim1, dim4, inner):
 names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
 @pytest.mark.skip("Row scale has some bugs for ampere")
 def test_row_scale_bench(dim1, dim4, inner):
@@ -1309,6 +1331,7 @@ def test_row_scale_bench(dim1, dim4, inner):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",
     values,
@@ -1358,6 +1381,7 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
 ]
 
 
+@skip_if_no_cuda()
 def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)
@@ -1382,6 +1406,7 @@ def test_overflow():
 names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
 def test_coo_double_quant(dim1, dim2):
     threshold = 3.00
@@ -1419,6 +1444,7 @@ def test_coo_double_quant(dim1, dim2):
 names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names)
 def test_spmm_coo(dim1, dim2, transposed_B):
     threshold = 1.5
@@ -1450,6 +1476,7 @@ def test_spmm_coo(dim1, dim2, transposed_B):
         assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30)
 
 
+@skip_if_no_cuda()
 def test_spmm_bench():
     batch = 2
     model = 1024 * 1
@@ -1500,6 +1527,7 @@ def test_spmm_bench():
 names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
 def test_integrated_sparse_decomp(dim1, dim2):
     threshold = 3.0
@@ -1536,6 +1564,7 @@ def test_integrated_sparse_decomp(dim1, dim2):
         assert err2 < err1
 
 
+@skip_if_no_cuda()
 def test_matmuls():
     a = torch.randn(256, 512).half().cuda()
     b = torch.randn(256, 512).half().cuda()
@@ -1566,6 +1595,7 @@ def test_matmuls():
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dtype, out_func", values, ids=names)
 def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func):
     out_func = getattr(torch, out_func)
@@ -1628,6 +1658,7 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func):
     # print(time.time() - t0)
 
 
+@skip_if_no_cuda()
 def test_coo2csr():
     threshold = 1
     A = torch.randn(128, 128).half().cuda()
@@ -1648,6 +1679,7 @@ def test_coo2csr():
     torch.testing.assert_close(A2[idx], csrA.values)
 
 
+@skip_if_no_cuda()
 def test_coo2csc():
     threshold = 1
     A = torch.randn(128, 128).half().cuda()
@@ -1682,6 +1714,7 @@ def test_coo2csc():
 names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names)
 def test_spmm_coo_dequant(dim1, dim2, dtype):
     threshold = 6.0
@@ -1798,6 +1831,8 @@ def test_spmm_coo_dequant(dim1, dim2, dtype):
 #values.append((batch_size, seqdim, 5140, 4*5140))
 #values.append((batch_size, seqdim, 12288, 4*12288))
 names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values]
+
+@skip_if_no_cuda()
 @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
 def test_bench_matmul(batch, seq, model, hidden):
     iters = 1000
@@ -1953,6 +1988,7 @@ def test_bench_matmul(batch, seq, model, hidden):
     #torch.cuda.synchronize()
     #print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
 
+@skip_if_no_cuda()
 def test_zeropoint():
     def quant_zp(x):
         dtype = x.dtype
@@ -2039,6 +2075,7 @@ def quant_zp(x):
     print(err1, err2, err3, err4, err5, err6)
 
 
+@skip_if_no_cuda()
 def test_extract_outliers():
     for i in range(k):
         shapeA = (4096, 4096 * 4)
@@ -2090,6 +2127,7 @@ def test_blockwise_cpu_large():
 
 
 
+@skip_if_no_cuda()
 def test_fp8_quant():
     for e_bits in range(1, 7):
         p_bits = 7-e_bits
@@ -2138,8 +2176,8 @@ def test_fp8_quant():
         #print(3, sum(relerr)/len(relerr))
 
 
+@skip_if_no_cuda()
 def test_few_bit_quant():
-
     #print('')
     for bits in range(2, 9):
         #print('='*30, bits, '='*30)
@@ -2198,6 +2236,7 @@ def test_few_bit_quant():
     #assert False
 
 
+@skip_if_no_cuda()
 def test_kbit_quantile_estimation():
     for i in range(100):
         data = torch.randn(1024, 1024, device='cuda')
@@ -2223,6 +2262,7 @@ def test_kbit_quantile_estimation():
             assert err < 0.035
 
 
+@skip_if_no_cuda()
 def test_bench_dequantization():
     a = torch.rand(1024, 1024, device='cuda').half()
     code =F.create_fp8_map(True, 3, 0, 4).cuda()
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 37f7af9cb..f8cd719de 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -11,6 +11,7 @@
 from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout
 from bitsandbytes.nn.modules import Linear8bitLt
 
+from testutil import skip_if_no_cuda
 
 # contributed by Alex Borzunov, see:
 # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py
@@ -32,8 +33,7 @@ def test_layout_exact_match():
         assert restored_x.is_contiguous()
         assert torch.all(torch.eq(restored_x, x))
 
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
+@skip_if_no_cuda()
 def test_linear_no_igemmlt():
     linear = torch.nn.Linear(1024, 3072)
     x = torch.randn(3, 1024, dtype=torch.half)
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 7d2d03498..27100bfeb 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -1,11 +1,14 @@
+import math
 from itertools import product
 
 import pytest
 import torch
 from torch import nn
+import einops
 
 import bitsandbytes as bnb
 
+from testutil import skip_if_no_cuda
 
 class MockArgs:
     def __init__(self, initial_data):
@@ -315,6 +318,7 @@ def forward(self, x):
 names = [f"threshold_{vals}" for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("threshold", values, ids=names)
 def test_linear8bitlt_inference(threshold):
     l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half()
@@ -329,6 +333,7 @@ def test_linear8bitlt_inference(threshold):
             assert l1.state.CxB is not None
 
 
+@skip_if_no_cuda()
 def test_linear8bitlt_accumulated_gradient():
     l1 = torch.nn.Sequential(*[bnb.nn.Linear8bitLt(32, 32).cuda().half() for i in range(2)])
     l2 = torch.nn.Sequential(*[torch.nn.Linear(32, 32).cuda().half() for i in range(2)])
@@ -343,7 +348,7 @@ def test_linear8bitlt_accumulated_gradient():
     acc_steps = 10
 
     for i in range(10):
-        b1 = torch.randn(16, 8, 32, device="cuda").half()
+        b1 = torch.randn(16, 8, 32, device=device).half()
         o1 = l1(b1)
         o2 = l2(b1)
         loss1 = o1.mean()
@@ -374,7 +379,7 @@ def test_linear8bitlt_accumulated_gradient():
             torch.testing.assert_close(l1[0].weight.grad, l2[0].weight.grad, atol=1e-3, rtol=1e-3)
             torch.testing.assert_close(l1[1].weight.grad, l2[1].weight.grad, atol=1e-3, rtol=1e-3)
 
-
+@skip_if_no_cuda()
 @pytest.mark.parametrize("threshold", [0.0, 2.0])
 @pytest.mark.parametrize("memory_efficient_backward", [False])
 def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
@@ -486,6 +491,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         assert (idx == 0).sum().item() <= b1.numel() * 0.005
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("module", [lambda nin, nout, bias=True: bnb.nn.Linear8bitLt(nin, nout, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4'])
 def test_linear_kbit_fp32_bias(module):
     # casts model to fp16 -> int8 automatically
@@ -581,6 +587,7 @@ def test_kbit_backprop(module):
     print('rel out', sum(relerrs1)/len(relerrs1))
     print('rel grad', sum(relerrs2)/len(relerrs2))
 
+@skip_if_no_cuda()
 def test_fp8linear():
 
     b = 10
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 49d4f442a..7365e2dc6 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -13,6 +13,7 @@
 
 import bitsandbytes as bnb
 import bitsandbytes.functional as F
+from testutil import skip_if_no_cuda
 
 # import apex
 
@@ -109,6 +110,7 @@ def rm_path(path):
 optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion']
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values]
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer32bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip()
@@ -186,6 +188,7 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name):
 names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names)
 def test_global_config(dim1, dim2, gtype):
     if dim1 == 1 and dim2 == 1:
@@ -251,6 +254,7 @@ def test_global_config(dim1, dim2, gtype):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer8bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip()
@@ -388,6 +392,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_bits", values, ids=names)
 def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits):
     if dim1 == 1 and dim2 == 1:
@@ -491,6 +496,7 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits):
 ]
 
 
+@skip_if_no_cuda()
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_benchmark_blockwise(dim1, dim2, gtype, optim_name):
     if dim1 == 1 and dim2 == 1:
diff --git a/tests/testutil.py b/tests/testutil.py
new file mode 100644
index 000000000..a805f60bf
--- /dev/null
+++ b/tests/testutil.py
@@ -0,0 +1,21 @@
+import pytest
+import torch
+from bitsandbytes.utils import get_cuda_devices
+
+def get_gpu_devices():
+    """
+    Returns a list of all GPU devices supported by Torch in the current environment (i.e. devices that Torch was built with
+    support for and are present in the current environment).
+    """
+    ret = []
+    if torch.backends.mps.is_built() and torch.backends.mps.is_available():
+        ret.append("mps")
+    if torch.cuda.is_available():
+        ret += get_cuda_devices()
+    return ret
+
+def skip_if_no_gpu():
+    return pytest.mark.skipif(not get_gpu_devices() or not torch.cuda.is_available(), reason="No GPU device found by Torch")
+
+def skip_if_no_cuda():
+    return pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA-compatible device found by Torch")