Skip to content

Commit

Permalink
fix: installing gpu bugs on wsl (#255)
Browse files Browse the repository at this point in the history
* fix: install gpu bugs on wsl

* fix: script bug

* fix: change cp to ln

---------

Co-authored-by: liuyu <>
Co-authored-by: aby913 <[email protected]>
  • Loading branch information
eball and aby913 authored Jul 26, 2024
1 parent b32e1b1 commit a35dfd6
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 20 deletions.
53 changes: 33 additions & 20 deletions build/installer/install_cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function retry_cmd(){
"$@"
ret=$?

if [ $ret -eq 0 ]; then
if [[ $ret -eq 0 ]]; then
break
fi

Expand Down Expand Up @@ -96,7 +96,7 @@ function ensure_success() {

local r=""

if [ $ret -eq 0 ]; then
if [[ $ret -eq 0 ]]; then
r=y
fi

Expand Down Expand Up @@ -291,7 +291,7 @@ precheck_os() {

# ubuntu 24 upgrade apparmor
ubuntuversion=$(is_ubuntu)
if [ ${ubuntuversion} -eq 2 ]; then
if [[ ${ubuntuversion} -eq 2 ]]; then
aapv=$(apparmor_parser --version)
if [[ ! ${aapv} =~ "4.0.1" ]]; then
local aapv_tar="${BASE_DIR}/components/apparmor_4.0.1-0ubuntu1_${ARCH}.deb"
Expand All @@ -308,11 +308,16 @@ precheck_os() {
fi
fi

# opy pre-installation dependency files
# copy pre-installation dependency files
if [ -d /opt/deps ]; then
ensure_success $sh_c "mv /opt/deps/* ${BASE_DIR}"
fi

if [[ $(is_wsl) -eq 1 ]]; then
$sh_c "chattr -i /etc/hosts"
$sh_c "chattr -i /etc/resolv.conf"
fi

}

is_debian() {
Expand Down Expand Up @@ -517,7 +522,17 @@ run_install() {
log_info 'installing k8s and kubesphere'

if [ -d "$BASE_DIR/pkg" ]; then
ensure_success $sh_c "cp -a ${BASE_DIR}/pkg ./"
ensure_success $sh_c "ln -s ${BASE_DIR}/pkg ./"
fi

if [[ $(is_wsl) -eq 1 ]]; then
if [ -f /usr/lib/wsl/lib/nvidia-smi ]; then
local device=$(/usr/lib/wsl/lib/nvidia-smi -L|grep 'NVIDIA'|grep UUID)
if [ x"$device" != x"" ]; then
LOCAL_GPU_ENABLE="1"
LOCAL_GPU_SHARE="1"
fi
fi
fi

# env 'KUBE_TYPE' is specific the special kubernetes (k8s or k3s), default k3s
Expand Down Expand Up @@ -569,7 +584,7 @@ run_install() {
install_gpu
fi

if [ $SHOULD_RETRY -eq 1 ]; then
if [[ $SHOULD_RETRY -eq 1 ]]; then
run_cmd=retry_cmd
else
run_cmd=ensure_success
Expand Down Expand Up @@ -621,13 +636,6 @@ run_install() {

# check_orion_gpu
# fi
if [[ $(is_wsl) -eq 1 ]]; then
if [ -f /usr/bin/nvidia-container-runtime ]; then
LOCAL_GPU_ENABLE="1"
LOCAL_GPU_SHARE="1"
fi
fi

GPU_TYPE="none"
if [ "x${LOCAL_GPU_ENABLE}" == "x1" ]; then
GPU_TYPE="nvidia"
Expand Down Expand Up @@ -2116,18 +2124,23 @@ install_gpu(){
sleep 30
if [[ $(is_wsl) -eq 1 ]]; then
local real_driver=$($sh_c "find /usr/lib/wsl/drivers/ -name libcuda.so.1.1")
local real_driver=$($sh_c "find /usr/lib/wsl/drivers/ -name libcuda.so.1.1|head -1")
echo "found cuda driver in $real_driver"
if [[ x"$real_driver" != x"" ]]; then
ensure_success $sh_c "ln -s /usr/lib/wsl/lib/libcuda* /usr/lib/x86_64-linux-gnu/"
$sh_c "ln -s /usr/lib/wsl/lib/libcuda* /usr/lib/x86_64-linux-gnu/"
ensure_success $sh_c "rm -f /usr/lib/x86_64-linux-gnu/libcuda.so"
ensure_success $sh_c "rm -f /usr/lib/x86_64-linux-gnu/libcuda.so.1"
ensure_success $sh_c "rm -f /usr/lib/x86_64-linux-gnu/libcuda.so.1.1"
ensure_success $sh_c "cp -f $real_driver /usr/lib/wsl/lib/libcuda.so"
ensure_success $sh_c "cp -f $real_driver /usr/lib/wsl/lib/libcuda.so.1"
ensure_success $sh_c "cp -f $real_driver /usr/lib/wsl/lib/libcuda.so.1.1"
ensure_success $sh_c "ln -s $real_driver /usr/lib/x86_64-linux-gnu/libcuda.so.1"
ensure_success $sh_c "ln -s $real_driver /usr/lib/x86_64-linux-gnu/libcuda.so.1.1"
ensure_success $sh_c "ln -s /usr/lib/x86_64-linux-gnu/libcuda.so.1 /usr/lib/x86_64-linux-gnu/libcuda.so"
fi
fi
ensure_success $sh_c "${KUBECTL} create -f deploy/nvidia-device-plugin.yml"
ensure_success $sh_c "${KUBECTL} create -f ${BASE_DIR}/deploy/nvidia-device-plugin.yml"
log_info 'Waiting for Nvidia GPU Driver applied ...\n'
Expand All @@ -2136,10 +2149,10 @@ install_gpu(){
if [ "x${LOCAL_GPU_SHARE}" == "x1" ]; then
log_info 'Installing Nvshare GPU Plugin ...\n'
ensure_success $sh_c "${KUBECTL} apply -f deploy/nvshare-system.yaml"
ensure_success $sh_c "${KUBECTL} apply -f deploy/nvshare-system-quotas.yaml"
ensure_success $sh_c "${KUBECTL} apply -f deploy/device-plugin.yaml"
ensure_success $sh_c "${KUBECTL} apply -f deploy/scheduler.yaml"
ensure_success $sh_c "${KUBECTL} apply -f ${BASE_DIR}/deploy/nvshare-system.yaml"
ensure_success $sh_c "${KUBECTL} apply -f ${BASE_DIR}/deploy/nvshare-system-quotas.yaml"
ensure_success $sh_c "${KUBECTL} apply -f ${BASE_DIR}/deploy/device-plugin.yaml"
ensure_success $sh_c "${KUBECTL} apply -f ${BASE_DIR}/deploy/scheduler.yaml"
fi
}
Expand Down
1 change: 1 addition & 0 deletions build/manifest/images
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ beclab/velero:v1.11.3
beclab/velero-plugin-for-terminus:v1.0.1
beclab/velero-plugin-for-terminus:v1.0.2
rancher/coredns-coredns:1.8.3
beclab/l4-bfl-proxy:v0.2.6

0 comments on commit a35dfd6

Please sign in to comment.