diff --git a/.appveyor.yml b/.appveyor.yml
index ba85fac817ab..2d279b0f33e3 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -36,6 +36,8 @@ install:
 build: false
 
 test_script:
+  - conda config --remove channels defaults
+  - conda config --add channels nodefaults
   - conda config --add channels conda-forge
   - conda config --set channel_priority strict
   - conda init powershell
diff --git a/.ci/setup.sh b/.ci/setup.sh
index 87d7294288e0..89fbed442ed1 100755
--- a/.ci/setup.sh
+++ b/.ci/setup.sh
@@ -23,7 +23,7 @@ if [[ $OS_NAME == "macos" ]]; then
         -o miniforge.sh \
         https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-x86_64.sh
 else  # Linux
-    if [[ $IN_UBUNTU_LATEST_CONTAINER == "true" ]]; then
+    if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
         # fixes error "unable to initialize frontend: Dialog"
         # https://github.com/moby/moby/issues/27988#issuecomment-462809153
         echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
@@ -42,13 +42,14 @@ else  # Linux
             iputils-ping \
             jq \
             libcurl4 \
-            libicu66 \
-            libssl1.1 \
+            libicu-dev \
+            libssl-dev \
             libunwind8 \
             locales \
+            locales-all \
             netcat \
             unzip \
-            zip
+            zip || exit -1
         if [[ $COMPILER == "clang" ]]; then
             sudo apt-get install --no-install-recommends -y \
                 clang \
@@ -56,12 +57,15 @@ else  # Linux
         fi
 
         export LANG="en_US.UTF-8"
+        sudo update-locale LANG=${LANG}
         export LC_ALL="${LANG}"
-        sudo locale-gen ${LANG}
-        sudo update-locale
+    fi
+    if [[ $TASK == "r-package" ]] && [[ $COMPILER == "clang" ]]; then
+        sudo apt-get install --no-install-recommends -y \
+            libomp-dev
     fi
     if [[ $TASK == "mpi" ]]; then
-        if [[ $IN_UBUNTU_LATEST_CONTAINER == "true" ]]; then
+        if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
             sudo apt-get update
             sudo apt-get install --no-install-recommends -y \
                 libopenmpi-dev \
@@ -74,13 +78,12 @@ else  # Linux
         fi
     fi
     if [[ $TASK == "gpu" ]]; then
-        if [[ $IN_UBUNTU_LATEST_CONTAINER == "true" ]]; then
-            sudo add-apt-repository ppa:mhier/libboost-latest -y
+        if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
             sudo apt-get update
             sudo apt-get install --no-install-recommends -y \
                 libboost1.74-dev \
-                ocl-icd-opencl-dev \
-                pocl-opencl-icd
+                libboost-filesystem1.74-dev \
+                ocl-icd-opencl-dev
         else  # in manylinux image
             sudo yum update -y
             sudo yum install -y \
@@ -90,6 +93,19 @@ else  # Linux
             || exit -1
         fi
     fi
+    if [[ $TASK == "gpu" || $TASK == "bdist" ]]; then
+        if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
+            sudo apt-get update
+            sudo apt-get install --no-install-recommends -y \
+                pocl-opencl-icd
+        elif [[ $(uname -m) == "x86_64" ]]; then
+            sudo yum update -y
+            sudo yum install -y \
+                ocl-icd-devel \
+                opencl-headers \
+            || exit -1
+        fi
+    fi
     if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
         echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
         apt-get update
diff --git a/.ci/test.sh b/.ci/test.sh
index f73fbe915cc6..f18198b5924f 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -8,6 +8,11 @@ elif [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "clang" ]]; then
     export CC=clang
 fi
 
+if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
+    export LANG="en_US.UTF-8"
+    export LC_ALL="en_US.UTF-8"
+fi
+
 if [[ "${TASK}" == "r-package" ]] || [[ "${TASK}" == "r-rchk" ]]; then
     bash ${BUILD_DIRECTORY}/.ci/test_r_package.sh || exit -1
     exit 0
@@ -161,10 +166,12 @@ elif [[ $TASK == "bdist" ]]; then
         else
             PLATFORM="manylinux2014_$ARCH"
         fi
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=$PLATFORM --python-tag py3 || exit -1
+        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --integrated-opencl --plat-name=$PLATFORM --python-tag py3 || exit -1
         if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
             cp dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
         fi
+        # Make sure we can do both CPU and GPU; see tests/python_package_test/test_dual.py
+        export LIGHTGBM_TEST_DUAL_CPU_GPU=1
     fi
     pip install --user $BUILD_DIRECTORY/python-package/dist/*.whl || exit -1
     pytest $BUILD_DIRECTORY/tests || exit -1
@@ -275,4 +282,15 @@ matplotlib.use\(\"Agg\"\)\
     cd $BUILD_DIRECTORY/examples/python-guide/notebooks
     sed -i'.bak' 's/INTERACTIVE = False/assert False, \\"Interactive mode disabled\\"/' interactive_plot_example.ipynb
     jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb || exit -1  # run all notebooks
+
+    # importing the library should succeed even if all optional dependencies are not present
+    conda uninstall --force --yes \
+        dask \
+        distributed \
+        joblib \
+        matplotlib \
+        psutil \
+        python-graphviz \
+        scikit-learn || exit -1
+    python -c "import lightgbm" || exit -1
 fi
diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index c15b5c59df7b..7a12d3fdc1d4 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -21,9 +21,9 @@ if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
     export R_LINUX_VERSION="3.6.3-1bionic"
     export R_APT_REPO="bionic-cran35/"
 elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
-    export R_MAC_VERSION=4.2.1
+    export R_MAC_VERSION=4.2.2
     export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/base/R-${R_MAC_VERSION}.pkg
-    export R_LINUX_VERSION="4.2.1-1.2004.0"
+    export R_LINUX_VERSION="4.2.2-1.2004.0"
     export R_APT_REPO="focal-cran40/"
 else
     echo "Unrecognized R version: ${R_VERSION}"
@@ -76,7 +76,7 @@ if [[ $OS_NAME == "macos" ]]; then
     brew install --cask basictex || exit -1
     export PATH="/Library/TeX/texbin:$PATH"
     sudo tlmgr --verify-repo=none update --self || exit -1
-    sudo tlmgr --verify-repo=none install inconsolata helvetic || exit -1
+    sudo tlmgr --verify-repo=none install inconsolata helvetic rsfs || exit -1
 
     curl -sL ${R_MAC_PKG_URL} -o R.pkg || exit -1
     sudo installer \
@@ -163,11 +163,12 @@ elif [[ $R_BUILD_TYPE == "cran" ]]; then
         || (cat ${RCHK_LOG_FILE} && exit -1)
         cat ${RCHK_LOG_FILE}
 
-        # the exception below is from R itself and not LightGBM:
+        # the exceptions below are from R itself and not LightGBM:
         # https://github.com/kalibera/rchk/issues/22#issuecomment-656036156
         exit $(
             cat ${RCHK_LOG_FILE} \
             | grep -v "in function strptime_internal" \
+            | grep -v "in function RunGenCollect" \
             | grep --count -E '\[PB\]|ERROR'
         )
     fi
diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1
index 2005ad5adeeb..e4d20de50b90 100644
--- a/.ci/test_r_package_windows.ps1
+++ b/.ci/test_r_package_windows.ps1
@@ -80,7 +80,7 @@ if ($env:R_MAJOR_VERSION -eq "3") {
   $env:RTOOLS_BIN = "$RTOOLS_INSTALL_PATH\usr\bin"
   $env:RTOOLS_MINGW_BIN = "$RTOOLS_INSTALL_PATH\x86_64-w64-mingw32.static.posix\bin"
   $env:RTOOLS_EXE_FILE = "rtools42-5253-5107.exe"
-  $env:R_WINDOWS_VERSION = "4.2.1"
+  $env:R_WINDOWS_VERSION = "4.2.2"
 } else {
   Write-Output "[ERROR] Unrecognized R version: $env:R_VERSION"
   Check-Output $false
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
index 6c0ffb8249f0..79b35faaff10 100644
--- a/.ci/test_windows.ps1
+++ b/.ci/test_windows.ps1
@@ -25,16 +25,6 @@ if ($env:TASK -eq "cpp-tests") {
   Exit 0
 }
 
-# setup for Python
-conda init powershell
-conda activate
-conda config --set always_yes yes --set changeps1 no
-conda update -q -y conda
-conda create -q -y -n $env:CONDA_ENV "python=$env:PYTHON_VERSION[build=*cpython]" ; Check-Output $?
-if ($env:TASK -ne "bdist") {
-  conda activate $env:CONDA_ENV
-}
-
 if ($env:TASK -eq "swig") {
   $env:JAVA_HOME = $env:JAVA_HOME_8_X64  # there is pre-installed Eclipse Temurin 8 somewhere
   $ProgressPreference = "SilentlyContinue"  # progress bar bug extremely slows down download speed
@@ -50,8 +40,27 @@ if ($env:TASK -eq "swig") {
   Exit 0
 }
 
-# re-including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy
-conda install -q -y -n $env:CONDA_ENV cloudpickle joblib matplotlib numpy pandas psutil pytest "python=$env:PYTHON_VERSION[build=*cpython]" python-graphviz scikit-learn scipy ; Check-Output $?
+# setup for Python
+conda init powershell
+conda activate
+conda config --set always_yes yes --set changeps1 no
+conda update -q -y conda
+conda create -q -y -n $env:CONDA_ENV `
+  cloudpickle `
+  joblib `
+  matplotlib `
+  numpy `
+  pandas `
+  psutil `
+  pytest `
+  "python=$env:PYTHON_VERSION[build=*cpython]" `
+  python-graphviz `
+  scikit-learn `
+  scipy ; Check-Output $?
+
+if ($env:TASK -ne "bdist") {
+  conda activate $env:CONDA_ENV
+}
 
 if ($env:TASK -eq "regular") {
   mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index a8d69ddcaa33..946f548784a6 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -9,6 +9,11 @@ on:
     - master
     - release/*
 
+# automatically cancel in-progress builds if another commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   github_actions: 'true'
   os_name: linux
@@ -107,10 +112,12 @@ jobs:
                 docker_img="${docker_img}-ubuntu$(lsb_release -rs)" 
             fi
             docker run --env-file docker.env -v "$GITHUB_WORKSPACE":"$ROOT_DOCKER_FOLDER" --rm --gpus all "$docker_img" /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
-  all-successful:
-    # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert
+  all-cuda-jobs-successful:
+    if: always()
     runs-on: ubuntu-latest
     needs: [test]
     steps:
     - name: Note that all tests succeeded
-      run: echo "🎉"
+      uses: re-actors/alls-green@v1.2.2
+      with:
+        jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/no-response.yml b/.github/workflows/no-response.yml
index a731941c21eb..30d767dd5444 100644
--- a/.github/workflows/no-response.yml
+++ b/.github/workflows/no-response.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   noResponse:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: lee-dohm/no-response@v0.5.0
         with:
diff --git a/.github/workflows/optional_checks.yml b/.github/workflows/optional_checks.yml
index bcb380e3006c..6ee6e9e46296 100644
--- a/.github/workflows/optional_checks.yml
+++ b/.github/workflows/optional_checks.yml
@@ -7,7 +7,7 @@ on:
       - release/*
 
 jobs:
-  all-successful:
+  all-optional-checks-successful:
     timeout-minutes: 120
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
index 877a5d623bac..f046aecc1d9d 100644
--- a/.github/workflows/python_package.yml
+++ b/.github/workflows/python_package.yml
@@ -9,6 +9,11 @@ on:
     - master
     - release/*
 
+# automatically cancel in-progress builds if another commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   CONDA_ENV: test-env
   GITHUB_ACTIONS: 'true'
@@ -71,10 +76,12 @@ jobs:
           export PATH=${CONDA}/bin:${PATH}
           $GITHUB_WORKSPACE/.ci/setup.sh || exit -1
           $GITHUB_WORKSPACE/.ci/test.sh || exit -1
-  all-successful:
-    # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert
+  all-python-package-jobs-successful:
+    if: always()
     runs-on: ubuntu-latest
     needs: [test]
     steps:
     - name: Note that all tests succeeded
-      run: echo "🎉"
+      uses: re-actors/alls-green@v1.2.2
+      with:
+        jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/r_configure.yml b/.github/workflows/r_configure.yml
index e399f5316410..fb1e014016ac 100644
--- a/.github/workflows/r_configure.yml
+++ b/.github/workflows/r_configure.yml
@@ -9,7 +9,7 @@ jobs:
     name: r-configure
     timeout-minutes: 60
     runs-on: ubuntu-latest
-    container: "ubuntu:20.04"
+    container: "ubuntu:22.04"
     steps:
       - name: Install essential software before checkout
         run: |
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
index dbceb9c69f6b..72d24eb9a39a 100644
--- a/.github/workflows/r_package.yml
+++ b/.github/workflows/r_package.yml
@@ -9,6 +9,11 @@ on:
     - master
     - release/*
 
+# automatically cancel in-progress builds if another commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   # hack to get around this:
   # https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html
@@ -33,22 +38,22 @@ jobs:
           ################
           # CMake builds #
           ################
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
             task: r-package
             compiler: gcc
             r_version: 3.6
             build_type: cmake
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
             task: r-package
             compiler: gcc
             r_version: 4.2
             build_type: cmake
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
             task: r-package
             compiler: clang
             r_version: 3.6
             build_type: cmake
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
             task: r-package
             compiler: clang
             r_version: 4.2
@@ -114,7 +119,7 @@ jobs:
             toolchain: MSYS
             r_version: 4.2
             build_type: cran
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
             task: r-package
             compiler: gcc
             r_version: 4.2
@@ -127,7 +132,7 @@ jobs:
           ################
           # Other checks #
           ################
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
             task: r-rchk
             compiler: gcc
             r_version: 4.2
@@ -151,7 +156,7 @@ jobs:
           CTAN_MIRROR: https://ctan.math.illinois.edu/systems/win32/miktex
           TINYTEX_INSTALLER: TinyTeX
       - name: Setup and run tests on Linux and macOS
-        if: matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest'
+        if: matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-22.04'
         shell: bash
         run: |
           export TASK="${{ matrix.task }}"
@@ -159,7 +164,7 @@ jobs:
           export GITHUB_ACTIONS="true"
           if [[ "${{ matrix.os }}" == "macOS-latest" ]]; then
               export OS_NAME="macos"
-          elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
+          elif [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then
               export OS_NAME="linux"
           fi
           export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
@@ -246,10 +251,12 @@ jobs:
               echo "NOTEs, WARNINGs, or ERRORs have been found by R CMD check"
               exit -1
           fi
-  all-successful:
-    # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert
+  all-r-package-jobs-successful:
+    if: always()
     runs-on: ubuntu-latest
     needs: [test, test-r-sanitizers, test-r-debian-clang]
     steps:
     - name: Note that all tests succeeded
-      run: echo "🎉"
+      uses: re-actors/alls-green@v1.2.2
+      with:
+        jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml
index 424ae49b4e5f..415cbb66086a 100644
--- a/.github/workflows/static_analysis.yml
+++ b/.github/workflows/static_analysis.yml
@@ -11,6 +11,11 @@ on:
     - master
     - release/*
 
+# automatically cancel in-progress builds if another commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   COMPILER: 'gcc'
   CONDA_ENV: test-env
@@ -21,7 +26,7 @@ env:
 jobs:
   test:
     name: ${{ matrix.task }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     timeout-minutes: 60
     strategy:
       fail-fast: false
@@ -80,10 +85,12 @@ jobs:
               echo ""
               exit -1
           fi
-  all-successful:
-    # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert
+  all-static-analysis-jobs-successful:
+    if: always()
     runs-on: ubuntu-latest
     needs: [test, r-check-docs]
     steps:
     - name: Note that all tests succeeded
-      run: echo "🎉"
+      uses: re-actors/alls-green@v1.2.2
+      with:
+        jobs: ${{ toJSON(needs) }}
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index 1ff35fc564c2..cf750a16ed98 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -21,7 +21,7 @@ resources:
   - container: linux-artifact-builder
     image: lightgbm/vsts-agent:manylinux_2_28_x86_64
   - container: ubuntu-latest
-    image: 'ubuntu:20.04'
+    image: 'ubuntu:22.04'
     options: "--name ci-container -v /usr/bin/docker:/tmp/docker:ro"
   - container: rbase
     image: wch1/r-debug
@@ -85,7 +85,7 @@ jobs:
   variables:
     COMPILER: clang
     DEBIAN_FRONTEND: 'noninteractive'
-    IN_UBUNTU_LATEST_CONTAINER: 'true'
+    IN_UBUNTU_BASE_CONTAINER: 'true'
     OS_NAME: 'linux'
     SETUP_CONDA: 'true'
   pool: sh-ubuntu
@@ -153,7 +153,7 @@ jobs:
     OS_NAME: 'linux'
     PRODUCES_ARTIFACTS: 'true'
   pool:
-    vmImage: ubuntu-latest
+    vmImage: ubuntu-22.04
   timeoutInMinutes: 180
   strategy:
     matrix:
@@ -189,11 +189,12 @@ jobs:
       EOF
       cat > docker-script.sh <<EOF
       export CONDA=\$HOME/miniforge
-      export PATH=\$CONDA/bin:\$PATH
+      export PATH=\$CONDA/bin:/opt/rh/llvm-toolset-7.0/root/usr/bin:\$PATH
+      export LD_LIBRARY_PATH=/opt/rh/llvm-toolset-7.0/root/usr/lib64:\$LD_LIBRARY_PATH
       $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
       $ROOT_DOCKER_FOLDER/.ci/test.sh || exit -1
       EOF
-      IMAGE_URI="quay.io/pypa/manylinux2014_${ARCH}"
+      IMAGE_URI="lightgbm/vsts-agent:manylinux2014_aarch64"
       docker pull "${IMAGE_URI}" || exit -1
       PLATFORM=$(docker inspect --format='{{.Os}}/{{.Architecture}}' "${IMAGE_URI}") || exit -1
       echo "detected image platform: ${PLATFORM}"
@@ -284,6 +285,8 @@ jobs:
     condition: eq(variables['TASK'], 'bdist')
     displayName: 'Install OpenCL'
   - script: |
+      cmd /c "conda config --remove channels defaults"
+      cmd /c "conda config --add channels nodefaults"
       cmd /c "conda config --add channels conda-forge"
       cmd /c "conda config --set channel_priority strict"
       cmd /c "conda init powershell"
@@ -300,7 +303,7 @@ jobs:
 ###########################################
   condition: not(startsWith(variables['Build.SourceBranch'], 'refs/pull/'))
   pool:
-    vmImage: 'ubuntu-latest'
+    vmImage: 'ubuntu-22.04'
   container: rbase
   steps:
   - script: |
@@ -331,7 +334,7 @@ jobs:
   - R_artifact
   condition: and(succeeded(), not(startsWith(variables['Build.SourceBranch'], 'refs/pull/')))
   pool:
-    vmImage: 'ubuntu-latest'
+    vmImage: 'ubuntu-22.04'
   steps:
   # Create archives with complete source code included (with git submodules)
   - task: ArchiveFiles@2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aea89891ca27..b7fa5dc8f330 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -184,11 +184,11 @@ if(USE_GPU)
 endif()
 
 if(__INTEGRATE_OPENCL)
-    if(WIN32)
+    if(APPLE)
+        message(FATAL_ERROR "Integrated OpenCL build is not available on macOS")
+    else()
         include(cmake/IntegratedOpenCL.cmake)
         add_definitions(-DUSE_GPU)
-    else()
-        message(FATAL_ERROR "Integrated OpenCL build is available only for Windows")
     endif()
 endif()
 
@@ -417,6 +417,8 @@ endif()
 if(USE_CUDA_EXP)
       src/boosting/cuda/*.cpp
       src/boosting/cuda/*.cu
+      src/metric/cuda/*.cpp
+      src/metric/cuda/*.cu
       src/objective/cuda/*.cpp
       src/objective/cuda/*.cu
       src/treelearner/cuda/*.cpp
@@ -544,7 +546,7 @@ if(__INTEGRATE_OPENCL)
   # variables INTEGRATED_OPENCL_* are set in IntegratedOpenCL.cmake
   target_include_directories(lightgbm_objs PRIVATE ${INTEGRATED_OPENCL_INCLUDES})
   target_compile_definitions(lightgbm_objs PRIVATE ${INTEGRATED_OPENCL_DEFINITIONS})
-  target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES})
+  target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES} ${CMAKE_DL_LIBS})
 endif()
 
 if(USE_CUDA OR USE_CUDA_EXP)
diff --git a/R-package/AUTOCONF_UBUNTU_VERSION b/R-package/AUTOCONF_UBUNTU_VERSION
index 2eee9f218a39..e522974be6a2 100644
--- a/R-package/AUTOCONF_UBUNTU_VERSION
+++ b/R-package/AUTOCONF_UBUNTU_VERSION
@@ -1 +1 @@
-2.69-11.1
+2.71-2
diff --git a/R-package/README.md b/R-package/README.md
index e08d9312a4ec..b3f1d822697f 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -352,22 +352,22 @@ This section briefly explains the key files for building a CRAN package. To upda
 At build time, `configure` will be run and used to create a file `Makevars`, using `Makevars.in` as a template.
 
 1. Edit `configure.ac`.
-2. Create `configure` with `autoconf`. Do not edit it by hand. This file must be generated on Ubuntu 20.04.
+2. Create `configure` with `autoconf`. Do not edit it by hand. This file must be generated on Ubuntu 22.04.
 
-    If you have an Ubuntu 20.04 environment available, run the provided script from the root of the `LightGBM` repository.
+    If you have an Ubuntu 22.04 environment available, run the provided script from the root of the `LightGBM` repository.
 
     ```shell
     ./R-package/recreate-configure.sh
     ```
 
-    If you do not have easy access to an Ubuntu 20.04 environment, the `configure` script can be generated using Docker by running the code below from the root of this repo.
+    If you do not have easy access to an Ubuntu 22.04 environment, the `configure` script can be generated using Docker by running the code below from the root of this repo.
 
     ```shell
     docker run \
         --rm \
         -v $(pwd):/opt/LightGBM \
         -w /opt/LightGBM \
-        -t ubuntu:20.04 \
+        -t ubuntu:22.04 \
         ./R-package/recreate-configure.sh
     ```
 
diff --git a/R-package/configure b/R-package/configure
index 86918954634e..4529c4de5398 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,9 +1,10 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for lightgbm 3.3.3.99.
+# Generated by GNU Autoconf 2.71 for lightgbm 3.3.3.99.
 #
 #
-# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
+# Inc.
 #
 #
 # This configure script is free software; the Free Software Foundation
@@ -14,14 +15,16 @@
 
 # Be more Bourne compatible
 DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+as_nop=:
+if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
-else
+else $as_nop
   case `(set -o) 2>/dev/null` in #(
   *posix*) :
     set -o posix ;; #(
@@ -31,46 +34,46 @@ esac
 fi
 
 
+
+# Reset variables that may have inherited troublesome values from
+# the environment.
+
+# IFS needs to be set, to space, tab, and newline, in precisely that order.
+# (If _AS_PATH_WALK were called with IFS unset, it would have the
+# side effect of setting IFS to empty, thus disabling word splitting.)
+# Quoting is to prevent editors from complaining about space-tab.
 as_nl='
 '
 export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
+IFS=" ""	$as_nl"
+
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# Ensure predictable behavior from utilities with locale-dependent output.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# We cannot yet rely on "unset" to work, but we need these variables
+# to be unset--not just set to an empty or harmless value--now, to
+# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh).  This construct
+# also avoids known problems related to "unset" and subshell syntax
+# in other old shells (e.g. bash 2.01 and pdksh 5.2.14).
+for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH
+do eval test \${$as_var+y} \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+
+# Ensure that fds 0, 1, and 2 are open.
+if (exec 3>&0) 2>/dev/null; then :; else exec 0</dev/null; fi
+if (exec 3>&1) 2>/dev/null; then :; else exec 1>/dev/null; fi
+if (exec 3>&2)            ; then :; else exec 2>/dev/null; fi
 
 # The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
+if ${PATH_SEPARATOR+false} :; then
   PATH_SEPARATOR=:
   (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
     (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
@@ -79,13 +82,6 @@ if test "${PATH_SEPARATOR+set}" != set; then
 fi
 
 
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
 # Find who we are.  Look in the path if we contain no directory separator.
 as_myself=
 case $0 in #((
@@ -94,8 +90,12 @@ case $0 in #((
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    test -r "$as_dir$0" && as_myself=$as_dir$0 && break
   done
 IFS=$as_save_IFS
 
@@ -107,30 +107,10 @@ if test "x$as_myself" = x; then
   as_myself=$0
 fi
 if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
   exit 1
 fi
 
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
 
 # Use a proper internal environment variable to ensure we don't fall
   # into an infinite loop, continuously re-executing ourselves.
@@ -152,20 +132,22 @@ esac
 exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
 # Admittedly, this is quite paranoid, since all the known shells bail
 # out after a failed `exec'.
-$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
-as_fn_exit 255
+printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
   fi
   # We don't want this to propagate to other subprocesses.
           { _as_can_reexec=; unset _as_can_reexec;}
 if test "x$CONFIG_SHELL" = x; then
-  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  as_bourne_compatible="as_nop=:
+if test \${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '\${1+\"\$@\"}'='\"\$@\"'
   setopt NO_GLOB_SUBST
-else
+else \$as_nop
   case \`(set -o) 2>/dev/null\` in #(
   *posix*) :
     set -o posix ;; #(
@@ -185,41 +167,52 @@ as_fn_success || { exitcode=1; echo as_fn_success failed.; }
 as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
 as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
 as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
-if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+if ( set x; as_fn_ret_success y && test x = \"\$1\" )
+then :
 
-else
+else \$as_nop
   exitcode=1; echo positional parameters were not saved.
 fi
 test x\$exitcode = x0 || exit 1
+blah=\$(echo \$(echo blah))
+test x\"\$blah\" = xblah || exit 1
 test -x / || exit 1"
   as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
   as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
   eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
   test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1"
-  if (eval "$as_required") 2>/dev/null; then :
+  if (eval "$as_required") 2>/dev/null
+then :
   as_have_required=yes
-else
+else $as_nop
   as_have_required=no
 fi
-  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null
+then :
 
-else
+else $as_nop
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 as_found=false
 for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
   as_found=:
   case $as_dir in #(
 	 /*)
 	   for as_base in sh bash ksh sh5; do
 	     # Try only shells that exist, to save several forks.
-	     as_shell=$as_dir/$as_base
+	     as_shell=$as_dir$as_base
 	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
-		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+		    as_run=a "$as_shell" -c "$as_bourne_compatible""$as_required" 2>/dev/null
+then :
   CONFIG_SHELL=$as_shell as_have_required=yes
-		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+		   if as_run=a "$as_shell" -c "$as_bourne_compatible""$as_suggested" 2>/dev/null
+then :
   break 2
 fi
 fi
@@ -227,14 +220,21 @@ fi
        esac
   as_found=false
 done
-$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
-	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
-  CONFIG_SHELL=$SHELL as_have_required=yes
-fi; }
 IFS=$as_save_IFS
+if $as_found
+then :
 
+else $as_nop
+  if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      as_run=a "$SHELL" -c "$as_bourne_compatible""$as_required" 2>/dev/null
+then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi
+fi
 
-      if test "x$CONFIG_SHELL" != x; then :
+
+      if test "x$CONFIG_SHELL" != x
+then :
   export CONFIG_SHELL
              # We cannot yet assume a decent shell, so we have to provide a
 # neutralization value for shells without unset; and this also
@@ -252,18 +252,19 @@ esac
 exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
 # Admittedly, this is quite paranoid, since all the known shells bail
 # out after a failed `exec'.
-$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2
 exit 255
 fi
 
-    if test x$as_have_required = xno; then :
-  $as_echo "$0: This script requires a shell more modern than all"
-  $as_echo "$0: the shells that I found on your system."
-  if test x${ZSH_VERSION+set} = xset ; then
-    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
-    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+    if test x$as_have_required = xno
+then :
+  printf "%s\n" "$0: This script requires a shell more modern than all"
+  printf "%s\n" "$0: the shells that I found on your system."
+  if test ${ZSH_VERSION+y} ; then
+    printf "%s\n" "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    printf "%s\n" "$0: be upgraded to zsh 4.3.4 or later."
   else
-    $as_echo "$0: Please tell bug-autoconf@gnu.org about your system,
+    printf "%s\n" "$0: Please tell bug-autoconf@gnu.org about your system,
 $0: including any error possibly output before this
 $0: message. Then install a modern shell, or manually run
 $0: the script under such a shell if you do have one."
@@ -290,6 +291,7 @@ as_fn_unset ()
 }
 as_unset=as_fn_unset
 
+
 # as_fn_set_status STATUS
 # -----------------------
 # Set $? to STATUS, without forking.
@@ -307,6 +309,14 @@ as_fn_exit ()
   as_fn_set_status $1
   exit $1
 } # as_fn_exit
+# as_fn_nop
+# ---------
+# Do nothing but, unlike ":", preserve the value of $?.
+as_fn_nop ()
+{
+  return $?
+}
+as_nop=as_fn_nop
 
 # as_fn_mkdir_p
 # -------------
@@ -321,7 +331,7 @@ as_fn_mkdir_p ()
     as_dirs=
     while :; do
       case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
       *) as_qdir=$as_dir;;
       esac
       as_dirs="'$as_qdir' $as_dirs"
@@ -330,7 +340,7 @@ $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$as_dir" : 'X\(//\)[^/]' \| \
 	 X"$as_dir" : 'X\(//\)$' \| \
 	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
+printf "%s\n" X"$as_dir" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -369,12 +379,13 @@ as_fn_executable_p ()
 # advantage of any shell optimizations that allow amortized linear growth over
 # repeated appends, instead of the typical quadratic growth present in naive
 # implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null
+then :
   eval 'as_fn_append ()
   {
     eval $1+=\$2
   }'
-else
+else $as_nop
   as_fn_append ()
   {
     eval $1=\$$1\$2
@@ -386,18 +397,27 @@ fi # as_fn_append
 # Perform arithmetic evaluation on the ARGs, and store the result in the
 # global $as_val. Take advantage of shells that can avoid forks. The arguments
 # must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null
+then :
   eval 'as_fn_arith ()
   {
     as_val=$(( $* ))
   }'
-else
+else $as_nop
   as_fn_arith ()
   {
     as_val=`expr "$@" || test $? -eq 1`
   }
 fi # as_fn_arith
 
+# as_fn_nop
+# ---------
+# Do nothing but, unlike ":", preserve the value of $?.
+as_fn_nop ()
+{
+  return $?
+}
+as_nop=as_fn_nop
 
 # as_fn_error STATUS ERROR [LINENO LOG_FD]
 # ----------------------------------------
@@ -409,9 +429,9 @@ as_fn_error ()
   as_status=$1; test $as_status -eq 0 && as_status=1
   if test "$4"; then
     as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
   fi
-  $as_echo "$as_me: error: $2" >&2
+  printf "%s\n" "$as_me: error: $2" >&2
   as_fn_exit $as_status
 } # as_fn_error
 
@@ -438,7 +458,7 @@ as_me=`$as_basename -- "$0" ||
 $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
 	 X"$0" : 'X\(//\)$' \| \
 	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
+printf "%s\n" X/"$0" |
     sed '/^.*\/\([^/][^/]*\)\/*$/{
 	    s//\1/
 	    q
@@ -482,7 +502,7 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits
       s/-\n.*//
     ' >$as_me.lineno &&
   chmod +x "$as_me.lineno" ||
-    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+    { printf "%s\n" "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
 
   # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
   # already done that, so ensure we don't try to do so again and fall
@@ -496,6 +516,10 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits
   exit
 }
 
+
+# Determine whether it's possible to make 'echo' print without a newline.
+# These variables are no longer used directly by Autoconf, but are AC_SUBSTed
+# for compatibility with existing Makefiles.
 ECHO_C= ECHO_N= ECHO_T=
 case `echo -n x` in #(((((
 -n*)
@@ -509,6 +533,13 @@ case `echo -n x` in #(((((
   ECHO_N='-n';;
 esac
 
+# For backward compatibility with old third-party macros, we provide
+# the shell variables $as_echo and $as_echo_n.  New code should use
+# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively.
+as_echo='printf %s\n'
+as_echo_n='printf %s'
+
+
 rm -f conf$$ conf$$.exe conf$$.file
 if test -d conf$$.dir; then
   rm -f conf$$.dir/conf$$.file
@@ -700,8 +731,6 @@ do
   *)    ac_optarg=yes ;;
   esac
 
-  # Accept the important Cygnus configure options, so we can diagnose typos.
-
   case $ac_dashdash$ac_option in
   --)
     ac_dashdash=yes ;;
@@ -742,9 +771,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
+      as_fn_error $? "invalid feature name: \`$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "enable_$ac_useropt"
@@ -768,9 +797,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
+      as_fn_error $? "invalid feature name: \`$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "enable_$ac_useropt"
@@ -981,9 +1010,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
+      as_fn_error $? "invalid package name: \`$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "with_$ac_useropt"
@@ -997,9 +1026,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
+      as_fn_error $? "invalid package name: \`$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "with_$ac_useropt"
@@ -1043,9 +1072,9 @@ Try \`$0 --help' for more information"
 
   *)
     # FIXME: should be removed in autoconf 3.0.
-    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    printf "%s\n" "$as_me: WARNING: you should use --build, --host, --target" >&2
     expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+      printf "%s\n" "$as_me: WARNING: invalid host type: $ac_option" >&2
     : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
     ;;
 
@@ -1061,7 +1090,7 @@ if test -n "$ac_unrecognized_opts"; then
   case $enable_option_checking in
     no) ;;
     fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
-    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+    *)     printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
   esac
 fi
 
@@ -1125,7 +1154,7 @@ $as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$as_myself" : 'X\(//\)[^/]' \| \
 	 X"$as_myself" : 'X\(//\)$' \| \
 	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_myself" |
+printf "%s\n" X"$as_myself" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -1264,9 +1293,9 @@ if test "$ac_init_help" = "recursive"; then
 case "$ac_dir" in
 .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
 *)
-  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'`
   # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
   case $ac_top_builddir_sub in
   "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
   *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
@@ -1294,7 +1323,8 @@ esac
 ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
 
     cd "$ac_dir" || { ac_status=$?; continue; }
-    # Check for guested configure.
+    # Check for configure.gnu first; this name is used for a wrapper for
+    # Metaconfig's "Configure" on case-insensitive file systems.
     if test -f "$ac_srcdir/configure.gnu"; then
       echo &&
       $SHELL "$ac_srcdir/configure.gnu" --help=recursive
@@ -1302,7 +1332,7 @@ ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
       echo &&
       $SHELL "$ac_srcdir/configure" --help=recursive
     else
-      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+      printf "%s\n" "$as_me: WARNING: no configuration information is in $ac_dir" >&2
     fi || ac_status=$?
     cd "$ac_pwd" || { ac_status=$?; break; }
   done
@@ -1312,9 +1342,9 @@ test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
 lightgbm configure 3.3.3.99
-generated by GNU Autoconf 2.69
+generated by GNU Autoconf 2.71
 
-Copyright (C) 2012 Free Software Foundation, Inc.
+Copyright (C) 2021 Free Software Foundation, Inc.
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.
 _ACEOF
@@ -1324,14 +1354,34 @@ fi
 ## ------------------------ ##
 ## Autoconf initialization. ##
 ## ------------------------ ##
+ac_configure_args_raw=
+for ac_arg
+do
+  case $ac_arg in
+  *\'*)
+    ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+  esac
+  as_fn_append ac_configure_args_raw " '$ac_arg'"
+done
+
+case $ac_configure_args_raw in
+  *$as_nl*)
+    ac_safe_unquote= ;;
+  *)
+    ac_unsafe_z='|&;<>()$`\\"*?[ ''	' # This string ends in space, tab.
+    ac_unsafe_a="$ac_unsafe_z#~"
+    ac_safe_unquote="s/ '\\([^$ac_unsafe_a][^$ac_unsafe_z]*\\)'/ \\1/g"
+    ac_configure_args_raw=`      printf "%s\n" "$ac_configure_args_raw" | sed "$ac_safe_unquote"`;;
+esac
+
 cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
 It was created by lightgbm $as_me 3.3.3.99, which was
-generated by GNU Autoconf 2.69.  Invocation command line was
+generated by GNU Autoconf 2.71.  Invocation command line was
 
-  $ $0 $@
+  $ $0$ac_configure_args_raw
 
 _ACEOF
 exec 5>>config.log
@@ -1364,8 +1414,12 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    $as_echo "PATH: $as_dir"
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    printf "%s\n" "PATH: $as_dir"
   done
 IFS=$as_save_IFS
 
@@ -1400,7 +1454,7 @@ do
     | -silent | --silent | --silen | --sile | --sil)
       continue ;;
     *\'*)
-      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+      ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
     esac
     case $ac_pass in
     1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
@@ -1435,11 +1489,13 @@ done
 # WARNING: Use '\'' to represent an apostrophe within the trap.
 # WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
 trap 'exit_status=$?
+  # Sanitize IFS.
+  IFS=" ""	$as_nl"
   # Save into config.log some information that might help in debugging.
   {
     echo
 
-    $as_echo "## ---------------- ##
+    printf "%s\n" "## ---------------- ##
 ## Cache variables. ##
 ## ---------------- ##"
     echo
@@ -1450,8 +1506,8 @@ trap 'exit_status=$?
     case $ac_val in #(
     *${as_nl}*)
       case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
-$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       esac
       case $ac_var in #(
       _ | IFS | as_nl) ;; #(
@@ -1475,7 +1531,7 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
 )
     echo
 
-    $as_echo "## ----------------- ##
+    printf "%s\n" "## ----------------- ##
 ## Output variables. ##
 ## ----------------- ##"
     echo
@@ -1483,14 +1539,14 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
     do
       eval ac_val=\$$ac_var
       case $ac_val in
-      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      *\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
       esac
-      $as_echo "$ac_var='\''$ac_val'\''"
+      printf "%s\n" "$ac_var='\''$ac_val'\''"
     done | sort
     echo
 
     if test -n "$ac_subst_files"; then
-      $as_echo "## ------------------- ##
+      printf "%s\n" "## ------------------- ##
 ## File substitutions. ##
 ## ------------------- ##"
       echo
@@ -1498,15 +1554,15 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       do
 	eval ac_val=\$$ac_var
 	case $ac_val in
-	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	*\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
 	esac
-	$as_echo "$ac_var='\''$ac_val'\''"
+	printf "%s\n" "$ac_var='\''$ac_val'\''"
       done | sort
       echo
     fi
 
     if test -s confdefs.h; then
-      $as_echo "## ----------- ##
+      printf "%s\n" "## ----------- ##
 ## confdefs.h. ##
 ## ----------- ##"
       echo
@@ -1514,8 +1570,8 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       echo
     fi
     test "$ac_signal" != 0 &&
-      $as_echo "$as_me: caught signal $ac_signal"
-    $as_echo "$as_me: exit $exit_status"
+      printf "%s\n" "$as_me: caught signal $ac_signal"
+    printf "%s\n" "$as_me: exit $exit_status"
   } >&5
   rm -f core *.core core.conftest.* &&
     rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
@@ -1529,63 +1585,48 @@ ac_signal=0
 # confdefs.h avoids OS command line length limits that DEFS can exceed.
 rm -f -r conftest* confdefs.h
 
-$as_echo "/* confdefs.h */" > confdefs.h
+printf "%s\n" "/* confdefs.h */" > confdefs.h
 
 # Predefined preprocessor variables.
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_NAME "$PACKAGE_NAME"
-_ACEOF
+printf "%s\n" "#define PACKAGE_NAME \"$PACKAGE_NAME\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
-_ACEOF
+printf "%s\n" "#define PACKAGE_TARNAME \"$PACKAGE_TARNAME\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_VERSION "$PACKAGE_VERSION"
-_ACEOF
+printf "%s\n" "#define PACKAGE_VERSION \"$PACKAGE_VERSION\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_STRING "$PACKAGE_STRING"
-_ACEOF
+printf "%s\n" "#define PACKAGE_STRING \"$PACKAGE_STRING\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
-_ACEOF
+printf "%s\n" "#define PACKAGE_BUGREPORT \"$PACKAGE_BUGREPORT\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_URL "$PACKAGE_URL"
-_ACEOF
+printf "%s\n" "#define PACKAGE_URL \"$PACKAGE_URL\"" >>confdefs.h
 
 
 # Let the site file select an alternate cache file if it wants to.
 # Prefer an explicitly selected file to automatically selected ones.
-ac_site_file1=NONE
-ac_site_file2=NONE
 if test -n "$CONFIG_SITE"; then
-  # We do not want a PATH search for config.site.
-  case $CONFIG_SITE in #((
-    -*)  ac_site_file1=./$CONFIG_SITE;;
-    */*) ac_site_file1=$CONFIG_SITE;;
-    *)   ac_site_file1=./$CONFIG_SITE;;
-  esac
+  ac_site_files="$CONFIG_SITE"
 elif test "x$prefix" != xNONE; then
-  ac_site_file1=$prefix/share/config.site
-  ac_site_file2=$prefix/etc/config.site
+  ac_site_files="$prefix/share/config.site $prefix/etc/config.site"
 else
-  ac_site_file1=$ac_default_prefix/share/config.site
-  ac_site_file2=$ac_default_prefix/etc/config.site
+  ac_site_files="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
 fi
-for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+
+for ac_site_file in $ac_site_files
 do
-  test "x$ac_site_file" = xNONE && continue
-  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
-$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+  case $ac_site_file in #(
+  */*) :
+     ;; #(
+  *) :
+    ac_site_file=./$ac_site_file ;;
+esac
+  if test -f "$ac_site_file" && test -r "$ac_site_file"; then
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+printf "%s\n" "$as_me: loading site script $ac_site_file" >&6;}
     sed 's/^/| /' "$ac_site_file" >&5
     . "$ac_site_file" \
-      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+      || { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;}
 as_fn_error $? "failed to load site script $ac_site_file
 See \`config.log' for more details" "$LINENO" 5; }
   fi
@@ -1595,16 +1636,16 @@ if test -r "$cache_file"; then
   # Some versions of bash will fail to source /dev/null (special files
   # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
   if test /dev/null != "$cache_file" && test -f "$cache_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
-$as_echo "$as_me: loading cache $cache_file" >&6;}
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+printf "%s\n" "$as_me: loading cache $cache_file" >&6;}
     case $cache_file in
       [\\/]* | ?:[\\/]* ) . "$cache_file";;
       *)                      . "./$cache_file";;
     esac
   fi
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
-$as_echo "$as_me: creating cache $cache_file" >&6;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+printf "%s\n" "$as_me: creating cache $cache_file" >&6;}
   >$cache_file
 fi
 
@@ -1618,12 +1659,12 @@ for ac_var in $ac_precious_vars; do
   eval ac_new_val=\$ac_env_${ac_var}_value
   case $ac_old_set,$ac_new_set in
     set,)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
-$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+printf "%s\n" "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
       ac_cache_corrupted=: ;;
     ,set)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
-$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+printf "%s\n" "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
       ac_cache_corrupted=: ;;
     ,);;
     *)
@@ -1632,24 +1673,24 @@ $as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
 	ac_old_val_w=`echo x $ac_old_val`
 	ac_new_val_w=`echo x $ac_new_val`
 	if test "$ac_old_val_w" != "$ac_new_val_w"; then
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
-$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+printf "%s\n" "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
 	  ac_cache_corrupted=:
 	else
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
-$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+	  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+printf "%s\n" "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
 	  eval $ac_var=\$ac_old_val
 	fi
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
-$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
-$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+printf "%s\n" "$as_me:   former value:  \`$ac_old_val'" >&2;}
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+printf "%s\n" "$as_me:   current value: \`$ac_new_val'" >&2;}
       fi;;
   esac
   # Pass precious variables to config.status.
   if test "$ac_new_set" = set; then
     case $ac_new_val in
-    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *\'*) ac_arg=$ac_var=`printf "%s\n" "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
     *) ac_arg=$ac_var=$ac_new_val ;;
     esac
     case " $ac_configure_args " in
@@ -1659,11 +1700,12 @@ $as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
   fi
 done
 if $ac_cache_corrupted; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
-$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
-  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+printf "%s\n" "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run \`${MAKE-make} distclean' and/or \`rm $cache_file'
+	    and start over" "$LINENO" 5
 fi
 ## -------------------- ##
 ## Main body of script. ##
@@ -1681,10 +1723,10 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 # find compiler and flags #
 ###########################
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking location of R" >&5
-$as_echo_n "checking location of R... " >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${R_HOME}" >&5
-$as_echo "${R_HOME}" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking location of R" >&5
+printf %s "checking location of R... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${R_HOME}" >&5
+printf "%s\n" "${R_HOME}" >&6; }
 
 # set up CPP flags
 # find the compiler and compiler flags used by R.
@@ -1719,8 +1761,8 @@ LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY -DEIGEN_DONT_PARALLELIZE"
 # MM_PREFETCH #
 ###############
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether MM_PREFETCH works" >&5
-$as_echo_n "checking whether MM_PREFETCH works... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether MM_PREFETCH works" >&5
+printf %s "checking whether MM_PREFETCH works... " >&6; }
 ac_mmprefetch=no
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -1729,7 +1771,7 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
                 #include <xmmintrin.h>
 
 int
-main ()
+main (void)
 {
 
                 int a = 0;
@@ -1744,8 +1786,8 @@ main ()
 
 _ACEOF
 ${CXX} ${CPPFLAGS} ${CXXFLAGS} -o conftest conftest.cpp 2>/dev/null && ./conftest && ac_mmprefetch=yes
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_mmprefetch}" >&5
-$as_echo "${ac_mmprefetch}" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${ac_mmprefetch}" >&5
+printf "%s\n" "${ac_mmprefetch}" >&6; }
 if test "${ac_mmprefetch}" = yes; then
     LGB_CPPFLAGS+=" -DMM_PREFETCH=1"
 fi
@@ -1754,8 +1796,8 @@ fi
 # MM_ALLOC #
 ############
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether MM_MALLOC works" >&5
-$as_echo_n "checking whether MM_MALLOC works... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether MM_MALLOC works" >&5
+printf %s "checking whether MM_MALLOC works... " >&6; }
 ac_mm_malloc=no
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -1764,7 +1806,7 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
                 #include <mm_malloc.h>
 
 int
-main ()
+main (void)
 {
 
                 char *a = (char*)_mm_malloc(8, 16);
@@ -1779,8 +1821,8 @@ main ()
 
 _ACEOF
 ${CXX} ${CPPFLAGS} ${CXXFLAGS} -o conftest conftest.cpp 2>/dev/null && ./conftest && ac_mm_malloc=yes
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_mm_malloc}" >&5
-$as_echo "${ac_mm_malloc}" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${ac_mm_malloc}" >&5
+printf "%s\n" "${ac_mm_malloc}" >&6; }
 if test "${ac_mm_malloc}" = yes; then
     LGB_CPPFLAGS+=" -DMM_MALLOC=1"
 fi
@@ -1810,11 +1852,11 @@ then
     HOMEBREW_LIBOMP_PREFIX=""
     if command -v brew &> /dev/null; then
         ac_brew_openmp=no
-        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP was installed via Homebrew" >&5
-$as_echo_n "checking whether OpenMP was installed via Homebrew... " >&6; }
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether OpenMP was installed via Homebrew" >&5
+printf %s "checking whether OpenMP was installed via Homebrew... " >&6; }
         brew --prefix libomp &>/dev/null && ac_brew_openmp=yes
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_brew_openmp}" >&5
-$as_echo "${ac_brew_openmp}" >&6; }
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${ac_brew_openmp}" >&5
+printf "%s\n" "${ac_brew_openmp}" >&6; }
         if test "${ac_brew_openmp}" = yes; then
             HOMEBREW_LIBOMP_PREFIX=`brew --prefix libomp`
             OPENMP_CXXFLAGS="${OPENMP_CXXFLAGS} -I${HOMEBREW_LIBOMP_PREFIX}/include"
@@ -1822,8 +1864,8 @@ $as_echo "${ac_brew_openmp}" >&6; }
         fi
     fi
     ac_pkg_openmp=no
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP will work in a package" >&5
-$as_echo_n "checking whether OpenMP will work in a package... " >&6; }
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether OpenMP will work in a package" >&5
+printf %s "checking whether OpenMP will work in a package... " >&6; }
     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -1831,7 +1873,7 @@ $as_echo_n "checking whether OpenMP will work in a package... " >&6; }
                     #include <omp.h>
 
 int
-main ()
+main (void)
 {
 
                     return (omp_get_max_threads() <= 1);
@@ -1855,8 +1897,8 @@ _ACEOF
         ${CXX} ${CPPFLAGS} ${CXXFLAGS} ${LDFLAGS} ${OPENMP_CXXFLAGS} ${OPENMP_LIB} -o conftest conftest.cpp 2>/dev/null && ./conftest && ac_pkg_openmp=yes
     fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_pkg_openmp}" >&5
-$as_echo "${ac_pkg_openmp}" >&6; }
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${ac_pkg_openmp}" >&5
+printf "%s\n" "${ac_pkg_openmp}" >&6; }
     if test "${ac_pkg_openmp}" = no; then
         OPENMP_CXXFLAGS=''
         OPENMP_LIB=''
@@ -1904,8 +1946,8 @@ _ACEOF
     case $ac_val in #(
     *${as_nl}*)
       case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
-$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       esac
       case $ac_var in #(
       _ | IFS | as_nl) ;; #(
@@ -1935,15 +1977,15 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
      /^ac_cv_env_/b end
      t clear
      :clear
-     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     s/^\([^=]*\)=\(.*[{}].*\)$/test ${\1+y} || &/
      t end
      s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
      :end' >>confcache
 if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
   if test -w "$cache_file"; then
     if test "x$cache_file" != "x/dev/null"; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
-$as_echo "$as_me: updating cache $cache_file" >&6;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+printf "%s\n" "$as_me: updating cache $cache_file" >&6;}
       if test ! -f "$cache_file" || test -h "$cache_file"; then
 	cat confcache >"$cache_file"
       else
@@ -1957,8 +1999,8 @@ $as_echo "$as_me: updating cache $cache_file" >&6;}
       fi
     fi
   else
-    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
-$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+printf "%s\n" "$as_me: not updating unwritable cache $cache_file" >&6;}
   fi
 fi
 rm -f confcache
@@ -2011,7 +2053,7 @@ U=
 for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
   # 1. Remove the extension, and $U if already installed.
   ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
-  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  ac_i=`printf "%s\n" "$ac_i" | sed "$ac_script"`
   # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
   #    will be set to the directory where LIBOBJS objects are built.
   as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
@@ -2027,8 +2069,8 @@ LTLIBOBJS=$ac_ltlibobjs
 ac_write_fail=0
 ac_clean_files_save=$ac_clean_files
 ac_clean_files="$ac_clean_files $CONFIG_STATUS"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
-$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+printf "%s\n" "$as_me: creating $CONFIG_STATUS" >&6;}
 as_write_fail=0
 cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
 #! $SHELL
@@ -2051,14 +2093,16 @@ cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
 
 # Be more Bourne compatible
 DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+as_nop=:
+if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
-else
+else $as_nop
   case `(set -o) 2>/dev/null` in #(
   *posix*) :
     set -o posix ;; #(
@@ -2068,46 +2112,46 @@ esac
 fi
 
 
+
+# Reset variables that may have inherited troublesome values from
+# the environment.
+
+# IFS needs to be set, to space, tab, and newline, in precisely that order.
+# (If _AS_PATH_WALK were called with IFS unset, it would have the
+# side effect of setting IFS to empty, thus disabling word splitting.)
+# Quoting is to prevent editors from complaining about space-tab.
 as_nl='
 '
 export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
+IFS=" ""	$as_nl"
+
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# Ensure predictable behavior from utilities with locale-dependent output.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# We cannot yet rely on "unset" to work, but we need these variables
+# to be unset--not just set to an empty or harmless value--now, to
+# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh).  This construct
+# also avoids known problems related to "unset" and subshell syntax
+# in other old shells (e.g. bash 2.01 and pdksh 5.2.14).
+for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH
+do eval test \${$as_var+y} \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+
+# Ensure that fds 0, 1, and 2 are open.
+if (exec 3>&0) 2>/dev/null; then :; else exec 0</dev/null; fi
+if (exec 3>&1) 2>/dev/null; then :; else exec 1>/dev/null; fi
+if (exec 3>&2)            ; then :; else exec 2>/dev/null; fi
 
 # The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
+if ${PATH_SEPARATOR+false} :; then
   PATH_SEPARATOR=:
   (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
     (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
@@ -2116,13 +2160,6 @@ if test "${PATH_SEPARATOR+set}" != set; then
 fi
 
 
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
 # Find who we are.  Look in the path if we contain no directory separator.
 as_myself=
 case $0 in #((
@@ -2131,8 +2168,12 @@ case $0 in #((
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    test -r "$as_dir$0" && as_myself=$as_dir$0 && break
   done
 IFS=$as_save_IFS
 
@@ -2144,30 +2185,10 @@ if test "x$as_myself" = x; then
   as_myself=$0
 fi
 if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
   exit 1
 fi
 
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
 
 
 # as_fn_error STATUS ERROR [LINENO LOG_FD]
@@ -2180,13 +2201,14 @@ as_fn_error ()
   as_status=$1; test $as_status -eq 0 && as_status=1
   if test "$4"; then
     as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
   fi
-  $as_echo "$as_me: error: $2" >&2
+  printf "%s\n" "$as_me: error: $2" >&2
   as_fn_exit $as_status
 } # as_fn_error
 
 
+
 # as_fn_set_status STATUS
 # -----------------------
 # Set $? to STATUS, without forking.
@@ -2213,18 +2235,20 @@ as_fn_unset ()
   { eval $1=; unset $1;}
 }
 as_unset=as_fn_unset
+
 # as_fn_append VAR VALUE
 # ----------------------
 # Append the text in VALUE to the end of the definition contained in VAR. Take
 # advantage of any shell optimizations that allow amortized linear growth over
 # repeated appends, instead of the typical quadratic growth present in naive
 # implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null
+then :
   eval 'as_fn_append ()
   {
     eval $1+=\$2
   }'
-else
+else $as_nop
   as_fn_append ()
   {
     eval $1=\$$1\$2
@@ -2236,12 +2260,13 @@ fi # as_fn_append
 # Perform arithmetic evaluation on the ARGs, and store the result in the
 # global $as_val. Take advantage of shells that can avoid forks. The arguments
 # must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null
+then :
   eval 'as_fn_arith ()
   {
     as_val=$(( $* ))
   }'
-else
+else $as_nop
   as_fn_arith ()
   {
     as_val=`expr "$@" || test $? -eq 1`
@@ -2272,7 +2297,7 @@ as_me=`$as_basename -- "$0" ||
 $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
 	 X"$0" : 'X\(//\)$' \| \
 	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
+printf "%s\n" X/"$0" |
     sed '/^.*\/\([^/][^/]*\)\/*$/{
 	    s//\1/
 	    q
@@ -2294,6 +2319,10 @@ as_cr_Letters=$as_cr_letters$as_cr_LETTERS
 as_cr_digits='0123456789'
 as_cr_alnum=$as_cr_Letters$as_cr_digits
 
+
+# Determine whether it's possible to make 'echo' print without a newline.
+# These variables are no longer used directly by Autoconf, but are AC_SUBSTed
+# for compatibility with existing Makefiles.
 ECHO_C= ECHO_N= ECHO_T=
 case `echo -n x` in #(((((
 -n*)
@@ -2307,6 +2336,12 @@ case `echo -n x` in #(((((
   ECHO_N='-n';;
 esac
 
+# For backward compatibility with old third-party macros, we provide
+# the shell variables $as_echo and $as_echo_n.  New code should use
+# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively.
+as_echo='printf %s\n'
+as_echo_n='printf %s'
+
 rm -f conf$$ conf$$.exe conf$$.file
 if test -d conf$$.dir; then
   rm -f conf$$.dir/conf$$.file
@@ -2348,7 +2383,7 @@ as_fn_mkdir_p ()
     as_dirs=
     while :; do
       case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
       *) as_qdir=$as_dir;;
       esac
       as_dirs="'$as_qdir' $as_dirs"
@@ -2357,7 +2392,7 @@ $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$as_dir" : 'X\(//\)[^/]' \| \
 	 X"$as_dir" : 'X\(//\)$' \| \
 	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
+printf "%s\n" X"$as_dir" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -2420,7 +2455,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # values after options handling.
 ac_log="
 This file was extended by lightgbm $as_me 3.3.3.99, which was
-generated by GNU Autoconf 2.69.  Invocation command line was
+generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
   CONFIG_HEADERS  = $CONFIG_HEADERS
@@ -2469,14 +2504,16 @@ $config_files
 Report bugs to the package provider."
 
 _ACEOF
+ac_cs_config=`printf "%s\n" "$ac_configure_args" | sed "$ac_safe_unquote"`
+ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\''/g"`
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
 lightgbm config.status 3.3.3.99
-configured by $0, generated by GNU Autoconf 2.69,
+configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
-Copyright (C) 2012 Free Software Foundation, Inc.
+Copyright (C) 2021 Free Software Foundation, Inc.
 This config.status script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it."
 
@@ -2513,21 +2550,21 @@ do
   -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
     ac_cs_recheck=: ;;
   --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
-    $as_echo "$ac_cs_version"; exit ;;
+    printf "%s\n" "$ac_cs_version"; exit ;;
   --config | --confi | --conf | --con | --co | --c )
-    $as_echo "$ac_cs_config"; exit ;;
+    printf "%s\n" "$ac_cs_config"; exit ;;
   --debug | --debu | --deb | --de | --d | -d )
     debug=: ;;
   --file | --fil | --fi | --f )
     $ac_shift
     case $ac_optarg in
-    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
     '') as_fn_error $? "missing file argument" ;;
     esac
     as_fn_append CONFIG_FILES " '$ac_optarg'"
     ac_need_defaults=false;;
   --he | --h |  --help | --hel | -h )
-    $as_echo "$ac_cs_usage"; exit ;;
+    printf "%s\n" "$ac_cs_usage"; exit ;;
   -q | -quiet | --quiet | --quie | --qui | --qu | --q \
   | -silent | --silent | --silen | --sile | --sil | --si | --s)
     ac_cs_silent=: ;;
@@ -2555,7 +2592,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 if \$ac_cs_recheck; then
   set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
   shift
-  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  \printf "%s\n" "running CONFIG_SHELL=$SHELL \$*" >&6
   CONFIG_SHELL='$SHELL'
   export CONFIG_SHELL
   exec "\$@"
@@ -2569,7 +2606,7 @@ exec 5>>config.log
   sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
 ## Running $as_me. ##
 _ASBOX
-  $as_echo "$ac_log"
+  printf "%s\n" "$ac_log"
 } >&5
 
 _ACEOF
@@ -2594,7 +2631,7 @@ done
 # We use the long form for the default assignment because of an extremely
 # bizarre bug on SunOS 4.1.3.
 if $ac_need_defaults; then
-  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+  test ${CONFIG_FILES+y} || CONFIG_FILES=$config_files
 fi
 
 # Have a temporary directory for convenience.  Make it in the build tree
@@ -2822,7 +2859,7 @@ do
 	   esac ||
 	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
       esac
-      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      case $ac_f in *\'*) ac_f=`printf "%s\n" "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
       as_fn_append ac_file_inputs " '$ac_f'"
     done
 
@@ -2830,17 +2867,17 @@ do
     # use $as_me), people would be surprised to read:
     #    /* config.h.  Generated by config.status.  */
     configure_input='Generated from '`
-	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	  printf "%s\n" "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
 	`' by configure.'
     if test x"$ac_file" != x-; then
       configure_input="$ac_file.  $configure_input"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
-$as_echo "$as_me: creating $ac_file" >&6;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+printf "%s\n" "$as_me: creating $ac_file" >&6;}
     fi
     # Neutralize special characters interpreted by sed in replacement strings.
     case $configure_input in #(
     *\&* | *\|* | *\\* )
-       ac_sed_conf_input=`$as_echo "$configure_input" |
+       ac_sed_conf_input=`printf "%s\n" "$configure_input" |
        sed 's/[\\\\&|]/\\\\&/g'`;; #(
     *) ac_sed_conf_input=$configure_input;;
     esac
@@ -2857,7 +2894,7 @@ $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$ac_file" : 'X\(//\)[^/]' \| \
 	 X"$ac_file" : 'X\(//\)$' \| \
 	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$ac_file" |
+printf "%s\n" X"$ac_file" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -2881,9 +2918,9 @@ $as_echo X"$ac_file" |
 case "$ac_dir" in
 .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
 *)
-  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'`
   # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
   case $ac_top_builddir_sub in
   "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
   *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
@@ -2936,8 +2973,8 @@ ac_sed_dataroot='
 case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
 *datarootdir*) ac_datarootdir_seen=yes;;
 *@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
-$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+printf "%s\n" "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
   ac_datarootdir_hack='
@@ -2979,9 +3016,9 @@ test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
   { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
   { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
       "$ac_tmp/out"`; test -z "$ac_out"; } &&
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
 which seems to be undefined.  Please make sure it is defined" >&5
-$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+printf "%s\n" "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
 which seems to be undefined.  Please make sure it is defined" >&2;}
 
   rm -f "$ac_tmp/stdin"
@@ -3028,7 +3065,8 @@ if test "$no_create" != yes; then
   $ac_cs_success || as_fn_exit 1
 fi
 if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
-$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
 fi
 
+
diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md
index 70f89a89159f..e52f2f15dd01 100644
--- a/R-package/cran-comments.md
+++ b/R-package/cran-comments.md
@@ -1,9 +1,31 @@
 # CRAN Submission History
 
+## v3.3.4 - Submission 1 - (December 15, 2022)
+
+### CRAN response
+
+Accepted to CRAN
+
+### Maintainer Notes
+
+Submitted with the following comment:
+
+> This submission contains {lightgbm} 3.3.3.
+
+> Per CRAN's policies, I am submitting it on behalf of the project's maintainer (Yu Shi), with his permission.
+
+> This submission includes patches to address the following warnings observed on the fedora and debian CRAN checks.
+>
+> Compiled code should not call entry points which might terminate R nor write to stdout/stderr instead of to the console, nor use Fortran I/O nor system RNGs nor [v]sprintf.
+
+> Thank you very much for your time and consideration.
+
 ## v3.3.3 - Submission 1 - (October 10, 2022)
 
 ### CRAN response
 
+Accepted to CRAN
+
 ### Maintainer Notes
 
 Submitted with the following comment:
diff --git a/R-package/recreate-configure.sh b/R-package/recreate-configure.sh
index 2df5ffa64f6f..df3586fd2af2 100755
--- a/R-package/recreate-configure.sh
+++ b/R-package/recreate-configure.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # recreates 'configure' from 'configure.ac'
-# this script should run on Ubuntu 20.04
+# this script should run on Ubuntu 22.04
 AUTOCONF_VERSION=$(cat R-package/AUTOCONF_UBUNTU_VERSION)
 
 # R packages cannot have versions like 3.0.0rc1, but
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 4d0e1d40a9e2..b95897af9844 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -26,6 +26,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/sample_strategy.o \
     io/bin.o \
     io/config.o \
     io/config_auto.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 10e3ce949396..befd2df65052 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -27,6 +27,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/sample_strategy.o \
     io/bin.o \
     io/config.o \
     io/config_auto.o \
diff --git a/README.md b/README.md
index 5c575518e031..0ade6706b7ba 100644
--- a/README.md
+++ b/README.md
@@ -121,6 +121,8 @@ MLflow (experiment tracking, model monitoring framework): https://github.com/mlf
 
 lightgbm-transform (feature transformation binding): https://github.com/microsoft/lightgbm-transform
 
+`postgresml` (LightGBM training and prediction in SQL, via a Postgres extension): https://github.com/postgresml/postgresml
+
 Support
 -------
 
diff --git a/cmake/IntegratedOpenCL.cmake b/cmake/IntegratedOpenCL.cmake
index 5473311196a7..bbdc6a4f6703 100644
--- a/cmake/IntegratedOpenCL.cmake
+++ b/cmake/IntegratedOpenCL.cmake
@@ -26,28 +26,45 @@ FetchContent_Declare(OpenCL-ICD-Loader GIT_REPOSITORY ${OPENCL_LOADER_REPOSITORY
 FetchContent_GetProperties(OpenCL-ICD-Loader)
 if(NOT OpenCL-ICD-Loader_POPULATED)
   FetchContent_Populate(OpenCL-ICD-Loader)
-  set(USE_DYNAMIC_VCXX_RUNTIME ON)
+  if(WIN32)
+    set(USE_DYNAMIC_VCXX_RUNTIME ON)
+  endif()
   add_subdirectory(${opencl-icd-loader_SOURCE_DIR} ${opencl-icd-loader_BINARY_DIR} EXCLUDE_FROM_ALL)
   message(STATUS "Populated OpenCL ICD Loader")
 endif()
 list(APPEND INTEGRATED_OPENCL_INCLUDES ${OPENCL_ICD_LOADER_HEADERS_DIR})
-list(
-  APPEND
-  INTEGRATED_OPENCL_LIBRARIES
-    ${opencl-icd-loader_BINARY_DIR}/Release/OpenCL.lib
-    cfgmgr32.lib
-    runtimeobject.lib
-)
 list(APPEND INTEGRATED_OPENCL_DEFINITIONS CL_TARGET_OPENCL_VERSION=120)
+if(WIN32)
+  list(
+    APPEND
+    INTEGRATED_OPENCL_LIBRARIES
+      ${opencl-icd-loader_BINARY_DIR}/Release/OpenCL.lib
+      cfgmgr32.lib
+      runtimeobject.lib
+  )
+else()
+  list(
+    APPEND
+    INTEGRATED_OPENCL_LIBRARIES
+      ${opencl-icd-loader_BINARY_DIR}/libOpenCL.a
+  )
+  set_property(TARGET OpenCL PROPERTY POSITION_INDEPENDENT_CODE ON)
+endif()
 
 # Build Independent Boost libraries
 include(ExternalProject)
 include(ProcessorCount)
 ProcessorCount(J)
 set(BOOST_BASE "${PROJECT_BINARY_DIR}/Boost")
-set(BOOST_BOOTSTRAP "${BOOST_BASE}/source/bootstrap.bat")
-set(BOOST_BUILD "${BOOST_BASE}/source/b2.exe")
-set(BOOST_FLAGS "")
+if(WIN32)
+  set(BOOST_BOOTSTRAP "${BOOST_BASE}/source/bootstrap.bat")
+  set(BOOST_BUILD "${BOOST_BASE}/source/b2.exe")
+  set(BOOST_FLAGS "")
+else()
+  set(BOOST_BOOTSTRAP "${BOOST_BASE}/source/bootstrap.sh")
+  set(BOOST_BUILD "${BOOST_BASE}/source/b2")
+  set(BOOST_FLAGS "-fPIC")
+endif()
 list(
   APPEND
   BOOST_SUBMODULES
@@ -137,35 +154,37 @@ ExternalProject_Add(
 set(BOOST_INCLUDE "${BOOST_BASE}/source" CACHE PATH "")
 set(BOOST_LIBRARY "${BOOST_BASE}/source/stage/lib" CACHE PATH "")
 list(APPEND INTEGRATED_OPENCL_INCLUDES ${BOOST_INCLUDE})
-if(MSVC)
-  if(${MSVC_VERSION} GREATER 1929)
-    message(FATAL_ERROR "Unrecognized MSVC version number: ${MSVC_VERSION}")
-  elseif(${MSVC_VERSION} GREATER 1919)
-    set(MSVC_TOOLCHAIN_ID "142")
-  elseif(${MSVC_VERSION} GREATER 1909)
-    set(MSVC_TOOLCHAIN_ID "141")
-  elseif(${MSVC_VERSION} GREATER 1899)
-    set(MSVC_TOOLCHAIN_ID "140")
+if(WIN32)
+  if(MSVC)
+    if(${MSVC_VERSION} GREATER 1929)
+      message(FATAL_ERROR "Unrecognized MSVC version number: ${MSVC_VERSION}")
+    elseif(${MSVC_VERSION} GREATER 1919)
+      set(MSVC_TOOLCHAIN_ID "142")
+    elseif(${MSVC_VERSION} GREATER 1909)
+      set(MSVC_TOOLCHAIN_ID "141")
+    elseif(${MSVC_VERSION} GREATER 1899)
+      set(MSVC_TOOLCHAIN_ID "140")
+    else()
+      message(FATAL_ERROR "Unrecognized MSVC version number: ${MSVC_VERSION}")
+    endif()
+    list(
+      APPEND
+      INTEGRATED_OPENCL_LIBRARIES
+      ${BOOST_LIBRARY}/libboost_filesystem-vc${MSVC_TOOLCHAIN_ID}-mt-x64-${BOOST_VERSION_UNDERSCORE}.lib
+      ${BOOST_LIBRARY}/libboost_system-vc${MSVC_TOOLCHAIN_ID}-mt-x64-${BOOST_VERSION_UNDERSCORE}.lib
+      ${BOOST_LIBRARY}/libboost_chrono-vc${MSVC_TOOLCHAIN_ID}-mt-x64-${BOOST_VERSION_UNDERSCORE}.lib
+    )
   else()
-    message(FATAL_ERROR "Unrecognized MSVC version number: ${MSVC_VERSION}")
+    message(FATAL_ERROR "Integrated OpenCL build is not yet available for MinGW")
   endif()
+else()
   list(
     APPEND
     INTEGRATED_OPENCL_LIBRARIES
-    ${BOOST_LIBRARY}/libboost_filesystem-vc${MSVC_TOOLCHAIN_ID}-mt-x64-${BOOST_VERSION_UNDERSCORE}.lib
-  )
-  list(
-    APPEND
-    INTEGRATED_OPENCL_LIBRARIES
-    ${BOOST_LIBRARY}/libboost_system-vc${MSVC_TOOLCHAIN_ID}-mt-x64-${BOOST_VERSION_UNDERSCORE}.lib
-  )
-  list(
-    APPEND
-    INTEGRATED_OPENCL_LIBRARIES
-    ${BOOST_LIBRARY}/libboost_chrono-vc${MSVC_TOOLCHAIN_ID}-mt-x64-${BOOST_VERSION_UNDERSCORE}.lib
+    ${BOOST_LIBRARY}/libboost_filesystem.a
+    ${BOOST_LIBRARY}/libboost_system.a
+    ${BOOST_LIBRARY}/libboost_chrono.a
   )
-else()
-  message(FATAL_ERROR "Integrated OpenCL build is not yet available for MinGW")
 endif()
 
 set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
diff --git a/docker/README.md b/docker/README.md
index ab9bd75ef919..7e9e3276dd33 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -2,6 +2,8 @@
 
 This directory contains `Dockerfile`s to make it easy to build and run LightGBM via [Docker](https://www.docker.com/).
 
+These builds of LightGBM all train on the CPU. For GPU-enabled builds, see [the gpu/ directory](./gpu).
+
 ## Installing Docker
 
 Follow the general installation instructions [on the Docker site](https://docs.docker.com/install/):
@@ -12,113 +14,183 @@ Follow the general installation instructions [on the Docker site](https://docs.d
 
 ## Using CLI Version of LightGBM via Docker
 
-Build a Docker image with LightGBM CLI:
+Build an image with the LightGBM CLI.
 
-```
+```shell
 mkdir lightgbm-docker
 cd lightgbm-docker
 wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-cli
-docker build -t lightgbm-cli -f dockerfile-cli .
+docker build \
+    -t lightgbm-cli \
+    -f dockerfile-cli \
+    .
 ```
 
-where `lightgbm-cli` is the desired Docker image name.
+Once that completes, the built image can be used to run the CLI in a container.
+To try it out, run the following.
 
-Run the CLI from the container:
+```shell
+# configure the CLI
+cat << EOF > train.conf
+task = train
+objective = binary
+data = binary.train
+num_trees = 10
+output_model = LightGBM-CLI-model.txt
+EOF
 
-```
-docker run --rm -it \
---volume $HOME/lgbm.conf:/lgbm.conf \
---volume $HOME/model.txt:/model.txt \
---volume $HOME/tmp:/out \
-lightgbm-cli \
-config=lgbm.conf
-```
-
-In the above example, three volumes are [mounted](https://docs.docker.com/engine/reference/commandline/run/#mount-volume--v---read-only)
-from the host machine to the Docker container:
+# get training data
+curl -O https://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/binary_classification/binary.train
 
-* `lgbm.conf` - task config, for example
-
-```
-app=multiclass
-num_class=3
-task=convert_model
-input_model=model.txt
-convert_model=/out/predict.cpp
-convert_model_language=cpp
+# train, and save model to a text file
+docker run \
+  --rm \
+  --volume "${PWD}":/opt/training \
+  --workdir /opt/training \
+  lightgbm-cli \
+  config=train.conf
 ```
 
-* `model.txt` - an input file for the task, could be training data or, in this case, a pre-trained model.
-* `out` - a directory to store the output of the task, notice that `convert_model` in the task config is using it.
+After this runs, a LightGBM model can be found at `LightGBM-CLI-model.txt`.
 
-`config=lgbm.conf` is a command-line argument passed to the `lightgbm` executable, more arguments can be passed if required.
+For more details on how to configure and use the LightGBM CLI, see https://lightgbm.readthedocs.io/en/latest/Quick-Start.html.
 
 ## Running the Python-package Сontainer
 
-Build the container, for Python users:
+Build an image with the LightGBM Python package installed.
 
-```
+```shell
 mkdir lightgbm-docker
 cd lightgbm-docker
 wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-python
-docker build -t lightgbm -f dockerfile-python .
+docker build \
+    -t lightgbm-python \
+    -f dockerfile-python \
+    .
 ```
 
-After build finished, run the container:
+Once that completes, the built image can be used to run LightGBM's Python package in a container.
+Run the following to produce a model using the Python package.
+
+```shell
+# get training data
+curl -O https://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/binary_classification/binary.train
+
+# create training script
+cat << EOF > train.py
+import lightgbm as lgb
+import numpy as np
+params = {
+    "objective": "binary",
+    "num_trees": 10
+}
 
+bst = lgb.train(
+    train_set=lgb.Dataset("binary.train"),
+    params=params
+)
+bst.save_model("LightGBM-python-model.txt")
+EOF
+
+# run training in a container
+docker run \
+    --rm \
+    --volume "${PWD}":/opt/training \
+    --workdir /opt/training \
+    lightgbm-python \
+    python train.py
 ```
-docker run --rm -it lightgbm
+
+After this runs, a LightGBM model can be found at `LightGBM-python-model.txt`.
+
+Or run an interactive Python session in a container.
+
+```shell
+docker run \
+    --rm \
+    --volume "${PWD}":/opt/training \
+    --workdir /opt/training \
+    -it lightgbm-python \
+    python
 ```
 
 ## Running the R-package Сontainer
 
-Build the container based on the [`verse` Rocker image](https://www.rocker-project.org/images/), for R users:
+Build an image with the LightGBM R package installed.
 
-```
+```shell
 mkdir lightgbm-docker
 cd lightgbm-docker
 wget https://raw.githubusercontent.com/Microsoft/LightGBM/master/docker/dockerfile-r
-docker build -t lightgbm-r -f dockerfile-r .
-```
-
-This will default to the latest version of R. If you want to try with an older `rocker` container to run a particular version of R, pass in a build arg with [a valid tag](https://hub.docker.com/r/rocker/verse/tags).
 
-For example, to test with R 3.5:
-
-```
 docker build \
-    -t lightgbm-r-35 \
+    -t lightgbm-r \
     -f dockerfile-r \
-    --build-arg R_VERSION=3.5 \
     .
 ```
 
-After the build is finished you have two options to run the container:
+Once that completes, the built image can be used to run LightGBM's R package in a container.
+Run the following to produce a model using the R package.
 
-1. Start [RStudio](https://www.rstudio.com/products/rstudio/), an interactive development environment, so that you can develop your analysis using LightGBM or simply try out the R package. You can open RStudio in your web browser.
-2. Start a regular R session.
+```shell
+# get training data
+curl -O https://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/binary_classification/binary.train
 
-In both cases you can simply call
+# create training script
+cat << EOF > train.R
+library(lightgbm)
+params <- list(
+    objective = "binary"
+    , num_trees = 10L
+)
 
-```
-library("lightgbm")
+bst <- lgb.train(
+    data = lgb.Dataset("binary.train"),
+    params = params
+)
+lgb.save(bst, "LightGBM-R-model.txt")
+EOF
+
+# run training in a container
+docker run \
+    --rm \
+    --volume "${PWD}":/opt/training \
+    --workdir /opt/training \
+    lightgbm-r \
+    Rscript train.R
 ```
 
-to load the installed LightGBM R package.
+After this runs, a LightGBM model can be found at `LightGBM-R-model.txt`.
 
-**RStudio**
+Run the following to get an interactive R session in a container.
 
+```shell
+docker run \
+    --rm \
+    -it lightgbm-r \
+    R
 ```
-docker run --rm -it -e PASSWORD=lightgbm -p 8787:8787 lightgbm-r
+
+To use [RStudio](https://www.rstudio.com/products/rstudio/), an interactive development environment, run the following.
+
+```shell
+docker run \
+    --rm \
+    --env PASSWORD="lightgbm" \
+    -p 8787:8787 \
+    lightgbm-r
 ```
 
-Open the browser at http://localhost:8787 and log in.
-See the [`rocker/rstudio`](https://hub.docker.com/r/rocker/rstudio) image documentation for further configuration options.
+Then navigate to `localhost:8787` in your local web browser, and log in with username `rstudio` and password `lightgbm`.
 
-**Regular R**
+To target a different R version, pass any [valid rocker/verse tag](https://hub.docker.com/r/rocker/verse/tags) to `docker build`.
 
-If you just want a vanilla R process, change the executable of the container:
+For example, to test LightGBM with R 3.5:
 
-```
-docker run --rm -it lightgbm-r R
+```shell
+docker build \
+    -t lightgbm-r-35 \
+    -f dockerfile-r \
+    --build-arg R_VERSION=3.5 \
+    .
 ```
diff --git a/docker/dockerfile-cli b/docker/dockerfile-cli
index e033f1c51de5..dc1972ab7df4 100644
--- a/docker/dockerfile-cli
+++ b/docker/dockerfile-cli
@@ -1,22 +1,32 @@
-FROM ubuntu:16.04
+FROM ubuntu:20.04
 
-RUN apt-get update && \
+ENV \
+    DEBIAN_FRONTEND=noninteractive \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+RUN apt-get update -y && \
     apt-get install -y --no-install-recommends \
         ca-certificates \
         cmake \
         build-essential \
         gcc \
         g++ \
-        git && \
+        git \
+        libomp-dev && \
     rm -rf /var/lib/apt/lists/*
 
-RUN git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \
+RUN git clone \
+        --recursive \
+        --branch stable \
+        --depth 1 \
+        https://github.com/Microsoft/LightGBM && \
     mkdir LightGBM/build && \
     cd LightGBM/build && \
     cmake .. && \
     make -j4 && \
     make install && \
-    cd ../.. && \
+    cd "${HOME}" && \
     rm -rf LightGBM
 
 ENTRYPOINT ["lightgbm"]
diff --git a/docker/dockerfile-python b/docker/dockerfile-python
index 3e473f5e9686..6c5ca6501ac3 100644
--- a/docker/dockerfile-python
+++ b/docker/dockerfile-python
@@ -1,7 +1,12 @@
-FROM ubuntu:16.04
+FROM ubuntu:20.04
 
 ARG CONDA_DIR=/opt/miniforge
-ENV PATH $CONDA_DIR/bin:$PATH
+
+ENV \
+    DEBIAN_FRONTEND=noninteractive \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    PATH=$CONDA_DIR/bin:$PATH
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -11,7 +16,8 @@ RUN apt-get update && \
         gcc \
         g++ \
         curl \
-        git && \
+        git \
+        libomp-dev && \
     # python environment
     curl -sL https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -o miniforge.sh && \
     /bin/bash miniforge.sh -f -b -p $CONDA_DIR && \
diff --git a/docker/dockerfile-r b/docker/dockerfile-r
index 1f173cdcdafc..65c20165f83e 100644
--- a/docker/dockerfile-r
+++ b/docker/dockerfile-r
@@ -1,12 +1,16 @@
 ARG R_VERSION=latest
 FROM rocker/verse:${R_VERSION}
 
-WORKDIR /lgbm
-
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
-        cmake && \
-    git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \
-    cd LightGBM && \
-    Rscript build_r.R
+        libomp-dev && \
+    git clone \
+        --recursive \
+        --branch stable \
+        --depth 1 https://github.com/Microsoft/LightGBM && \
+    cd ./LightGBM && \
+    sh build-cran-package.sh --no-build-vignettes && \
+    R CMD INSTALL ./lightgbm_*.tar.gz && \
+    cd .. && \
+    rm -rf ./LightGBM
diff --git a/docs/Development-Guide.rst b/docs/Development-Guide.rst
index c8b30173da79..6c4819e45209 100644
--- a/docs/Development-Guide.rst
+++ b/docs/Development-Guide.rst
@@ -19,7 +19,7 @@ Important Classes
 +-------------------------+----------------------------------------------------------------------------------------+
 | ``Bin``                 | Data structure used for storing feature discrete values (converted from float values)  |
 +-------------------------+----------------------------------------------------------------------------------------+
-| ``Boosting``            | Boosting interface (GBDT, DART, GOSS, etc.)                                            |
+| ``Boosting``            | Boosting interface (GBDT, DART, etc.)                                                  |
 +-------------------------+----------------------------------------------------------------------------------------+
 | ``Config``              | Stores parameters and configurations                                                   |
 +-------------------------+----------------------------------------------------------------------------------------+
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index c4196cca7a65..4ac77d407ba6 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -127,7 +127,7 @@ Core Parameters
 
       -  label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
 
--  ``boosting`` :raw-html:`<a id="boosting" title="Permalink to this parameter" href="#boosting">&#x1F517;&#xFE0E;</a>`, default = ``gbdt``, type = enum, options: ``gbdt``, ``rf``, ``dart``, ``goss``, aliases: ``boosting_type``, ``boost``
+-  ``boosting`` :raw-html:`<a id="boosting" title="Permalink to this parameter" href="#boosting">&#x1F517;&#xFE0E;</a>`, default = ``gbdt``, type = enum, options: ``gbdt``, ``rf``, ``dart``, aliases: ``boosting_type``, ``boost``
 
    -  ``gbdt``, traditional Gradient Boosting Decision Tree, aliases: ``gbrt``
 
@@ -135,10 +135,16 @@ Core Parameters
 
    -  ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
 
-   -  ``goss``, Gradient-based One-Side Sampling
-
       -  **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
 
+-  ``data_sample_strategy`` :raw-html:`<a id="data_sample_strategy" title="Permalink to this parameter" href="#data_sample_strategy">&#x1F517;&#xFE0E;</a>`, default = ``bagging``, type = enum, options: ``bagging``, ``goss``
+
+   -  ``bagging``, Randomly Bagging Sampling
+
+      -  **Note**: ``bagging`` is only effective when ``bagging_freq > 0`` and ``bagging_fraction < 1.0``
+
+   -  ``goss``, Gradient-based One-Side Sampling
+
 -  ``data`` :raw-html:`<a id="data" title="Permalink to this parameter" href="#data">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename``
 
    -  path of training data, LightGBM will train from this data
@@ -268,7 +274,7 @@ Learning Control Parameters
 
       -  ``num_threads`` is relatively small, e.g. ``<= 16``
 
-      -  you want to use small ``bagging_fraction`` or ``goss`` boosting to speed up
+      -  you want to use small ``bagging_fraction`` or ``goss`` sample strategy to speed up
 
    -  **Note**: setting this to ``true`` will double the memory cost for Dataset object. If you have not enough memory, you can try setting ``force_col_wise=true``
 
diff --git a/helpers/check_dynamic_dependencies.py b/helpers/check_dynamic_dependencies.py
index bd22672466ed..5372356a6ce3 100644
--- a/helpers/check_dynamic_dependencies.py
+++ b/helpers/check_dynamic_dependencies.py
@@ -25,7 +25,7 @@ def check_dependencies(objdump_string: str) -> None:
     objdump_string : str
         The dynamic symbol table entries of the file (result of `objdump -T` command).
     """
-    GLIBC_version = re.compile(r'0{16}[ \t]+GLIBC_(\d{1,2})[.](\d{1,3})[.]?\d{,3}[ \t]+')
+    GLIBC_version = re.compile(r'0{16}[ \(\t]+GLIBC_(\d{1,2})[.](\d{1,3})[.]?\d{,3}[ \)\t]+')
     versions = GLIBC_version.findall(objdump_string)
     assert len(versions) > 1
     for major, minor in versions:
@@ -33,7 +33,7 @@ def check_dependencies(objdump_string: str) -> None:
         assert int(major) <= 2, error_msg
         assert int(minor) <= 28, error_msg
 
-    GLIBCXX_version = re.compile(r'0{16}[ \t]+GLIBCXX_(\d{1,2})[.](\d{1,2})[.]?(\d{,3})[ \t]+')
+    GLIBCXX_version = re.compile(r'0{16}[ \(\t]+GLIBCXX_(\d{1,2})[.](\d{1,2})[.]?(\d{,3})[ \)\t]+')
     versions = GLIBCXX_version.findall(objdump_string)
     assert len(versions) > 1
     for major, minor, patch in versions:
@@ -42,7 +42,7 @@ def check_dependencies(objdump_string: str) -> None:
         assert int(minor) == 4, error_msg
         assert patch == '' or int(patch) <= 22, error_msg
 
-    GOMP_version = re.compile(r'0{16}[ \t]+G?OMP_(\d{1,2})[.](\d{1,2})[.]?\d{,3}[ \t]+')
+    GOMP_version = re.compile(r'0{16}[ \(\t]+G?OMP_(\d{1,2})[.](\d{1,2})[.]?\d{,3}[ \)\t]+')
     versions = GOMP_version.findall(objdump_string)
     assert len(versions) > 1
     for major, minor in versions:
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 2de8c984f70b..4456d19b4da3 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -153,14 +153,21 @@ struct Config {
   // [doc-only]
   // type = enum
   // alias = boosting_type, boost
-  // options = gbdt, rf, dart, goss
+  // options = gbdt, rf, dart
   // desc = ``gbdt``, traditional Gradient Boosting Decision Tree, aliases: ``gbrt``
   // desc = ``rf``, Random Forest, aliases: ``random_forest``
   // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
-  // desc = ``goss``, Gradient-based One-Side Sampling
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
   std::string boosting = "gbdt";
 
+  // [doc-only]
+  // type = enum
+  // options = bagging, goss
+  // desc = ``bagging``, Randomly Bagging Sampling
+  // descl2 = **Note**: ``bagging`` is only effective when ``bagging_freq > 0`` and ``bagging_fraction < 1.0``
+  // desc = ``goss``, Gradient-based One-Side Sampling
+  std::string data_sample_strategy = "bagging";
+
   // alias = train, train_data, train_data_file, data_filename
   // desc = path of training data, LightGBM will train from this data
   // desc = **Note**: can be used only in CLI version
@@ -263,7 +270,7 @@ struct Config {
   // desc = enabling this is recommended when:
   // descl2 = the number of data points is large, and the total number of bins is relatively small
   // descl2 = ``num_threads`` is relatively small, e.g. ``<= 16``
-  // descl2 = you want to use small ``bagging_fraction`` or ``goss`` boosting to speed up
+  // descl2 = you want to use small ``bagging_fraction`` or ``goss`` sample strategy to speed up
   // desc = **Note**: setting this to ``true`` will double the memory cost for Dataset object. If you have not enough memory, you can try setting ``force_col_wise=true``
   // desc = **Note**: when both ``force_col_wise`` and ``force_row_wise`` are ``false``, LightGBM will firstly try them both, and then use the faster one. To remove the overhead of testing set the faster one to ``true`` manually
   // desc = **Note**: this parameter cannot be used at the same time with ``force_col_wise``, choose only one of them
diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp
new file mode 100644
index 000000000000..caeff267e8ef
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_metric.hpp
@@ -0,0 +1,41 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifndef LIGHTGBM_CUDA_CUDA_METRIC_HPP_
+#define LIGHTGBM_CUDA_CUDA_METRIC_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/metric.h>
+
+namespace LightGBM {
+
+template <typename HOST_METRIC>
+class CUDAMetricInterface: public HOST_METRIC {
+ public:
+  explicit CUDAMetricInterface(const Config& config): HOST_METRIC(config) {
+    cuda_labels_ = nullptr;
+    cuda_weights_ = nullptr;
+  }
+
+  void Init(const Metadata& metadata, data_size_t num_data) override {
+    HOST_METRIC::Init(metadata, num_data);
+    cuda_labels_ = metadata.cuda_metadata()->cuda_label();
+    cuda_weights_ = metadata.cuda_metadata()->cuda_weights();
+  }
+
+  bool IsCUDAMetric() const { return true; }
+
+ protected:
+  const label_t* cuda_labels_;
+  const label_t* cuda_weights_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+
+#endif  // LIGHTGBM_CUDA_CUDA_METRIC_HPP_
diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
index fbcad87a57b6..1010895e9d7a 100644
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -31,14 +31,8 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE {
     cuda_weights_ = metadata.cuda_metadata()->cuda_weights();
   }
 
-  virtual void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
-    LaunchConvertOutputCUDAKernel(num_data, input, output);
-  }
-
-  std::function<void(data_size_t, const double*, double*)> GetCUDAConvertOutputFunc() const override {
-    return [this] (data_size_t num_data, const double* input, double* output) {
-      ConvertOutputCUDA(num_data, input, output);
-    };
+  virtual const double* ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
+    return LaunchConvertOutputCUDAKernel(num_data, input, output);
   }
 
   double BoostFromScore(int class_id) const override {
@@ -67,7 +61,7 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE {
     return HOST_OBJECTIVE::BoostFromScore(class_id);
   }
 
-  virtual void LaunchConvertOutputCUDAKernel(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const {}
+  virtual const double* LaunchConvertOutputCUDAKernel(const data_size_t /*num_data*/, const double* input, double* /*output*/) const { return input; }
 
   virtual void LaunchRenewTreeOutputCUDAKernel(
     const double* /*score*/, const data_size_t* /*data_indices_in_leaf*/, const data_size_t* /*num_data_in_leaf*/,
diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp
index 9d89dc3b7465..d557798270e0 100644
--- a/include/LightGBM/cuda/cuda_tree.hpp
+++ b/include/LightGBM/cuda/cuda_tree.hpp
@@ -77,6 +77,8 @@ class CUDATree : public Tree {
                             const data_size_t* used_data_indices,
                             data_size_t num_data, double* score) const override;
 
+  inline void AsConstantTree(double val) override;
+
   const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }
 
   const int* cuda_left_child() const { return cuda_left_child_; }
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 797e9f1b44d5..d5b94bc89e4a 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -10,10 +10,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
+#include <LightGBM/utils/log.h>
 #endif  // USE_CUDA || USE_CUDA_EXP
 
 #ifdef USE_CUDA_EXP
-#include <LightGBM/utils/log.h>
 #include <vector>
 #endif  // USE_CUDA_EXP
 
@@ -119,8 +119,12 @@ class CUDAVector {
   }
 
   void Resize(size_t size) {
+    if (size == size_) {
+      return;
+    }
     if (size == 0) {
       Clear();
+      return;
     }
     T* new_data = nullptr;
     AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
index ff648b1f5fc3..376a6f1a071d 100644
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -99,11 +99,14 @@ class ObjectiveFunction {
 
   #ifdef USE_CUDA_EXP
   /*!
-  * \brief Get output convert function for CUDA version
+  * \brief Convert output for CUDA version
   */
-  virtual std::function<void(data_size_t, const double*, double*)> GetCUDAConvertOutputFunc() const {
-    return [] (data_size_t, const double*, double*) {};
+  virtual const double* ConvertOutputCUDA(data_size_t /*num_data*/, const double* input, double* /*output*/) const {
+    return input;
   }
+
+  virtual bool NeedConvertOutputCUDA () const { return false; }
+
   #endif  // USE_CUDA_EXP
 };
 
diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h
new file mode 100644
index 000000000000..765632f7ecbf
--- /dev/null
+++ b/include/LightGBM/sample_strategy.h
@@ -0,0 +1,83 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
+#define LIGHTGBM_SAMPLE_STRATEGY_H_
+
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/threading.h>
+#include <LightGBM/config.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/tree_learner.h>
+#include <LightGBM/objective_function.h>
+
+#include <memory>
+#include <vector>
+
+namespace LightGBM {
+
+class SampleStrategy {
+ public:
+  SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_), need_resize_gradients_(false) {}
+
+  virtual ~SampleStrategy() {}
+
+  static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration);
+
+  virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0;
+
+  virtual void ResetSampleConfig(const Config* config, bool is_change_dataset) = 0;
+
+  bool is_use_subset() const { return is_use_subset_; }
+
+  data_size_t bag_data_cnt() const { return bag_data_cnt_; }
+
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices() { return bag_data_indices_; }
+
+  #ifdef USE_CUDA_EXP
+  CUDAVector<data_size_t>& cuda_bag_data_indices() { return cuda_bag_data_indices_; }
+  #endif  // USE_CUDA_EXP
+
+  void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) {
+    objective_function_ = objective_function;
+  }
+
+  void UpdateTrainingData(const Dataset* train_data) {
+    train_data_ = train_data;
+    num_data_ = train_data->num_data();
+  }
+
+  virtual bool IsHessianChange() const = 0;
+
+  bool NeedResizeGradients() const { return need_resize_gradients_; }
+
+ protected:
+  const Config* config_;
+  const Dataset* train_data_;
+  const ObjectiveFunction* objective_function_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
+  data_size_t bag_data_cnt_;
+  data_size_t num_data_;
+  int num_tree_per_iteration_;
+  std::unique_ptr<Dataset> tmp_subset_;
+  bool is_use_subset_;
+  bool balanced_bagging_;
+  const int bagging_rand_block_ = 1024;
+  std::vector<Random> bagging_rands_;
+  ParallelPartitionRunner<data_size_t, false> bagging_runner_;
+  /*! \brief whether need to resize the gradient vectors */
+  bool need_resize_gradients_;
+
+  #ifdef USE_CUDA_EXP
+  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
+  CUDAVector<data_size_t> cuda_bag_data_indices_;
+  #endif  // USE_CUDA_EXP
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_SAMPLE_STRATEGY_H_
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 6ff0370e2ea6..3e403b16e89b 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -228,7 +228,7 @@ class Tree {
     shrinkage_ = 1.0f;
   }
 
-  inline void AsConstantTree(double val) {
+  virtual inline void AsConstantTree(double val) {
     num_leaves_ = 1;
     shrinkage_ = 1.0f;
     leaf_value_[0] = val;
diff --git a/python-package/README.rst b/python-package/README.rst
index 048d4dc0346e..4494eb43a172 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -22,7 +22,7 @@ Install from `PyPI <https://pypi.org/project/lightgbm>`_
 
 You may need to install `wheel <https://pythonwheels.com>`_ via ``pip install wheel`` first.
 
-Compiled library that is included in the wheel file supports both **GPU** and **CPU** versions out of the box. This feature is experimental and available only for **Windows** currently. To use **GPU** version you only need to install OpenCL Runtime libraries. For NVIDIA and AMD GPU they are included in the ordinary drivers for your graphics card, so no action is required. If you would like your AMD or Intel CPU to act like a GPU (for testing and debugging) you can install `AMD APP SDK <https://github.com/microsoft/LightGBM/releases/download/v2.0.12/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe>`_.
+Compiled library that is included in the wheel file supports both **GPU** and **CPU** versions out of the box. This feature is experimental and available only for **Windows** and **Linux** currently. To use **GPU** version you only need to install OpenCL Runtime libraries. For NVIDIA and AMD GPU they are included in the ordinary drivers for your graphics card, so no action is required. If you would like your AMD or Intel CPU to act like a GPU (for testing and debugging) you can install `AMD APP SDK <https://github.com/microsoft/LightGBM/releases/download/v2.0.12/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe>`_ on **Windows** and `PoCL <http://portablecl.org>`_ on **Linux**. Many modern Linux distributions provide packages for PoCL, look for ``pocl-opencl-icd`` on Debian-based distributions and ``pocl`` on RedHat-based distributions.
 
 For **Windows** users, `VC runtime <https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads>`_ is needed if **Visual Studio** (2015 or newer) is not installed.
 
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index be640af22ec2..10e4710bcd4d 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -304,7 +304,7 @@ def _c_array(ctype: type, values: List[Any]) -> ctypes.Array:
     return (ctype * len(values))(*values)
 
 
-def json_default_with_numpy(obj: Any) -> Any:
+def _json_default_with_numpy(obj: Any) -> Any:
     """Convert numpy classes to JSON serializable objects."""
     if isinstance(obj, (np.integer, np.floating, np.bool_)):
         return obj.item()
@@ -314,7 +314,7 @@ def json_default_with_numpy(obj: Any) -> Any:
         return obj
 
 
-def param_dict_to_str(data: Optional[Dict[str, Any]]) -> str:
+def _param_dict_to_str(data: Optional[Dict[str, Any]]) -> str:
     """Convert Python dictionary to string, which is passed to C API."""
     if data is None or not data:
         return ""
@@ -465,43 +465,47 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
     return params
 
 
-MAX_INT32 = (1 << 31) - 1
+_MAX_INT32 = (1 << 31) - 1
 
 """Macro definition of data type in C API of LightGBM"""
-C_API_DTYPE_FLOAT32 = 0
-C_API_DTYPE_FLOAT64 = 1
-C_API_DTYPE_INT32 = 2
-C_API_DTYPE_INT64 = 3
+_C_API_DTYPE_FLOAT32 = 0
+_C_API_DTYPE_FLOAT64 = 1
+_C_API_DTYPE_INT32 = 2
+_C_API_DTYPE_INT64 = 3
 
 """Matrix is row major in Python"""
-C_API_IS_ROW_MAJOR = 1
+_C_API_IS_ROW_MAJOR = 1
 
 """Macro definition of prediction type in C API of LightGBM"""
-C_API_PREDICT_NORMAL = 0
-C_API_PREDICT_RAW_SCORE = 1
-C_API_PREDICT_LEAF_INDEX = 2
-C_API_PREDICT_CONTRIB = 3
+_C_API_PREDICT_NORMAL = 0
+_C_API_PREDICT_RAW_SCORE = 1
+_C_API_PREDICT_LEAF_INDEX = 2
+_C_API_PREDICT_CONTRIB = 3
 
 """Macro definition of sparse matrix type"""
-C_API_MATRIX_TYPE_CSR = 0
-C_API_MATRIX_TYPE_CSC = 1
+_C_API_MATRIX_TYPE_CSR = 0
+_C_API_MATRIX_TYPE_CSC = 1
 
 """Macro definition of feature importance type"""
-C_API_FEATURE_IMPORTANCE_SPLIT = 0
-C_API_FEATURE_IMPORTANCE_GAIN = 1
+_C_API_FEATURE_IMPORTANCE_SPLIT = 0
+_C_API_FEATURE_IMPORTANCE_GAIN = 1
 
 """Data type of data field"""
-FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
-                     "weight": C_API_DTYPE_FLOAT32,
-                     "init_score": C_API_DTYPE_FLOAT64,
-                     "group": C_API_DTYPE_INT32}
+_FIELD_TYPE_MAPPER = {
+    "label": _C_API_DTYPE_FLOAT32,
+    "weight": _C_API_DTYPE_FLOAT32,
+    "init_score": _C_API_DTYPE_FLOAT64,
+    "group": _C_API_DTYPE_INT32
+}
 
 """String name to int feature importance type mapper"""
-FEATURE_IMPORTANCE_TYPE_MAPPER = {"split": C_API_FEATURE_IMPORTANCE_SPLIT,
-                                  "gain": C_API_FEATURE_IMPORTANCE_GAIN}
+_FEATURE_IMPORTANCE_TYPE_MAPPER = {
+    "split": _C_API_FEATURE_IMPORTANCE_SPLIT,
+    "gain": _C_API_FEATURE_IMPORTANCE_GAIN
+}
 
 
-def convert_from_sliced_object(data):
+def _convert_from_sliced_object(data):
     """Fix the memory of multi-dimensional sliced object."""
     if isinstance(data, np.ndarray) and isinstance(data.base, np.ndarray):
         if not data.flags.c_contiguous:
@@ -511,19 +515,19 @@ def convert_from_sliced_object(data):
     return data
 
 
-def c_float_array(data):
+def _c_float_array(data):
     """Get pointer of float numpy array / list."""
     if _is_1d_list(data):
         data = np.array(data, copy=False)
     if _is_numpy_1d_array(data):
-        data = convert_from_sliced_object(data)
+        data = _convert_from_sliced_object(data)
         assert data.flags.c_contiguous
         if data.dtype == np.float32:
             ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
-            type_data = C_API_DTYPE_FLOAT32
+            type_data = _C_API_DTYPE_FLOAT32
         elif data.dtype == np.float64:
             ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
-            type_data = C_API_DTYPE_FLOAT64
+            type_data = _C_API_DTYPE_FLOAT64
         else:
             raise TypeError(f"Expected np.float32 or np.float64, met type({data.dtype})")
     else:
@@ -531,19 +535,19 @@ def c_float_array(data):
     return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
 
 
-def c_int_array(data):
+def _c_int_array(data):
     """Get pointer of int numpy array / list."""
     if _is_1d_list(data):
         data = np.array(data, copy=False)
     if _is_numpy_1d_array(data):
-        data = convert_from_sliced_object(data)
+        data = _convert_from_sliced_object(data)
         assert data.flags.c_contiguous
         if data.dtype == np.int32:
             ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
-            type_data = C_API_DTYPE_INT32
+            type_data = _C_API_DTYPE_INT32
         elif data.dtype == np.int64:
             ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
-            type_data = C_API_DTYPE_INT64
+            type_data = _C_API_DTYPE_INT64
         else:
             raise TypeError(f"Expected np.int32 or np.int64, met type({data.dtype})")
     else:
@@ -602,7 +606,16 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
         df_dtypes = [dtype.type for dtype in data.dtypes]
         df_dtypes.append(np.float32)  # so that the target dtype considers floats
         target_dtype = np.find_common_type(df_dtypes, [])
-        data = data.astype(target_dtype, copy=False).values
+        try:
+            # most common case (no nullable dtypes)
+            data = data.to_numpy(dtype=target_dtype, copy=False)
+        except TypeError:
+            # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
+            # raises error because array is casted to type(pd.NA) and there's no na_value argument
+            data = data.astype(target_dtype, copy=False).values
+        except ValueError:
+            # data has nullable dtypes, but we can specify na_value argument and copy will be made
+            data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
     else:
         if feature_name == 'auto':
             feature_name = None
@@ -612,7 +625,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
 
 
 def _dump_pandas_categorical(pandas_categorical, file_name=None):
-    categorical_json = json.dumps(pandas_categorical, default=json_default_with_numpy)
+    categorical_json = json.dumps(pandas_categorical, default=_json_default_with_numpy)
     pandas_str = f'\npandas_categorical:{categorical_json}\n'
     if file_name is not None:
         with open(file_name, 'a') as f:
@@ -773,7 +786,7 @@ def __init__(
             raise TypeError('Need model_file or booster_handle to create a predictor')
 
         pred_parameter = {} if pred_parameter is None else pred_parameter
-        self.pred_parameter = param_dict_to_str(pred_parameter)
+        self.pred_parameter = _param_dict_to_str(pred_parameter)
 
     def __del__(self) -> None:
         try:
@@ -842,13 +855,13 @@ def predict(
                 )
             )
         data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
-        predict_type = C_API_PREDICT_NORMAL
+        predict_type = _C_API_PREDICT_NORMAL
         if raw_score:
-            predict_type = C_API_PREDICT_RAW_SCORE
+            predict_type = _C_API_PREDICT_RAW_SCORE
         if pred_leaf:
-            predict_type = C_API_PREDICT_LEAF_INDEX
+            predict_type = _C_API_PREDICT_LEAF_INDEX
         if pred_contrib:
-            predict_type = C_API_PREDICT_CONTRIB
+            predict_type = _C_API_PREDICT_CONTRIB
         int_data_has_header = 1 if data_has_header else 0
 
         if isinstance(data, (str, Path)):
@@ -897,9 +910,9 @@ def predict(
 
     def __get_num_preds(self, start_iteration, num_iteration, nrow, predict_type):
         """Get size of prediction result."""
-        if nrow > MAX_INT32:
+        if nrow > _MAX_INT32:
             raise LightGBMError('LightGBM cannot perform prediction for data '
-                                f'with number of rows greater than MAX_INT32 ({MAX_INT32}).\n'
+                                f'with number of rows greater than MAX_INT32 ({_MAX_INT32}).\n'
                                 'You can split your data into chunks '
                                 'and then concatenate predictions for them')
         n_preds = ctypes.c_int64(0)
@@ -922,7 +935,7 @@ def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None)
                 data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
             else:  # change non-float data to float data, need to copy
                 data = np.array(mat.reshape(mat.size), dtype=np.float32)
-            ptr_data, type_ptr_data, _ = c_float_array(data)
+            ptr_data, type_ptr_data, _ = _c_float_array(data)
             n_preds = self.__get_num_preds(start_iteration, num_iteration, mat.shape[0], predict_type)
             if preds is None:
                 preds = np.empty(n_preds, dtype=np.float64)
@@ -935,7 +948,7 @@ def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None)
                 ctypes.c_int(type_ptr_data),
                 ctypes.c_int32(mat.shape[0]),
                 ctypes.c_int32(mat.shape[1]),
-                ctypes.c_int(C_API_IS_ROW_MAJOR),
+                ctypes.c_int(_C_API_IS_ROW_MAJOR),
                 ctypes.c_int(predict_type),
                 ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
@@ -947,8 +960,8 @@ def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None)
             return preds, mat.shape[0]
 
         nrow = mat.shape[0]
-        if nrow > MAX_INT32:
-            sections = np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)
+        if nrow > _MAX_INT32:
+            sections = np.arange(start=_MAX_INT32, stop=nrow, step=_MAX_INT32)
             # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
             n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
             n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
@@ -966,15 +979,15 @@ def __create_sparse_native(self, cs, out_shape, out_ptr_indptr, out_ptr_indices,
         # create numpy array from output arrays
         data_indices_len = out_shape[0]
         indptr_len = out_shape[1]
-        if indptr_type == C_API_DTYPE_INT32:
+        if indptr_type == _C_API_DTYPE_INT32:
             out_indptr = _cint32_array_to_numpy(out_ptr_indptr, indptr_len)
-        elif indptr_type == C_API_DTYPE_INT64:
+        elif indptr_type == _C_API_DTYPE_INT64:
             out_indptr = _cint64_array_to_numpy(out_ptr_indptr, indptr_len)
         else:
             raise TypeError("Expected int32 or int64 type for indptr")
-        if data_type == C_API_DTYPE_FLOAT32:
+        if data_type == _C_API_DTYPE_FLOAT32:
             out_data = _cfloat32_array_to_numpy(out_ptr_data, data_indices_len)
-        elif data_type == C_API_DTYPE_FLOAT64:
+        elif data_type == _C_API_DTYPE_FLOAT64:
             out_data = _cfloat64_array_to_numpy(out_ptr_data, data_indices_len)
         else:
             raise TypeError("Expected float32 or float64 type for data")
@@ -1018,10 +1031,10 @@ def inner_predict(csr, start_iteration, num_iteration, predict_type, preds=None)
                 raise ValueError("Wrong length of pre-allocated predict array")
             out_num_preds = ctypes.c_int64(0)
 
-            ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
-            ptr_data, type_ptr_data, _ = c_float_array(csr.data)
+            ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr)
+            ptr_data, type_ptr_data, _ = _c_float_array(csr.data)
 
-            assert csr.shape[1] <= MAX_INT32
+            assert csr.shape[1] <= _MAX_INT32
             csr_indices = csr.indices.astype(np.int32, copy=False)
 
             _safe_call(_LIB.LGBM_BoosterPredictForCSR(
@@ -1045,16 +1058,16 @@ def inner_predict(csr, start_iteration, num_iteration, predict_type, preds=None)
             return preds, nrow
 
         def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
-            ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
-            ptr_data, type_ptr_data, _ = c_float_array(csr.data)
+            ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr)
+            ptr_data, type_ptr_data, _ = _c_float_array(csr.data)
             csr_indices = csr.indices.astype(np.int32, copy=False)
-            matrix_type = C_API_MATRIX_TYPE_CSR
-            if type_ptr_indptr == C_API_DTYPE_INT32:
+            matrix_type = _C_API_MATRIX_TYPE_CSR
+            if type_ptr_indptr == _C_API_DTYPE_INT32:
                 out_ptr_indptr = ctypes.POINTER(ctypes.c_int32)()
             else:
                 out_ptr_indptr = ctypes.POINTER(ctypes.c_int64)()
             out_ptr_indices = ctypes.POINTER(ctypes.c_int32)()
-            if type_ptr_data == C_API_DTYPE_FLOAT32:
+            if type_ptr_data == _C_API_DTYPE_FLOAT32:
                 out_ptr_data = ctypes.POINTER(ctypes.c_float)()
             else:
                 out_ptr_data = ctypes.POINTER(ctypes.c_double)()
@@ -1083,11 +1096,11 @@ def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
             nrow = len(csr.indptr) - 1
             return matrices, nrow
 
-        if predict_type == C_API_PREDICT_CONTRIB:
+        if predict_type == _C_API_PREDICT_CONTRIB:
             return inner_predict_sparse(csr, start_iteration, num_iteration, predict_type)
         nrow = len(csr.indptr) - 1
-        if nrow > MAX_INT32:
-            sections = [0] + list(np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)) + [nrow]
+        if nrow > _MAX_INT32:
+            sections = [0] + list(np.arange(start=_MAX_INT32, stop=nrow, step=_MAX_INT32)) + [nrow]
             # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
             n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)]
             n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
@@ -1103,16 +1116,16 @@ def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
     def __pred_for_csc(self, csc, start_iteration, num_iteration, predict_type):
         """Predict for a CSC data."""
         def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type):
-            ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
-            ptr_data, type_ptr_data, _ = c_float_array(csc.data)
+            ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr)
+            ptr_data, type_ptr_data, _ = _c_float_array(csc.data)
             csc_indices = csc.indices.astype(np.int32, copy=False)
-            matrix_type = C_API_MATRIX_TYPE_CSC
-            if type_ptr_indptr == C_API_DTYPE_INT32:
+            matrix_type = _C_API_MATRIX_TYPE_CSC
+            if type_ptr_indptr == _C_API_DTYPE_INT32:
                 out_ptr_indptr = ctypes.POINTER(ctypes.c_int32)()
             else:
                 out_ptr_indptr = ctypes.POINTER(ctypes.c_int64)()
             out_ptr_indices = ctypes.POINTER(ctypes.c_int32)()
-            if type_ptr_data == C_API_DTYPE_FLOAT32:
+            if type_ptr_data == _C_API_DTYPE_FLOAT32:
                 out_ptr_data = ctypes.POINTER(ctypes.c_float)()
             else:
                 out_ptr_data = ctypes.POINTER(ctypes.c_double)()
@@ -1142,18 +1155,18 @@ def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type):
             return matrices, nrow
 
         nrow = csc.shape[0]
-        if nrow > MAX_INT32:
+        if nrow > _MAX_INT32:
             return self.__pred_for_csr(csc.tocsr(), start_iteration, num_iteration, predict_type)
-        if predict_type == C_API_PREDICT_CONTRIB:
+        if predict_type == _C_API_PREDICT_CONTRIB:
             return inner_predict_sparse(csc, start_iteration, num_iteration, predict_type)
         n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
         preds = np.empty(n_preds, dtype=np.float64)
         out_num_preds = ctypes.c_int64(0)
 
-        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
-        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
+        ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr)
+        ptr_data, type_ptr_data, _ = _c_float_array(csc.data)
 
-        assert csc.shape[0] <= MAX_INT32
+        assert csc.shape[0] <= _MAX_INT32
         csc_indices = csc.indices.astype(np.int32, copy=False)
 
         _safe_call(_LIB.LGBM_BoosterPredictForCSC(
@@ -1290,10 +1303,10 @@ def _create_sample_indices(self, total_nrow: int) -> np.ndarray:
         indices : numpy array
             Indices for sampled data.
         """
-        param_str = param_dict_to_str(self.get_params())
+        param_str = _param_dict_to_str(self.get_params())
         sample_cnt = _get_sample_count(total_nrow, param_str)
         indices = np.empty(sample_cnt, dtype=np.int32)
-        ptr_data, _, _ = c_int_array(indices)
+        ptr_data, _, _ = _c_int_array(indices)
         actual_sample_cnt = ctypes.c_int32(0)
 
         _safe_call(_LIB.LGBM_SampleIndices(
@@ -1373,14 +1386,14 @@ def _init_from_sample(
         # each int* points to start of indices for each column
         indices_col_ptr = (ctypes.POINTER(ctypes.c_int32) * ncol)()
         for i in range(ncol):
-            sample_col_ptr[i] = c_float_array(sample_data[i])[0]
-            indices_col_ptr[i] = c_int_array(sample_indices[i])[0]
+            sample_col_ptr[i] = _c_float_array(sample_data[i])[0]
+            indices_col_ptr[i] = _c_int_array(sample_indices[i])[0]
 
         num_per_col = np.array([len(d) for d in sample_indices], dtype=np.int32)
-        num_per_col_ptr, _, _ = c_int_array(num_per_col)
+        num_per_col_ptr, _, _ = _c_int_array(num_per_col)
 
         self.handle = ctypes.c_void_p()
-        params_str = param_dict_to_str(self.get_params())
+        params_str = _param_dict_to_str(self.get_params())
         _safe_call(_LIB.LGBM_DatasetCreateFromSampledColumn(
             ctypes.cast(sample_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))),
             ctypes.cast(indices_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_int32))),
@@ -1409,7 +1422,7 @@ def _push_rows(self, data: np.ndarray) -> 'Dataset':
         """
         nrow, ncol = data.shape
         data = data.reshape(data.size)
-        data_ptr, data_type, _ = c_float_array(data)
+        data_ptr, data_type, _ = _c_float_array(data)
 
         _safe_call(_LIB.LGBM_DatasetPushRows(
             self.handle,
@@ -1554,7 +1567,7 @@ def _lazy_init(
                         params.pop(cat_alias, None)
                 params['categorical_column'] = sorted(categorical_indices)
 
-        params_str = param_dict_to_str(params)
+        params_str = _param_dict_to_str(params)
         self.params = params
         # process for reference dataset
         ref_dataset = None
@@ -1674,7 +1687,7 @@ def __init_from_seqs(
         if ref_dataset is not None:
             self._init_from_ref_dataset(total_nrow, ref_dataset)
         else:
-            param_str = param_dict_to_str(self.get_params())
+            param_str = _param_dict_to_str(self.get_params())
             sample_cnt = _get_sample_count(total_nrow, param_str)
 
             sample_data, col_indices = self.__sample(seqs, total_nrow)
@@ -1704,13 +1717,13 @@ def __init_from_np2d(
         else:  # change non-float data to float data, need to copy
             data = np.array(mat.reshape(mat.size), dtype=np.float32)
 
-        ptr_data, type_ptr_data, _ = c_float_array(data)
+        ptr_data, type_ptr_data, _ = _c_float_array(data)
         _safe_call(_LIB.LGBM_DatasetCreateFromMat(
             ptr_data,
             ctypes.c_int(type_ptr_data),
             ctypes.c_int32(mat.shape[0]),
             ctypes.c_int32(mat.shape[1]),
-            ctypes.c_int(C_API_IS_ROW_MAJOR),
+            ctypes.c_int(_C_API_IS_ROW_MAJOR),
             _c_str(params_str),
             ref_dataset,
             ctypes.byref(self.handle)))
@@ -1747,7 +1760,7 @@ def __init_from_list_np2d(
             else:  # change non-float data to float data, need to copy
                 mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
 
-            chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
+            chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i])
             if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
                 raise ValueError('Input chunks must have same type')
             ptr_data[i] = chunk_ptr_data
@@ -1761,7 +1774,7 @@ def __init_from_list_np2d(
             ctypes.c_int(type_ptr_data),
             nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
             ctypes.c_int32(ncol),
-            ctypes.c_int(C_API_IS_ROW_MAJOR),
+            ctypes.c_int(_C_API_IS_ROW_MAJOR),
             _c_str(params_str),
             ref_dataset,
             ctypes.byref(self.handle)))
@@ -1778,10 +1791,10 @@ def __init_from_csr(
             raise ValueError(f'Length mismatch: {len(csr.indices)} vs {len(csr.data)}')
         self.handle = ctypes.c_void_p()
 
-        ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
-        ptr_data, type_ptr_data, _ = c_float_array(csr.data)
+        ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr)
+        ptr_data, type_ptr_data, _ = _c_float_array(csr.data)
 
-        assert csr.shape[1] <= MAX_INT32
+        assert csr.shape[1] <= _MAX_INT32
         csr_indices = csr.indices.astype(np.int32, copy=False)
 
         _safe_call(_LIB.LGBM_DatasetCreateFromCSR(
@@ -1809,10 +1822,10 @@ def __init_from_csc(
             raise ValueError(f'Length mismatch: {len(csc.indices)} vs {len(csc.data)}')
         self.handle = ctypes.c_void_p()
 
-        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
-        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
+        ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr)
+        ptr_data, type_ptr_data, _ = _c_float_array(csc.data)
 
-        assert csc.shape[0] <= MAX_INT32
+        assert csc.shape[0] <= _MAX_INT32
         csc_indices = csc.indices.astype(np.int32, copy=False)
 
         _safe_call(_LIB.LGBM_DatasetCreateFromCSC(
@@ -1902,7 +1915,7 @@ def construct(self) -> "Dataset":
                         _, self.group = np.unique(np.repeat(range(len(group_info)), repeats=group_info)[self.used_indices],
                                                   return_counts=True)
                     self.handle = ctypes.c_void_p()
-                    params_str = param_dict_to_str(self.params)
+                    params_str = _param_dict_to_str(self.params)
                     _safe_call(_LIB.LGBM_DatasetGetSubset(
                         self.reference.construct().handle,
                         used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
@@ -2040,8 +2053,8 @@ def update():
             update()
         elif params is not None:
             ret = _LIB.LGBM_DatasetUpdateParamChecking(
-                _c_str(param_dict_to_str(self.params)),
-                _c_str(param_dict_to_str(params)))
+                _c_str(_param_dict_to_str(self.params)),
+                _c_str(_param_dict_to_str(params)))
             if ret != 0:
                 # could be updated if data is not freed
                 if self.data is not None:
@@ -2085,7 +2098,7 @@ def set_field(
                 _c_str(field_name),
                 None,
                 ctypes.c_int(0),
-                ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
+                ctypes.c_int(_FIELD_TYPE_MAPPER[field_name])))
             return self
         if field_name == 'init_score':
             dtype = np.float64
@@ -2104,12 +2117,12 @@ def set_field(
             data = _list_to_1d_numpy(data, dtype, name=field_name)
 
         if data.dtype == np.float32 or data.dtype == np.float64:
-            ptr_data, type_data, _ = c_float_array(data)
+            ptr_data, type_data, _ = _c_float_array(data)
         elif data.dtype == np.int32:
-            ptr_data, type_data, _ = c_int_array(data)
+            ptr_data, type_data, _ = _c_int_array(data)
         else:
             raise TypeError(f"Expected np.float32/64 or np.int32, met type({data.dtype})")
-        if type_data != FIELD_TYPE_MAPPER[field_name]:
+        if type_data != _FIELD_TYPE_MAPPER[field_name]:
             raise TypeError("Input type error for set_field")
         _safe_call(_LIB.LGBM_DatasetSetField(
             self.handle,
@@ -2144,15 +2157,15 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]:
             ctypes.byref(tmp_out_len),
             ctypes.byref(ret),
             ctypes.byref(out_type)))
-        if out_type.value != FIELD_TYPE_MAPPER[field_name]:
+        if out_type.value != _FIELD_TYPE_MAPPER[field_name]:
             raise TypeError("Return type error for get_field")
         if tmp_out_len.value == 0:
             return None
-        if out_type.value == C_API_DTYPE_INT32:
+        if out_type.value == _C_API_DTYPE_INT32:
             arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
-        elif out_type.value == C_API_DTYPE_FLOAT32:
+        elif out_type.value == _C_API_DTYPE_FLOAT32:
             arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
-        elif out_type.value == C_API_DTYPE_FLOAT64:
+        elif out_type.value == _C_API_DTYPE_FLOAT64:
             arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
         else:
             raise TypeError("Unknown type")
@@ -2291,7 +2304,17 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
                 if len(label.columns) > 1:
                     raise ValueError('DataFrame for label cannot have multiple columns')
                 _check_for_bad_pandas_dtypes(label.dtypes)
-                label_array = np.ravel(label.values.astype(np.float32, copy=False))
+                try:
+                    # most common case (no nullable dtypes)
+                    label = label.to_numpy(dtype=np.float32, copy=False)
+                except TypeError:
+                    # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
+                    # raises error because array is casted to type(pd.NA) and there's no na_value argument
+                    label = label.astype(np.float32, copy=False).values
+                except ValueError:
+                    # data has nullable dtypes, but we can specify na_value argument and copy will be made
+                    label = label.to_numpy(dtype=np.float32, na_value=np.nan)
+                label_array = np.ravel(label)
             else:
                 label_array = _list_to_1d_numpy(label, name='label')
             self.set_field('label', label_array)
@@ -2775,7 +2798,7 @@ def __init__(
             train_set.construct()
             # copy the parameters from train_set
             params.update(train_set.get_params())
-            params_str = param_dict_to_str(params)
+            params_str = _param_dict_to_str(params)
             self.handle = ctypes.c_void_p()
             _safe_call(_LIB.LGBM_BoosterCreate(
                 train_set.handle,
@@ -3148,7 +3171,7 @@ def reset_parameter(self, params: Dict[str, Any]) -> "Booster":
         self : Booster
             Booster with new parameters.
         """
-        params_str = param_dict_to_str(params)
+        params_str = _param_dict_to_str(params)
         if params_str:
             _safe_call(_LIB.LGBM_BoosterResetParameter(
                 self.handle,
@@ -3518,7 +3541,7 @@ def save_model(
         """
         if num_iteration is None:
             num_iteration = self.best_iteration
-        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
+        importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         _safe_call(_LIB.LGBM_BoosterSaveModel(
             self.handle,
             ctypes.c_int(start_iteration),
@@ -3612,7 +3635,7 @@ def model_to_string(
         """
         if num_iteration is None:
             num_iteration = self.best_iteration
-        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
+        importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
@@ -3680,7 +3703,7 @@ def dump_model(
         """
         if num_iteration is None:
             num_iteration = self.best_iteration
-        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
+        importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
@@ -3708,7 +3731,7 @@ def dump_model(
                 ptr_string_buffer))
         ret = json.loads(string_buffer.value.decode('utf-8'), object_hook=object_hook)
         ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical,
-                                                          default=json_default_with_numpy))
+                                                          default=_json_default_with_numpy))
         return ret
 
     def predict(
@@ -3884,7 +3907,7 @@ def refit(
             new_booster.handle,
             predictor.handle))
         leaf_preds = leaf_preds.reshape(-1)
-        ptr_data, _, _ = c_int_array(leaf_preds)
+        ptr_data, _, _ = _c_int_array(leaf_preds)
         _safe_call(_LIB.LGBM_BoosterRefit(
             new_booster.handle,
             ptr_data,
@@ -4002,14 +4025,14 @@ def feature_importance(
         """
         if iteration is None:
             iteration = self.best_iteration
-        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
+        importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         result = np.empty(self.num_feature(), dtype=np.float64)
         _safe_call(_LIB.LGBM_BoosterFeatureImportance(
             self.handle,
             ctypes.c_int(iteration),
             ctypes.c_int(importance_type_int),
             result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
-        if importance_type_int == C_API_FEATURE_IMPORTANCE_SPLIT:
+        if importance_type_int == _C_API_FEATURE_IMPORTANCE_SPLIT:
             return result.astype(np.int32)
         else:
             return result
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 65c044ffc883..adbc5f62593e 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -122,6 +122,7 @@ class _LGBMRegressorBase:  # type: ignore
 
         pass
 
+    _LGBMBaseCrossValidator = None
     _LGBMLabelEncoder = None
     LGBMNotFittedError = ValueError
     _LGBMStratifiedKFold = None
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index c71ce6799c32..aaf8c35fa0fa 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -1042,6 +1042,8 @@ def _lgb_dask_fit(
         eval_at: Optional[Iterable[int]] = None,
         **kwargs: Any
     ) -> "_DaskLGBMModel":
+        if not DASK_INSTALLED:
+            raise LightGBMError('dask is required for lightgbm.dask')
         if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
             raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask')
 
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 05219eaf3d5e..fe75a8f5c827 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -382,7 +382,6 @@ def __init__(
         boosting_type : str, optional (default='gbdt')
             'gbdt', traditional Gradient Boosting Decision Tree.
             'dart', Dropouts meet Multiple Additive Regression Trees.
-            'goss', Gradient-based One-Side Sampling.
             'rf', Random Forest.
         num_leaves : int, optional (default=31)
             Maximum tree leaves for base learners.
diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp
new file mode 100644
index 000000000000..65a937435105
--- /dev/null
+++ b/src/boosting/bagging.hpp
@@ -0,0 +1,209 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifndef LIGHTGBM_BOOSTING_BAGGING_HPP_
+#define LIGHTGBM_BOOSTING_BAGGING_HPP_
+
+#include <string>
+
+namespace LightGBM {
+
+class BaggingSampleStrategy : public SampleStrategy {
+ public:
+  BaggingSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration)
+    : need_re_bagging_(false) {
+    config_ = config;
+    train_data_ = train_data;
+    num_data_ = train_data->num_data();
+    objective_function_ = objective_function;
+    num_tree_per_iteration_ = num_tree_per_iteration;
+  }
+
+  ~BaggingSampleStrategy() {}
+
+  void Bagging(int iter, TreeLearner* tree_learner, score_t* /*gradients*/, score_t* /*hessians*/) override {
+    Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
+    // if need bagging
+    if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
+        need_re_bagging_) {
+      need_re_bagging_ = false;
+      auto left_cnt = bagging_runner_.Run<true>(
+          num_data_,
+          [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
+              data_size_t*) {
+            data_size_t cur_left_count = 0;
+            if (balanced_bagging_) {
+              cur_left_count =
+                  BalancedBaggingHelper(cur_start, cur_cnt, left);
+            } else {
+              cur_left_count = BaggingHelper(cur_start, cur_cnt, left);
+            }
+            return cur_left_count;
+          },
+          bag_data_indices_.data());
+      bag_data_cnt_ = left_cnt;
+      Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
+      // set bagging data to tree learner
+      if (!is_use_subset_) {
+        #ifdef USE_CUDA_EXP
+        if (config_->device_type == std::string("cuda_exp")) {
+          CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+          tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
+        } else {
+        #endif  // USE_CUDA_EXP
+          tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+        #ifdef USE_CUDA_EXP
+        }
+        #endif  // USE_CUDA_EXP
+      } else {
+        // get subset
+        tmp_subset_->ReSize(bag_data_cnt_);
+        tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
+                                bag_data_cnt_, false);
+        #ifdef USE_CUDA_EXP
+        if (config_->device_type == std::string("cuda_exp")) {
+          CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+          tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
+                                       bag_data_cnt_);
+        } else {
+        #endif  // USE_CUDA_EXP
+          tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                       bag_data_cnt_);
+        #ifdef USE_CUDA_EXP
+        }
+        #endif  // USE_CUDA_EXP
+      }
+    }
+  }
+
+  void ResetSampleConfig(const Config* config, bool is_change_dataset) override {
+    need_resize_gradients_ = false;
+    // if need bagging, create buffer
+    data_size_t num_pos_data = 0;
+    if (objective_function_ != nullptr) {
+      num_pos_data = objective_function_->NumPositiveData();
+    }
+    bool balance_bagging_cond = (config->pos_bagging_fraction < 1.0 || config->neg_bagging_fraction < 1.0) && (num_pos_data > 0);
+    if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) {
+      need_re_bagging_ = false;
+      if (!is_change_dataset &&
+        config_ != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq
+        && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) {
+        config_ = config;
+        return;
+      }
+      config_ = config;
+      if (balance_bagging_cond) {
+        balanced_bagging_ = true;
+        bag_data_cnt_ = static_cast<data_size_t>(num_pos_data * config_->pos_bagging_fraction)
+                        + static_cast<data_size_t>((num_data_ - num_pos_data) * config_->neg_bagging_fraction);
+      } else {
+        bag_data_cnt_ = static_cast<data_size_t>(config_->bagging_fraction * num_data_);
+      }
+      bag_data_indices_.resize(num_data_);
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        cuda_bag_data_indices_.Resize(num_data_);
+      }
+      #endif  // USE_CUDA_EXP
+      bagging_runner_.ReSize(num_data_);
+      bagging_rands_.clear();
+      for (int i = 0;
+          i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) {
+        bagging_rands_.emplace_back(config_->bagging_seed + i);
+      }
+
+      double average_bag_rate =
+          (static_cast<double>(bag_data_cnt_) / num_data_) / config_->bagging_freq;
+      is_use_subset_ = false;
+      if (config_->device_type != std::string("cuda_exp")) {
+        const int group_threshold_usesubset = 100;
+        const double average_bag_rate_threshold = 0.5;
+        if (average_bag_rate <= average_bag_rate_threshold
+            && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
+          if (tmp_subset_ == nullptr || is_change_dataset) {
+            tmp_subset_.reset(new Dataset(bag_data_cnt_));
+            tmp_subset_->CopyFeatureMapperFrom(train_data_);
+          }
+          is_use_subset_ = true;
+          Log::Debug("Use subset for bagging");
+        }
+      }
+
+      need_re_bagging_ = true;
+
+      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+        // resize gradient vectors to copy the customized gradients for using subset data
+        need_resize_gradients_ = true;
+      }
+    } else {
+      bag_data_cnt_ = num_data_;
+      bag_data_indices_.clear();
+      #ifdef USE_CUDA_EXP
+      cuda_bag_data_indices_.Clear();
+      #endif  // USE_CUDA_EXP
+      bagging_runner_.ReSize(0);
+      is_use_subset_ = false;
+    }
+  }
+
+  bool IsHessianChange() const override {
+    return false;
+  }
+
+ private:
+  data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) {
+    if (cnt <= 0) {
+      return 0;
+    }
+    data_size_t cur_left_cnt = 0;
+    data_size_t cur_right_pos = cnt;
+    // random bagging, minimal unit is one record
+    for (data_size_t i = 0; i < cnt; ++i) {
+      auto cur_idx = start + i;
+      if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < config_->bagging_fraction) {
+        buffer[cur_left_cnt++] = cur_idx;
+      } else {
+        buffer[--cur_right_pos] = cur_idx;
+      }
+    }
+    return cur_left_cnt;
+  }
+
+  data_size_t BalancedBaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) {
+    if (cnt <= 0) {
+      return 0;
+    }
+    auto label_ptr = train_data_->metadata().label();
+    data_size_t cur_left_cnt = 0;
+    data_size_t cur_right_pos = cnt;
+    // random bagging, minimal unit is one record
+    for (data_size_t i = 0; i < cnt; ++i) {
+      auto cur_idx = start + i;
+      bool is_pos = label_ptr[start + i] > 0;
+      bool is_in_bag = false;
+      if (is_pos) {
+        is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() <
+                    config_->pos_bagging_fraction;
+      } else {
+        is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() <
+                    config_->neg_bagging_fraction;
+      }
+      if (is_in_bag) {
+        buffer[cur_left_cnt++] = cur_idx;
+      } else {
+        buffer[--cur_right_pos] = cur_idx;
+      }
+    }
+    return cur_left_cnt;
+  }
+
+  /*! \brief whether need restart bagging in continued training */
+  bool need_re_bagging_;
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_BOOSTING_BAGGING_HPP_
diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp
index 91fa318a0f18..98f2554b1388 100644
--- a/src/boosting/boosting.cpp
+++ b/src/boosting/boosting.cpp
@@ -6,7 +6,6 @@
 
 #include "dart.hpp"
 #include "gbdt.h"
-#include "goss.hpp"
 #include "rf.hpp"
 
 namespace LightGBM {
@@ -39,7 +38,7 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
     } else if (type == std::string("dart")) {
       return new DART();
     } else if (type == std::string("goss")) {
-      return new GOSS();
+      return new GBDT();
     } else if (type == std::string("rf")) {
       return new RF();
     } else {
@@ -53,7 +52,7 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
       } else if (type == std::string("dart")) {
         ret.reset(new DART());
       } else if (type == std::string("goss")) {
-        ret.reset(new GOSS());
+        ret.reset(new GBDT());
       } else if (type == std::string("rf")) {
         return new RF();
       } else {
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index c2a38417091a..72e16ee7e707 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -10,9 +10,11 @@
 #include <LightGBM/prediction_early_stop.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/sample_strategy.h>
 
 #include <chrono>
 #include <ctime>
+#include <queue>
 #include <sstream>
 
 namespace LightGBM {
@@ -34,13 +36,11 @@ GBDT::GBDT()
       num_class_(1),
       num_iteration_for_pred_(0),
       shrinkage_rate_(0.1f),
-      num_init_iteration_(0),
-      need_re_bagging_(false),
-      balanced_bagging_(false),
-      bagging_runner_(0, bagging_rand_block_) {
+      num_init_iteration_(0) {
   average_output_ = false;
   tree_learner_ = nullptr;
   linear_tree_ = false;
+  data_sample_strategy_.reset(nullptr);
   gradients_pointer_ = nullptr;
   hessians_pointer_ = nullptr;
   boosting_on_gpu_ = false;
@@ -96,9 +96,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
     }
   }
 
+  data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_));
   is_constant_hessian_ = GetIsConstHessian(objective_function);
 
-  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
+  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() &&
+                     !data_sample_strategy_->IsHessianChange();  // for sample strategy with Hessian change, fall back to boosting on CPU
+
   tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type,
                                                                               config_.get(), boosting_on_gpu_));
 
@@ -124,34 +127,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   #endif  // USE_CUDA_EXP
 
   num_data_ = train_data_->num_data();
-  // create buffer for gradients and Hessians
-  if (objective_function_ != nullptr) {
-    const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
-      if (gradients_pointer_ != nullptr) {
-        CHECK_NOTNULL(hessians_pointer_);
-        DeallocateCUDAMemory<score_t>(&gradients_pointer_, __FILE__, __LINE__);
-        DeallocateCUDAMemory<score_t>(&hessians_pointer_, __FILE__, __LINE__);
-      }
-      AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
-      AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
-    } else {
-    #endif  // USE_CUDA_EXP
-      gradients_.resize(total_size);
-      hessians_.resize(total_size);
-      gradients_pointer_ = gradients_.data();
-      hessians_pointer_ = hessians_.data();
-    #ifdef USE_CUDA_EXP
-    }
-    #endif  // USE_CUDA_EXP
-  } else if (config_->boosting == std::string("goss")) {
-    const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-    gradients_.resize(total_size);
-    hessians_.resize(total_size);
-    gradients_pointer_ = gradients_.data();
-    hessians_pointer_ = hessians_.data();
-  }
 
   // get max feature index
   max_feature_idx_ = train_data_->num_total_features() - 1;
@@ -164,8 +139,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   // get parser config file content
   parser_config_str_ = train_data_->parser_config_str();
 
+  // check that forced splits does not use feature indices larger than dataset size
+  CheckForcedSplitFeatures();
+
   // if need bagging, create buffer
-  ResetBaggingConfig(config_.get(), true);
+  data_sample_strategy_->ResetSampleConfig(config_.get(), true);
+  ResetGradientBuffers();
 
   class_need_train_ = std::vector<bool>(num_tree_per_iteration_, true);
   if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) {
@@ -180,6 +159,26 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   }
 }
 
+void GBDT::CheckForcedSplitFeatures() {
+  std::queue<Json> forced_split_nodes;
+  forced_split_nodes.push(forced_splits_json_);
+  while (!forced_split_nodes.empty()) {
+    Json node = forced_split_nodes.front();
+    forced_split_nodes.pop();
+    const int feature_index = node["feature"].int_value();
+    if (feature_index > max_feature_idx_) {
+      Log::Fatal("Forced splits file includes feature index %d, but maximum feature index in dataset is %d",
+        feature_index, max_feature_idx_);
+    }
+    if (node.object_items().count("left") > 0) {
+      forced_split_nodes.push(node["left"]);
+    }
+    if (node.object_items().count("right") > 0) {
+      forced_split_nodes.push(node["right"]);
+    }
+  }
+}
+
 void GBDT::AddValidDataset(const Dataset* valid_data,
                            const std::vector<const Metric*>& valid_metrics) {
   if (!train_data_->CheckAlign(*valid_data)) {
@@ -227,108 +226,6 @@ void GBDT::Boosting() {
     GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_);
 }
 
-data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) {
-  if (cnt <= 0) {
-    return 0;
-  }
-  data_size_t cur_left_cnt = 0;
-  data_size_t cur_right_pos = cnt;
-  // random bagging, minimal unit is one record
-  for (data_size_t i = 0; i < cnt; ++i) {
-    auto cur_idx = start + i;
-    if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < config_->bagging_fraction) {
-      buffer[cur_left_cnt++] = cur_idx;
-    } else {
-      buffer[--cur_right_pos] = cur_idx;
-    }
-  }
-  return cur_left_cnt;
-}
-
-data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt,
-                                        data_size_t* buffer) {
-  if (cnt <= 0) {
-    return 0;
-  }
-  auto label_ptr = train_data_->metadata().label();
-  data_size_t cur_left_cnt = 0;
-  data_size_t cur_right_pos = cnt;
-  // random bagging, minimal unit is one record
-  for (data_size_t i = 0; i < cnt; ++i) {
-    auto cur_idx = start + i;
-    bool is_pos = label_ptr[start + i] > 0;
-    bool is_in_bag = false;
-    if (is_pos) {
-      is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() <
-                  config_->pos_bagging_fraction;
-    } else {
-      is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() <
-                  config_->neg_bagging_fraction;
-    }
-    if (is_in_bag) {
-      buffer[cur_left_cnt++] = cur_idx;
-    } else {
-      buffer[--cur_right_pos] = cur_idx;
-    }
-  }
-  return cur_left_cnt;
-}
-
-void GBDT::Bagging(int iter) {
-  Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
-  // if need bagging
-  if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
-      need_re_bagging_) {
-    need_re_bagging_ = false;
-    auto left_cnt = bagging_runner_.Run<true>(
-        num_data_,
-        [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
-            data_size_t*) {
-          data_size_t cur_left_count = 0;
-          if (balanced_bagging_) {
-            cur_left_count =
-                BalancedBaggingHelper(cur_start, cur_cnt, left);
-          } else {
-            cur_left_count = BaggingHelper(cur_start, cur_cnt, left);
-          }
-          return cur_left_count;
-        },
-        bag_data_indices_.data());
-    bag_data_cnt_ = left_cnt;
-    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
-    // set bagging data to tree learner
-    if (!is_use_subset_) {
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
-        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
-        tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
-      } else {
-      #endif  // USE_CUDA_EXP
-        tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
-      }
-      #endif  // USE_CUDA_EXP
-    } else {
-      // get subset
-      tmp_subset_->ReSize(bag_data_cnt_);
-      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
-                              bag_data_cnt_, false);
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
-        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
-        tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
-                                      bag_data_cnt_);
-      } else {
-      #endif  // USE_CUDA_EXP
-        tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
-                                      bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
-      }
-      #endif  // USE_CUDA_EXP
-    }
-  }
-}
-
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
   Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
   bool is_finished = false;
@@ -448,7 +345,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
   } else {
     // use customized objective function
     CHECK(objective_function_ == nullptr);
-    if (config_->boosting == std::string("goss")) {
+    if (data_sample_strategy_->IsHessianChange()) {
       // need to copy customized gradients when using GOSS
       int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration_;
       #pragma omp parallel for schedule(static)
@@ -464,15 +361,13 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
   }
 
   // bagging logic
-  Bagging(iter_);
+  data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+  const bool is_use_subset = data_sample_strategy_->is_use_subset();
+  const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
+  const std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices = data_sample_strategy_->bag_data_indices();
 
-  if (gradients != nullptr && is_use_subset_ && bag_data_cnt_ < num_data_ && !boosting_on_gpu_ && config_->boosting != std::string("goss")) {
-    // allocate gradients_ and hessians_ for copy gradients for using data subset
-    int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration_;
-    gradients_.resize(total_size);
-    hessians_.resize(total_size);
-    gradients_pointer_ = gradients_.data();
-    hessians_pointer_ = hessians_.data();
+  if (objective_function_ == nullptr && is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_ && !data_sample_strategy_->IsHessianChange()) {
+    ResetGradientBuffers();
   }
 
   bool should_continue = false;
@@ -483,10 +378,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
       auto grad = gradients + offset;
       auto hess = hessians + offset;
       // need to copy gradients for bagging subset.
-      if (is_use_subset_ && bag_data_cnt_ < num_data_ && !boosting_on_gpu_) {
-        for (int i = 0; i < bag_data_cnt_; ++i) {
-          gradients_pointer_[offset + i] = grad[bag_data_indices_[i]];
-          hessians_pointer_[offset + i] = hess[bag_data_indices_[i]];
+      if (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_) {
+        for (int i = 0; i < bag_data_cnt; ++i) {
+          gradients_pointer_[offset + i] = grad[bag_data_indices[i]];
+          hessians_pointer_[offset + i] = hess[bag_data_indices[i]];
         }
         grad = gradients_pointer_ + offset;
         hess = hessians_pointer_ + offset;
@@ -500,7 +395,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
       auto score_ptr = train_score_updater_->score() + offset;
       auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast<double>(label[i]) - score_ptr[i]; };
       tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
-                                     num_data_, bag_data_indices_.data(), bag_data_cnt_, train_score_updater_->score());
+                                     num_data_, bag_data_indices.data(), bag_data_cnt, train_score_updater_->score());
       // shrinkage by learning rate
       new_tree->Shrinkage(shrinkage_rate_);
       // update score
@@ -580,17 +475,18 @@ bool GBDT::EvalAndCheckEarlyStopping() {
 void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
   Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer);
   // update training score
-  if (!is_use_subset_) {
+  if (!data_sample_strategy_->is_use_subset()) {
     train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
 
+    const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
     // we need to predict out-of-bag scores of data for boosting
-    if (num_data_ - bag_data_cnt_ > 0) {
+    if (num_data_ - bag_data_cnt > 0) {
       #ifdef USE_CUDA_EXP
       if (config_->device_type == std::string("cuda_exp")) {
-        train_score_updater_->AddScore(tree, cuda_bag_data_indices_.RawData() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
+        train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
       } else {
       #endif  // USE_CUDA_EXP
-        train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
+        train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
       #ifdef USE_CUDA_EXP
       }
       #endif  // USE_CUDA_EXP
@@ -818,6 +714,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
   }
 
   objective_function_ = objective_function;
+  data_sample_strategy_->UpdateObjectiveFunction(objective_function);
   if (objective_function_ != nullptr) {
     CHECK_EQ(num_tree_per_iteration_, objective_function_->NumModelPerIteration());
     if (objective_function_->IsRenewTreeOutput() && !config_->monotone_constraints.empty()) {
@@ -833,11 +730,15 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
   }
   training_metrics_.shrink_to_fit();
 
-  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
+  #ifdef USE_CUDA_EXP
+  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() &&
+                    !data_sample_strategy_->IsHessianChange();  // for sample strategy with Hessian change, fall back to boosting on CPU
   tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_);
+  #endif  // USE_CUDA_EXP
 
   if (train_data != train_data_) {
     train_data_ = train_data;
+    data_sample_strategy_->UpdateTrainingData(train_data);
     // not same training data, need reset score and others
     // create score tracker
     #ifdef USE_CUDA_EXP
@@ -860,34 +761,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
 
     num_data_ = train_data_->num_data();
 
-    // create buffer for gradients and hessians
-    if (objective_function_ != nullptr) {
-      const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
-        if (gradients_pointer_ != nullptr) {
-          CHECK_NOTNULL(hessians_pointer_);
-          DeallocateCUDAMemory<score_t>(&gradients_pointer_, __FILE__, __LINE__);
-          DeallocateCUDAMemory<score_t>(&hessians_pointer_, __FILE__, __LINE__);
-        }
-        AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
-        AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
-      } else {
-      #endif  // USE_CUDA_EXP
-        gradients_.resize(total_size);
-        hessians_.resize(total_size);
-        gradients_pointer_ = gradients_.data();
-        hessians_pointer_ = hessians_.data();
-      #ifdef USE_CUDA_EXP
-      }
-      #endif  // USE_CUDA_EXP
-    } else if (config_->boosting == std::string("goss")) {
-      const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-      gradients_.resize(total_size);
-      hessians_.resize(total_size);
-      gradients_pointer_ = gradients_.data();
-      hessians_pointer_ = hessians_.data();
-    }
+    ResetGradientBuffers();
 
     max_feature_idx_ = train_data_->num_total_features() - 1;
     label_idx_ = train_data_->label_idx();
@@ -896,7 +770,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
     parser_config_str_ = train_data_->parser_config_str();
 
     tree_learner_->ResetTrainingData(train_data, is_constant_hessian_);
-    ResetBaggingConfig(config_.get(), true);
+    data_sample_strategy_->ResetSampleConfig(config_.get(), true);
   } else {
     tree_learner_->ResetIsConstantHessian(is_constant_hessian_);
   }
@@ -919,11 +793,16 @@ void GBDT::ResetConfig(const Config* config) {
     tree_learner_->ResetConfig(new_config.get());
   }
 
-  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
+  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() &&
+                    !data_sample_strategy_->IsHessianChange();  // for sample strategy with Hessian change, fall back to boosting on CPU
   tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_);
 
   if (train_data_ != nullptr) {
-    ResetBaggingConfig(new_config.get(), false);
+    data_sample_strategy_->ResetSampleConfig(new_config.get(), false);
+    if (data_sample_strategy_->NeedResizeGradients()) {
+      // resize gradient vectors to copy the customized gradients for goss or bagging with subset
+      ResetGradientBuffers();
+    }
   }
   if (config_.get() != nullptr && config_->forcedsplits_filename != new_config->forcedsplits_filename) {
     // load forced_splits file
@@ -943,96 +822,37 @@ void GBDT::ResetConfig(const Config* config) {
   config_.reset(new_config.release());
 }
 
-void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
-  // if need bagging, create buffer
-  data_size_t num_pos_data = 0;
+void GBDT::ResetGradientBuffers() {
+  const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+  const bool is_use_subset = data_sample_strategy_->is_use_subset();
+  const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
   if (objective_function_ != nullptr) {
-    num_pos_data = objective_function_->NumPositiveData();
-  }
-  bool balance_bagging_cond = (config->pos_bagging_fraction < 1.0 || config->neg_bagging_fraction < 1.0) && (num_pos_data > 0);
-  if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) {
-    need_re_bagging_ = false;
-    if (!is_change_dataset &&
-      config_.get() != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq
-      && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) {
-      return;
-    }
-    if (balance_bagging_cond) {
-      balanced_bagging_ = true;
-      bag_data_cnt_ = static_cast<data_size_t>(num_pos_data * config->pos_bagging_fraction)
-                      + static_cast<data_size_t>((num_data_ - num_pos_data) * config->neg_bagging_fraction);
-    } else {
-      bag_data_cnt_ = static_cast<data_size_t>(config->bagging_fraction * num_data_);
-    }
-    bag_data_indices_.resize(num_data_);
     #ifdef USE_CUDA_EXP
-    if (config->device_type == std::string("cuda_exp")) {
-      cuda_bag_data_indices_.Resize(num_data_);
-    }
-    #endif  // USE_CUDA_EXP
-    bagging_runner_.ReSize(num_data_);
-    bagging_rands_.clear();
-    for (int i = 0;
-         i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) {
-      bagging_rands_.emplace_back(config_->bagging_seed + i);
-    }
-
-    double average_bag_rate =
-        (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
-    is_use_subset_ = false;
-    if (config_->device_type != std::string("cuda_exp")) {
-      const int group_threshold_usesubset = 100;
-      if (average_bag_rate <= 0.5
-          && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
-        if (tmp_subset_ == nullptr || is_change_dataset) {
-          tmp_subset_.reset(new Dataset(bag_data_cnt_));
-          tmp_subset_->CopyFeatureMapperFrom(train_data_);
-        }
-        is_use_subset_ = true;
-        Log::Debug("Use subset for bagging");
+    if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
+      if (cuda_gradients_.Size() < total_size) {
+        cuda_gradients_.Resize(total_size);
+        cuda_hessians_.Resize(total_size);
       }
-    }
-
-    need_re_bagging_ = true;
-
-    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
-      // resize gradient vectors to copy the customized gradients for goss or bagging with subset
-      if (objective_function_ != nullptr) {
-        const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-        #ifdef USE_CUDA_EXP
-        if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
-          if (gradients_pointer_ != nullptr) {
-            CHECK_NOTNULL(hessians_pointer_);
-            DeallocateCUDAMemory<score_t>(&gradients_pointer_, __FILE__, __LINE__);
-            DeallocateCUDAMemory<score_t>(&hessians_pointer_, __FILE__, __LINE__);
-          }
-          AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
-          AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
-        } else {
-        #endif  // USE_CUDA_EXP
-          gradients_.resize(total_size);
-          hessians_.resize(total_size);
-          gradients_pointer_ = gradients_.data();
-          hessians_pointer_ = hessians_.data();
-        #ifdef USE_CUDA_EXP
-        }
-        #endif  // USE_CUDA_EXP
-      } else if (config_->boosting == std::string("goss")) {
-        const size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+      gradients_pointer_ = cuda_gradients_.RawData();
+      hessians_pointer_ = cuda_hessians_.RawData();
+    } else {
+    #endif  // USE_CUDA_EXP
+      if (gradients_.size() < total_size) {
         gradients_.resize(total_size);
         hessians_.resize(total_size);
-        gradients_pointer_ = gradients_.data();
-        hessians_pointer_ = hessians_.data();
       }
-    }
-  } else {
-    bag_data_cnt_ = num_data_;
-    bag_data_indices_.clear();
+      gradients_pointer_ = gradients_.data();
+      hessians_pointer_ = hessians_.data();
     #ifdef USE_CUDA_EXP
-    cuda_bag_data_indices_.Clear();
+    }
     #endif  // USE_CUDA_EXP
-    bagging_runner_.ReSize(0);
-    is_use_subset_ = false;
+  } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) {
+    if (gradients_.size() < total_size) {
+      gradients_.resize(total_size);
+      hessians_.resize(total_size);
+    }
+    gradients_pointer_ = gradients_.data();
+    hessians_pointer_ = hessians_.data();
   }
 }
 
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 5cc3cc7541b0..d71245980b36 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -11,6 +11,7 @@
 #include <LightGBM/cuda/vector_cudahost.h>
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/threading.h>
+#include <LightGBM/sample_strategy.h>
 
 #include <string>
 #include <algorithm>
@@ -57,6 +58,11 @@ class GBDT : public GBDTBase {
             const ObjectiveFunction* objective_function,
             const std::vector<const Metric*>& training_metrics) override;
 
+  /*!
+  * \brief Traverse the tree of forced splits and check that all indices are less than the number of features.
+  */
+  void CheckForcedSplitFeatures();
+
   /*!
   * \brief Merge model from other boosting object. Will insert to the front of current boosting object
   * \param other
@@ -453,7 +459,7 @@ class GBDT : public GBDTBase {
 
  protected:
   virtual bool GetIsConstHessian(const ObjectiveFunction* objective_function) {
-    if (objective_function != nullptr) {
+    if (objective_function != nullptr && !data_sample_strategy_->IsHessianChange()) {
       return objective_function->IsConstantHessian();
     } else {
       return false;
@@ -469,18 +475,6 @@ class GBDT : public GBDTBase {
   */
   void ResetBaggingConfig(const Config* config, bool is_change_dataset);
 
-  /*!
-  * \brief Implement bagging logic
-  * \param iter Current interation
-  */
-  virtual void Bagging(int iter);
-
-  virtual data_size_t BaggingHelper(data_size_t start, data_size_t cnt,
-                                    data_size_t* buffer);
-
-  data_size_t BalancedBaggingHelper(data_size_t start, data_size_t cnt,
-                                    data_size_t* buffer);
-
   /*!
   * \brief calculate the objective function
   */
@@ -508,6 +502,11 @@ class GBDT : public GBDTBase {
 
   double BoostFromAverage(int class_id, bool update_scorer);
 
+  /*!
+  * \brief Reset gradient buffers, must be called after sample strategy is reset
+  */
+  void ResetGradientBuffers();
+
   /*! \brief current iteration */
   int iter_;
   /*! \brief Pointer to training data */
@@ -561,18 +560,16 @@ class GBDT : public GBDTBase {
   /*! \brief Whether boosting is done on GPU, used for cuda_exp */
   bool boosting_on_gpu_;
   #ifdef USE_CUDA_EXP
+  /*! \brief Gradient vector on GPU */
+  CUDAVector<score_t> cuda_gradients_;
+  /*! \brief Hessian vector on GPU */
+  CUDAVector<score_t> cuda_hessians_;
   /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */
   mutable std::vector<double> host_score_;
   /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */
   mutable CUDAVector<double> cuda_score_;
-  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
-  CUDAVector<data_size_t> cuda_bag_data_indices_;
   #endif  // USE_CUDA_EXP
 
-  /*! \brief Store the indices of in-bag data */
-  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
-  /*! \brief Number of in-bag data */
-  data_size_t bag_data_cnt_;
   /*! \brief Number of training data */
   data_size_t num_data_;
   /*! \brief Number of trees per iterations */
@@ -592,8 +589,6 @@ class GBDT : public GBDTBase {
   /*! \brief Feature names */
   std::vector<std::string> feature_names_;
   std::vector<std::string> feature_infos_;
-  std::unique_ptr<Dataset> tmp_subset_;
-  bool is_use_subset_;
   std::vector<bool> class_need_train_;
   bool is_constant_hessian_;
   std::unique_ptr<ObjectiveFunction> loaded_objective_;
@@ -602,11 +597,9 @@ class GBDT : public GBDTBase {
   bool balanced_bagging_;
   std::string loaded_parameter_;
   std::vector<int8_t> monotone_constraints_;
-  const int bagging_rand_block_ = 1024;
-  std::vector<Random> bagging_rands_;
-  ParallelPartitionRunner<data_size_t, false> bagging_runner_;
   Json forced_splits_json_;
   bool linear_tree_;
+  std::unique_ptr<SampleStrategy> data_sample_strategy_;
 };
 
 }  // namespace LightGBM
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index 09c63d9728f3..34b099e051bb 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -1,79 +1,87 @@
 /*!
- * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
-#ifndef LIGHTGBM_BOOSTING_GOSS_H_
-#define LIGHTGBM_BOOSTING_GOSS_H_
 
-#include <LightGBM/boosting.h>
+#ifndef LIGHTGBM_BOOSTING_GOSS_HPP_
+#define LIGHTGBM_BOOSTING_GOSS_HPP_
+
 #include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/log.h>
+#include <LightGBM/sample_strategy.h>
 
-#include <string>
 #include <algorithm>
-#include <chrono>
-#include <cstdio>
-#include <cstdint>
-#include <fstream>
+#include <string>
 #include <vector>
 
-#include "gbdt.h"
-#include "score_updater.hpp"
-
 namespace LightGBM {
 
-class GOSS: public GBDT {
+class GOSSStrategy : public SampleStrategy {
  public:
-  /*!
-  * \brief Constructor
-  */
-  GOSS() : GBDT() {
-  }
-
-  ~GOSS() {
-  }
-
-  void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
-            const std::vector<const Metric*>& training_metrics) override {
-    GBDT::Init(config, train_data, objective_function, training_metrics);
-    ResetGoss();
-    if (objective_function_ == nullptr) {
-      // use customized objective function
-      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
-      gradients_.resize(total_size, 0.0f);
-      hessians_.resize(total_size, 0.0f);
-    }
-  }
-
-  void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function,
-                         const std::vector<const Metric*>& training_metrics) override {
-    GBDT::ResetTrainingData(train_data, objective_function, training_metrics);
-    ResetGoss();
+  GOSSStrategy(const Config* config, const Dataset* train_data, int num_tree_per_iteration) {
+    config_ = config;
+    train_data_ = train_data;
+    num_tree_per_iteration_ = num_tree_per_iteration;
+    num_data_ = train_data->num_data();
   }
 
-  void ResetConfig(const Config* config) override {
-    GBDT::ResetConfig(config);
-    ResetGoss();
+  ~GOSSStrategy() {
   }
 
-  bool TrainOneIter(const score_t* gradients, const score_t* hessians) override {
-    if (gradients != nullptr) {
-      // use customized objective function
-      CHECK(hessians != nullptr && objective_function_ == nullptr);
-      int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration_;
-      #pragma omp parallel for schedule(static)
-      for (int64_t i = 0; i < total_size; ++i) {
-        gradients_[i] = gradients[i];
-        hessians_[i] = hessians[i];
+  void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override {
+    bag_data_cnt_ = num_data_;
+    // not subsample for first iterations
+    if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
+    auto left_cnt = bagging_runner_.Run<true>(
+        num_data_,
+        [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
+            data_size_t*) {
+          data_size_t cur_left_count = 0;
+          cur_left_count = Helper(cur_start, cur_cnt, left, gradients, hessians);
+          return cur_left_count;
+        },
+        bag_data_indices_.data());
+    bag_data_cnt_ = left_cnt;
+    // set bagging data to tree learner
+    if (!is_use_subset_) {
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+        tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
+      } else {
+      #endif  // USE_CUDA_EXP
+        tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+      #ifdef USE_CUDA_EXP
       }
-      return GBDT::TrainOneIter(gradients_.data(), hessians_.data());
+      #endif  // USE_CUDA_EXP
     } else {
-      CHECK(hessians == nullptr);
-      return GBDT::TrainOneIter(nullptr, nullptr);
+      // get subset
+      tmp_subset_->ReSize(bag_data_cnt_);
+      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
+                              bag_data_cnt_, false);
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+        tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
+                                      bag_data_cnt_);
+      } else {
+      #endif  // USE_CUDA_EXP
+        tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                     bag_data_cnt_);
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
     }
   }
 
-  void ResetGoss() {
+  void ResetSampleConfig(const Config* config, bool /*is_change_dataset*/) override {
+    // Cannot use bagging in GOSS
+    config_ = config;
+    need_resize_gradients_ = false;
+    if (objective_function_ == nullptr) {
+      // resize gradient vectors to copy the customized gradients for goss
+      need_resize_gradients_ = true;
+    }
+
     CHECK_LE(config_->top_rate + config_->other_rate, 1.0f);
     CHECK(config_->top_rate > 0.0f && config_->other_rate > 0.0f);
     if (config_->bagging_freq > 0 && config_->bagging_fraction != 1.0f) {
@@ -100,7 +108,12 @@ class GOSS: public GBDT {
     bag_data_cnt_ = num_data_;
   }
 
-  data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) override {
+  bool IsHessianChange() const override {
+    return true;
+  }
+
+ private:
+  data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) {
     if (cnt <= 0) {
       return 0;
     }
@@ -108,7 +121,7 @@ class GOSS: public GBDT {
     for (data_size_t i = 0; i < cnt; ++i) {
       for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
         size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
-        tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
+        tmp_gradients[i] += std::fabs(gradients[idx] * hessians[idx]);
       }
     }
     data_size_t top_k = static_cast<data_size_t>(cnt * config_->top_rate);
@@ -126,7 +139,7 @@ class GOSS: public GBDT {
       score_t grad = 0.0f;
       for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
         size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + cur_idx;
-        grad += std::fabs(gradients_[idx] * hessians_[idx]);
+        grad += std::fabs(gradients[idx] * hessians[idx]);
       }
       if (grad >= threshold) {
         buffer[cur_left_cnt++] = cur_idx;
@@ -140,8 +153,8 @@ class GOSS: public GBDT {
           buffer[cur_left_cnt++] = cur_idx;
           for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
             size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + cur_idx;
-            gradients_[idx] *= multiply;
-            hessians_[idx] *= multiply;
+            gradients[idx] *= multiply;
+            hessians[idx] *= multiply;
           }
         } else {
           buffer[--cur_right_pos] = cur_idx;
@@ -150,58 +163,8 @@ class GOSS: public GBDT {
     }
     return cur_left_cnt;
   }
-
-  void Bagging(int iter) override {
-    bag_data_cnt_ = num_data_;
-    // not subsample for first iterations
-    if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
-    auto left_cnt = bagging_runner_.Run<true>(
-        num_data_,
-        [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
-            data_size_t*) {
-          data_size_t cur_left_count = 0;
-          cur_left_count = BaggingHelper(cur_start, cur_cnt, left);
-          return cur_left_count;
-        },
-        bag_data_indices_.data());
-    bag_data_cnt_ = left_cnt;
-    // set bagging data to tree learner
-    if (!is_use_subset_) {
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
-        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
-        tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
-      } else {
-      #endif  // USE_CUDA_EXP
-        tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
-      }
-      #endif  // USE_CUDA_EXP
-    } else {
-      // get subset
-      tmp_subset_->ReSize(bag_data_cnt_);
-      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
-                              bag_data_cnt_, false);
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
-        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
-        tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
-                                      bag_data_cnt_);
-      } else {
-      #endif  // USE_CUDA_EXP
-        tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
-                                      bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
-      }
-      #endif  // USE_CUDA_EXP
-    }
-  }
-
- protected:
-  bool GetIsConstHessian(const ObjectiveFunction*) override {
-    return false;
-  }
 };
 
 }  // namespace LightGBM
-#endif   // LIGHTGBM_BOOSTING_GOSS_H_
+
+#endif  // LIGHTGBM_BOOSTING_GOSS_HPP_
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index ac1008f88516..9a87e982483e 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -32,8 +32,12 @@ class RF : public GBDT {
 
   void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
     const std::vector<const Metric*>& training_metrics) override {
-    CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
-    CHECK(config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f);
+    if (config->data_sample_strategy == std::string("bagging")) {
+      CHECK((config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) ||
+            (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f));
+    } else {
+      CHECK_EQ(config->data_sample_strategy, std::string("goss"));
+    }
     GBDT::Init(config, train_data, objective_function, training_metrics);
 
     if (num_init_iteration_ > 0) {
@@ -48,15 +52,19 @@ class RF : public GBDT {
     shrinkage_rate_ = 1.0f;
     // only boosting one time
     Boosting();
-    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+    if (data_sample_strategy_->is_use_subset() && data_sample_strategy_->bag_data_cnt() < num_data_) {
       tmp_grad_.resize(num_data_);
       tmp_hess_.resize(num_data_);
     }
   }
 
   void ResetConfig(const Config* config) override {
-    CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f);
-    CHECK(config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f);
+    if (config->data_sample_strategy == std::string("bagging")) {
+      CHECK((config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) ||
+            (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f));
+    } else {
+      CHECK_EQ(config->data_sample_strategy, std::string("goss"));
+    }
     GBDT::ResetConfig(config);
     // not shrinkage rate for the RF
     shrinkage_rate_ = 1.0f;
@@ -73,7 +81,7 @@ class RF : public GBDT {
     CHECK_EQ(num_tree_per_iteration_, num_class_);
     // only boosting one time
     Boosting();
-    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+    if (data_sample_strategy_->is_use_subset() && data_sample_strategy_->bag_data_cnt() < num_data_) {
       tmp_grad_.resize(num_data_);
       tmp_hess_.resize(num_data_);
     }
@@ -102,7 +110,11 @@ class RF : public GBDT {
 
   bool TrainOneIter(const score_t* gradients, const score_t* hessians) override {
     // bagging logic
-    Bagging(iter_);
+    data_sample_strategy_ ->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+    const bool is_use_subset = data_sample_strategy_->is_use_subset();
+    const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
+    const std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices = data_sample_strategy_->bag_data_indices();
+
     CHECK_EQ(gradients, nullptr);
     CHECK_EQ(hessians, nullptr);
 
@@ -115,11 +127,10 @@ class RF : public GBDT {
         auto grad = gradients + offset;
         auto hess = hessians + offset;
 
-        // need to copy gradients for bagging subset.
-        if (is_use_subset_ && bag_data_cnt_ < num_data_ && !boosting_on_gpu_) {
-          for (int i = 0; i < bag_data_cnt_; ++i) {
-            tmp_grad_[i] = grad[bag_data_indices_[i]];
-            tmp_hess_[i] = hess[bag_data_indices_[i]];
+        if (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_) {
+          for (int i = 0; i < bag_data_cnt; ++i) {
+            tmp_grad_[i] = grad[bag_data_indices[i]];
+            tmp_hess_[i] = hess[bag_data_indices[i]];
           }
           grad = tmp_grad_.data();
           hess = tmp_hess_.data();
@@ -132,7 +143,7 @@ class RF : public GBDT {
         double pred = init_scores_[cur_tree_id];
         auto residual_getter = [pred](const label_t* label, int i) {return static_cast<double>(label[i]) - pred; };
         tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter,
-          num_data_, bag_data_indices_.data(), bag_data_cnt_, train_score_updater_->score());
+          num_data_, bag_data_indices.data(), bag_data_cnt, train_score_updater_->score());
         if (std::fabs(init_scores_[cur_tree_id]) > kEpsilon) {
           new_tree->AddBias(init_scores_[cur_tree_id]);
         }
diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp
new file mode 100644
index 000000000000..71c4ac1755de
--- /dev/null
+++ b/src/boosting/sample_strategy.cpp
@@ -0,0 +1,24 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include <LightGBM/sample_strategy.h>
+#include "goss.hpp"
+#include "bagging.hpp"
+
+namespace LightGBM {
+
+SampleStrategy* SampleStrategy::CreateSampleStrategy(
+  const Config* config,
+  const Dataset* train_data,
+  const ObjectiveFunction* objective_function,
+  int num_tree_per_iteration) {
+  if (config->data_sample_strategy == std::string("goss")) {
+    return new GOSSStrategy(config, train_data, num_tree_per_iteration);
+  } else {
+    return new BaggingSampleStrategy(config, train_data, objective_function, num_tree_per_iteration);
+  }
+}
+
+}  // namespace LightGBM
diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu
index b19321eb8935..5a6b3eb74ef0 100644
--- a/src/cuda/cuda_algorithms.cu
+++ b/src/cuda/cuda_algorithms.cu
@@ -127,6 +127,7 @@ void ShuffleReduceSumGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffe
 }
 
 template void ShuffleReduceSumGlobal<label_t, double>(const label_t* values, size_t n, double* block_buffer);
+template void ShuffleReduceSumGlobal<double, double>(const double* values, size_t n, double* block_buffer);
 
 template <typename VAL_T, typename REDUCE_T>
 __global__ void ShuffleReduceMinGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) {
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 72006eb50cdb..8827414c2e99 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -99,6 +99,20 @@ void GetBoostingType(const std::unordered_map<std::string, std::string>& params,
   }
 }
 
+void GetDataSampleStrategy(const std::unordered_map<std::string, std::string>& params, std::string* strategy) {
+  std::string value;
+  if (Config::GetString(params, "data_sample_strategy", &value)) {
+    std::transform(value.begin(), value.end(), value.begin(), Common::tolower);
+    if (value == std::string("goss")) {
+      *strategy = "goss";
+    } else if (value == std::string("bagging")) {
+      *strategy = "bagging";
+    } else {
+      Log::Fatal("Unknown sample strategy %s", value.c_str());
+    }
+  }
+}
+
 void ParseMetrics(const std::string& value, std::vector<std::string>* out_metric) {
   std::unordered_set<std::string> metric_sets;
   out_metric->clear();
@@ -242,6 +256,7 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
 
   GetTaskType(params, &task);
   GetBoostingType(params, &boosting);
+  GetDataSampleStrategy(params, &data_sample_strategy);
   GetObjectiveType(params, &objective);
   GetMetricType(params, objective, &metric);
   GetDeviceType(params, &device_type);
@@ -423,6 +438,12 @@ void Config::CheckParamConflict() {
         "Will set min_data_in_leaf to 1.");
     min_data_in_leaf = 1;
   }
+  if (boosting == std::string("goss")) {
+    boosting = std::string("gbdt");
+    data_sample_strategy = std::string("goss");
+    Log::Warning("Found boosting=goss. For backwards compatibility reasons, LightGBM interprets this as boosting=gbdt, data_sample_strategy=goss."
+                 "To suppress this warning, set data_sample_strategy=goss instead.");
+  }
 }
 
 std::string Config::ToString() const {
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index a86abd3a2c1d..b1dbcc378a27 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -186,6 +186,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "task",
   "objective",
   "boosting",
+  "data_sample_strategy",
   "data",
   "valid",
   "num_iterations",
@@ -762,6 +763,7 @@ const std::unordered_map<std::string, std::vector<std::string>>& Config::paramet
     {"task", {"task_type"}},
     {"objective", {"objective_type", "app", "application", "loss"}},
     {"boosting", {"boosting_type", "boost"}},
+    {"data_sample_strategy", {}},
     {"data", {"train", "train_data", "train_data_file", "data_filename"}},
     {"valid", {"test", "valid_data", "valid_data_file", "test_data", "test_data_file", "valid_filenames"}},
     {"num_iterations", {"num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "nrounds", "num_boost_round", "n_estimators", "max_iter"}},
@@ -899,6 +901,7 @@ const std::unordered_map<std::string, std::string>& Config::ParameterTypes() {
     {"config", "string"},
     {"objective", "string"},
     {"boosting", "string"},
+    {"data_sample_strategy", "string"},
     {"data", "string"},
     {"valid", "vector<string>"},
     {"num_iterations", "int"},
diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp
index b7ecee6e6167..196563340ae5 100644
--- a/src/io/cuda/cuda_tree.cpp
+++ b/src/io/cuda/cuda_tree.cpp
@@ -330,6 +330,10 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() {
   CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__);
 }
 
+void CUDATree::AsConstantTree(double val) {
+  Tree::AsConstantTree(val);
+  CopyFromHostToCUDADevice<double>(cuda_leaf_value_, &val, 1, __FILE__, __LINE__);
+}
 
 }  // namespace LightGBM
 
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 2842551cf2ee..a237e79b2680 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -1495,7 +1495,7 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
                    other->max_bin_by_feature_, other->num_total_features_, -1);
   num_total_features_ += other->num_total_features_;
   for (size_t i = 0; i < (other->numeric_feature_map_).size(); ++i) {
-    int feat_ind = numeric_feature_map_[i];
+    int feat_ind = other->numeric_feature_map_[i];
     if (feat_ind > -1) {
       numeric_feature_map_.push_back(feat_ind + num_numeric_features_);
     } else {
diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp
index f70a4ef4ac14..037f54ba091a 100644
--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -96,7 +96,7 @@ class BinaryMetric: public Metric {
     return std::vector<double>(1, loss);
   }
 
- private:
+ protected:
   /*! \brief Number of data */
   data_size_t num_data_;
   /*! \brief Pointer of label */
diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp
new file mode 100644
index 000000000000..d526fddeecb2
--- /dev/null
+++ b/src/metric/cuda/cuda_binary_metric.cpp
@@ -0,0 +1,31 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_binary_metric.hpp"
+
+namespace LightGBM {
+
+CUDABinaryLoglossMetric::CUDABinaryLoglossMetric(const Config& config):
+  CUDABinaryMetricInterface<BinaryLoglossMetric, CUDABinaryLoglossMetric>(config) {}
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+std::vector<double> CUDABinaryMetricInterface<HOST_METRIC, CUDA_METRIC>::Eval(const double* score, const ObjectiveFunction* objective) const {
+  const double* score_convert = score;
+  if (objective != nullptr && objective->NeedConvertOutputCUDA()) {
+    this->score_convert_buffer_.Resize(static_cast<size_t>(this->num_data_) * static_cast<size_t>(this->num_class_));
+    score_convert = objective->ConvertOutputCUDA(this->num_data_, score, this->score_convert_buffer_.RawData());
+  }
+  double sum_loss = 0.0, sum_weight = 0.0;
+  this->LaunchEvalKernel(score_convert, &sum_loss, &sum_weight);
+  const double eval_score = sum_loss / sum_weight;
+  return std::vector<double>{eval_score};
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp
new file mode 100644
index 000000000000..ae50dac381dd
--- /dev/null
+++ b/src/metric/cuda/cuda_binary_metric.hpp
@@ -0,0 +1,57 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifndef LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
+#define LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_metric.hpp>
+#include <LightGBM/cuda/cuda_utils.h>
+
+#include <vector>
+
+#include "cuda_regression_metric.hpp"
+#include "../binary_metric.hpp"
+
+namespace LightGBM {
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+class CUDABinaryMetricInterface: public CUDAPointwiseMetricInterface<HOST_METRIC, CUDA_METRIC> {
+ public:
+  explicit CUDABinaryMetricInterface(const Config& config): CUDAPointwiseMetricInterface<HOST_METRIC, CUDA_METRIC>(config) {}
+
+  virtual ~CUDABinaryMetricInterface() {}
+
+  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override;
+};
+
+class CUDABinaryLoglossMetric: public CUDABinaryMetricInterface<BinaryLoglossMetric, CUDABinaryLoglossMetric> {
+ public:
+  explicit CUDABinaryLoglossMetric(const Config& config);
+
+  virtual ~CUDABinaryLoglossMetric() {}
+
+  __device__ static double MetricOnPointCUDA(label_t label, double score) {
+    // score should have been converted to probability
+    if (label <= 0) {
+      if (1.0f - score > kEpsilon) {
+        return -log(1.0f - score);
+      }
+    } else {
+      if (score > kEpsilon) {
+        return -log(score);
+      }
+    }
+    return -log(kEpsilon);
+  }
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+
+#endif  // LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
diff --git a/src/metric/cuda/cuda_pointwise_metric.cpp b/src/metric/cuda/cuda_pointwise_metric.cpp
new file mode 100644
index 000000000000..aacd85e50e87
--- /dev/null
+++ b/src/metric/cuda/cuda_pointwise_metric.cpp
@@ -0,0 +1,38 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_binary_metric.hpp"
+#include "cuda_pointwise_metric.hpp"
+#include "cuda_regression_metric.hpp"
+
+namespace LightGBM {
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+void CUDAPointwiseMetricInterface<HOST_METRIC, CUDA_METRIC>::Init(const Metadata& metadata, data_size_t num_data) {
+  CUDAMetricInterface<HOST_METRIC>::Init(metadata, num_data);
+  const int max_num_reduce_blocks = (this->num_data_ + NUM_DATA_PER_EVAL_THREAD - 1) / NUM_DATA_PER_EVAL_THREAD;
+  if (this->cuda_weights_ == nullptr) {
+    reduce_block_buffer_.Resize(max_num_reduce_blocks);
+  } else {
+    reduce_block_buffer_.Resize(max_num_reduce_blocks * 2);
+  }
+  const int max_num_reduce_blocks_inner = (max_num_reduce_blocks + NUM_DATA_PER_EVAL_THREAD - 1) / NUM_DATA_PER_EVAL_THREAD;
+  if (this->cuda_weights_ == nullptr) {
+    reduce_block_buffer_inner_.Resize(max_num_reduce_blocks_inner);
+  } else {
+    reduce_block_buffer_inner_.Resize(max_num_reduce_blocks_inner * 2);
+  }
+}
+
+template void CUDAPointwiseMetricInterface<RMSEMetric, CUDARMSEMetric>::Init(const Metadata& metadata, data_size_t num_data);
+template void CUDAPointwiseMetricInterface<L2Metric, CUDAL2Metric>::Init(const Metadata& metadata, data_size_t num_data);
+template void CUDAPointwiseMetricInterface<BinaryLoglossMetric, CUDABinaryLoglossMetric>::Init(const Metadata& metadata, data_size_t num_data);
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/metric/cuda/cuda_pointwise_metric.cu b/src/metric/cuda/cuda_pointwise_metric.cu
new file mode 100644
index 000000000000..4650eb2593cf
--- /dev/null
+++ b/src/metric/cuda/cuda_pointwise_metric.cu
@@ -0,0 +1,69 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+
+#include "cuda_binary_metric.hpp"
+#include "cuda_pointwise_metric.hpp"
+#include "cuda_regression_metric.hpp"
+
+namespace LightGBM {
+
+template <typename CUDA_METRIC, bool USE_WEIGHTS>
+__global__ void EvalKernel(const data_size_t num_data, const label_t* labels, const label_t* weights,
+                           const double* scores, double* reduce_block_buffer) {
+  __shared__ double shared_mem_buffer[32];
+  const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  double point_metric = 0.0;
+  if (index < num_data) {
+    point_metric = USE_WEIGHTS ?
+      CUDA_METRIC::MetricOnPointCUDA(labels[index], scores[index]) * weights[index] :
+      CUDA_METRIC::MetricOnPointCUDA(labels[index], scores[index]);
+  }
+  const double block_sum_point_metric = ShuffleReduceSum<double>(point_metric, shared_mem_buffer, NUM_DATA_PER_EVAL_THREAD);
+  if (threadIdx.x == 0) {
+    reduce_block_buffer[blockIdx.x] = block_sum_point_metric;
+  }
+  if (USE_WEIGHTS) {
+    double weight = 0.0;
+    if (index < num_data) {
+      weight = static_cast<double>(weights[index]);
+      const double block_sum_weight = ShuffleReduceSum<double>(weight, shared_mem_buffer, NUM_DATA_PER_EVAL_THREAD);
+      if (threadIdx.x == 0) {
+        reduce_block_buffer[blockIdx.x + gridDim.x] = block_sum_weight;
+      }
+    }
+  }
+}
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+void CUDAPointwiseMetricInterface<HOST_METRIC, CUDA_METRIC>::LaunchEvalKernel(const double* score, double* sum_loss, double* sum_weight) const {
+  const int num_blocks = (this->num_data_ + NUM_DATA_PER_EVAL_THREAD - 1) / NUM_DATA_PER_EVAL_THREAD;
+  if (this->cuda_weights_ != nullptr) {
+    EvalKernel<CUDA_METRIC, true><<<num_blocks, NUM_DATA_PER_EVAL_THREAD>>>(
+      this->num_data_, this->cuda_labels_, this->cuda_weights_, score, reduce_block_buffer_.RawData());
+  } else {
+    EvalKernel<CUDA_METRIC, false><<<num_blocks, NUM_DATA_PER_EVAL_THREAD>>>(
+      this->num_data_, this->cuda_labels_, this->cuda_weights_, score, reduce_block_buffer_.RawData());
+  }
+  ShuffleReduceSumGlobal<double, double>(reduce_block_buffer_.RawData(), num_blocks, reduce_block_buffer_inner_.RawData());
+  CopyFromCUDADeviceToHost<double>(sum_loss, reduce_block_buffer_inner_.RawData(), 1, __FILE__, __LINE__);
+  *sum_weight = static_cast<double>(this->num_data_);
+  if (this->cuda_weights_ != nullptr) {
+    ShuffleReduceSumGlobal<double, double>(reduce_block_buffer_.RawData() + num_blocks, num_blocks, reduce_block_buffer_inner_.RawData());
+    CopyFromCUDADeviceToHost<double>(sum_weight, reduce_block_buffer_inner_.RawData(), 1, __FILE__, __LINE__);
+  }
+}
+
+template void CUDAPointwiseMetricInterface<RMSEMetric, CUDARMSEMetric>::LaunchEvalKernel(const double* score, double* sum_loss, double* sum_weight) const;
+template void CUDAPointwiseMetricInterface<L2Metric, CUDAL2Metric>::LaunchEvalKernel(const double* score, double* sum_loss, double* sum_weight) const;
+template void CUDAPointwiseMetricInterface<BinaryLoglossMetric, CUDABinaryLoglossMetric>::LaunchEvalKernel(const double* score, double* sum_loss, double* sum_weight) const;
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/metric/cuda/cuda_pointwise_metric.hpp b/src/metric/cuda/cuda_pointwise_metric.hpp
new file mode 100644
index 000000000000..4d635da5739e
--- /dev/null
+++ b/src/metric/cuda/cuda_pointwise_metric.hpp
@@ -0,0 +1,43 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifndef LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
+#define LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_metric.hpp>
+#include <LightGBM/cuda/cuda_utils.h>
+
+#include <vector>
+
+#define NUM_DATA_PER_EVAL_THREAD (1024)
+
+namespace LightGBM {
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+class CUDAPointwiseMetricInterface: public CUDAMetricInterface<HOST_METRIC> {
+ public:
+  explicit CUDAPointwiseMetricInterface(const Config& config): CUDAMetricInterface<HOST_METRIC>(config), num_class_(config.num_class) {}
+
+  virtual ~CUDAPointwiseMetricInterface() {}
+
+  void Init(const Metadata& metadata, data_size_t num_data) override;
+
+ protected:
+  void LaunchEvalKernel(const double* score_convert, double* sum_loss, double* sum_weight) const;
+
+  mutable CUDAVector<double> score_convert_buffer_;
+  CUDAVector<double> reduce_block_buffer_;
+  CUDAVector<double> reduce_block_buffer_inner_;
+  const int num_class_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+
+#endif  // LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
diff --git a/src/metric/cuda/cuda_regression_metric.cpp b/src/metric/cuda/cuda_regression_metric.cpp
new file mode 100644
index 000000000000..15b219160a30
--- /dev/null
+++ b/src/metric/cuda/cuda_regression_metric.cpp
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <vector>
+
+#include "cuda_regression_metric.hpp"
+
+namespace LightGBM {
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+std::vector<double> CUDARegressionMetricInterface<HOST_METRIC, CUDA_METRIC>::Eval(const double* score, const ObjectiveFunction* objective) const {
+  const double* score_convert = score;
+  if (objective != nullptr && objective->NeedConvertOutputCUDA()) {
+    this->score_convert_buffer_.Resize(static_cast<size_t>(this->num_data_) * static_cast<size_t>(this->num_class_));
+    score_convert = objective->ConvertOutputCUDA(this->num_data_, score, this->score_convert_buffer_.RawData());
+  }
+  double sum_loss = 0.0, sum_weight = 0.0;
+  this->LaunchEvalKernel(score_convert, &sum_loss, &sum_weight);
+  const double eval_score = this->AverageLoss(sum_loss, sum_weight);
+  return std::vector<double>{eval_score};
+}
+
+CUDARMSEMetric::CUDARMSEMetric(const Config& config): CUDARegressionMetricInterface<RMSEMetric, CUDARMSEMetric>(config) {}
+
+CUDAL2Metric::CUDAL2Metric(const Config& config): CUDARegressionMetricInterface<L2Metric, CUDAL2Metric>(config) {}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp
new file mode 100644
index 000000000000..342e49542eb4
--- /dev/null
+++ b/src/metric/cuda/cuda_regression_metric.hpp
@@ -0,0 +1,59 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifndef LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
+#define LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_metric.hpp>
+#include <LightGBM/cuda/cuda_utils.h>
+
+#include <vector>
+
+#include "cuda_pointwise_metric.hpp"
+#include "../regression_metric.hpp"
+
+namespace LightGBM {
+
+template <typename HOST_METRIC, typename CUDA_METRIC>
+class CUDARegressionMetricInterface: public CUDAPointwiseMetricInterface<HOST_METRIC, CUDA_METRIC> {
+ public:
+  explicit CUDARegressionMetricInterface(const Config& config):
+    CUDAPointwiseMetricInterface<HOST_METRIC, CUDA_METRIC>(config) {}
+
+  virtual ~CUDARegressionMetricInterface() {}
+
+  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override;
+};
+
+class CUDARMSEMetric: public CUDARegressionMetricInterface<RMSEMetric, CUDARMSEMetric> {
+ public:
+  explicit CUDARMSEMetric(const Config& config);
+
+  virtual ~CUDARMSEMetric() {}
+
+  __device__ inline static double MetricOnPointCUDA(label_t label, double score) {
+    return (score - label) * (score - label);
+  }
+};
+
+class CUDAL2Metric : public CUDARegressionMetricInterface<L2Metric, CUDAL2Metric> {
+ public:
+  explicit CUDAL2Metric(const Config& config);
+
+  virtual ~CUDAL2Metric() {}
+
+  __device__ inline static double MetricOnPointCUDA(label_t label, double score) {
+    return (score - label) * (score - label);
+  }
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+
+#endif  // LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp
index dacdb0961c87..a393f1b2021a 100644
--- a/src/metric/metric.cpp
+++ b/src/metric/metric.cpp
@@ -11,17 +11,18 @@
 #include "regression_metric.hpp"
 #include "xentropy_metric.hpp"
 
+#include "cuda/cuda_binary_metric.hpp"
+#include "cuda/cuda_regression_metric.hpp"
+
 namespace LightGBM {
 
 Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
   #ifdef USE_CUDA_EXP
-  if (config.device_type == std::string("cuda_exp")) {
+  if (config.device_type == std::string("cuda_exp") && config.boosting == std::string("gbdt")) {
     if (type == std::string("l2")) {
-      Log::Warning("Metric l2 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
-      return new L2Metric(config);
+      return new CUDAL2Metric(config);
     } else if (type == std::string("rmse")) {
-      Log::Warning("Metric rmse is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
-      return new RMSEMetric(config);
+      return new CUDARMSEMetric(config);
     } else if (type == std::string("l1")) {
       Log::Warning("Metric l1 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
       return new L1Metric(config);
@@ -38,8 +39,7 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
       Log::Warning("Metric poisson is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
       return new PoissonMetric(config);
     } else if (type == std::string("binary_logloss")) {
-      Log::Warning("Metric binary_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
-      return new BinaryLoglossMetric(config);
+      return new CUDABinaryLoglossMetric(config);
     } else if (type == std::string("binary_error")) {
       Log::Warning("Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
       return new BinaryErrorMetric(config);
diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp
index 379c36c46aca..3c4124aad4b9 100644
--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -101,7 +101,7 @@ class RegressionMetric: public Metric {
   inline static void CheckLabel(label_t) {
   }
 
- private:
+ protected:
   /*! \brief Number of data */
   data_size_t num_data_;
   /*! \brief Pointer of label */
diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu
index 45fa46cf333d..9726f3eda66d 100644
--- a/src/objective/cuda/cuda_binary_objective.cu
+++ b/src/objective/cuda/cuda_binary_objective.cu
@@ -182,9 +182,10 @@ __global__ void ConvertOutputCUDAKernel_BinaryLogloss(const double sigmoid, cons
   }
 }
 
-void CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
+const double* CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
   const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY;
   ConvertOutputCUDAKernel_BinaryLogloss<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(sigmoid_, num_data, input, output);
+  return output;
 }
 
 __global__ void ResetOVACUDALabelKernel(
diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp
index 0de5be0b6331..77f58d8318f1 100644
--- a/src/objective/cuda/cuda_binary_objective.hpp
+++ b/src/objective/cuda/cuda_binary_objective.hpp
@@ -33,12 +33,14 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface<BinaryLogloss> {
 
   void Init(const Metadata& metadata, data_size_t num_data) override;
 
+  bool NeedConvertOutputCUDA() const override { return true; }
+
  private:
   void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const override;
 
   double LaunchCalcInitScoreKernel(const int class_id) const override;
 
-  void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
+  const double* LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
 
   void LaunchResetOVACUDALabelKernel() const;
 
diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp
index 0f70b0d66897..2ea3de870e99 100644
--- a/src/objective/cuda/cuda_multiclass_objective.cpp
+++ b/src/objective/cuda/cuda_multiclass_objective.cpp
@@ -49,10 +49,11 @@ void CUDAMulticlassOVA::GetGradients(const double* score, score_t* gradients, sc
   }
 }
 
-void CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
+const double* CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
   for (int i = 0; i < num_class_; ++i) {
     cuda_binary_loss_[i]->ConvertOutputCUDA(num_data, input + i * num_data, output + i * num_data);
   }
+  return output;
 }
 
 
diff --git a/src/objective/cuda/cuda_multiclass_objective.cu b/src/objective/cuda/cuda_multiclass_objective.cu
index 480ce32a9a57..797c7cec7bf0 100644
--- a/src/objective/cuda/cuda_multiclass_objective.cu
+++ b/src/objective/cuda/cuda_multiclass_objective.cu
@@ -95,11 +95,12 @@ __global__ void ConvertOutputCUDAKernel_MulticlassSoftmax(
   }
 }
 
-void CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel(
+const double* CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel(
   const data_size_t num_data, const double* input, double* output) const {
   const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS;
   ConvertOutputCUDAKernel_MulticlassSoftmax<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_MULTICLASS>>>(
     num_class_, num_data, input, cuda_softmax_buffer_.RawData(), output);
+  return output;
 }
 
 }  // namespace LightGBM
diff --git a/src/objective/cuda/cuda_multiclass_objective.hpp b/src/objective/cuda/cuda_multiclass_objective.hpp
index 37c3087c9e2f..e6e326306e31 100644
--- a/src/objective/cuda/cuda_multiclass_objective.hpp
+++ b/src/objective/cuda/cuda_multiclass_objective.hpp
@@ -34,7 +34,7 @@ class CUDAMulticlassSoftmax: public CUDAObjectiveInterface<MulticlassSoftmax> {
  private:
   void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const;
 
-  void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const;
+  const double* LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const;
 
   // CUDA memory, held by this object
   CUDAVector<double> cuda_softmax_buffer_;
@@ -51,7 +51,7 @@ class CUDAMulticlassOVA: public CUDAObjectiveInterface<MulticlassOVA> {
 
   void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
 
-  void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;
+  const double* ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;
 
   double BoostFromScore(int class_id) const override {
     return cuda_binary_loss_[class_id]->BoostFromScore(0);
diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu
index 8bb257673f7d..99feec132508 100644
--- a/src/objective/cuda/cuda_regression_objective.cu
+++ b/src/objective/cuda/cuda_regression_objective.cu
@@ -68,9 +68,14 @@ __global__ void ConvertOutputCUDAKernel_Regression(const bool sqrt, const data_s
   }
 }
 
-void CUDARegressionL2loss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
+const double* CUDARegressionL2loss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
   const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION;
-  ConvertOutputCUDAKernel_Regression<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_REGRESSION>>>(sqrt_, num_data, input, output);
+  if (sqrt_) {
+    ConvertOutputCUDAKernel_Regression<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_REGRESSION>>>(sqrt_, num_data, input, output);
+    return output;
+  } else {
+    return input;
+  }
 }
 
 template <bool USE_WEIGHT>
@@ -339,9 +344,10 @@ __global__ void ConvertOutputCUDAKernel_Regression_Poisson(const data_size_t num
   }
 }
 
-void CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
+const double* CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
   const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION;
   ConvertOutputCUDAKernel_Regression_Poisson<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_REGRESSION>>>(num_data, input, output);
+  return output;
 }
 
 
diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp
index 2e5b9e8506e9..593fcf1cfcb6 100644
--- a/src/objective/cuda/cuda_regression_objective.hpp
+++ b/src/objective/cuda/cuda_regression_objective.hpp
@@ -49,7 +49,9 @@ class CUDARegressionL2loss : public CUDARegressionObjectiveInterface<RegressionL
  protected:
   void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
 
-  void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
+  const double* LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
+
+  bool NeedConvertOutputCUDA() const override { return sqrt_; }
 };
 
 
@@ -121,7 +123,9 @@ class CUDARegressionPoissonLoss : public CUDARegressionObjectiveInterface<Regres
  protected:
   void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
 
-  void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
+  const double* LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
+
+  bool NeedConvertOutputCUDA() const override { return true; }
 
   double LaunchCalcInitScoreKernel(const int class_id) const override;
 
diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp
index 96b512e009fd..79749570d672 100644
--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -19,7 +19,9 @@ namespace LightGBM {
 
 ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
   #ifdef USE_CUDA_EXP
-  if (config.device_type == std::string("cuda_exp") && config.boosting == std::string("gbdt")) {
+  if (config.device_type == std::string("cuda_exp") &&
+      config.data_sample_strategy != std::string("goss") &&
+      config.boosting != std::string("rf")) {
     if (type == std::string("regression")) {
       return new CUDARegressionL2loss(config);
     } else if (type == std::string("regression_l1")) {
diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp
index 4ed03ba376b5..71c1a6d7cdfe 100644
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -398,7 +398,6 @@ class RegressionFairLoss: public RegressionL2loss {
 class RegressionPoissonLoss: public RegressionL2loss {
  public:
   explicit RegressionPoissonLoss(const Config& config): RegressionL2loss(config) {
-    Log::Warning("RegressionPoissonLoss is created again");
     max_delta_step_ = static_cast<double>(config.poisson_max_delta_step);
     if (sqrt_) {
       Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName());
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index bd047557dffb..f3fc65d3e4c1 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -376,6 +376,29 @@ def test_add_features_from_different_sources():
             assert d1.feature_name == res_feature_names
 
 
+def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
+
+    arr_a = np.zeros((100, 1), dtype=np.float32)
+    arr_b = np.random.normal(size=(100, 5))
+
+    dataset_a = lgb.Dataset(arr_a).construct()
+    expected_msg = (
+        '[LightGBM] [Warning] There are no meaningful features which satisfy '
+        'the provided configuration. Decreasing Dataset parameters min_data_in_bin '
+        'or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n'
+    )
+    log_lines = capsys.readouterr().out
+    assert expected_msg in log_lines
+
+    dataset_b = lgb.Dataset(arr_b).construct()
+
+    original_handle = dataset_a.handle.value
+    dataset_a.add_features_from(dataset_b)
+    assert dataset_a.num_feature() == 6
+    assert dataset_a.num_data() == 100
+    assert dataset_a.handle.value == original_handle
+
+
 def test_cegb_affects_behavior(tmp_path):
     X = np.random.random((100, 5))
     X[:, [1, 3]] = 0
diff --git a/tests/python_package_test/test_dual.py b/tests/python_package_test/test_dual.py
index cd31a7d9b3b9..75c54c83eb94 100644
--- a/tests/python_package_test/test_dual.py
+++ b/tests/python_package_test/test_dual.py
@@ -2,6 +2,7 @@
 """Tests for dual GPU+CPU support."""
 
 import os
+import platform
 
 import pytest
 from sklearn.metrics import log_loss
@@ -26,9 +27,11 @@ def test_cpu_and_gpu_work():
 
     params_gpu = params_cpu.copy()
     params_gpu["device"] = "gpu"
-    params_gpu["gpu_use_dp"] = True
+    # Double-precision floats are only supported on x86_64 with PoCL
+    params_gpu["gpu_use_dp"] = (platform.machine() == "x86_64")
     gpu_bst = lgb.train(params_gpu, data, num_boost_round=10)
     gpu_score = log_loss(y, gpu_bst.predict(X))
 
-    assert cpu_score == pytest.approx(gpu_score)
+    rel = 1e-6 if params_gpu["gpu_use_dp"] else 1e-4
+    assert cpu_score == pytest.approx(gpu_score, rel=rel)
     assert gpu_score < 0.242
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 7d50a1ff390d..93e0b4f648ba 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -21,7 +21,7 @@
 import lightgbm as lgb
 from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
 
-from .utils import (SERIALIZERS, dummy_obj, load_boston, load_breast_cancer, load_digits, load_iris, logistic_sigmoid,
+from .utils import (SERIALIZERS, dummy_obj, load_breast_cancer, load_digits, load_iris, logistic_sigmoid,
                     make_synthetic_regression, mse_obj, pickle_and_unpickle_object, sklearn_multiclass_custom_objective,
                     softmax)
 
@@ -114,7 +114,8 @@ def test_rf():
 
 @pytest.mark.parametrize('objective', ['regression', 'regression_l1', 'huber', 'fair', 'poisson'])
 def test_regression(objective):
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
+    y = np.abs(y)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': objective,
@@ -133,13 +134,13 @@ def test_regression(objective):
     )
     ret = mean_squared_error(y_test, gbm.predict(X_test))
     if objective == 'huber':
-        assert ret < 35
+        assert ret < 430
     elif objective == 'fair':
-        assert ret < 17
+        assert ret < 296
     elif objective == 'poisson':
-        assert ret < 8
+        assert ret < 193
     else:
-        assert ret < 7
+        assert ret < 338
     assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
 
 
@@ -924,7 +925,7 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
 
 
 def test_continue_train():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': 'regression',
@@ -948,7 +949,7 @@ def test_continue_train():
         init_model='model.txt'
     )
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
-    assert ret < 2.0
+    assert ret < 13.6
     assert evals_result['valid_0']['l1'][-1] == pytest.approx(ret)
     np.testing.assert_allclose(evals_result['valid_0']['l1'], evals_result['valid_0']['custom_mae'])
 
@@ -968,7 +969,7 @@ def test_continue_train_reused_dataset():
 
 
 def test_continue_train_dart():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'boosting_type': 'dart',
@@ -989,7 +990,7 @@ def test_continue_train_dart():
         init_model=init_gbm
     )
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
-    assert ret < 2.0
+    assert ret < 13.6
     assert evals_result['valid_0']['l1'][-1] == pytest.approx(ret)
 
 
@@ -1920,10 +1921,12 @@ def test_refit_dataset_params():
     np.testing.assert_allclose(stored_weights, refit_weight)
 
 
-def test_mape_rf():
-    X, y = load_boston(return_X_y=True)
+@pytest.mark.parametrize('boosting_type', ['rf', 'dart'])
+def test_mape_for_specific_boosting_types(boosting_type):
+    X, y = make_synthetic_regression()
+    y = abs(y)
     params = {
-        'boosting_type': 'rf',
+        'boosting_type': boosting_type,
         'objective': 'mape',
         'verbose': -1,
         'bagging_freq': 1,
@@ -1935,25 +1938,9 @@ def test_mape_rf():
     gbm = lgb.train(params, lgb_train, num_boost_round=20)
     pred = gbm.predict(X)
     pred_mean = pred.mean()
-    assert pred_mean > 20
-
-
-def test_mape_dart():
-    X, y = load_boston(return_X_y=True)
-    params = {
-        'boosting_type': 'dart',
-        'objective': 'mape',
-        'verbose': -1,
-        'bagging_freq': 1,
-        'bagging_fraction': 0.8,
-        'feature_fraction': 0.8,
-        'boost_from_average': False
-    }
-    lgb_train = lgb.Dataset(X, y)
-    gbm = lgb.train(params, lgb_train, num_boost_round=40)
-    pred = gbm.predict(X)
-    pred_mean = pred.mean()
-    assert pred_mean > 18
+    # the following checks that dart and rf with mape can predict outside the 0-1 range
+    # https://github.com/microsoft/LightGBM/issues/1579
+    assert pred_mean > 8
 
 
 def check_constant_features(y_true, expected_pred, more_params):
@@ -2667,19 +2654,22 @@ def test_model_size():
 
 @pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_get_split_value_histogram():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
+    X = np.repeat(X, 3, axis=0)
+    y = np.repeat(y, 3, axis=0)
+    X[:, 2] = np.random.default_rng(0).integers(0, 20, size=X.shape[0])
     lgb_train = lgb.Dataset(X, y, categorical_feature=[2])
     gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20)
     # test XGBoost-style return value
     params = {'feature': 0, 'xgboost_style': True}
-    assert gbm.get_split_value_histogram(**params).shape == (9, 2)
-    assert gbm.get_split_value_histogram(bins=999, **params).shape == (9, 2)
+    assert gbm.get_split_value_histogram(**params).shape == (12, 2)
+    assert gbm.get_split_value_histogram(bins=999, **params).shape == (12, 2)
     assert gbm.get_split_value_histogram(bins=-1, **params).shape == (1, 2)
     assert gbm.get_split_value_histogram(bins=0, **params).shape == (1, 2)
     assert gbm.get_split_value_histogram(bins=1, **params).shape == (1, 2)
     assert gbm.get_split_value_histogram(bins=2, **params).shape == (2, 2)
-    assert gbm.get_split_value_histogram(bins=6, **params).shape == (5, 2)
-    assert gbm.get_split_value_histogram(bins=7, **params).shape == (6, 2)
+    assert gbm.get_split_value_histogram(bins=6, **params).shape == (6, 2)
+    assert gbm.get_split_value_histogram(bins=7, **params).shape == (7, 2)
     if lgb.compat.PANDAS_INSTALLED:
         np.testing.assert_allclose(
             gbm.get_split_value_histogram(0, xgboost_style=True).values,
@@ -2700,8 +2690,8 @@ def test_get_split_value_histogram():
         )
     # test numpy-style return value
     hist, bins = gbm.get_split_value_histogram(0)
-    assert len(hist) == 23
-    assert len(bins) == 24
+    assert len(hist) == 20
+    assert len(bins) == 21
     hist, bins = gbm.get_split_value_histogram(0, bins=999)
     assert len(hist) == 999
     assert len(bins) == 1000
@@ -2790,7 +2780,7 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration,
         )
         assert assumed_iteration == len(ret[list(ret.keys())[0]])
 
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=73)
     lgb_train = lgb.Dataset(X_train, y_train)
@@ -2798,16 +2788,16 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration,
     lgb_valid2 = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
 
     iter_valid1_l1 = 3
-    iter_valid1_l2 = 14
-    iter_valid2_l1 = 2
+    iter_valid1_l2 = 3
+    iter_valid2_l1 = 3
     iter_valid2_l2 = 15
-    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 4
+    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
     iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
     iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
     iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
 
-    iter_cv_l1 = 4
-    iter_cv_l2 = 12
+    iter_cv_l1 = 15
+    iter_cv_l2 = 13
     assert len(set([iter_cv_l1, iter_cv_l2])) == 2
     iter_cv_min = min([iter_cv_l1, iter_cv_l2])
 
@@ -2897,6 +2887,25 @@ def test_node_level_subcol():
     assert ret != ret2
 
 
+def test_forced_split_feature_indices(tmp_path):
+    X, y = make_synthetic_regression()
+    forced_split = {
+        "feature": 0,
+        "threshold": 0.5,
+        "left": {"feature": X.shape[1], "threshold": 0.5},
+    }
+    tmp_split_file = tmp_path / "forced_split.json"
+    with open(tmp_split_file, "w") as f:
+        f.write(json.dumps(forced_split))
+    lgb_train = lgb.Dataset(X, y)
+    params = {
+        "objective": "regression",
+        "forcedsplits_filename": tmp_split_file
+    }
+    with pytest.raises(lgb.basic.LightGBMError, match="Forced splits file includes feature index"):
+        bst = lgb.train(params, lgb_train)
+
+
 def test_forced_bins():
     x = np.empty((100, 2))
     x[:, 0] = np.arange(0, 1, 0.01)
@@ -3153,7 +3162,7 @@ def _imptcs_to_numpy(X, impcts_dict):
 
 
 def test_interaction_constraints():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression(n_samples=200)
     num_features = X.shape[1]
     train_data = lgb.Dataset(X, label=y)
     # check that constraint containing all features is equivalent to no constraint
@@ -3166,9 +3175,7 @@ def test_interaction_constraints():
     pred2 = est.predict(X)
     np.testing.assert_allclose(pred1, pred2)
     # check that constraint partitioning the features reduces train accuracy
-    est = lgb.train(dict(params, interaction_constraints=[list(range(num_features // 2)),
-                                                          list(range(num_features // 2, num_features))]),
-                    train_data, num_boost_round=10)
+    est = lgb.train(dict(params, interaction_constraints=[[0, 2], [1, 3]]), train_data, num_boost_round=10)
     pred3 = est.predict(X)
     assert mean_squared_error(y, pred1) < mean_squared_error(y, pred3)
     # check that constraints consisting of single features reduce accuracy further
@@ -3568,7 +3575,7 @@ def hook(obj):
 
 @pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Forced splits are not yet supported by CUDA Experimental version')
 def test_force_split_with_feature_fraction(tmp_path):
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     lgb_train = lgb.Dataset(X_train, y_train)
 
@@ -3595,7 +3602,7 @@ def test_force_split_with_feature_fraction(tmp_path):
 
     gbm = lgb.train(params, lgb_train)
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
-    assert ret < 2.0
+    assert ret < 15.7
 
     tree_info = gbm.dump_model()["tree_info"]
     assert len(tree_info) > 1
@@ -3604,6 +3611,142 @@ def test_force_split_with_feature_fraction(tmp_path):
         assert tree_structure['split_feature'] == 0
 
 
+def test_goss_boosting_and_strategy_equivalent():
+    X, y = make_synthetic_regression(n_samples=10_000, n_features=10, n_informative=5, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    lgb_train = lgb.Dataset(X_train, y_train)
+    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+    base_params = {
+        'metric': 'l2',
+        'verbose': -1,
+        'bagging_seed': 0,
+        'learning_rate': 0.05,
+        'num_threads': 1,
+        'force_row_wise': True,
+        'gpu_use_dp': True,
+    }
+    params1 = {**base_params, 'boosting': 'goss'}
+    evals_result1 = {}
+    lgb.train(params1, lgb_train,
+              num_boost_round=10,
+              valid_sets=lgb_eval,
+              callbacks=[lgb.record_evaluation(evals_result1)])
+    params2 = {**base_params, 'data_sample_strategy': 'goss'}
+    evals_result2 = {}
+    lgb.train(params2, lgb_train,
+              num_boost_round=10,
+              valid_sets=lgb_eval,
+              callbacks=[lgb.record_evaluation(evals_result2)])
+    assert evals_result1['valid_0']['l2'] == evals_result2['valid_0']['l2']
+
+
+def test_sample_strategy_with_boosting():
+    X, y = make_synthetic_regression(n_samples=10_000, n_features=10, n_informative=5, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    lgb_train = lgb.Dataset(X_train, y_train)
+    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+
+    base_params = {
+        'metric': 'l2',
+        'verbose': -1,
+        'num_threads': 1,
+        'force_row_wise': True,
+        'gpu_use_dp': True,
+    }
+
+    params1 = {**base_params, 'boosting': 'dart', 'data_sample_strategy': 'goss'}
+    evals_result = {}
+    gbm = lgb.train(params1, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res1 = evals_result['valid_0']['l2'][-1]
+    test_res1 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res1 == pytest.approx(3149.393862, abs=1.0)
+    assert eval_res1 == pytest.approx(test_res1)
+
+    params2 = {**base_params, 'boosting': 'gbdt', 'data_sample_strategy': 'goss'}
+    evals_result = {}
+    gbm = lgb.train(params2, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res2 = evals_result['valid_0']['l2'][-1]
+    test_res2 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res2 == pytest.approx(2547.715968, abs=1.0)
+    assert eval_res2 == pytest.approx(test_res2)
+
+    params3 = {**base_params, 'boosting': 'goss', 'data_sample_strategy': 'goss'}
+    evals_result = {}
+    gbm = lgb.train(params3, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res3 = evals_result['valid_0']['l2'][-1]
+    test_res3 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res3 == pytest.approx(2547.715968, abs=1.0)
+    assert eval_res3 == pytest.approx(test_res3)
+
+    params4 = {**base_params, 'boosting': 'rf', 'data_sample_strategy': 'goss'}
+    evals_result = {}
+    gbm = lgb.train(params4, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res4 = evals_result['valid_0']['l2'][-1]
+    test_res4 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res4 == pytest.approx(2095.538735, abs=1.0)
+    assert eval_res4 == pytest.approx(test_res4)
+
+    assert test_res1 != test_res2
+    assert eval_res1 != eval_res2
+    assert test_res2 == test_res3
+    assert eval_res2 == eval_res3
+    assert eval_res1 != eval_res4
+    assert test_res1 != test_res4
+    assert eval_res2 != eval_res4
+    assert test_res2 != test_res4
+
+    params5 = {**base_params, 'boosting': 'dart', 'data_sample_strategy': 'bagging', 'bagging_freq': 1, 'bagging_fraction': 0.5}
+    evals_result = {}
+    gbm = lgb.train(params5, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res5 = evals_result['valid_0']['l2'][-1]
+    test_res5 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res5 == pytest.approx(3134.866931, abs=1.0)
+    assert eval_res5 == pytest.approx(test_res5)
+
+    params6 = {**base_params, 'boosting': 'gbdt', 'data_sample_strategy': 'bagging', 'bagging_freq': 1, 'bagging_fraction': 0.5}
+    evals_result = {}
+    gbm = lgb.train(params6, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res6 = evals_result['valid_0']['l2'][-1]
+    test_res6 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res6 == pytest.approx(2539.792378, abs=1.0)
+    assert eval_res6 == pytest.approx(test_res6)
+    assert test_res5 != test_res6
+    assert eval_res5 != eval_res6
+
+    params7 = {**base_params, 'boosting': 'rf', 'data_sample_strategy': 'bagging', 'bagging_freq': 1, 'bagging_fraction': 0.5}
+    evals_result = {}
+    gbm = lgb.train(params7, lgb_train,
+                    num_boost_round=10,
+                    valid_sets=lgb_eval,
+                    callbacks=[lgb.record_evaluation(evals_result)])
+    eval_res7 = evals_result['valid_0']['l2'][-1]
+    test_res7 = mean_squared_error(y_test, gbm.predict(X_test))
+    assert test_res7 == pytest.approx(1518.704481, abs=1.0)
+    assert eval_res7 == pytest.approx(test_res7)
+    assert test_res5 != test_res7
+    assert eval_res5 != eval_res7
+    assert test_res6 != test_res7
+    assert eval_res6 != eval_res7
+
+
 def test_record_evaluation_with_train():
     X, y = make_synthetic_regression()
     ds = lgb.Dataset(X, y)
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index c09be27f1adb..5873bf9112c3 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -21,8 +21,8 @@
 import lightgbm as lgb
 from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
 
-from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking,
-                    make_synthetic_regression, sklearn_multiclass_custom_objective, softmax)
+from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression,
+                    sklearn_multiclass_custom_objective, softmax)
 
 decreasing_generator = itertools.count(0, -1)
 task_to_model_factory = {
@@ -112,12 +112,12 @@ def test_binary():
 
 
 def test_regression():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     gbm = lgb.LGBMRegressor(n_estimators=50, verbose=-1)
     gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
     ret = mean_squared_error(y_test, gbm.predict(X_test))
-    assert ret < 7
+    assert ret < 174
     assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret)
 
 
@@ -226,12 +226,12 @@ def test_objective_aliases(custom_objective):
 
 
 def test_regression_with_custom_objective():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     gbm = lgb.LGBMRegressor(n_estimators=50, verbose=-1, objective=objective_ls)
     gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
     ret = mean_squared_error(y_test, gbm.predict(X_test))
-    assert ret < 7.0
+    assert ret < 174
     assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret)
 
 
@@ -249,13 +249,12 @@ def test_binary_classification_with_custom_objective():
 
 
 def test_dart():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50)
     gbm.fit(X_train, y_train)
     score = gbm.score(X_test, y_test)
-    assert score >= 0.8
-    assert score <= 1.
+    assert 0.8 <= score <= 1.0
 
 
 def test_stacking_classifier():
@@ -280,7 +279,9 @@ def test_stacking_classifier():
 
 
 def test_stacking_regressor():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression(n_samples=200)
+    n_features = X.shape[1]
+    n_input_models = 2
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
     regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)),
                   ('gbm2', lgb.LGBMRegressor(n_estimators=3))]
@@ -291,11 +292,11 @@ def test_stacking_regressor():
     score = reg.score(X_test, y_test)
     assert score >= 0.2
     assert score <= 1.
-    assert reg.n_features_in_ == 13  # number of input features
-    assert len(reg.named_estimators_['gbm1'].feature_importances_) == 13
+    assert reg.n_features_in_ == n_features  # number of input features
+    assert len(reg.named_estimators_['gbm1'].feature_importances_) == n_features
     assert reg.named_estimators_['gbm1'].n_features_in_ == reg.named_estimators_['gbm2'].n_features_in_
-    assert reg.final_estimator_.n_features_in_ == 15  # number of concatenated features
-    assert len(reg.final_estimator_.feature_importances_) == 15
+    assert reg.final_estimator_.n_features_in_ == n_features + n_input_models  # number of concatenated features
+    assert len(reg.final_estimator_.feature_importances_) == n_features + n_input_models
 
 
 def test_grid_search():
@@ -765,7 +766,8 @@ def test_evaluate_train_set():
 
 
 def test_metrics():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
+    y = abs(y)
     params = {'n_estimators': 2, 'verbose': -1}
     params_fit = {'X': X, 'y': y, 'eval_set': (X, y)}
 
@@ -1102,7 +1104,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_
                 else:
                     assert gbm.n_estimators == gbm.best_iteration_
 
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression(n_samples=300)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72)
     params = {'n_estimators': 30,
@@ -1114,11 +1116,11 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_
     params_fit = {'X': X_train,
                   'y': y_train}
 
-    iter_valid1_l1 = 3
-    iter_valid1_l2 = 18
-    iter_valid2_l1 = 11
-    iter_valid2_l2 = 7
-    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 4
+    iter_valid1_l1 = 4
+    iter_valid1_l2 = 4
+    iter_valid2_l1 = 2
+    iter_valid2_l2 = 2
+    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
     iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
     iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
     iter_min = min([iter_min_l1, iter_min_l2])
diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py
index e2c0a29effc1..29183713d714 100644
--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -13,11 +13,6 @@
 SERIALIZERS = ["pickle", "joblib", "cloudpickle"]
 
 
-@lru_cache(maxsize=None)
-def load_boston(**kwargs):
-    return sklearn.datasets.load_boston(**kwargs)
-
-
 @lru_cache(maxsize=None)
 def load_breast_cancer(**kwargs):
     return sklearn.datasets.load_breast_cancer(**kwargs)
@@ -119,8 +114,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
 
 
 @lru_cache(maxsize=None)
-def make_synthetic_regression(n_samples=100):
-    return sklearn.datasets.make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
+def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
+    return sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features,
+                                            n_informative=n_informative, random_state=random_state)
 
 
 def dummy_obj(preds, train_data):
diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj
index 653b86c00ac5..876ddda4cf64 100644
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -253,6 +253,7 @@
     <ClInclude Include="..\include\LightGBM\network.h" />
     <ClInclude Include="..\include\LightGBM\objective_function.h" />
     <ClInclude Include="..\include\LightGBM\prediction_early_stop.h" />
+    <ClInclude Include="..\include\LightGBM\sample_strategy.h" />
     <ClInclude Include="..\include\LightGBM\tree.h" />
     <ClInclude Include="..\include\LightGBM\tree_learner.h" />
     <ClInclude Include="..\include\LightGBM\utils\yamc\alternate_shared_mutex.hpp" />
@@ -311,6 +312,7 @@
     <ClCompile Include="..\src\boosting\gbdt_model_text.cpp" />
     <ClCompile Include="..\src\boosting\gbdt_prediction.cpp" />
     <ClCompile Include="..\src\boosting\prediction_early_stop.cpp" />
+    <ClCompile Include="..\src\boosting\sample_strategy.cpp" />
     <ClCompile Include="..\src\c_api.cpp" />
     <ClCompile Include="..\src\io\bin.cpp" />
     <ClCompile Include="..\src\io\config.cpp" />
diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters
index 0f48c7564580..56b4e29287d5 100644
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -129,6 +129,9 @@
     <ClInclude Include="..\include\LightGBM\prediction_early_stop.h">
       <Filter>include\LightGBM</Filter>
     </ClInclude>
+    <ClInclude Include="..\include\LightGBM\sample_strategy.h">
+      <Filter>include\LightGBM</Filter>
+    </ClInclude>
     <ClInclude Include="..\include\LightGBM\tree.h">
       <Filter>include\LightGBM</Filter>
     </ClInclude>
@@ -311,6 +314,9 @@
     <ClCompile Include="..\src\boosting\gbdt_model_text.cpp">
       <Filter>src\boosting</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\boosting\sample_strategy.cpp">
+      <Filter>src\boosting</Filter>
+    </ClCompile>
     <ClCompile Include="..\src\io\file_io.cpp">
       <Filter>src\io</Filter>
     </ClCompile>