diff --git a/.appveyor.yml b/.appveyor.yml
index 5b64b63e10b6..e02c3e9d8151 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1,9 +1,9 @@
-version: 3.3.1.99.{build}
+version: 3.3.2.99.{build}
 
 image: Visual Studio 2015
 platform: x64
 configuration:  # a trick to construct a build matrix with multiple Python versions
-  - 3.7
+  - '3.7'
 
 # only build pull requests and
 # commits to 'master'
@@ -27,10 +27,7 @@ install:
   - set PYTHON_VERSION=%CONFIGURATION%
   - set CONDA_ENV="test-env"
   - ps: |
-      switch ($env:PYTHON_VERSION) {
-          "3.7" {$env:MINICONDA = "C:\Miniconda37-x64"}
-          default {$env:MINICONDA = "C:\Miniconda37-x64"}
-      }
+      $env:MINICONDA = "C:\Miniconda3-x64"
       $env:PATH = "$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH"
       $env:BUILD_SOURCESDIRECTORY = "$env:APPVEYOR_BUILD_FOLDER"
       $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim()
@@ -38,5 +35,7 @@ install:
 build: false
 
 test_script:
+  - conda config --add channels conda-forge
+  - conda config --set channel_priority strict
   - conda init powershell
   - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1
diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R
index d477a1a70b9c..54f94bb9a51d 100755
--- a/.ci/lint_r_code.R
+++ b/.ci/lint_r_code.R
@@ -29,28 +29,40 @@ interactive_text <- paste0(
 )
 
 LINTERS_TO_USE <- list(
-    "absolute_path"          = lintr::absolute_path_linter
-    , "assignment"           = lintr::assignment_linter
-    , "closed_curly"         = lintr::closed_curly_linter
-    , "commas"               = lintr::commas_linter
-    , "equals_na"            = lintr::equals_na_linter
-    , "function_left"        = lintr::function_left_parentheses_linter
-    , "implicit_integers"    = lintr::implicit_integer_linter
-    , "infix_spaces"         = lintr::infix_spaces_linter
+    "absolute_path"          = lintr::absolute_path_linter()
+    , "any_duplicated"       = lintr::any_duplicated_linter()
+    , "any_is_na"            = lintr::any_is_na_linter()
+    , "assignment"           = lintr::assignment_linter()
+    , "braces"               = lintr::brace_linter()
+    , "class_equals"         = lintr::class_equals_linter()
+    , "commas"               = lintr::commas_linter()
+    , "duplicate_argument"   = lintr::duplicate_argument_linter()
+    , "equals_na"            = lintr::equals_na_linter()
+    , "function_left"        = lintr::function_left_parentheses_linter()
+    , "implicit_integers"    = lintr::implicit_integer_linter()
+    , "infix_spaces"         = lintr::infix_spaces_linter()
+    , "inner_combine"        = lintr::inner_combine_linter()
+    , "literal_coercion"     = lintr::literal_coercion_linter()
     , "long_lines"           = lintr::line_length_linter(length = 120L)
-    , "no_tabs"              = lintr::no_tab_linter
-    , "non_portable_path"    = lintr::nonportable_path_linter
-    , "open_curly"           = lintr::open_curly_linter
-    , "paren_brace_linter"   = lintr::paren_brace_linter
-    , "semicolon"            = lintr::semicolon_terminator_linter
-    , "seq"                  = lintr::seq_linter
-    , "single_quotes"        = lintr::single_quotes_linter
-    , "spaces_inside"        = lintr::spaces_inside_linter
-    , "spaces_left_parens"   = lintr::spaces_left_parentheses_linter
+    , "missing_argument"     = lintr::missing_argument_linter()
+    , "no_tabs"              = lintr::no_tab_linter()
+    , "non_portable_path"    = lintr::nonportable_path_linter()
+    , "numeric_leading_zero" = lintr::numeric_leading_zero_linter()
+    , "outer_negation"       = lintr::outer_negation_linter()
+    , "package_hooks"        = lintr::package_hooks_linter()
+    , "paste"                = lintr::paste_linter()
+    , "regex_subset"         = lintr::regex_subset_linter()
+    , "semicolon"            = lintr::semicolon_linter()
+    , "seq"                  = lintr::seq_linter()
+    , "single_quotes"        = lintr::single_quotes_linter()
+    , "spaces_inside"        = lintr::spaces_inside_linter()
+    , "spaces_left_parens"   = lintr::spaces_left_parentheses_linter()
+    , "sprintf"              = lintr::sprintf_linter()
+    , "string_boundary"      = lintr::string_boundary_linter()
     , "todo_comments"        = lintr::todo_comment_linter(c("todo", "fixme", "to-do"))
-    , "trailing_blank"       = lintr::trailing_blank_lines_linter
-    , "trailing_white"       = lintr::trailing_whitespace_linter
-    , "true_false"           = lintr::T_and_F_symbol_linter
+    , "trailing_blank"       = lintr::trailing_blank_lines_linter()
+    , "trailing_white"       = lintr::trailing_whitespace_linter()
+    , "true_false"           = lintr::T_and_F_symbol_linter()
     , "undesirable_function" = lintr::undesirable_function_linter(
         fun = c(
             "cat" = "CRAN forbids the use of cat() in packages except in special cases. Use message() or warning()."
@@ -58,8 +70,8 @@ LINTERS_TO_USE <- list(
                 "cbind is an unsafe way to build up a data frame. merge() or direct "
                 , "column assignment is preferred."
             )
-            , "dyn.load" = "Directly loading/unloading .dll/.so files in package code should not be necessary."
-            , "dyn.unload" = "Directly loading/unloading .dll/.so files in package code should not be necessary."
+            , "dyn.load" = "Directly loading or unloading .dll or .so files in package code should not be necessary."
+            , "dyn.unload" = "Directly loading or unloading .dll or .so files in package code should not be necessary."
             , "help" = interactive_text
             , "ifelse" = "The use of ifelse() is dangerous because it will silently allow mixing types."
             , "install.packages" = interactive_text
@@ -83,7 +95,9 @@ LINTERS_TO_USE <- list(
             , "??" = interactive_text
         )
     )
-    , "unneeded_concatenation" = lintr::unneeded_concatenation_linter
+    , "unneeded_concatenation" = lintr::unneeded_concatenation_linter()
+    , "unreachable_code"       = lintr::unreachable_code_linter()
+    , "vector_logic"           = lintr::vector_logic_linter()
 )
 
 noquote(paste0(length(FILES_TO_LINT), " R files need linting"))
diff --git a/.ci/run_rhub_solaris_checks.R b/.ci/run_rhub_solaris_checks.R
deleted file mode 100644
index 98f1e251eb3a..000000000000
--- a/.ci/run_rhub_solaris_checks.R
+++ /dev/null
@@ -1,92 +0,0 @@
-args <- commandArgs(
-    trailingOnly = TRUE
-)
-package_tarball <- args[[1L]]
-log_file <- args[[2L]]
-dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE)
-
-email <- c(
-    150L, 147L, 145L, 146L, 158L, 145L, 140L, 151L, 137L, 156L, 146L, 159L, 140L, 137L, 141L, 146L,
-    143L, 141L, 149L, 157L, 106L, 163L, 153L, 154L, 151L, 139L, 147L, 150L, 88L, 141L, 153L, 151L
-)
-token <- c(
-    91L, 98L, 91L, 142L, 142L, 99L, 96L, 91L, 98L, 94L, 99L, 92L, 94L, 144L, 90L, 139L,
-    139L, 143L, 139L, 91L, 99L, 142L, 97L, 93L, 144L, 99L, 139L, 143L, 97L, 99L, 97L, 94L
-)
-
-if (Sys.info()["sysname"] == "Windows") {
-    null_file <- "NUL"
-} else {
-    null_file <- "/dev/null"
-}
-
-sink(file = null_file)
-rhub::validate_email(
-    email = intToUtf8(email - 42L)
-    , token = intToUtf8(token - 42L)
-)
-sink()
-
-checks_succeeded <- TRUE
-platforms <- c(
-    "solaris-x86-patched"
-    , "solaris-x86-patched-ods"
-)
-sink(file = null_file)
-for (platform in platforms) {
-    res_object <- rhub::check(
-        path = package_tarball
-        , email = intToUtf8(email - 42L)
-        , check_args = "--as-cran"
-        , platform = platform
-        , env_vars = c(
-            "R_COMPILE_AND_INSTALL_PACKAGES" = "always"
-            , "_R_CHECK_SYSTEM_CLOCK_" = 0L
-            , "_R_CHECK_CRAN_INCOMING_REMOTE_" = 0L
-            , "_R_CHECK_PKG_SIZES_THRESHOLD_" = 60L
-            , "_R_CHECK_TOPLEVEL_FILES_" = 0L
-        )
-        , show_status = TRUE
-    )
-    statuses <- res_object[[".__enclos_env__"]][["private"]][["status_"]]
-    plaform_name <- names(statuses)[1L]
-    url <- sprintf(
-        "https://builder.r-hub.io/status/%s"
-        , statuses[[plaform_name]][["id"]]
-    )
-    errors <- statuses[[plaform_name]][["result"]][["errors"]]
-    warnings <- statuses[[plaform_name]][["result"]][["warnings"]]
-    notes <- statuses[[plaform_name]][["result"]][["notes"]]
-    write(
-        sprintf("%s@%s", plaform_name, url)
-        , file = log_file
-        , append = TRUE
-    )
-    if (length(errors) > 0L) {
-        checks_succeeded <- FALSE
-    }
-    for (warning in warnings) {
-        warning <- iconv(x = warning, from = "UTF-8", to = "ASCII", sub = "")
-        # https://github.com/r-hub/rhub/issues/113
-        if (!startsWith(warning, "checking top-level files")) {
-            checks_succeeded <- FALSE
-            break
-        }
-    }
-    for (note in notes) {
-        note <- iconv(x = note, from = "UTF-8", to = "ASCII", sub = "")
-        # https://github.com/r-hub/rhub/issues/415
-        if (!(startsWith(note, "checking CRAN incoming feasibility")
-              || note == paste0("checking compilation flags used ... NOTE\n"
-                                , "Compilation used the following non-portable flag(s):\n  -march=pentiumpro"))) {
-            checks_succeeded <- FALSE
-            break
-        }
-    }
-    if (!checks_succeeded) {
-        break
-    }
-}
-sink()
-
-quit(save = "no", status = as.integer(!checks_succeeded))
diff --git a/.ci/setup.sh b/.ci/setup.sh
index e038be53417f..c1e39b67d1a0 100755
--- a/.ci/setup.sh
+++ b/.ci/setup.sh
@@ -17,8 +17,10 @@ if [[ $OS_NAME == "macos" ]]; then
     if [[ $TASK == "swig" ]]; then
         brew install swig
     fi
-    brew install graphviz
-    curl -sL -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+    curl \
+        -sL \
+        -o miniforge.sh \
+        https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-x86_64.sh
 else  # Linux
     if [[ $IN_UBUNTU_LATEST_CONTAINER == "true" ]]; then
         # fixes error "unable to initialize frontend: Dialog"
@@ -42,9 +44,6 @@ else  # Linux
             libicu66 \
             libssl1.1 \
             libunwind8 \
-            libxau6 \
-            libxext6 \
-            libxrender1 \
             locales \
             netcat \
             unzip \
@@ -72,25 +71,16 @@ else  # Linux
         sudo apt-get install --no-install-recommends -y \
             libboost1.74-dev \
             ocl-icd-opencl-dev
-        cd $BUILD_DIRECTORY  # to avoid permission errors
-        curl -sL -o AMD-APP-SDKInstaller.tar.bz2 https://github.com/microsoft/LightGBM/releases/download/v2.0.12/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2
-        tar -xjf AMD-APP-SDKInstaller.tar.bz2
-        mkdir -p $OPENCL_VENDOR_PATH
-        mkdir -p $AMDAPPSDK_PATH
-        sh AMD-APP-SDK*.sh --tar -xf -C $AMDAPPSDK_PATH
-        mv $AMDAPPSDK_PATH/lib/x86_64/sdk/* $AMDAPPSDK_PATH/lib/x86_64/
-        echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd
+        if [[ $IN_UBUNTU_LATEST_CONTAINER == "true" ]]; then
+            sudo apt-get install --no-install-recommends -y \
+                pocl-opencl-icd
+        fi
     fi
-    ARCH=$(uname -m)
-    if [[ $TASK == "cuda" ]]; then
+    if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
         echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
         apt-get update
         apt-get install --no-install-recommends -y \
             curl \
-            graphviz \
-            libxau6 \
-            libxext6 \
-            libxrender1 \
             lsb-release \
             software-properties-common
         if [[ $COMPILER == "clang" ]]; then
@@ -98,40 +88,24 @@ else  # Linux
                 clang \
                 libomp-dev
         fi
-        curl \
-            -s \
-            -L \
-            --insecure \
-            https://apt.kitware.com/keys/kitware-archive-latest.asc \
-        | apt-key add -
+        curl -sL https://apt.kitware.com/keys/kitware-archive-latest.asc | apt-key add -
         apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" -y
-        apt-get --allow-unauthenticated upgrade -y
-        apt-get --allow-unauthenticated update -y
+        apt-get update
         apt-get install --no-install-recommends -y \
             cmake
-    else
-        if [[ $ARCH != "x86_64" ]]; then
-            yum update -y
-            yum install -y \
-                graphviz
-        else
-            sudo apt-get update
-            sudo apt-get install --no-install-recommends -y \
-                graphviz
-        fi
     fi
     if [[ $SETUP_CONDA != "false" ]]; then
-        if [[ $ARCH == "x86_64" ]]; then
-            curl -sL -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-        else
-            curl -sL -o conda.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${ARCH}.sh
-        fi
+        ARCH=$(uname -m)
+        curl \
+            -sL \
+            -o miniforge.sh \
+            https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${ARCH}.sh
     fi
 fi
 
-if [[ "${TASK}" != "r-package" ]]; then
+if [[ "${TASK}" != "r-package" ]] && [[ "${TASK}" != "r-rchk" ]]; then
     if [[ $SETUP_CONDA != "false" ]]; then
-        sh conda.sh -b -p $CONDA
+        sh miniforge.sh -b -p $CONDA
     fi
     conda config --set always_yes yes --set changeps1 no
     conda update -q -y conda
diff --git a/.ci/test.sh b/.ci/test.sh
index e3d385ab8b71..e549416d55da 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -36,8 +36,15 @@ cd $BUILD_DIRECTORY
 
 if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
     cd $BUILD_DIRECTORY/docs
-    conda install -q -y -n $CONDA_ENV -c conda-forge doxygen rstcheck
-    pip install --user -r requirements.txt
+    conda env update \
+        -n $CONDA_ENV \
+        --file ./env.yml || exit -1
+    conda install \
+        -q \
+        -y \
+        -n $CONDA_ENV \
+            doxygen \
+            rstcheck || exit -1
     # check reStructuredText formatting
     cd $BUILD_DIRECTORY/python-package
     rstcheck --report warning $(find . -type f -name "*.rst") || exit -1
@@ -62,16 +69,13 @@ fi
 
 if [[ $TASK == "lint" ]]; then
     conda install -q -y -n $CONDA_ENV \
+        cmakelint \
+        cpplint \
+        isort \
+        mypy \
         pycodestyle \
         pydocstyle \
-        r-stringi  # stringi needs to be installed separate from r-lintr to avoid issues like 'unable to load shared object stringi.so'
-    # r-xfun below has to be upgraded because lintr requires > 0.19 for that package
-    conda install -q -y -n $CONDA_ENV \
-        -c conda-forge \
-            libxml2 \
-            "r-xfun>=0.19" \
-            "r-lintr>=2.0"
-    pip install --user cmakelint cpplint isort mypy
+        "r-lintr>=3.0"
     echo "Linting Python code"
     pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1
     pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
@@ -114,8 +118,22 @@ if [[ $TASK == "swig" ]]; then
     exit 0
 fi
 
-conda install -q -y -n $CONDA_ENV cloudpickle "dask=2021.9.1" "distributed=2021.9.1" joblib matplotlib numpy pandas psutil pytest scikit-learn scipy
-pip install graphviz  # python-graphviz from Anaconda is not allowed to be installed with Python 3.9
+conda install -q -y -n $CONDA_ENV \
+    cloudpickle \
+    dask \
+    distributed \
+    joblib \
+    matplotlib \
+    numpy \
+    pandas \
+    psutil \
+    pytest \
+    scikit-learn \
+    scipy || exit -1
+
+# python-graphviz has to be installed separately to prevent conda from downgrading to pypy
+conda install -q -y -n $CONDA_ENV \
+    python-graphviz || exit -1
 
 if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "clang" ]]; then
     # fix "OMP: Error #15: Initializing libiomp5.dylib, but found libomp.dylib already initialized." (OpenMP library conflict due to conda's MKL)
@@ -161,32 +179,52 @@ if [[ $TASK == "gpu" ]]; then
     grep -q 'std::string device_type = "gpu"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
     if [[ $METHOD == "pip" ]]; then
         cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--gpu --install-option="--opencl-include-dir=$AMDAPPSDK_PATH/include/" || exit -1
+        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--gpu || exit -1
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --gpu --opencl-include-dir="$AMDAPPSDK_PATH/include/" || exit -1
+        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --gpu || exit -1
         pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
-        cmake -DUSE_GPU=ON -DOpenCL_INCLUDE_DIR=$AMDAPPSDK_PATH/include/ ..
+        cmake -DUSE_GPU=ON ..
+    fi
+elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
+    if [[ $TASK == "cuda" ]]; then
+        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+        grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+    else
+        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+        grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+        # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
+        sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
+        grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
     fi
-elif [[ $TASK == "cuda" ]]; then
-    sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
-    grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
     if [[ $METHOD == "pip" ]]; then
         cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
+        if [[ $TASK == "cuda" ]]; then
+            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
+        else
+            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda-exp || exit -1
+        fi
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
+        if [[ $TASK == "cuda" ]]; then
+            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
+        else
+            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda-exp || exit -1
+        fi
         pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
-        cmake -DUSE_CUDA=ON ..
+        if [[ $TASK == "cuda" ]]; then
+            cmake -DUSE_CUDA=ON ..
+        else
+            cmake -DUSE_CUDA_EXP=ON ..
+        fi
     fi
 elif [[ $TASK == "mpi" ]]; then
     if [[ $METHOD == "pip" ]]; then
@@ -229,7 +267,11 @@ import matplotlib\
 matplotlib.use\(\"Agg\"\)\
 ' plot_example.py  # prevent interactive window mode
     sed -i'.bak' 's/graph.render(view=True)/graph.render(view=False)/' plot_example.py
-    conda install -q -y -n $CONDA_ENV h5py ipywidgets notebook  # requirements for examples
+    # requirements for examples
+    conda install -q -y -n $CONDA_ENV \
+        h5py \
+        ipywidgets \
+        notebook
     for f in *.py **/*.py; do python $f || exit -1; done  # run all examples
     cd $BUILD_DIRECTORY/examples/python-guide/notebooks
     sed -i'.bak' 's/INTERACTIVE = False/assert False, \\"Interactive mode disabled\\"/' interactive_plot_example.ipynb
diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index e7d36e59ceeb..ad0de4f5a9ad 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # set up R environment
-CRAN_MIRROR="https://cloud.r-project.org/"
+CRAN_MIRROR="https://cran.rstudio.com"
 R_LIB_PATH=~/Rlib
 mkdir -p $R_LIB_PATH
 export R_LIBS=$R_LIB_PATH
@@ -17,11 +17,13 @@ fi
 R_MAJOR_VERSION=( ${R_VERSION//./ } )
 if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
     export R_MAC_VERSION=3.6.3
+    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/R-${R_MAC_VERSION}.pkg
     export R_LINUX_VERSION="3.6.3-1bionic"
     export R_APT_REPO="bionic-cran35/"
 elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
-    export R_MAC_VERSION=4.1.1
-    export R_LINUX_VERSION="4.1.1-1.2004.0"
+    export R_MAC_VERSION=4.1.3
+    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/base/R-${R_MAC_VERSION}.pkg
+    export R_LINUX_VERSION="4.1.3-1.2004.0"
     export R_APT_REPO="focal-cran40/"
 else
     echo "Unrecognized R version: ${R_VERSION}"
@@ -38,7 +40,7 @@ if [[ $OS_NAME == "linux" ]]; then
         --keyserver keyserver.ubuntu.com \
         --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
     sudo add-apt-repository \
-        "deb https://cloud.r-project.org/bin/linux/ubuntu ${R_APT_REPO}"
+        "deb ${CRAN_MIRROR}/bin/linux/ubuntu ${R_APT_REPO}"
     sudo apt-get update
     sudo apt-get install \
         --no-install-recommends \
@@ -66,20 +68,33 @@ fi
 if [[ $OS_NAME == "macos" ]]; then
     brew update-reset && brew update
     if [[ $R_BUILD_TYPE == "cran" ]]; then
-        brew install automake
+        brew install automake || exit -1
     fi
     brew install \
         checkbashisms \
-        qpdf
-    brew install --cask basictex
+        qpdf || exit -1
+    brew install --cask basictex || exit -1
     export PATH="/Library/TeX/texbin:$PATH"
-    sudo tlmgr --verify-repo=none update --self
-    sudo tlmgr --verify-repo=none install inconsolata helvetic
+    sudo tlmgr --verify-repo=none update --self || exit -1
+    sudo tlmgr --verify-repo=none install inconsolata helvetic || exit -1
 
-    curl -sL https://cran.r-project.org/bin/macosx/R-${R_MAC_VERSION}.pkg -o R.pkg
+    curl -sL ${R_MAC_PKG_URL} -o R.pkg || exit -1
     sudo installer \
         -pkg $(pwd)/R.pkg \
-        -target /
+        -target / || exit -1
+
+    # Older R versions (<= 4.1.2) on newer macOS (>= 11.0.0) cannot create the necessary symlinks.
+    # See https://github.com/r-lib/actions/issues/412.
+    if [[ $(sw_vers -productVersion | head -c2) -ge "11" ]]; then
+        sudo ln \
+            -sf \
+            /Library/Frameworks/R.framework/Resources/bin/R \
+            /usr/local/bin/R
+        sudo ln \
+            -sf \
+            /Library/Frameworks/R.framework/Resources/bin/Rscript \
+            /usr/local/bin/Rscript
+    fi
 
     # Fix "duplicate libomp versions" issue on Mac
     # by replacing the R libomp.dylib with a symlink to the one installed with brew
@@ -92,13 +107,13 @@ if [[ $OS_NAME == "macos" ]]; then
     fi
 fi
 
-# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat'
+# Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat'
 # to avoid a CI-time dependency on devtools (for devtools::install_deps())
 # NOTE: testthat is not required when running rchk
 if [[ "${TASK}" == "r-rchk" ]]; then
-    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')"
+    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown')"
 else
-    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')"
+    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat')"
 fi
 compile_from_source="both"
 if [[ $OS_NAME == "macos" ]]; then
diff --git a/.ci/test_r_package_solaris.sh b/.ci/test_r_package_solaris.sh
deleted file mode 100755
index 18ed6cb2f7ad..000000000000
--- a/.ci/test_r_package_solaris.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-apt-get install --no-install-recommends -y \
-  libcurl4-openssl-dev \
-  libxml2-dev \
-  libssl-dev
-
-# installation of dependencies needs to happen before building the package,
-# since `R CMD build` needs to install the package to build vignettes
-Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
-
-sh build-cran-package.sh || exit -1
-
-log_file="rhub_logs.txt"
-Rscript ./.ci/run_rhub_solaris_checks.R lightgbm_*.tar.gz $log_file || exit -1
diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh
index e7a6cb027d2d..528a9af48d63 100755
--- a/.ci/test_r_package_valgrind.sh
+++ b/.ci/test_r_package_valgrind.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
+RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" || exit -1
 sh build-cran-package.sh \
   --r-executable=RDvalgrind \
   || exit -1
diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1
index 62562b471345..81573957f800 100644
--- a/.ci/test_r_package_windows.ps1
+++ b/.ci/test_r_package_windows.ps1
@@ -78,7 +78,7 @@ if ($env:R_MAJOR_VERSION -eq "3") {
   $env:RTOOLS_BIN = "$RTOOLS_INSTALL_PATH\usr\bin"
   $env:RTOOLS_MINGW_BIN = "$RTOOLS_INSTALL_PATH\mingw64\bin"
   $env:RTOOLS_EXE_FILE = "rtools40v2-x86_64.exe"
-  $env:R_WINDOWS_VERSION = "4.1.1"
+  $env:R_WINDOWS_VERSION = "4.1.3"
 } else {
   Write-Output "[ERROR] Unrecognized R version: $env:R_VERSION"
   Check-Output $false
@@ -87,7 +87,7 @@ if ($env:R_MAJOR_VERSION -eq "3") {
 $env:R_LIB_PATH = "$env:BUILD_SOURCESDIRECTORY/RLibrary" -replace '[\\]', '/'
 $env:R_LIBS = "$env:R_LIB_PATH"
 $env:PATH = "$env:RTOOLS_BIN;" + "$env:RTOOLS_MINGW_BIN;" + "$env:R_LIB_PATH/R/bin/x64;" + "$env:R_LIB_PATH/miktex/texmfs/install/miktex/bin/x64;" + $env:PATH
-$env:CRAN_MIRROR = "https://cloud.r-project.org/"
+$env:CRAN_MIRROR = "https://cran.rstudio.com"
 $env:CTAN_MIRROR = "https://ctan.math.illinois.edu/systems/win32/miktex"
 $env:CTAN_PACKAGE_ARCHIVE = "$env:CTAN_MIRROR/tm/packages/"
 $env:MIKTEX_EXCEPTION_PATH = "$env:TEMP\miktex"
@@ -109,7 +109,7 @@ tzutil /s "GMT Standard Time"
 
 # download R and RTools
 Write-Output "Downloading R and Rtools"
-Download-File-With-Retries -url "https://cran.r-project.org/bin/windows/base/old/$env:R_WINDOWS_VERSION/R-$env:R_WINDOWS_VERSION-win.exe" -destfile "R-win.exe"
+Download-File-With-Retries -url "$env:CRAN_MIRROR/bin/windows/base/old/$env:R_WINDOWS_VERSION/R-$env:R_WINDOWS_VERSION-win.exe" -destfile "R-win.exe"
 Download-File-With-Retries -url "https://github.com/microsoft/LightGBM/releases/download/v2.0.12/$env:RTOOLS_EXE_FILE" -destfile "Rtools.exe"
 
 # Install R
@@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
 Write-Output "Done installing Rtools"
 
 Write-Output "Installing dependencies"
-$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
+$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
 Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?
 
 # MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
index 51488145dc17..fec559d86a5e 100644
--- a/.ci/test_windows.ps1
+++ b/.ci/test_windows.ps1
@@ -50,7 +50,9 @@ if ($env:TASK -eq "swig") {
   Exit 0
 }
 
-conda install -q -y -n $env:CONDA_ENV joblib matplotlib numpy pandas psutil pytest python-graphviz scikit-learn scipy ; Check-Output $?
+conda install -q -y -n $env:CONDA_ENV cloudpickle joblib numpy pandas psutil pytest scikit-learn scipy ; Check-Output $?
+# matplotlib and python-graphviz have to be installed separately to prevent conda from downgrading to pypy
+conda install -q -y -n $env:CONDA_ENV matplotlib python-graphviz ; Check-Output $?
 
 if ($env:TASK -eq "regular") {
   mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 1242a25a5519..ce6da9f6e7fb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,27 +10,27 @@
 *    @guolinke @StrikerRUS @jameslamb @shiyu1994
 
 # other catch-alls that will get matched if specific rules below are not matched
-*.R    @Laurae2 @jameslamb
-*.py    @StrikerRUS @chivee @wxchan @henry0312 @shiyu1994 @hzy46 @tongwu-msft
-*.cpp    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
-*.h    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
+*.R    @jameslamb @jmoralez
+*.py    @StrikerRUS @jmoralez @jameslamb @shiyu1994
+*.cpp    @guolinke @shiyu1994
+*.h    @guolinke @shiyu1994
 
 # main C++ code
-include/    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
-src/    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
-CMakeLists.txt    @guolinke @chivee @Laurae2 @jameslamb @wxchan @henry0312 @StrikerRUS @huanzhang12 @btrotta @shiyu1994 @hzy46 @tongwu-msft
-tests/c_api_test/    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
-tests/cpp_tests/    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
-tests/data/    @guolinke @chivee @btrotta @shiyu1994 @hzy46 @tongwu-msft
-windows/    @guolinke @chivee @btrotta @StrikerRUS @shiyu1994 @hzy46 @tongwu-msft
+include/    @guolinke @shiyu1994
+src/    @guolinke @shiyu1994
+CMakeLists.txt    @guolinke @jameslamb @StrikerRUS @shiyu1994
+tests/c_api_test/    @guolinke @shiyu1994
+tests/cpp_tests/    @guolinke @shiyu1994
+tests/data/    @guolinke @shiyu1994
+windows/    @guolinke @StrikerRUS @shiyu1994
 
 # R code
-build_r.R    @jameslamb @StrikerRUS
-build-cran-package.sh    @jameslamb @StrikerRUS
-R-package/    @Laurae2 @jameslamb
+build_r.R    @jameslamb @StrikerRUS @jmoralez
+build-cran-package.sh    @jameslamb @StrikerRUS @jmoralez
+R-package/    @jameslamb @jmoralez
 
 # Python code
-python-package/    @StrikerRUS @chivee @wxchan @henry0312 @shiyu1994 @jameslamb @hzy46 @tongwu-msft
+python-package/    @StrikerRUS @shiyu1994 @jameslamb @jmoralez
 
 # Dask integration
 python-package/lightgbm/dask.py    @jameslamb @jmoralez
@@ -42,21 +42,21 @@ helpers/    @StrikerRUS @guolinke
 # CI administrative stuff
 .ci/    @StrikerRUS @jameslamb
 docs/    @StrikerRUS @jameslamb
-examples/     @StrikerRUS @jameslamb @guolinke
+examples/     @StrikerRUS @jameslamb @guolinke @jmoralez
 *.yml    @StrikerRUS @jameslamb
 .vsts-ci.yml    @StrikerRUS @jameslamb
 
 # docker setup
 docker/    @StrikerRUS @jameslamb
-docker/dockerfile-cli    @guolinke @chivee @shiyu1994
-docker/gpu/    @huanzhang12
-docker/dockerfile-python    @StrikerRUS @chivee @wxchan @henry0312 @shiyu1994
-docker/dockerfile-r    @Laurae2 @jameslamb
+docker/dockerfile-cli    @guolinke @shiyu1994 @StrikerRUS @jameslamb
+docker/gpu/    @StrikerRUS @jameslamb
+docker/dockerfile-python    @StrikerRUS @shiyu1994 @jameslamb @jmoralez
+docker/dockerfile-r    @jameslamb @jmoralez
 
 # GPU code
-docs/GPU-*.rst    @huanzhang12
-src/treelearner/gpu_tree_learner.cpp    @huanzhang12 @guolinke @chivee @shiyu1994
-src/treelearner/tree_learner.cpp    @huanzhang12 @guolinke @chivee @shiyu1994
+docs/GPU-*.rst    @shiyu1994 @guolinke
+src/treelearner/gpu_tree_learner.cpp    @guolinke @shiyu1994
+src/treelearner/tree_learner.cpp    @guolinke @shiyu1994
 
 # JAVA code
-swig/    @guolinke @chivee @shiyu1994
+swig/    @guolinke @shiyu1994
diff --git a/.github/no-response.yml b/.github/no-response.yml
deleted file mode 100644
index dbfcf698738f..000000000000
--- a/.github/no-response.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-# Configuration for probot-no-response - https://github.com/probot/no-response
-
-# Number of days of inactivity before an Issue is closed for lack of response
-daysUntilClose: 30
-# Label requiring a response
-responseRequiredLabel: awaiting response
-# Comment to post when closing an Issue for lack of response. Set to `false` to disable
-closeComment: >
-    This issue has been automatically closed because it has been awaiting a response for too long.
-    When you have time to to work with the maintainers to resolve this issue, please post a new comment and it will be re-opened.
-    If the issue has been locked for editing by the time you return to it, please open a new issue and reference this one. Thank you for taking the time to improve LightGBM!
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 822ea2d44b82..b06e01d0d4ff 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -16,7 +16,7 @@ env:
 
 jobs:
   test:
-    name: cuda ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }})
+    name: ${{ matrix.tree_learner }} ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }})
     runs-on: [self-hosted, linux]
     timeout-minutes: 60
     strategy:
@@ -25,16 +25,29 @@ jobs:
         include:
           - method: source
             compiler: gcc
-            python_version: 3.7
-            cuda_version: "11.4.2"
+            python_version: "3.8"
+            cuda_version: "11.6.2"
+            tree_learner: cuda
           - method: pip
             compiler: clang
-            python_version: 3.8
+            python_version: "3.9"
             cuda_version: "10.0"
+            tree_learner: cuda
           - method: wheel
             compiler: gcc
-            python_version: 3.9
+            python_version: "3.10"
             cuda_version: "9.0"
+            tree_learner: cuda
+          - method: source
+            compiler: gcc
+            python_version: "3.8"
+            cuda_version: "11.6.2"
+            tree_learner: cuda_exp
+          - method: pip
+            compiler: clang
+            python_version: "3.9"
+            cuda_version: "10.0"
+            tree_learner: cuda_exp
     steps:
       - name: Setup or update software on host machine
         run: |
@@ -81,7 +94,7 @@ jobs:
             LGB_VER=$(head -n 1 VERSION.txt)
             EOF
             cat > docker-script.sh <<EOF
-            export CONDA=\$HOME/miniconda
+            export CONDA=\$HOME/miniforge
             export PATH=\$CONDA/bin:\$PATH
             nvidia-smi
             $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
diff --git a/.github/workflows/linkchecker.yml b/.github/workflows/linkchecker.yml
index 0e570109518b..5867ed83f043 100644
--- a/.github/workflows/linkchecker.yml
+++ b/.github/workflows/linkchecker.yml
@@ -11,7 +11,7 @@ env:
   CONDA_ENV: test-env
   GITHUB_ACTIONS: 'true'
   OS_NAME: 'linux'
-  PYTHON_VERSION: 3.9
+  PYTHON_VERSION: '3.10'
   TASK: 'check-links'
 
 jobs:
@@ -27,7 +27,7 @@ jobs:
       - name: Setup and run tests
         run: |
           export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
-          export CONDA=${HOME}/miniconda
+          export CONDA=${HOME}/miniforge
           export PATH=${CONDA}/bin:${HOME}/.local/bin:${PATH}
           $GITHUB_WORKSPACE/.ci/setup.sh || exit -1
           $GITHUB_WORKSPACE/.ci/test.sh || exit -1
diff --git a/.github/workflows/no-response.yml b/.github/workflows/no-response.yml
new file mode 100644
index 000000000000..a731941c21eb
--- /dev/null
+++ b/.github/workflows/no-response.yml
@@ -0,0 +1,23 @@
+name: No Response Bot
+
+on:
+  issue_comment:
+    types: [created]
+  schedule:
+    # "every day at 04:00 UTC"
+    - cron: '0 4 * * *'
+
+jobs:
+  noResponse:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: lee-dohm/no-response@v0.5.0
+        with:
+          closeComment: >
+              This issue has been automatically closed because it has been awaiting a response for too long.
+              When you have time to to work with the maintainers to resolve this issue, please post a new comment and it will be re-opened.
+              If the issue has been locked for editing by the time you return to it, please open a new issue and reference this one.
+              Thank you for taking the time to improve LightGBM!
+          daysUntilClose: 30
+          responseRequiredLabel: awaiting response
+          token: ${{ github.token }}
diff --git a/.github/workflows/optional_checks.yml b/.github/workflows/optional_checks.yml
index 4cdb51327bba..81a927d2607a 100644
--- a/.github/workflows/optional_checks.yml
+++ b/.github/workflows/optional_checks.yml
@@ -19,7 +19,6 @@ jobs:
         run: |
             workflows=(
                 "R valgrind tests;r-valgrind"
-                "Solaris CRAN check;r-solaris"
             )
             for i in "${workflows[@]}"; do
                 workflow_name=${i%;*}
diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
index 8c0e45b8fd3e..835578e87d1e 100644
--- a/.github/workflows/python_package.yml
+++ b/.github/workflows/python_package.yml
@@ -23,28 +23,28 @@ jobs:
         include:
           - os: macOS-latest
             task: regular
-            python_version: 3.8
+            python_version: '3.8'
           - os: macOS-latest
             task: sdist
-            python_version: 3.9
+            python_version: '3.9'
           - os: macOS-latest
             task: bdist
-            python_version: 3.8
+            python_version: '3.7'
           - os: macOS-latest
             task: if-else
-            python_version: 3.8
+            python_version: '3.8'
           - os: macOS-latest
             task: mpi
             method: source
-            python_version: 3.9
+            python_version: '3.9'
           - os: macOS-latest
             task: mpi
             method: pip
-            python_version: 3.9
+            python_version: '3.10'
           - os: macOS-latest
             task: mpi
             method: wheel
-            python_version: 3.7
+            python_version: '3.7'
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2.4.0
@@ -66,7 +66,7 @@ jobs:
           fi
           export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
           export LGB_VER=$(head -n 1 VERSION.txt)
-          export CONDA=${HOME}/miniconda
+          export CONDA=${HOME}/miniforge
           export PATH=${CONDA}/bin:${PATH}
           $GITHUB_WORKSPACE/.ci/setup.sh || exit -1
           $GITHUB_WORKSPACE/.ci/test.sh || exit -1
diff --git a/.github/workflows/r_configure.yml b/.github/workflows/r_configure.yml
index d450abe866f4..f989c272d2c5 100644
--- a/.github/workflows/r_configure.yml
+++ b/.github/workflows/r_configure.yml
@@ -17,6 +17,9 @@ jobs:
           apt-get install --no-install-recommends -y \
             ca-certificates \
             git
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
       - name: Checkout repository
         uses: actions/checkout@v2.4.0
         with:
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
index 4f2243735cda..84b82996a795 100644
--- a/.github/workflows/r_package.yml
+++ b/.github/workflows/r_package.yml
@@ -40,7 +40,7 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: gcc
-            r_version: 4.0
+            r_version: 4.1
             build_type: cmake
           - os: ubuntu-latest
             task: r-package
@@ -50,7 +50,7 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: clang
-            r_version: 4.0
+            r_version: 4.1
             build_type: cmake
           - os: macOS-latest
             task: r-package
@@ -60,7 +60,7 @@ jobs:
           - os: macOS-latest
             task: r-package
             compiler: gcc
-            r_version: 4.0
+            r_version: 4.1
             build_type: cmake
           - os: macOS-latest
             task: r-package
@@ -70,7 +70,7 @@ jobs:
           - os: macOS-latest
             task: r-package
             compiler: clang
-            r_version: 4.0
+            r_version: 4.1
             build_type: cmake
           - os: windows-latest
             task: r-package
@@ -82,53 +82,40 @@ jobs:
             task: r-package
             compiler: MINGW
             toolchain: MSYS
-            r_version: 4.0
-            build_type: cmake
-          # Visual Studio 2017
-          - os: windows-2016
-            task: r-package
-            compiler: MSVC
-            toolchain: MSVC
-            r_version: 3.6
+            r_version: 4.1
             build_type: cmake
           # Visual Studio 2019
           - os: windows-2019
             task: r-package
             compiler: MSVC
             toolchain: MSVC
-            r_version: 4.0
+            r_version: 3.6
             build_type: cmake
           # Visual Studio 2022
           - os: windows-2022
             task: r-package
             compiler: MSVC
             toolchain: MSVC
-            r_version: 4.0
+            r_version: 4.1
             build_type: cmake
           ###############
           # CRAN builds #
           ###############
-          - os: windows-latest
-            task: r-package
-            compiler: MINGW
-            toolchain: MINGW
-            r_version: 3.6
-            build_type: cran
           - os: windows-latest
             task: r-package
             compiler: MINGW
             toolchain: MSYS
-            r_version: 4.0
+            r_version: 4.1
             build_type: cran
           - os: ubuntu-latest
             task: r-package
             compiler: gcc
-            r_version: 4.0
+            r_version: 4.1
             build_type: cran
           - os: macOS-latest
             task: r-package
             compiler: clang
-            r_version: 4.0
+            r_version: 4.1
             build_type: cran
           ################
           # Other checks #
@@ -136,7 +123,7 @@ jobs:
           - os: ubuntu-latest
             task: r-rchk
             compiler: gcc
-            r_version: 4.0
+            r_version: 4.1
             build_type: cran
     steps:
       - name: Prevent conversion of line endings on Windows
@@ -193,6 +180,9 @@ jobs:
           - r_customization: csan
             compiler: clang
     steps:
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
       - name: Checkout repository
         uses: actions/checkout@v2.4.0
         with:
@@ -201,7 +191,7 @@ jobs:
       - name: Install packages
         shell: bash
         run: |
-          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
           sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
           RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
       - name: Run tests with sanitizers
@@ -209,7 +199,7 @@ jobs:
         run: |
           cd R-package/tests
           exit_code=0
-          RDscript${{ matrix.r_customization }} testthat.R >> tests.log 2>&1 || exit_code=1
+          RDscript${{ matrix.r_customization }} testthat.R >> tests.log 2>&1 || exit_code=-1
           cat ./tests.log
           exit ${exit_code}
   test-r-debian-clang:
@@ -223,6 +213,9 @@ jobs:
         run: |
           apt-get update --allow-releaseinfo-change
           apt-get install --no-install-recommends -y git
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
       - name: Checkout repository
         uses: actions/checkout@v2.4.0
         with:
@@ -232,7 +225,7 @@ jobs:
         shell: bash
         run: |
           export PATH=/opt/R-devel/bin/:${PATH}
-          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
           sh build-cran-package.sh
           R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
           if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
diff --git a/.github/workflows/r_solaris.yml b/.github/workflows/r_solaris.yml
deleted file mode 100644
index c717d57e5b22..000000000000
--- a/.github/workflows/r_solaris.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: Solaris CRAN check
-
-on:
-  repository_dispatch:
-    types: [gha_run_r_solaris]
-
-jobs:
-  test:
-    name: solaris-cran
-    timeout-minutes: 120
-    runs-on: ubuntu-latest
-    container: wch1/r-debug
-    env:
-      SECRETS_WORKFLOW: ${{ secrets.WORKFLOW }}
-    steps:
-      - name: Install essential software before checkout
-        shell: bash
-        run: |
-          apt-get update
-          apt-get install --no-install-recommends -y \
-            curl \
-            jq
-      - name: Checkout repository
-        uses: actions/checkout@v2.4.0
-        with:
-          fetch-depth: 5
-          submodules: true
-          repository: microsoft/LightGBM
-          ref: "refs/pull/${{ github.event.client_payload.pr_number }}/merge"
-      - name: Send init status
-        if: ${{ always() }}
-        run: |
-          $GITHUB_WORKSPACE/.ci/set_commit_status.sh "${{ github.workflow }}" "pending" "${{ github.event.client_payload.pr_sha }}"
-          $GITHUB_WORKSPACE/.ci/append_comment.sh \
-            "${{ github.event.client_payload.comment_number }}" \
-            "Workflow **${{ github.workflow }}** has been triggered! 🚀\r\n${GITHUB_SERVER_URL}/microsoft/LightGBM/actions/runs/${GITHUB_RUN_ID}"
-      - name: Run tests on Solaris
-        shell: bash
-        run: ./.ci/test_r_package_solaris.sh
-      - name: Send final status
-        if: ${{ always() }}
-        run: |
-          $GITHUB_WORKSPACE/.ci/set_commit_status.sh "${{ github.workflow }}" "${{ job.status }}" "${{ github.event.client_payload.pr_sha }}"
-          body=""
-          while IFS= read -r line; do
-              platform=${line%@*}
-              url=${line#*@}
-              body="${body}**${platform}**: ${url}\r\n"
-          done < "$GITHUB_WORKSPACE/rhub_logs.txt" || true
-          body="${body}Reports also have been sent to LightGBM public e-mail: https://yopmail.com?lightgbm_rhub_checks\r\n"
-          body="${body}Status: ${{ job.status }}."
-          $GITHUB_WORKSPACE/.ci/append_comment.sh \
-            "${{ github.event.client_payload.comment_number }}" \
-            "$body"
-      - name: Rerun workflow-indicator
-        if: ${{ always() }}
-        run: |
-          bash $GITHUB_WORKSPACE/.ci/rerun_workflow.sh \
-            "optional_checks.yml" \
-            "${{ github.event.client_payload.pr_number }}" \
-            "${{ github.event.client_payload.pr_branch }}" \
-            || true
diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml
index 56b9dac031fe..e333b06651f4 100644
--- a/.github/workflows/r_valgrind.yml
+++ b/.github/workflows/r_valgrind.yml
@@ -20,6 +20,9 @@ jobs:
           apt-get install --no-install-recommends -y \
             curl \
             jq
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
       - name: Checkout repository
         uses: actions/checkout@v2.4.0
         with:
diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml
index 123a93e2462b..16a0784385dc 100644
--- a/.github/workflows/static_analysis.yml
+++ b/.github/workflows/static_analysis.yml
@@ -15,7 +15,7 @@ env:
   CONDA_ENV: test-env
   GITHUB_ACTIONS: 'true'
   OS_NAME: 'linux'
-  PYTHON_VERSION: 3.9
+  PYTHON_VERSION: '3.10'
 
 jobs:
   test:
@@ -39,7 +39,7 @@ jobs:
         run: |
           export TASK="${{ matrix.task }}"
           export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
-          export CONDA=${HOME}/miniconda
+          export CONDA=${HOME}/miniforge
           export PATH=${CONDA}/bin:$HOME/.local/bin:${PATH}
           $GITHUB_WORKSPACE/.ci/setup.sh || exit -1
           $GITHUB_WORKSPACE/.ci/test.sh || exit -1
@@ -49,6 +49,9 @@ jobs:
     runs-on: ubuntu-latest
     container: rocker/verse
     steps:
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
       - name: Checkout repository
         uses: actions/checkout@v2.4.0
         with:
@@ -57,7 +60,7 @@ jobs:
       - name: Install packages
         shell: bash
         run: |
-          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'roxygen2', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
           sh build-cran-package.sh || exit -1
           R CMD INSTALL --with-keep.source lightgbm_*.tar.gz || exit -1
       - name: Test documentation
diff --git a/.github/workflows/triggering_comments.yml b/.github/workflows/triggering_comments.yml
index c361eabb9089..226db21f5f06 100644
--- a/.github/workflows/triggering_comments.yml
+++ b/.github/workflows/triggering_comments.yml
@@ -25,14 +25,6 @@ jobs:
           "${{ github.event.comment.id }}" \
           "gha_run_r_valgrind"
 
-    - name: Trigger R Solaris CRAN checks
-      if: github.event.comment.body == '/gha run r-solaris'
-      run: |
-        $GITHUB_WORKSPACE/.ci/trigger_dispatch_run.sh \
-          "${{ github.event.issue.pull_request.url }}" \
-          "${{ github.event.comment.id }}" \
-          "gha_run_r_solaris"
-
     - name: Trigger update R configure
       if: github.event.comment.body == '/gha run r-configure'
       run: |
diff --git a/.gitignore b/.gitignore
index 96e0700f4f49..bb65ca426bba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -346,6 +346,7 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/pythonapi/
+*.flag
 
 # Doxygen documentation
 docs/doxyoutput/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 4f85742c7f74..7d63476a0f79 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,10 +1,12 @@
 version: 2
+build:
+  os: "ubuntu-20.04"
+  tools:
+    python: "miniconda3-4.7"
+conda:
+  environment: docs/env.yml
 formats:
   - pdf
-python:
-  version: 3
-  install:
-    - requirements: docs/requirements.txt
 sphinx:
   builder: html
   configuration: docs/conf.py
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index 605e7f4512b8..139f72061236 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -9,14 +9,18 @@ pr:
 - master
 variables:
   AZURE: 'true'
-  PYTHON_VERSION: 3.9
+  PYTHON_VERSION: '3.10'
   CONDA_ENV: test-env
+  runCodesignValidationInjection: false
+  skipComponentGovernanceDetection: true
+  DOTNET_CLI_TELEMETRY_OPTOUT: true
+  DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
 resources:
   containers:
   - container: ubuntu1404
     image: lightgbm/vsts-agent:ubuntu-14.04
   - container: ubuntu-latest
-    image: 'ubuntu:latest'
+    image: 'ubuntu:20.04'
     options: "--name ci-container -v /usr/bin/docker:/tmp/docker:ro"
   - container: rbase
     image: wch1/r-debug
@@ -35,17 +39,19 @@ jobs:
     matrix:
       regular:
         TASK: regular
+        PYTHON_VERSION: '3.9'
       sdist:
         TASK: sdist
-        PYTHON_VERSION: 3.7
+        PYTHON_VERSION: '3.7'
       bdist:
         TASK: bdist
+        PYTHON_VERSION: '3.8'
       inference:
         TASK: if-else
       mpi_source:
         TASK: mpi
         METHOD: source
-        PYTHON_VERSION: 3.8
+        PYTHON_VERSION: '3.8'
       gpu_source:
         TASK: gpu
         METHOD: source
@@ -56,12 +62,10 @@ jobs:
       echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
       echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
       echo "##vso[task.prependpath]$CONDA/bin"
-      AMDAPPSDK_PATH=$BUILD_SOURCESDIRECTORY/AMDAPPSDK
-      echo "##vso[task.setvariable variable=AMDAPPSDK_PATH]$AMDAPPSDK_PATH"
-      LD_LIBRARY_PATH=$AMDAPPSDK_PATH/lib/x86_64:$LD_LIBRARY_PATH
-      echo "##vso[task.setvariable variable=LD_LIBRARY_PATH]$LD_LIBRARY_PATH"
-      echo "##vso[task.setvariable variable=OPENCL_VENDOR_PATH]$AMDAPPSDK_PATH/etc/OpenCL/vendors"
     displayName: 'Set variables'
+  - script: |
+      echo '$(Build.SourceVersion)' > '$(Build.ArtifactStagingDirectory)/commit.txt'
+    displayName: 'Add commit hash to artifacts archive'
   - bash: $(Build.SourcesDirectory)/.ci/setup.sh
     displayName: Setup
   - bash: $(Build.SourcesDirectory)/.ci/test.sh
@@ -91,7 +95,7 @@ jobs:
         TASK: sdist
       bdist:
         TASK: bdist
-        PYTHON_VERSION: 3.8
+        PYTHON_VERSION: '3.8'
       inference:
         TASK: if-else
       mpi_source:
@@ -100,21 +104,23 @@ jobs:
       mpi_pip:
         TASK: mpi
         METHOD: pip
-        PYTHON_VERSION: 3.8
+        PYTHON_VERSION: '3.9'
       mpi_wheel:
         TASK: mpi
         METHOD: wheel
-        PYTHON_VERSION: 3.7
+        PYTHON_VERSION: '3.7'
       gpu_source:
         TASK: gpu
         METHOD: source
+        PYTHON_VERSION: '3.9'
       gpu_pip:
         TASK: gpu
         METHOD: pip
+        PYTHON_VERSION: '3.8'
       gpu_wheel:
         TASK: gpu
         METHOD: wheel
-        PYTHON_VERSION: 3.7
+        PYTHON_VERSION: '3.7'
       cpp_tests:
         TASK: cpp-tests
         METHOD: with-sanitizers
@@ -122,14 +128,9 @@ jobs:
   - script: |
       echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
       echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
-      CONDA=$HOME/miniconda
+      CONDA=$HOME/miniforge
       echo "##vso[task.setvariable variable=CONDA]$CONDA"
       echo "##vso[task.prependpath]$CONDA/bin"
-      AMDAPPSDK_PATH=$BUILD_SOURCESDIRECTORY/AMDAPPSDK
-      echo "##vso[task.setvariable variable=AMDAPPSDK_PATH]$AMDAPPSDK_PATH"
-      LD_LIBRARY_PATH=$AMDAPPSDK_PATH/lib/x86_64:$LD_LIBRARY_PATH
-      echo "##vso[task.setvariable variable=LD_LIBRARY_PATH]$LD_LIBRARY_PATH"
-      echo "##vso[task.setvariable variable=OPENCL_VENDOR_PATH]$AMDAPPSDK_PATH/etc/OpenCL/vendors"
     displayName: 'Set variables'
   # https://github.com/microsoft/azure-pipelines-agent/issues/2043#issuecomment-687983301
   - script: |
@@ -183,7 +184,7 @@ jobs:
       BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY
       EOF
       cat > docker-script.sh <<EOF
-      export CONDA=\$HOME/miniconda
+      export CONDA=\$HOME/miniforge
       export PATH=\$CONDA/bin:\$PATH
       $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
       $ROOT_DOCKER_FOLDER/.ci/test.sh || exit -1
@@ -220,10 +221,10 @@ jobs:
     matrix:
       regular:
         TASK: regular
-        PYTHON_VERSION: 3.7
+        PYTHON_VERSION: '3.7'
       sdist:
         TASK: sdist
-        PYTHON_VERSION: 3.8
+        PYTHON_VERSION: '3.8'
       bdist:
         TASK: bdist
       swig:
@@ -236,7 +237,7 @@ jobs:
   - script: |
       echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
       echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
-      CONDA=$AGENT_HOMEDIRECTORY/miniconda
+      CONDA=$AGENT_HOMEDIRECTORY/miniforge
       echo "##vso[task.setvariable variable=CONDA]$CONDA"
       echo "##vso[task.prependpath]$CONDA/bin"
       echo "##vso[task.setvariable variable=JAVA_HOME]$JAVA_HOME_8_X64"
@@ -255,14 +256,15 @@ jobs:
 - job: Windows
 ###########################################
   pool:
-    vmImage: 'vs2017-win2016'
+    vmImage: 'windows-2019'
   strategy:
     matrix:
       regular:
         TASK: regular
+        PYTHON_VERSION: '3.9'
       sdist:
         TASK: sdist
-        PYTHON_VERSION: 3.8
+        PYTHON_VERSION: '3.8'
       bdist:
         TASK: bdist
       swig:
@@ -278,6 +280,8 @@ jobs:
     condition: eq(variables['TASK'], 'bdist')
     displayName: 'Install OpenCL'
   - script: |
+      cmd /c "conda config --add channels conda-forge"
+      cmd /c "conda config --set channel_priority strict"
       cmd /c "conda init powershell"
       cmd /c "powershell -ExecutionPolicy Bypass -File %BUILD_SOURCESDIRECTORY%/.ci/test_windows.ps1"
     displayName: Test
@@ -300,7 +304,7 @@ jobs:
       R_LIB_PATH=~/Rlib
       export R_LIBS=${R_LIB_PATH}
       mkdir -p ${R_LIB_PATH}
-      RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'),  lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
+      RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown'),  lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" || exit -1
       sh build-cran-package.sh --r-executable=RD || exit -1
       mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
     displayName: 'Build CRAN R-package'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5eac26a61764..6bdb286be16a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF)
+option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use santizer flags" OFF)
 set(
@@ -28,7 +29,7 @@ if(__INTEGRATE_OPENCL)
   cmake_minimum_required(VERSION 3.11)
 elseif(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
-elseif(USE_CUDA)
+elseif(USE_CUDA OR USE_CUDA_EXP)
   cmake_minimum_required(VERSION 3.16)
 else()
   cmake_minimum_required(VERSION 3.0)
@@ -133,7 +134,7 @@ else()
     add_definitions(-DUSE_SOCKET)
 endif()
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_CUDA_EXP)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
     enable_language(CUDA)
     set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
@@ -171,8 +172,12 @@ if(__INTEGRATE_OPENCL)
     endif()
 endif()
 
-if(USE_CUDA)
-    find_package(CUDA 9.0 REQUIRED)
+if(USE_CUDA OR USE_CUDA_EXP)
+    if(USE_CUDA)
+      find_package(CUDA 9.0 REQUIRED)
+    else()
+      find_package(CUDA 10.0 REQUIRED)
+    endif()
     include_directories(${CUDA_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -199,7 +204,12 @@ if(USE_CUDA)
     endif()
     message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 
-    add_definitions(-DUSE_CUDA)
+    if(USE_CUDA)
+      add_definitions(-DUSE_CUDA)
+    elseif(USE_CUDA_EXP)
+      add_definitions(-DUSE_CUDA_EXP)
+    endif()
+
     if(NOT DEFINED CMAKE_CUDA_STANDARD)
       set(CMAKE_CUDA_STANDARD 11)
       set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -314,6 +324,18 @@ if(WIN32 AND MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
 endif()
 
+# Check if inet_pton is already available, to avoid conflicts with the implementation in LightGBM.
+# As of 2022, MinGW started including a definition of inet_pton.
+if(WIN32)
+  include(CheckSymbolExists)
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "ws2_32")
+  check_symbol_exists(inet_pton "ws2tcpip.h" WIN_INET_PTON_FOUND)
+  if(WIN_INET_PTON_FOUND)
+    add_definitions(-DWIN_HAS_INET_PTON)
+  endif()
+  list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES "ws2_32")
+endif()
+
 if(MSVC)
     set(
       variables
@@ -369,9 +391,17 @@ file(
       src/objective/*.cpp
       src/network/*.cpp
       src/treelearner/*.cpp
-if(USE_CUDA)
+if(USE_CUDA OR USE_CUDA_EXP)
       src/treelearner/*.cu
 endif()
+if(USE_CUDA_EXP)
+      src/treelearner/cuda/*.cpp
+      src/treelearner/cuda/*.cu
+      src/io/cuda/*.cu
+      src/io/cuda/*.cpp
+      src/cuda/*.cpp
+      src/cuda/*.cu
+endif()
 )
 
 add_library(lightgbm_objs OBJECT ${SOURCES})
@@ -493,7 +523,7 @@ if(__INTEGRATE_OPENCL)
   target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES})
 endif()
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_CUDA_EXP)
   # Disable cmake warning about policy CMP0104. Refer to issue #3754 and PR #4268.
   # Custom target properties does not propagate, thus we need to specify for
   # each target that contains or depends on cuda source.
@@ -501,6 +531,8 @@ if(USE_CUDA)
   set_target_properties(_lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
   set_target_properties(lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
 
+  set_target_properties(lightgbm_objs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
   # Device linking is not supported for object libraries.
   # Thus we have to specify them on final targets.
   set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore
index 3e552132cf22..9a6fec7e58d7 100644
--- a/R-package/.Rbuildignore
+++ b/R-package/.Rbuildignore
@@ -38,6 +38,7 @@ src/external_libs/fmt/.*\.md
 src/external_libs/fmt/.travis.yml
 src/external_libs/fmt/doc
 src/external_libs/fmt/support/Android\.mk
+src/external_libs/fmt/support/bazel/.bazel.*
 src/external_libs/fmt/support/.*\.gradle
 src/external_libs/fmt/support/.*\.pro
 src/external_libs/fmt/support/.*\.py
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index d9287584da09..2dd04f8210d8 100755
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -19,14 +19,11 @@ Authors@R: c(
     person("Yachen", "Yan", role = c("ctb")),
     person("Microsoft Corporation", role = c("cph")),
     person("Dropbox, Inc.", role = c("cph")),
-    person("Jay", "Loden", role = c("cph")),
-    person("Dave", "Daeschler", role = c("cph")),
-    person("Giampaolo", "Rodola", role = c("cph")),
     person("Alberto", "Ferreira", role = c("ctb")),
     person("Daniel", "Lemire", role = c("ctb")),
     person("Victor", "Zverovich", role = c("cph")),
     person("IBM Corporation", role = c("ctb")),
-    person("David", "Cortes", role = c("ctb")),
+    person("David", "Cortes", role = c("aut")),
     person("Michael", "Mayer", role = c("ctb"))
     )
 Description: Tree based algorithms can be improved by introducing boosting frameworks.
@@ -50,6 +47,7 @@ VignetteBuilder: knitr
 Suggests:
     knitr,
     processx,
+    RhpcBLASctl,
     rmarkdown,
     testthat
 Depends:
@@ -61,7 +59,8 @@ Imports:
     jsonlite (>= 1.0),
     Matrix (>= 1.1-0),
     methods,
+    parallel,
     utils
 SystemRequirements:
     C++11
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.0
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 02e886bbcbac..d8ed636c0208 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -31,13 +31,16 @@ export(lgb.plot.interpretation)
 export(lgb.restore_handle)
 export(lgb.save)
 export(lgb.train)
-export(lgb.unloader)
 export(lightgbm)
 export(readRDS.lgb.Booster)
 export(saveRDS.lgb.Booster)
 export(set_field)
 export(slice)
 import(methods)
+importClassesFrom(Matrix,dgCMatrix)
+importClassesFrom(Matrix,dgRMatrix)
+importClassesFrom(Matrix,dsparseMatrix)
+importClassesFrom(Matrix,dsparseVector)
 importFrom(Matrix,Matrix)
 importFrom(R6,R6Class)
 importFrom(data.table,":=")
@@ -52,6 +55,8 @@ importFrom(graphics,barplot)
 importFrom(graphics,par)
 importFrom(jsonlite,fromJSON)
 importFrom(methods,is)
+importFrom(methods,new)
+importFrom(parallel,detectCores)
 importFrom(stats,quantile)
 importFrom(utils,modifyList)
 importFrom(utils,read.delim)
diff --git a/R-package/R/aliases.R b/R-package/R/aliases.R
index 77fe74ef2af0..0aa886ab90c2 100644
--- a/R-package/R/aliases.R
+++ b/R-package/R/aliases.R
@@ -33,11 +33,18 @@
     )])
 }
 
+# [description] Non-exported environment, used for caching details that only need to be
+#               computed once per R session.
+.lgb_session_cache_env <- new.env()
+
 # [description] List of respected parameter aliases. Wrapped in a function to take advantage of
 #               lazy evaluation (so it doesn't matter what order R sources files during installation).
 # [return] A named list, where each key is a main LightGBM parameter and each value is a character
 #          vector of corresponding aliases.
 .PARAMETER_ALIASES <- function() {
+    if (exists("PARAMETER_ALIASES", where = .lgb_session_cache_env)) {
+        return(get("PARAMETER_ALIASES", envir = .lgb_session_cache_env))
+    }
     params_to_aliases <- jsonlite::fromJSON(
         .Call(
             LGBM_DumpParamAliases_R
@@ -47,6 +54,12 @@
         aliases_with_main_name <- c(main_name, unlist(params_to_aliases[[main_name]]))
         params_to_aliases[[main_name]] <- aliases_with_main_name
     }
+    # store in cache so the next call to `.PARAMETER_ALIASES()` doesn't need to recompute this
+    assign(
+        x = "PARAMETER_ALIASES"
+        , value = params_to_aliases
+        , envir = .lgb_session_cache_env
+    )
     return(params_to_aliases)
 }
 
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index 3830ce620149..50f36be4a2be 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -71,7 +71,7 @@ merge.eval.string <- function(env) {
 
 }
 
-cb.print.evaluation <- function(period) {
+cb_print_evaluation <- function(period) {
 
   # Create callback
   callback <- function(env) {
@@ -103,13 +103,13 @@ cb.print.evaluation <- function(period) {
 
   # Store attributes
   attr(callback, "call") <- match.call()
-  attr(callback, "name") <- "cb.print.evaluation"
+  attr(callback, "name") <- "cb_print_evaluation"
 
   return(callback)
 
 }
 
-cb.record.evaluation <- function() {
+cb_record_evaluation <- function() {
 
   # Create callback
   callback <- function(env) {
@@ -178,13 +178,13 @@ cb.record.evaluation <- function() {
 
   # Store attributes
   attr(callback, "call") <- match.call()
-  attr(callback, "name") <- "cb.record.evaluation"
+  attr(callback, "name") <- "cb_record_evaluation"
 
   return(callback)
 
 }
 
-cb.early.stop <- function(stopping_rounds, first_metric_only, verbose) {
+cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
 
   factor_to_bigger_better <- NULL
   best_iter <- NULL
@@ -316,7 +316,7 @@ cb.early.stop <- function(stopping_rounds, first_metric_only, verbose) {
   }
 
   attr(callback, "call") <- match.call()
-  attr(callback, "name") <- "cb.early.stop"
+  attr(callback, "name") <- "cb_early_stop"
 
   return(callback)
 
@@ -335,13 +335,13 @@ add.cb <- function(cb_list, cb) {
   # Set names of elements
   names(cb_list) <- callback.names(cb_list = cb_list)
 
-  if ("cb.early.stop" %in% names(cb_list)) {
+  if ("cb_early_stop" %in% names(cb_list)) {
 
     # Concatenate existing elements
-    cb_list <- c(cb_list, cb_list["cb.early.stop"])
+    cb_list <- c(cb_list, cb_list["cb_early_stop"])
 
     # Remove only the first one
-    cb_list["cb.early.stop"] <- NULL
+    cb_list["cb_early_stop"] <- NULL
 
   }
 
diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index dec33e969f0d..0f6562f592d3 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -26,108 +26,90 @@ Booster <- R6::R6Class(
                           modelfile = NULL,
                           model_str = NULL) {
 
-      # Create parameters and handle
       handle <- NULL
 
-      # Attempts to create a handle for the dataset
-      try({
-
-        # Check if training dataset is not null
-        if (!is.null(train_set)) {
-          # Check if training dataset is lgb.Dataset or not
-          if (!lgb.is.Dataset(train_set)) {
-            stop("lgb.Booster: Can only use lgb.Dataset as training data")
-          }
-          train_set_handle <- train_set$.__enclos_env__$private$get_handle()
-          params <- utils::modifyList(params, train_set$get_params())
-          params_str <- lgb.params2str(params = params)
-          # Store booster handle
-          handle <- .Call(
-            LGBM_BoosterCreate_R
-            , train_set_handle
-            , params_str
-          )
-
-          # Create private booster information
-          private$train_set <- train_set
-          private$train_set_version <- train_set$.__enclos_env__$private$version
-          private$num_dataset <- 1L
-          private$init_predictor <- train_set$.__enclos_env__$private$predictor
-
-          # Check if predictor is existing
-          if (!is.null(private$init_predictor)) {
-
-            # Merge booster
-            .Call(
-              LGBM_BoosterMerge_R
-              , handle
-              , private$init_predictor$.__enclos_env__$private$handle
-            )
-
-          }
-
-          # Check current iteration
-          private$is_predicted_cur_iter <- c(private$is_predicted_cur_iter, FALSE)
+      if (!is.null(train_set)) {
 
-        } else if (!is.null(modelfile)) {
+        if (!lgb.is.Dataset(train_set)) {
+          stop("lgb.Booster: Can only use lgb.Dataset as training data")
+        }
+        train_set_handle <- train_set$.__enclos_env__$private$get_handle()
+        params <- utils::modifyList(params, train_set$get_params())
+        params_str <- lgb.params2str(params = params)
+        # Store booster handle
+        handle <- .Call(
+          LGBM_BoosterCreate_R
+          , train_set_handle
+          , params_str
+        )
 
-          # Do we have a model file as character?
-          if (!is.character(modelfile)) {
-            stop("lgb.Booster: Can only use a string as model file path")
-          }
+        # Create private booster information
+        private$train_set <- train_set
+        private$train_set_version <- train_set$.__enclos_env__$private$version
+        private$num_dataset <- 1L
+        private$init_predictor <- train_set$.__enclos_env__$private$predictor
 
-          modelfile <- path.expand(modelfile)
+        if (!is.null(private$init_predictor)) {
 
-          # Create booster from model
-          handle <- .Call(
-            LGBM_BoosterCreateFromModelfile_R
-            , modelfile
+          # Merge booster
+          .Call(
+            LGBM_BoosterMerge_R
+            , handle
+            , private$init_predictor$.__enclos_env__$private$handle
           )
 
-        } else if (!is.null(model_str)) {
+        }
 
-          # Do we have a model_str as character/raw?
-          if (!is.raw(model_str) && !is.character(model_str)) {
-            stop("lgb.Booster: Can only use a character/raw vector as model_str")
-          }
+        # Check current iteration
+        private$is_predicted_cur_iter <- c(private$is_predicted_cur_iter, FALSE)
 
-          # Create booster from model
-          handle <- .Call(
-            LGBM_BoosterLoadModelFromString_R
-            , model_str
-          )
+      } else if (!is.null(modelfile)) {
 
-        } else {
+        # Do we have a model file as character?
+        if (!is.character(modelfile)) {
+          stop("lgb.Booster: Can only use a string as model file path")
+        }
 
-          # Booster non existent
-          stop(
-            "lgb.Booster: Need at least either training dataset, "
-            , "model file, or model_str to create booster instance"
-          )
+        modelfile <- path.expand(modelfile)
 
-        }
+        # Create booster from model
+        handle <- .Call(
+          LGBM_BoosterCreateFromModelfile_R
+          , modelfile
+        )
 
-      })
+      } else if (!is.null(model_str)) {
 
-      # Check whether the handle was created properly if it was not stopped earlier by a stop call
-      if (isTRUE(lgb.is.null.handle(x = handle))) {
+        # Do we have a model_str as character/raw?
+        if (!is.raw(model_str) && !is.character(model_str)) {
+          stop("lgb.Booster: Can only use a character/raw vector as model_str")
+        }
 
-        stop("lgb.Booster: cannot create Booster handle")
+        # Create booster from model
+        handle <- .Call(
+          LGBM_BoosterLoadModelFromString_R
+          , model_str
+        )
 
       } else {
 
-        # Create class
-        class(handle) <- "lgb.Booster.handle"
-        private$handle <- handle
-        private$num_class <- 1L
-        .Call(
-          LGBM_BoosterGetNumClasses_R
-          , private$handle
-          , private$num_class
+        # Booster non existent
+        stop(
+          "lgb.Booster: Need at least either training dataset, "
+          , "model file, or model_str to create booster instance"
         )
 
       }
 
+      class(handle) <- "lgb.Booster.handle"
+      private$handle <- handle
+      private$num_class <- 1L
+      .Call(
+        LGBM_BoosterGetNumClasses_R
+        , private$handle
+        , private$num_class
+      )
+
       self$params <- params
 
       return(invisible(NULL))
@@ -249,7 +231,8 @@ Booster <- R6::R6Class(
           private$set_objective_to_none <- TRUE
         }
         # Perform objective calculation
-        gpair <- fobj(private$inner_predict(1L), private$train_set)
+        preds <- private$inner_predict(1L)
+        gpair <- fobj(preds, private$train_set)
 
         # Check for gradient and hessian as list
         if (is.null(gpair$grad) || is.null(gpair$hess)) {
@@ -257,13 +240,24 @@ Booster <- R6::R6Class(
             return a list with attributes (hess, grad)")
         }
 
+        # Check grad and hess have the right shape
+        n_grad <- length(gpair$grad)
+        n_hess <- length(gpair$hess)
+        n_preds <- length(preds)
+        if (n_grad != n_preds) {
+          stop(sprintf("Expected custom objective function to return grad with length %d, got %d.", n_preds, n_grad))
+        }
+        if (n_hess != n_preds) {
+          stop(sprintf("Expected custom objective function to return hess with length %d, got %d.", n_preds, n_hess))
+        }
+
         # Return custom boosting gradient/hessian
         .Call(
           LGBM_BoosterUpdateOneIterCustom_R
           , private$handle
           , gpair$grad
           , gpair$hess
-          , length(gpair$grad)
+          , n_preds
         )
 
       }
@@ -492,7 +486,6 @@ Booster <- R6::R6Class(
                        predleaf = FALSE,
                        predcontrib = FALSE,
                        header = FALSE,
-                       reshape = FALSE,
                        params = list()) {
 
       self$restore_handle()
@@ -505,6 +498,34 @@ Booster <- R6::R6Class(
         start_iteration <- 0L
       }
 
+      # possibly override keyword arguments with parameters
+      #
+      # NOTE: this length() check minimizes the latency introduced by these checks,
+      #       for the common case where params is empty
+      #
+      # NOTE: doing this here instead of in Predictor$predict() to keep
+      #       Predictor$predict() as fast as possible
+      if (length(params) > 0L) {
+        params <- lgb.check.wrapper_param(
+          main_param_name = "predict_raw_score"
+          , params = params
+          , alternative_kwarg_value = rawscore
+        )
+        params <- lgb.check.wrapper_param(
+          main_param_name = "predict_leaf_index"
+          , params = params
+          , alternative_kwarg_value = predleaf
+        )
+        params <- lgb.check.wrapper_param(
+          main_param_name = "predict_contrib"
+          , params = params
+          , alternative_kwarg_value = predcontrib
+        )
+        rawscore <- params[["predict_raw_score"]]
+        predleaf <- params[["predict_leaf_index"]]
+        predcontrib <- params[["predict_contrib"]]
+      }
+
       # Predict on new data
       predictor <- Predictor$new(
         modelfile = private$handle
@@ -519,7 +540,6 @@ Booster <- R6::R6Class(
           , predleaf = predleaf
           , predcontrib = predcontrib
           , header = header
-          , reshape = reshape
         )
       )
 
@@ -711,8 +731,9 @@ Booster <- R6::R6Class(
         res <- feval(private$inner_predict(data_idx), data)
 
         if (is.null(res$name) || is.null(res$value) ||  is.null(res$higher_better)) {
-          stop("lgb.Booster.eval: custom eval function should return a
-            list with attribute (name, value, higher_better)");
+          stop(
+            "lgb.Booster.eval: custom eval function should return a list with attribute (name, value, higher_better)"
+          )
         }
 
         # Append names and evaluation
@@ -731,8 +752,28 @@ Booster <- R6::R6Class(
 #' @title Predict method for LightGBM model
 #' @description Predicted values based on class \code{lgb.Booster}
 #' @param object Object of class \code{lgb.Booster}
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or
-#'             a character representing a path to a text file (CSV, TSV, or LibSVM)
+#' @param newdata a \code{matrix} object, a \code{dgCMatrix} object or
+#'                a character representing a path to a text file (CSV, TSV, or LibSVM)
+#' @param type Type of prediction to output. Allowed types are:\itemize{
+#'             \item \code{"response"}: will output the predicted score according to the objective function being
+#'                   optimized (depending on the link function that the objective uses), after applying any necessary
+#'                   transformations - for example, for \code{objective="binary"}, it will output class probabilities.
+#'             \item \code{"class"}: for classification objectives, will output the class with the highest predicted
+#'                   probability. For other objectives, will output the same as "response".
+#'             \item \code{"raw"}: will output the non-transformed numbers (sum of predictions from boosting iterations'
+#'                   results) from which the "response" number is produced for a given objective function - for example,
+#'                   for \code{objective="binary"}, this corresponds to log-odds. For many objectives such as
+#'                   "regression", since no transformation is applied, the output will be the same as for "response".
+#'             \item \code{"leaf"}: will output the index of the terminal node / leaf at which each observations falls
+#'                   in each tree in the model, outputted as integers, with one column per tree.
+#'             \item \code{"contrib"}: will return the per-feature contributions for each prediction, including an
+#'                   intercept (each feature will produce one column). If there are multiple classes, each class will
+#'                   have separate feature contributions (thus the number of columns is features+1 multiplied by the
+#'                   number of classes).
+#'             }
+#'
+#'             Note that, if using custom objectives, types "class" and "response" will not be available and will
+#'             default towards using "raw" instead.
 #' @param start_iteration int or None, optional (default=None)
 #'                        Start index of the iteration to predict.
 #'                        If None or <= 0, starts from the first iteration.
@@ -741,26 +782,20 @@ Booster <- R6::R6Class(
 #'                      If None, if the best iteration exists and start_iteration is None or <= 0, the
 #'                      best iteration is used; otherwise, all iterations from start_iteration are used.
 #'                      If <= 0, all iterations from start_iteration are used (no limits).
-#' @param rawscore whether the prediction should be returned in the for of original untransformed
-#'                 sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
-#'                 for logistic regression would result in predictions for log-odds instead of probabilities.
-#' @param predleaf whether predict leaf index instead.
-#' @param predcontrib return per-feature contributions for each record.
 #' @param header only used for prediction for text file. True if text file has header
-#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
-#'                prediction outputs per case.
 #' @param params a list of additional named parameters. See
 #'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{
 #'               the "Predict Parameters" section of the documentation} for a list of parameters and
-#'               valid values.
+#'               valid values. Where these conflict with the values of keyword arguments to this function,
+#'               the values in \code{params} take precedence.
 #' @param ... ignored
-#' @return For regression or binary classification, it returns a vector of length \code{nrows(data)}.
-#'         For multiclass classification, either a \code{num_class * nrows(data)} vector or
-#'         a \code{(nrows(data), num_class)} dimension matrix is returned, depending on
-#'         the \code{reshape} value.
+#' @return For prediction types that are meant to always return one output per observation (e.g. when predicting
+#'         \code{type="response"} on a binary classification or regression objective), will return a vector with one
+#'         element per row in \code{newdata}.
 #'
-#'         When \code{predleaf = TRUE}, the output is a matrix object with the
-#'         number of columns corresponding to the number of trees.
+#'         For prediction types that are meant to return more than one output per observation (e.g. when predicting
+#'         \code{type="response"} on a multi-class objective, or when predicting \code{type="leaf"}, regardless of
+#'         objective), will return a matrix with one row per observation in \code{newdata} and one column per output.
 #'
 #' @examples
 #' \donttest{
@@ -797,14 +832,11 @@ Booster <- R6::R6Class(
 #' @importFrom utils modifyList
 #' @export
 predict.lgb.Booster <- function(object,
-                                data,
+                                newdata,
+                                type = "response",
                                 start_iteration = NULL,
                                 num_iteration = NULL,
-                                rawscore = FALSE,
-                                predleaf = FALSE,
-                                predcontrib = FALSE,
                                 header = FALSE,
-                                reshape = FALSE,
                                 params = list(),
                                 ...) {
 
@@ -814,26 +846,65 @@ predict.lgb.Booster <- function(object,
 
   additional_params <- list(...)
   if (length(additional_params) > 0L) {
+    additional_params_names <- names(additional_params)
+    if ("reshape" %in% additional_params_names) {
+      stop("'reshape' argument is no longer supported.")
+    }
+
+    old_args_for_type <- list(
+      "rawscore" = "raw"
+      , "predleaf" = "leaf"
+      , "predcontrib" = "contrib"
+    )
+    for (arg in names(old_args_for_type)) {
+      if (arg %in% additional_params_names) {
+        stop(sprintf("Argument '%s' is no longer supported. Use type='%s' instead."
+                     , arg
+                     , old_args_for_type[[arg]]))
+      }
+    }
+
     warning(paste0(
       "predict.lgb.Booster: Found the following passed through '...': "
-      , paste(names(additional_params), collapse = ", ")
+      , toString(names(additional_params))
       , ". These are ignored. Use argument 'params' instead."
     ))
   }
 
-  return(
-    object$predict(
-      data = data
-      , start_iteration = start_iteration
-      , num_iteration = num_iteration
-      , rawscore = rawscore
-      , predleaf =  predleaf
-      , predcontrib =  predcontrib
-      , header = header
-      , reshape = reshape
-      , params = params
-    )
+  if (!is.null(object$params$objective) && object$params$objective == "none" && type %in% c("class", "response")) {
+    warning("Prediction types 'class' and 'response' are not supported for custom objectives.")
+    type <- "raw"
+  }
+
+  rawscore <- FALSE
+  predleaf <- FALSE
+  predcontrib <- FALSE
+  if (type == "raw") {
+    rawscore <- TRUE
+  } else if (type == "leaf") {
+    predleaf <- TRUE
+  } else if (type == "contrib") {
+    predcontrib <- TRUE
+  }
+
+  pred <- object$predict(
+    data = newdata
+    , start_iteration = start_iteration
+    , num_iteration = num_iteration
+    , rawscore = rawscore
+    , predleaf =  predleaf
+    , predcontrib =  predcontrib
+    , header = header
+    , params = params
   )
+  if (type == "class") {
+    if (object$params$objective == "binary") {
+      pred <- as.integer(pred >= 0.5)
+    } else if (object$params$objective %in% c("multiclass", "multiclassova")) {
+      pred <- max.col(pred) - 1L
+    }
+  }
+  return(pred)
 }
 
 #' @name print.lgb.Booster
@@ -864,12 +935,13 @@ print.lgb.Booster <- function(x, ...) {
     if (obj == "none") {
       obj <- "custom"
     }
-    if (x$.__enclos_env__$private$num_class == 1L) {
+    num_class <- x$.__enclos_env__$private$num_class
+    if (num_class == 1L) {
       cat(sprintf("Objective: %s\n", obj))
     } else {
       cat(sprintf("Objective: %s (%d classes)\n"
           , obj
-          , x$.__enclos_env__$private$num_class))
+          , num_class))
     }
   } else {
     cat("(Booster handle is invalid)\n")
@@ -1123,7 +1195,7 @@ lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_
       "lgb.get.eval.result: data_name "
       , shQuote(data_name)
       , " not found. Only the following datasets exist in record evals: ["
-      , paste(data_names, collapse = ", ")
+      , toString(data_names)
       , "]"
     ))
   }
@@ -1137,7 +1209,7 @@ lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_
       , " not found. Only the following eval_names exist for dataset "
       , shQuote(data_name)
       , ": ["
-      , paste(eval_names, collapse = ", ")
+      , toString(eval_names)
       , "]"
     ))
     stop("lgb.get.eval.result: wrong eval name")
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index 8b6a670f35f5..8520d88bfdd1 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -109,7 +109,7 @@ Dataset <- R6::R6Class(
                             params = list()) {
 
       # the Dataset's existing parameters should be overwritten by any passed in to this call
-      params <- modifyList(self$get_params(), params)
+      params <- modifyList(private$params, params)
 
       # Create new dataset
       ret <- Dataset$new(
@@ -169,12 +169,13 @@ Dataset <- R6::R6Class(
           } else {
 
             # Check if more categorical features were output over the feature space
-            if (max(private$categorical_feature) > length(private$colnames)) {
+            data_is_not_filename <- !is.character(private$raw_data)
+            if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) {
               stop(
                 "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                 , max(private$categorical_feature)
                 , " but only "
-                , length(private$colnames)
+                , ncol(private$raw_data)
                 , " features"
               )
             }
@@ -236,7 +237,7 @@ Dataset <- R6::R6Class(
           if (length(private$raw_data@p) > 2147483647L) {
             stop("Cannot support large CSC matrix")
           }
-          # Are we using a dgCMatrix (sparsed matrix column compressed)
+          # Are we using a dgCMatrix (sparse matrix column compressed)
           handle <- .Call(
             LGBM_DatasetCreateFromCSC_R
             , private$raw_data@p
@@ -288,6 +289,13 @@ Dataset <- R6::R6Class(
         self$set_colnames(colnames = private$colnames)
       }
 
+      # Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
+      # in cases like constructing from a file or from a matrix with no column names.
+      private$colnames <- .Call(
+          LGBM_DatasetGetFeatureNames_R
+          , private$handle
+      )
+
       # Load init score if requested
       if (!is.null(private$predictor) && is.null(private$used_indices)) {
 
@@ -295,7 +303,6 @@ Dataset <- R6::R6Class(
         init_score <- private$predictor$predict(
           data = private$raw_data
           , rawscore = TRUE
-          , reshape = TRUE
         )
 
         # Not needed to transpose, for is col_marjor
@@ -376,6 +383,28 @@ Dataset <- R6::R6Class(
 
     },
 
+    # Get number of bins for feature
+    get_feature_num_bin = function(feature) {
+      if (lgb.is.null.handle(x = private$handle)) {
+        stop("Cannot get number of bins in feature before constructing Dataset.")
+      }
+      if (is.character(feature)) {
+        feature_name <- feature
+        feature <- which(private$colnames == feature_name)
+        if (length(feature) == 0L) {
+          stop(sprintf("feature '%s' not found", feature_name))
+        }
+      }
+      num_bin <- integer(1L)
+      .Call(
+        LGBM_DatasetGetFeatureNumBin_R
+        , private$handle
+        , feature - 1L
+        , num_bin
+      )
+      return(num_bin)
+    },
+
     # Get column names
     get_colnames = function() {
 
@@ -442,7 +471,7 @@ Dataset <- R6::R6Class(
       if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
         stop(
           "Dataset$get_field(): field_name must one of the following: "
-          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
+          , toString(sQuote(.INFO_KEYS()))
         )
       }
 
@@ -462,15 +491,14 @@ Dataset <- R6::R6Class(
           , info_len
         )
 
-        # Check if info is not empty
         if (info_len > 0L) {
 
           # Get back fields
           ret <- NULL
           ret <- if (field_name == "group") {
-            integer(info_len) # Integer
+            integer(info_len)
           } else {
-            numeric(info_len) # Numeric
+            numeric(info_len)
           }
 
           .Call(
@@ -495,15 +523,15 @@ Dataset <- R6::R6Class(
       if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
         stop(
           "Dataset$set_field(): field_name must one of the following: "
-          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
+          , toString(sQuote(.INFO_KEYS()))
         )
       }
 
       # Check for type of information
       data <- if (field_name == "group") {
-        as.integer(data) # Integer
+        as.integer(data)
       } else {
-        as.numeric(data) # Numeric
+        as.numeric(data)
       }
 
       # Store information privately
@@ -531,14 +559,12 @@ Dataset <- R6::R6Class(
 
     },
 
-    # Slice dataset
     slice = function(idxset) {
 
-      # Perform slicing
       return(
         Dataset$new(
           data = NULL
-          , params = self$get_params()
+          , params = private$params
           , reference = self
           , colnames = private$colnames
           , categorical_feature = private$categorical_feature
@@ -557,15 +583,17 @@ Dataset <- R6::R6Class(
       if (length(params) == 0L) {
         return(invisible(self))
       }
+      new_params <- utils::modifyList(private$params, params)
       if (lgb.is.null.handle(x = private$handle)) {
-        private$params <- utils::modifyList(private$params, params)
+        private$params <- new_params
       } else {
         tryCatch({
           .Call(
             LGBM_DatasetUpdateParamChecking_R
             , lgb.params2str(params = private$params)
-            , lgb.params2str(params = params)
+            , lgb.params2str(params = new_params)
           )
+          private$params <- new_params
         }, error = function(e) {
           # If updating failed but raw data is not available, raise an error because
           # achieving what the user asked for is not possible
@@ -575,7 +603,7 @@ Dataset <- R6::R6Class(
 
           # If updating failed but raw data is available, modify the params
           # on the R side and re-set ("deconstruct") the Dataset
-          private$params <- utils::modifyList(private$params, params)
+          private$params <- new_params
           self$finalize()
         })
       }
@@ -583,6 +611,11 @@ Dataset <- R6::R6Class(
 
     },
 
+    # [description] Get only Dataset-specific parameters. This is primarily used by
+    #               Booster to update its parameters based on the characteristics of
+    #               a Dataset. It should not be used by other methods in this class,
+    #               since "verbose" is not a Dataset parameter and needs to be passed
+    #               through to avoid globally re-setting verbosity.
     get_params = function() {
       dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
       ret <- list()
@@ -617,7 +650,6 @@ Dataset <- R6::R6Class(
 
     },
 
-    # Set reference
     set_reference = function(reference) {
 
       # setting reference to this same Dataset object doesn't require any changes
@@ -677,7 +709,6 @@ Dataset <- R6::R6Class(
     info = NULL,
     version = 0L,
 
-    # Get handle
     get_handle = function() {
 
       # Get handle and construct if needed
@@ -688,7 +719,6 @@ Dataset <- R6::R6Class(
 
     },
 
-    # Set predictor
     set_predictor = function(predictor) {
 
       if (identical(private$predictor, predictor)) {
diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R
index 3ca8ea98348e..0e1e80276e19 100644
--- a/R-package/R/lgb.Predictor.R
+++ b/R-package/R/lgb.Predictor.R
@@ -1,4 +1,5 @@
-#' @importFrom methods is
+#' @importFrom methods is new
+#' @importClassesFrom Matrix dsparseMatrix dsparseVector dgCMatrix dgRMatrix
 #' @importFrom R6 R6Class
 #' @importFrom utils read.delim
 Predictor <- R6::R6Class(
@@ -84,8 +85,7 @@ Predictor <- R6::R6Class(
                        rawscore = FALSE,
                        predleaf = FALSE,
                        predcontrib = FALSE,
-                       header = FALSE,
-                       reshape = FALSE) {
+                       header = FALSE) {
 
       # Check if number of iterations is existing - if not, then set it to -1 (use all)
       if (is.null(num_iteration)) {
@@ -127,6 +127,111 @@ Predictor <- R6::R6Class(
         num_row <- nrow(preds)
         preds <- as.vector(t(preds))
 
+      } else if (predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) {
+
+        ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle)
+        ncols_out <- integer(1L)
+        .Call(LGBM_BoosterGetNumClasses_R, private$handle, ncols_out)
+        ncols_out <- (ncols + 1L) * max(ncols_out, 1L)
+        if (is.na(ncols_out)) {
+          ncols_out <- as.numeric(ncols + 1L) * as.numeric(max(ncols_out, 1L))
+        }
+        if (!inherits(data, "dsparseVector") && ncols_out > .Machine$integer.max) {
+          stop("Resulting matrix of feature contributions is too large for R to handle.")
+        }
+
+        if (inherits(data, "dsparseVector")) {
+
+          if (length(data) > ncols) {
+            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
+                         , ncols
+                         , length(data)))
+          }
+          res <- .Call(
+            LGBM_BoosterPredictSparseOutput_R
+            , private$handle
+            , c(0L, as.integer(length(data@x)))
+            , data@i - 1L
+            , data@x
+            , TRUE
+            , 1L
+            , ncols
+            , start_iteration
+            , num_iteration
+            , private$params
+          )
+          out <- methods::new("dsparseVector")
+          out@i <- res$indices + 1L
+          out@x <- res$data
+          out@length <- ncols_out
+          return(out)
+
+        } else if (inherits(data, "dgRMatrix")) {
+
+          if (ncol(data) > ncols) {
+            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
+                         , ncols
+                         , ncol(data)))
+          }
+          res <- .Call(
+            LGBM_BoosterPredictSparseOutput_R
+            , private$handle
+            , data@p
+            , data@j
+            , data@x
+            , TRUE
+            , nrow(data)
+            , ncols
+            , start_iteration
+            , num_iteration
+            , private$params
+          )
+          out <- methods::new("dgRMatrix")
+          out@p <- res$indptr
+          out@j <- res$indices
+          out@x <- res$data
+          out@Dim <- as.integer(c(nrow(data), ncols_out))
+
+        } else if (inherits(data, "dgCMatrix")) {
+
+          if (ncol(data) != ncols) {
+            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
+                         , ncols
+                         , ncol(data)))
+          }
+          res <- .Call(
+            LGBM_BoosterPredictSparseOutput_R
+            , private$handle
+            , data@p
+            , data@i
+            , data@x
+            , FALSE
+            , nrow(data)
+            , ncols
+            , start_iteration
+            , num_iteration
+            , private$params
+          )
+          out <- methods::new("dgCMatrix")
+          out@p <- res$indptr
+          out@i <- res$indices
+          out@x <- res$data
+          out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L))
+
+        } else {
+
+          stop(sprintf("Predictions on sparse inputs are only allowed for '%s', '%s', '%s' - got: %s"
+                       , "dsparseVector"
+                       , "dgRMatrix"
+                       , "dgCMatrix"
+                       , toString(class(data))))
+        }
+
+        if (NROW(row.names(data))) {
+          out@Dimnames[[1L]] <- row.names(data)
+        }
+        return(out)
+
       } else {
 
         # Not a file, we need to predict from R object
@@ -215,23 +320,21 @@ Predictor <- R6::R6Class(
       # Get number of cases per row
       npred_per_case <- length(preds) / num_row
 
-
       # Data reshaping
-
-      if (predleaf | predcontrib) {
-
-        # Predict leaves only, reshaping is mandatory
-        preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
-
-      } else if (reshape && npred_per_case > 1L) {
-
-        # Predict with data reshaping
+      if (npred_per_case > 1L || predleaf || predcontrib) {
         preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
+      }
 
+      # Keep row names if possible
+      if (NROW(row.names(data)) && NROW(data) == NROW(preds)) {
+        if (is.null(dim(preds))) {
+          names(preds) <- row.names(data)
+        } else {
+          row.names(preds) <- row.names(data)
+        }
       }
 
       return(preds)
-
     }
 
   ),
diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R
index 720be3651e27..f282fa3152fc 100644
--- a/R-package/R/lgb.convert_with_rules.R
+++ b/R-package/R/lgb.convert_with_rules.R
@@ -4,7 +4,9 @@
     return(
         vapply(
             X = df
-            , FUN = function(x) {paste0(class(x), collapse = ",")}
+            , FUN = function(x) {
+                paste0(class(x), collapse = ",")
+            }
             , FUN.VALUE = character(1L)
         )
     )
@@ -18,14 +20,13 @@
     column_classes <- .get_column_classes(df = df)
     unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
     if (length(unconverted_columns) > 0L) {
-        col_detail_string <- paste0(
+        col_detail_string <- toString(
             paste0(
                 names(unconverted_columns)
                 , " ("
                 , unconverted_columns
                 , ")"
             )
-            , collapse = ", "
         )
         msg <- paste0(
             function_name
@@ -40,8 +41,12 @@
     return(invisible(NULL))
 }
 
-.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
-.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}
+.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {
+    return(-1L)
+}
+.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {
+    return(0L)
+}
 
 
 #' @name lgb.convert_with_rules
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index 0690936f5624..cf88100db399 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -12,8 +12,8 @@ CVBooster <- R6::R6Class(
       return(invisible(NULL))
     },
     reset_parameter = function(new_params) {
-      for (x in boosters) {
-        x$reset_parameter(params = new_params)
+      for (x in self$boosters) {
+        x[["booster"]]$reset_parameter(params = new_params)
       }
       return(invisible(self))
     }
@@ -43,6 +43,9 @@ CVBooster <- R6::R6Class(
 #' @param callbacks List of callback functions that are applied at each iteration.
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                   into a predictor model which frees up memory and the original datasets
+#' @param eval_train_metric \code{boolean}, whether to add the cross validation results on the
+#'               training data. This parameter defaults to \code{FALSE}. Setting it to \code{TRUE}
+#'               will increase run time.
 #' @inheritSection lgb_shared_params Early Stopping
 #' @return a trained model \code{lgb.CVBooster}.
 #'
@@ -87,6 +90,7 @@ lgb.cv <- function(params = list()
                    , callbacks = list()
                    , reset_data = FALSE
                    , serializable = TRUE
+                   , eval_train_metric = FALSE
                    ) {
 
   if (nrounds <= 0L) {
@@ -101,22 +105,30 @@ lgb.cv <- function(params = list()
     data <- lgb.Dataset(data = data, label = label)
   }
 
-  # Setup temporary variables
-  params$verbose <- verbose
-  params <- lgb.check.obj(params = params, obj = obj)
-  params <- lgb.check.eval(params = params, eval = eval)
-  fobj <- NULL
-  eval_functions <- list(NULL)
-
   # set some parameters, resolving the way they were passed in with other parameters
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents
   # what was passed in
+  params <- lgb.check.wrapper_param(
+    main_param_name = "verbosity"
+    , params = params
+    , alternative_kwarg_value = verbose
+  )
   params <- lgb.check.wrapper_param(
     main_param_name = "num_iterations"
     , params = params
     , alternative_kwarg_value = nrounds
   )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "metric"
+    , params = params
+    , alternative_kwarg_value = NULL
+  )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "objective"
+    , params = params
+    , alternative_kwarg_value = obj
+  )
   params <- lgb.check.wrapper_param(
     main_param_name = "early_stopping_round"
     , params = params
@@ -124,16 +136,19 @@ lgb.cv <- function(params = list()
   )
   early_stopping_rounds <- params[["early_stopping_round"]]
 
-  # Check for objective (function or not)
+  # extract any function objects passed for objective or metric
+  fobj <- NULL
   if (is.function(params$objective)) {
     fobj <- params$objective
-    params$objective <- "NONE"
+    params$objective <- "none"
   }
 
   # If eval is a single function, store it as a 1-element list
   # (for backwards compatibility). If it is a list of functions, store
   # all of them. This makes it possible to pass any mix of strings like "auc"
   # and custom functions to eval
+  params <- lgb.check.eval(params = params, eval = eval)
+  eval_functions <- list(NULL)
   if (is.function(eval)) {
     eval_functions <- list(eval)
   }
@@ -230,13 +245,13 @@ lgb.cv <- function(params = list()
   }
 
   # Add printing log callback
-  if (verbose > 0L && eval_freq > 0L) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb.print.evaluation(period = eval_freq))
+  if (params[["verbosity"]] > 0L && eval_freq > 0L) {
+    callbacks <- add.cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq))
   }
 
   # Add evaluation log callback
   if (record) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb.record.evaluation())
+    callbacks <- add.cb(cb_list = callbacks, cb = cb_record_evaluation())
   }
 
   # Did user pass parameters that indicate they want to use early stopping?
@@ -257,10 +272,10 @@ lgb.cv <- function(params = list()
     warning("Early stopping is not available in 'dart' mode.")
     using_early_stopping <- FALSE
 
-    # Remove the cb.early.stop() function if it was passed in to callbacks
+    # Remove the cb_early_stop() function if it was passed in to callbacks
     callbacks <- Filter(
       f = function(cb_func) {
-        !identical(attr(cb_func, "name"), "cb.early.stop")
+        !identical(attr(cb_func, "name"), "cb_early_stop")
       }
       , x = callbacks
     )
@@ -270,10 +285,10 @@ lgb.cv <- function(params = list()
   if (using_early_stopping) {
     callbacks <- add.cb(
       cb_list = callbacks
-      , cb = cb.early.stop(
+      , cb = cb_early_stop(
         stopping_rounds = early_stopping_rounds
         , first_metric_only = isTRUE(params[["first_metric_only"]])
-        , verbose = verbose
+        , verbose = params[["verbosity"]] > 0L
       )
     )
   }
@@ -332,6 +347,9 @@ lgb.cv <- function(params = list()
       }
 
       booster <- Booster$new(params = params, train_set = dtrain)
+      if (isTRUE(eval_train_metric)) {
+        booster$add_valid(data = dtrain, name = "train")
+      }
       booster$add_valid(data = dtest, name = "valid")
       return(
         list(booster = booster)
@@ -416,6 +434,10 @@ lgb.cv <- function(params = list()
     )
     cv_booster$best_score <- cv_booster$record_evals[["valid"]][[first_metric]][[.EVAL_KEY()]][[cv_booster$best_iter]]
   }
+  # Propagate the best_iter attribute from the cv_booster to the individual boosters
+  for (bst in cv_booster$boosters) {
+    bst$booster$best_iter <- cv_booster$best_iter
+  }
 
   if (reset_data) {
     lapply(cv_booster$boosters, function(fd) {
@@ -509,14 +531,14 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
 #' @importFrom stats quantile
 lgb.stratified.folds <- function(y, k) {
 
-  ## Group the numeric data based on their magnitudes
-  ## and sample within those groups.
-  ## When the number of samples is low, we may have
-  ## issues further slicing the numeric data into
-  ## groups. The number of groups will depend on the
-  ## ratio of the number of folds to the sample size.
-  ## At most, we will use quantiles. If the sample
-  ## is too small, we just do regular unstratified CV
+  # Group the numeric data based on their magnitudes
+  # and sample within those groups.
+  # When the number of samples is low, we may have
+  # issues further slicing the numeric data into
+  # groups. The number of groups will depend on the
+  # ratio of the number of folds to the sample size.
+  # At most, we will use quantiles. If the sample
+  # is too small, we just do regular unstratified CV
   if (is.numeric(y)) {
 
     cuts <- length(y) %/% k
@@ -536,29 +558,28 @@ lgb.stratified.folds <- function(y, k) {
 
   if (k < length(y)) {
 
-    ## Reset levels so that the possible levels and
-    ## the levels in the vector are the same
+    # Reset levels so that the possible levels and
+    # the levels in the vector are the same
     y <- as.factor(as.character(y))
     numInClass <- table(y)
     foldVector <- vector(mode = "integer", length(y))
 
-    ## For each class, balance the fold allocation as far
-    ## as possible, then resample the remainder.
-    ## The final assignment of folds is also randomized.
-
+    # For each class, balance the fold allocation as far
+    # as possible, then resample the remainder.
+    # The final assignment of folds is also randomized.
     for (i in seq_along(numInClass)) {
 
-      ## Create a vector of integers from 1:k as many times as possible without
-      ## going over the number of samples in the class. Note that if the number
-      ## of samples in a class is less than k, nothing is producd here.
+      # Create a vector of integers from 1:k as many times as possible without
+      # going over the number of samples in the class. Note that if the number
+      # of samples in a class is less than k, nothing is produced here.
       seqVector <- rep(seq_len(k), numInClass[i] %/% k)
 
-      ## Add enough random integers to get length(seqVector) == numInClass[i]
+      # Add enough random integers to get length(seqVector) == numInClass[i]
       if (numInClass[i] %% k > 0L) {
         seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
       }
 
-      ## Shuffle the integers for fold assignment and assign to this classes's data
+      # Shuffle the integers for fold assignment and assign to this classes's data
       foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
 
     }
diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R
index be3036a52986..3d707635174f 100644
--- a/R-package/R/lgb.restore_handle.R
+++ b/R-package/R/lgb.restore_handle.R
@@ -15,9 +15,8 @@
 #' model <- lightgbm(
 #'   agaricus.train$data
 #'   , agaricus.train$label
-#'   , params = list(objective = "binary", nthreads = 1L)
+#'   , params = list(objective = "binary")
 #'   , nrounds = 5L
-#'   , save_name = NULL
 #'   , verbose = 0)
 #' fname <- tempfile(fileext="rds")
 #' saveRDS(model, fname)
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index b3e19c7185d2..8cf3a95eaf2e 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -1,6 +1,8 @@
 #' @name lgb.train
 #' @title Main training logic for LightGBM
-#' @description Logic to train with LightGBM
+#' @description Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}},
+#'              this function is focused on performance (e.g. speed, memory efficiency). It is also
+#'              less likely to have breaking API changes in new releases than \code{\link{lightgbm}}.
 #' @inheritParams lgb_shared_params
 #' @param valids a list of \code{lgb.Dataset} objects, used for validation
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
@@ -73,22 +75,30 @@ lgb.train <- function(params = list(),
     }
   }
 
-  # Setup temporary variables
-  params$verbose <- verbose
-  params <- lgb.check.obj(params = params, obj = obj)
-  params <- lgb.check.eval(params = params, eval = eval)
-  fobj <- NULL
-  eval_functions <- list(NULL)
-
   # set some parameters, resolving the way they were passed in with other parameters
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents
   # what was passed in
+  params <- lgb.check.wrapper_param(
+    main_param_name = "verbosity"
+    , params = params
+    , alternative_kwarg_value = verbose
+  )
   params <- lgb.check.wrapper_param(
     main_param_name = "num_iterations"
     , params = params
     , alternative_kwarg_value = nrounds
   )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "metric"
+    , params = params
+    , alternative_kwarg_value = NULL
+  )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "objective"
+    , params = params
+    , alternative_kwarg_value = obj
+  )
   params <- lgb.check.wrapper_param(
     main_param_name = "early_stopping_round"
     , params = params
@@ -96,16 +106,19 @@ lgb.train <- function(params = list(),
   )
   early_stopping_rounds <- params[["early_stopping_round"]]
 
-  # Check for objective (function or not)
+  # extract any function objects passed for objective or metric
+  fobj <- NULL
   if (is.function(params$objective)) {
     fobj <- params$objective
-    params$objective <- "NONE"
+    params$objective <- "none"
   }
 
   # If eval is a single function, store it as a 1-element list
   # (for backwards compatibility). If it is a list of functions, store
   # all of them. This makes it possible to pass any mix of strings like "auc"
   # and custom functions to eval
+  params <- lgb.check.eval(params = params, eval = eval)
+  eval_functions <- list(NULL)
   if (is.function(eval)) {
     eval_functions <- list(eval)
   }
@@ -199,13 +212,13 @@ lgb.train <- function(params = list(),
   }
 
   # Add printing log callback
-  if (verbose > 0L && eval_freq > 0L) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb.print.evaluation(period = eval_freq))
+  if (params[["verbosity"]] > 0L && eval_freq > 0L) {
+    callbacks <- add.cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq))
   }
 
   # Add evaluation log callback
   if (record && length(valids) > 0L) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb.record.evaluation())
+    callbacks <- add.cb(cb_list = callbacks, cb = cb_record_evaluation())
   }
 
   # Did user pass parameters that indicate they want to use early stopping?
@@ -226,10 +239,10 @@ lgb.train <- function(params = list(),
     warning("Early stopping is not available in 'dart' mode.")
     using_early_stopping <- FALSE
 
-    # Remove the cb.early.stop() function if it was passed in to callbacks
+    # Remove the cb_early_stop() function if it was passed in to callbacks
     callbacks <- Filter(
       f = function(cb_func) {
-        !identical(attr(cb_func, "name"), "cb.early.stop")
+        !identical(attr(cb_func, "name"), "cb_early_stop")
       }
       , x = callbacks
     )
@@ -239,10 +252,10 @@ lgb.train <- function(params = list(),
   if (using_early_stopping) {
     callbacks <- add.cb(
       cb_list = callbacks
-      , cb = cb.early.stop(
+      , cb = cb_early_stop(
         stopping_rounds = early_stopping_rounds
         , first_metric_only = isTRUE(params[["first_metric_only"]])
-        , verbose = verbose
+        , verbose = params[["verbosity"]] > 0L
       )
     )
   }
diff --git a/R-package/R/lgb.unloader.R b/R-package/R/lgb.unloader.R
deleted file mode 100644
index 443a0d1899e0..000000000000
--- a/R-package/R/lgb.unloader.R
+++ /dev/null
@@ -1,76 +0,0 @@
-#' @name lgb.unloader
-#' @title Remove lightgbm and its objects from an environment
-#' @description Attempts to unload LightGBM packages so you can remove objects cleanly without
-#'              having to restart R. This is useful for instance if an object becomes stuck for no
-#'              apparent reason and you do not want to restart R to fix the lost object.
-#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R.
-#'                Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
-#'                unloading is performed.
-#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
-#'             environment. Defaults to \code{FALSE} which means to not remove them.
-#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
-#'              \code{.GlobalEnv} which is the global environment.
-#'
-#' @return NULL invisibly.
-#'
-#' @examples
-#' \donttest{
-#' data(agaricus.train, package = "lightgbm")
-#' train <- agaricus.train
-#' dtrain <- lgb.Dataset(train$data, label = train$label)
-#' data(agaricus.test, package = "lightgbm")
-#' test <- agaricus.test
-#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
-#' params <- list(
-#'   objective = "regression"
-#'   , metric = "l2"
-#'   , min_data = 1L
-#'   , learning_rate = 1.0
-#' )
-#' valids <- list(test = dtest)
-#' model <- lgb.train(
-#'   params = params
-#'   , data = dtrain
-#'   , nrounds = 5L
-#'   , valids = valids
-#' )
-#'
-#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
-#' rm(model, dtrain, dtest) # Not needed if wipe = TRUE
-#' gc() # Not needed if wipe = TRUE
-#'
-#' library(lightgbm)
-#' # Do whatever you want again with LightGBM without object clashing
-#' }
-#' @export
-lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
-
-  # Unload package
-  try(detach("package:lightgbm", unload = TRUE), silent = TRUE)
-
-  # Should we wipe variables? (lgb.Booster, lgb.Dataset)
-  if (wipe) {
-    boosters <- Filter(
-      f = function(x) {
-        inherits(get(x, envir = envir), "lgb.Booster")
-      }
-      , x = ls(envir = envir)
-    )
-    datasets <- Filter(
-      f = function(x) {
-        inherits(get(x, envir = envir), "lgb.Dataset")
-      }
-      , x = ls(envir = envir)
-    )
-    rm(list = c(boosters, datasets), envir = envir)
-    gc(verbose = FALSE)
-  }
-
-  # Load package back?
-  if (restore) {
-    library(lightgbm)
-  }
-
-  return(invisible(NULL))
-
-}
diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
index b40c8cc21c04..f2b085473107 100644
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -43,7 +43,7 @@
 #'                     These should follow the requirements from the descriptions above.
 #'                 }
 #'             }
-#' @param eval_freq evaluation output frequency, only effect when verbose > 0
+#' @param eval_freq evaluation output frequency, only effective when verbose > 0 and \code{valids} has been provided
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
 #' @param nrounds number of training rounds
 #' @param obj objective function, can be character or custom objective function. Examples include
@@ -51,9 +51,10 @@
 #'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
 #' @param params a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 #'               the "Parameters" section of the documentation} for a list of parameters and valid values.
-#' @param verbose verbosity for output, if <= 0, also will disable the print of evaluation during training
+#' @param verbose verbosity for output, if <= 0 and \code{valids} has been provided, also will disable the
+#'                printing of evaluation during training
 #' @param serializable whether to make the resulting objects serializable through functions such as
-#' \code{save} or \code{saveRDS} (see section "Model serialization").
+#'                     \code{save} or \code{saveRDS} (see section "Model serialization").
 #' @section Early Stopping:
 #'
 #'          "early stopping" refers to stopping the training process if the model's performance on a given
@@ -88,12 +89,37 @@ NULL
 
 #' @name lightgbm
 #' @title Train a LightGBM model
-#' @description Simple interface for training a LightGBM model.
+#' @description High-level R interface to train a LightGBM model. Unlike \code{\link{lgb.train}}, this function
+#'              is focused on compatibility with other statistics and machine learning interfaces in R.
+#'              This focus on compatibility means that this interface may experience more frequent breaking API changes
+#'              than \code{\link{lgb.train}}.
+#'              For efficiency-sensitive applications, or for applications where breaking API changes across releases
+#'              is very expensive, use \code{\link{lgb.train}}.
 #' @inheritParams lgb_shared_params
 #' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
-#' @param weight vector of response values. If not NULL, will set to dataset
-#' @param save_name File name to use when writing the trained model to disk. Should end in ".model".
-#'                  If passing `NULL`, will not save the trained model to disk.
+#' @param weights Sample / observation weights for rows in the input data. If \code{NULL}, will assume that all
+#'                observations / rows have the same importance / weight.
+#' @param objective Optimization objective (e.g. `"regression"`, `"binary"`, etc.).
+#'                  For a list of accepted objectives, see
+#'                  \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{
+#'                  the "objective" item of the "Parameters" section of the documentation}.
+#' @param init_score initial score is the base prediction lightgbm will boost from
+#' @param num_threads Number of parallel threads to use. For best speed, this should be set to the number of
+#'                    physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
+#'                    number of maximum threads.
+#'
+#'                    Be aware that using too many threads can result in speed degradation in smaller datasets
+#'                    (see the parameters documentation for more details).
+#'
+#'                    If passing zero, will use the default number of threads configured for OpenMP
+#'                    (typically controlled through an environment variable \code{OMP_NUM_THREADS}).
+#'
+#'                    If passing \code{NULL} (the default), will try to use the number of physical cores in the
+#'                    system, but be aware that getting the number of cores detected correctly requires package
+#'                    \code{RhpcBLASctl} to be installed.
+#'
+#'                    This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+#'                    if passed there.
 #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
 #'     \itemize{
 #'        \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
@@ -114,16 +140,18 @@ NULL
 #' @export
 lightgbm <- function(data,
                      label = NULL,
-                     weight = NULL,
+                     weights = NULL,
                      params = list(),
                      nrounds = 100L,
                      verbose = 1L,
                      eval_freq = 1L,
                      early_stopping_rounds = NULL,
-                     save_name = "lightgbm.model",
                      init_model = NULL,
                      callbacks = list(),
                      serializable = TRUE,
+                     objective = "regression",
+                     init_score = NULL,
+                     num_threads = NULL,
                      ...) {
 
   # validate inputs early to avoid unnecessary computation
@@ -131,19 +159,34 @@ lightgbm <- function(data,
     stop("nrounds should be greater than zero")
   }
 
+  if (is.null(num_threads)) {
+    num_threads <- lgb.get.default.num.threads()
+  }
+  params <- lgb.check.wrapper_param(
+    main_param_name = "num_threads"
+    , params = params
+    , alternative_kwarg_value = num_threads
+  )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "verbosity"
+    , params = params
+    , alternative_kwarg_value = verbose
+  )
+
   # Set data to a temporary variable
   dtrain <- data
 
   # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
   if (!lgb.is.Dataset(x = dtrain)) {
-    dtrain <- lgb.Dataset(data = data, label = label, weight = weight)
+    dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score)
   }
 
   train_args <- list(
     "params" = params
     , "data" = dtrain
     , "nrounds" = nrounds
-    , "verbose" = verbose
+    , "obj" = objective
+    , "verbose" = params[["verbosity"]]
     , "eval_freq" = eval_freq
     , "early_stopping_rounds" = early_stopping_rounds
     , "init_model" = init_model
@@ -156,22 +199,12 @@ lightgbm <- function(data,
     train_args[["valids"]] <- list()
   }
 
-  # Set validation as oneself
-  if (verbose > 0L) {
-    train_args[["valids"]][["train"]] <- dtrain
-  }
-
   # Train a model using the regular way
   bst <- do.call(
     what = lgb.train
     , args = train_args
   )
 
-  # Store model under a specific name
-  if (!is.null(save_name)) {
-    bst$save_model(filename = save_name)
-  }
-
   return(bst)
 }
 
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 1a4605f837cc..53c88475b589 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -1,13 +1,13 @@
 lgb.is.Booster <- function(x) {
-  return(all(c("R6", "lgb.Booster") %in% class(x)))
+  return(all(c("R6", "lgb.Booster") %in% class(x)))  # nolint: class_equals
 }
 
 lgb.is.Dataset <- function(x) {
-  return(all(c("R6", "lgb.Dataset") %in% class(x)))
+  return(all(c("R6", "lgb.Dataset") %in% class(x)))  # nolint: class_equals
 }
 
 lgb.is.Predictor <- function(x) {
-  return(all(c("R6", "lgb.Predictor") %in% class(x)))
+  return(all(c("R6", "lgb.Predictor") %in% class(x)))  # nolint: class_equals
 }
 
 lgb.is.null.handle <- function(x) {
@@ -25,20 +25,19 @@ lgb.params2str <- function(params) {
     stop("params must be a list")
   }
 
-  # Split parameter names
   names(params) <- gsub("\\.", "_", names(params))
-
+  param_names <- names(params)
   ret <- list()
 
   # Perform key value join
-  for (key in names(params)) {
+  for (i in seq_along(params)) {
 
     # If a parameter has multiple values, join those values together with commas.
     # trimws() is necessary because format() will pad to make strings the same width
     val <- paste0(
       trimws(
         format(
-          x = params[[key]]
+          x = unname(params[[i]])
           , scientific = FALSE
         )
       )
@@ -47,7 +46,7 @@ lgb.params2str <- function(params) {
     if (nchar(val) <= 0L) next # Skip join
 
     # Join key value
-    pair <- paste0(c(key, val), collapse = "=")
+    pair <- paste0(c(param_names[[i]], val), collapse = "=")
     ret <- c(ret, pair)
 
   }
@@ -70,7 +69,13 @@ lgb.check_interaction_constraints <- function(interaction_constraints, column_na
     if (!methods::is(interaction_constraints, "list")) {
         stop("interaction_constraints must be a list")
     }
-    if (!all(sapply(interaction_constraints, function(x) {is.character(x) || is.numeric(x)}))) {
+    constraint_is_character_or_numeric <- sapply(
+        X = interaction_constraints
+        , FUN = function(x) {
+            return(is.character(x) || is.numeric(x))
+        }
+    )
+    if (!all(constraint_is_character_or_numeric)) {
         stop("every element in interaction_constraints must be a character vector or numeric vector")
     }
 
@@ -118,71 +123,6 @@ lgb.check_interaction_constraints <- function(interaction_constraints, column_na
 
 }
 
-lgb.check.obj <- function(params, obj) {
-
-  # List known objectives in a vector
-  OBJECTIVES <- c(
-    "regression"
-    , "regression_l1"
-    , "regression_l2"
-    , "mean_squared_error"
-    , "mse"
-    , "l2_root"
-    , "root_mean_squared_error"
-    , "rmse"
-    , "mean_absolute_error"
-    , "mae"
-    , "quantile"
-    , "huber"
-    , "fair"
-    , "poisson"
-    , "binary"
-    , "lambdarank"
-    , "multiclass"
-    , "softmax"
-    , "multiclassova"
-    , "multiclass_ova"
-    , "ova"
-    , "ovr"
-    , "xentropy"
-    , "cross_entropy"
-    , "xentlambda"
-    , "cross_entropy_lambda"
-    , "mean_absolute_percentage_error"
-    , "mape"
-    , "gamma"
-    , "tweedie"
-    , "rank_xendcg"
-    , "xendcg"
-    , "xe_ndcg"
-    , "xe_ndcg_mart"
-    , "xendcg_mart"
-  )
-
-  # Check whether the objective is empty or not, and take it from params if needed
-  if (!is.null(obj)) {
-    params$objective <- obj
-  }
-
-  # Check whether the objective is a character
-  if (is.character(params$objective)) {
-
-    # If the objective is a character, check if it is a known objective
-    if (!(params$objective %in% OBJECTIVES)) {
-
-      stop("lgb.check.obj: objective name error should be one of (", paste0(OBJECTIVES, collapse = ", "), ")")
-
-    }
-
-  } else if (!is.function(params$objective)) {
-
-    stop("lgb.check.obj: objective should be a character or a function")
-
-  }
-
-  return(params)
-
-}
 
 # [description]
 #     Take any character values from eval and store them in params$metric.
@@ -268,7 +208,7 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v
     return(params)
   }
 
-  # if the main parameter wasn't proovided, prefer the first alias
+  # if the main parameter wasn't provided, prefer the first alias
   if (length(aliases_provided) > 0L) {
     first_param <- aliases_provided[1L]
     params[[main_param_name]] <- params[[first_param]]
@@ -283,3 +223,26 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v
   params[[main_param_name]] <- alternative_kwarg_value
   return(params)
 }
+
+#' @importFrom parallel detectCores
+lgb.get.default.num.threads <- function() {
+  if (requireNamespace("RhpcBLASctl", quietly = TRUE)) {  # nolint: undesirable_function
+    return(RhpcBLASctl::get_num_cores())
+  } else {
+    msg <- "Optional package 'RhpcBLASctl' not found."
+    cores <- 0L
+    if (Sys.info()["sysname"] != "Linux") {
+      cores <- parallel::detectCores(logical = FALSE)
+      if (is.na(cores) || cores < 0L) {
+        cores <- 0L
+      }
+    }
+    if (cores == 0L) {
+      msg <- paste(msg, "Will use default number of OpenMP threads.", sep = " ")
+    } else {
+      msg <- paste(msg, "Detection of CPU cores might not be accurate.", sep = " ")
+    }
+    warning(msg)
+    return(cores)
+  }
+}
diff --git a/R-package/README.md b/R-package/README.md
index d610fe0410a8..2280ba385e44 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -265,11 +265,13 @@ The example below shows how to generate code coverage for the R package on a mac
 
 ```shell
 # Install
-sh build-cran-package.sh
+sh build-cran-package.sh \
+    --no-build-vignettes
 
 # Get coverage
 Rscript -e " \
-    coverage  <- covr::package_coverage('./lightgbm_r', type = 'tests', quiet = FALSE);
+    library(covr);
+    coverage <- covr::package_coverage('./lightgbm_r', type = 'tests', quiet = FALSE);
     print(coverage);
     covr::report(coverage, file = file.path(getwd(), 'coverage.html'), browse = TRUE);
     "
@@ -333,6 +335,7 @@ At build time, `configure` will be run and used to create a file `Makevars`, usi
 
     ```shell
     docker run \
+        --rm \
         -v $(pwd):/opt/LightGBM \
         -w /opt/LightGBM \
         -t ubuntu:20.04 \
@@ -365,36 +368,6 @@ sh build-cran-package.sh
 R CMD check --as-cran lightgbm_*.tar.gz
 ```
 
-#### Solaris
-
-All packages uploaded to CRAN must pass `R CMD check` on Solaris 10. To test LightGBM on this operating system, you can use the free service [R Hub](https://builder.r-hub.io/), a free service generously provided by the R Consortium.
-
-```shell
-sh build-cran-package.sh
-```
-
-```r
-package_tarball <- paste0("lightgbm_", readLines("VERSION.txt")[1], ".tar.gz")
-rhub::check(
-    path = package_tarball
-    , email = "your_email_here"
-    , check_args = "--as-cran"
-    , platform = c(
-        "solaris-x86-patched"
-        , "solaris-x86-patched-ods"
-    )
-    , env_vars = c(
-        "R_COMPILE_AND_INSTALL_PACKAGES" = "always"
-    )
-)
-```
-
-Alternatively, GitHub Actions can run code above for you. On a pull request, create a comment with this phrase:
-
-> /gha run r-solaris
-
-**NOTE:** Please do this only once you see that other R tests on a pull request are passing. R Hub is a free resource with limited capacity, and we want to be respectful community members.
-
 #### <a id="UBSAN"></a>ASAN and UBSAN
 
 All packages uploaded to CRAN must pass builds using `gcc` and `clang`, instrumented with two sanitizers: the Address Sanitizer (ASAN) and the Undefined Behavior Sanitizer (UBSAN).
@@ -425,7 +398,7 @@ docker run \
 
 # install dependencies
 RDscript${R_CUSTOMIZATION} \
-  -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+  -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
 
 # install lightgbm
 sh build-cran-package.sh --r-executable=RD${R_CUSTOMIZATION}
@@ -450,12 +423,13 @@ You can replicate these checks locally using Docker. Note that instrumented vers
 
 ```shell
 docker run \
+    --rm \
     -v $(pwd):/opt/LightGBM \
     -w /opt/LightGBM \
     -it \
         wch1/r-debug
 
-RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
 
 sh build-cran-package.sh \
     --r-executable=RDvalgrind
diff --git a/R-package/configure b/R-package/configure
index 3a109c345562..b879e9c4c049 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for lightgbm 3.3.1.99.
+# Generated by GNU Autoconf 2.69 for lightgbm 3.3.2.99.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -576,8 +576,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='lightgbm'
 PACKAGE_TARNAME='lightgbm'
-PACKAGE_VERSION='3.3.1.99'
-PACKAGE_STRING='lightgbm 3.3.1.99'
+PACKAGE_VERSION='3.3.2.99'
+PACKAGE_STRING='lightgbm 3.3.2.99'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1182,7 +1182,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures lightgbm 3.3.1.99 to adapt to many kinds of systems.
+\`configure' configures lightgbm 3.3.2.99 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1244,7 +1244,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of lightgbm 3.3.1.99:";;
+     short | recursive ) echo "Configuration of lightgbm 3.3.2.99:";;
    esac
   cat <<\_ACEOF
 
@@ -1311,7 +1311,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-lightgbm configure 3.3.1.99
+lightgbm configure 3.3.2.99
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1328,7 +1328,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by lightgbm $as_me 3.3.1.99, which was
+It was created by lightgbm $as_me 3.3.2.99, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2398,7 +2398,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by lightgbm $as_me 3.3.1.99, which was
+This file was extended by lightgbm $as_me 3.3.2.99, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2451,7 +2451,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-lightgbm config.status 3.3.1.99
+lightgbm config.status 3.3.2.99
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/configure.win b/R-package/configure.win
index d7f75f972771..b4b05ce55750 100755
--- a/R-package/configure.win
+++ b/R-package/configure.win
@@ -70,6 +70,30 @@ then
     LGB_CPPFLAGS="${LGB_CPPFLAGS} -DMM_MALLOC=1"
 fi
 
+#############
+# INET_PTON #
+#############
+
+ac_inet_pton="no"
+
+cat > conftest.cpp <<EOL
+#include <ws2tcpip.h>
+int main() {
+  void* p = inet_pton;
+  return 0;
+}
+EOL
+
+${CXX} ${CXXFLAGS} ${CPPFLAGS} -o conftest conftest.cpp 2>/dev/null && ./conftest && ac_inet_pton="yes"
+rm -f ./conftest
+rm -f ./conftest.cpp
+echo "checking whether INET_PTON works...${ac_inet_pton}"
+
+if test "${ac_inet_pton}" = "yes";
+then
+    LGB_CPPFLAGS="${LGB_CPPFLAGS} -DWIN_HAS_INET_PTON=1"
+fi
+
 # Generate Makevars.win from Makevars.win.in
 sed -e \
     "s/@LGB_CPPFLAGS@/$LGB_CPPFLAGS/" \
diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md
index 1d354cfa3777..8b04d748149c 100644
--- a/R-package/cran-comments.md
+++ b/R-package/cran-comments.md
@@ -1,5 +1,120 @@
 # CRAN Submission History
 
+## v3.3.2 - Submission 1 - (TBD)
+
+### CRAN response
+
+### Maintainer Notes
+
+In this submission, we uploaded a patch that CRAN stuff provided us via e-mail. The full text of the e-mail from CRAN:
+
+```text
+Dear maintainers,
+
+This concerns the CRAN packages
+
+Cairo cepreader gpboost httpuv ipaddress lightgbm proj4 prophet
+RcppCWB RcppParallel RDieHarder re2 redux rgeolocate RGtk2 tth
+udunits2 unrtf
+
+maintained by one of you:
+
+Andreas Blaette andreas.blaette@uni-due.de: RcppCWB
+David Hall david.hall.physics@gmail.com: ipaddress
+Dirk Eddelbuettel edd@debian.org: RDieHarder
+Fabio Sigrist fabiosigrist@gmail.com: gpboost
+Friedrich Leisch Friedrich.Leisch@R-project.org: tth
+Girish Palya girishji@gmail.com: re2
+James Hiebert hiebert@uvic.ca: udunits2
+Jari Oksanen jhoksane@gmail.com: cepreader
+Kevin Ushey kevin@rstudio.com: RcppParallel
+ORPHANED: RGtk2
+Os Keyes ironholds@gmail.com: rgeolocate
+Rich FitzJohn rich.fitzjohn@gmail.com: redux
+Sean Taylor sjtz@pm.me: prophet
+Simon Urbanek simon.urbanek@r-project.org: proj4
+Simon Urbanek Simon.Urbanek@r-project.org: Cairo
+Winston Chang winston@rstudio.com: httpuv
+Yu Shi yushi2@microsoft.com: lightgbm
+
+your packages need to be updated for R-devel/R 4.2 to work on Windows,
+following the recent switch to UCRT and Rtools42.
+
+Sorry for the group message, please feel free to respond individually
+regarding your package or ask specifically about what needs to be fixed.
+
+I've created patches for you, so please review them and fix your packages:
+
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fsvn.r-project.org%2FR-dev-web%2Ftrunk%2FWindowsBuilds%2Fwinutf8%2Fucrt3%2Fr_packages%2Fpatches%2FCRAN%2F&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=rFGf7Y4Dvo6g1kzV%2BeAJDLGm1TUtzQsLsavElTw6H1U%3D&amp;reserved=0
+
+You can apply them as follows
+
+tar xfz package_1.0.0.tar.gz
+
+wget
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fsvn.r-project.org%2FR-dev-web%2Ftrunk%2FWindowsBuilds%2Fwinutf8%2Fucrt3%2Fr_packages%2Fpatches%2FCRAN%2Fpackage.diff&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=iyTjhoqvzj3IbQ8HGCZeh1IQl34FAGpIdVyZWkzNvO0%3D&amp;reserved=0
+
+patch --binary < package.diff
+
+These patches are currently automatically applied by R-devel on Windows
+at installation time, which makes most of your packages pass their
+checks (as OK or NOTE), but please check your results carefully and
+carefully review the patches. Usually these changes were because of
+newer GCC or newer MinGW in the toolchain, but some for other reasons,
+and some of them will definitely have to be improved so that the package
+keeps building also for older versions of R using Rtools40. We have only
+been testing the patches with UCRT (and Rtools42) on Windows.
+
+For more information, please see
+
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdeveloper.r-project.org%2FBlog%2Fpublic%2F2021%2F12%2F07%2Fupcoming-changes-in-r-4.2-on-windows%2F&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=SY77zgtbDbHvTxTgPLOoe%2Fw5OZDhXvJoxpVOoEaKoYo%3D&amp;reserved=0
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdeveloper.r-project.org%2FWindowsBuilds%2Fwinutf8%2Fucrt3%2Fhowto.html&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=dlVJ4nhQlmDPd56bHoVsWZuRfrUUorvOWxoUTmVDM%2Bg%3D&amp;reserved=0
+
+Once you add your patches/fix the issues, your package will probably
+show a warning during R CMD check (as patching would be attempted to be
+applied again). That's ok, at that point please let me know and I will
+remove my patch from the repository of automatically applied patches.
+
+If you end up just applying the patch as is, there is probably no need
+testing on your end, but you can do so using Winbuilder, r-hub, github
+actions (e.g. https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fkalibera%2Fucrt3&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=msqoPzqDStlAUn%2Bb6gGevwFPD%2FaNL5dTxiNud2Sqzy8%3D&amp;reserved=0).
+
+If you wanted to test locally on your Windows machine and do not have a
+UCRT version of R-devel yet, please uninstall your old version of
+R-devel, delete the old library used with that, install a new UCRT
+version of R-devel , and install Rtools42. You can keep Rtools40
+installed if you need it with R 4.1 or earlier.
+
+Currently, the new R-devel can be downloaded from
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.r-project.org%2Fnosvn%2Fwinutf8%2Fucrt3%2Fweb%2Frdevel.html&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=0hCwONzLmcW0GIXNqiOZQEIuhNA%2BjHhQvXsofs8J98o%3D&amp;reserved=0
+
+And Rtools42 from
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.r-project.org%2Fnosvn%2Fwinutf8%2Fucrt3%2Fweb%2Frtools.html&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=WLWLbOyQKbaYz8gkfKz2sqoGknjIOtl1aGAhUF%2Bpylg%3D&amp;reserved=0
+
+If you end up testing locally, you can use R_INSTALL_TIME_PATCHES
+environment variable to disable the automated patching, see the "howto"
+document above. That way you could also see what the original issue was
+causing.
+
+If you wanted to find libraries to link for yourself, e.g. in a newer
+version of your package, please look for "Using findLinkingOrder with
+Rtools42 (tiff package example)" in the "howto" document above. I
+created the patches for you manually before we finished this script, so
+you may be able to create a shorter version using it, but - it's
+probably not worth the effort.
+
+If you wanted to try in a virtual machine, but did not have a license,
+you can use also an automated setup of a free trial VM from
+https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdeveloper.r-project.org%2FBlog%2Fpublic%2F2021%2F03%2F18%2Fvirtual-windows-machine-for-checking-r-packages&amp;data=04%7C01%7Cyushi2%40microsoft.com%7C8e6c353d1a8842c81eeb08d9bef5d835%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637750786169848244%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=aFFQYuC9CoBwBiLgZHi8N3yUnSiHu5Xtdqb2YBiMIHQ%3D&amp;reserved=0
+
+(but that needs a very good and un-metered network connection to install)
+
+Please let us know if you have any questions.
+
+Thanks,
+Tomas & Uwe
+```
+
 ## v3.3.1 - Submission 1 - (October 27, 2021)
 
 ### CRAN response
diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R
index b6b3f1ceba7b..5ee04f2d756e 100644
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@@ -22,8 +22,8 @@ param <- list(
 bst <- lgb.train(param, dtrain, 1L, valids = valids)
 
 # Note: we need the margin value instead of transformed prediction in set_init_score
-ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
-ptest  <- predict(bst, agaricus.test$data, rawscore = TRUE)
+ptrain <- predict(bst, agaricus.train$data, type = "raw")
+ptest  <- predict(bst, agaricus.test$data, type = "raw")
 
 # set the init_score property of dtrain and dtest
 # base margin is the base prediction we will boost from
diff --git a/R-package/demo/leaf_stability.R b/R-package/demo/leaf_stability.R
index af1c533ac5b1..0733f31c3f87 100644
--- a/R-package/demo/leaf_stability.R
+++ b/R-package/demo/leaf_stability.R
@@ -111,7 +111,7 @@ new_data <- data.frame(
     X = rowMeans(predict(
         model
         , agaricus.test$data
-        , predleaf = TRUE
+        , type = "leaf"
     ))
     , Y = pmin(
         pmax(
@@ -162,7 +162,7 @@ new_data2 <- data.frame(
     X = rowMeans(predict(
         model2
         , agaricus.test$data
-        , predleaf = TRUE
+        , type = "leaf"
     ))
     , Y = pmin(
         pmax(
@@ -218,7 +218,7 @@ new_data3 <- data.frame(
     X = rowMeans(predict(
         model3
         , agaricus.test$data
-        , predleaf = TRUE
+        , type = "leaf"
     ))
     , Y = pmin(
         pmax(
diff --git a/R-package/demo/multiclass.R b/R-package/demo/multiclass.R
index 0f52fecc0b26..35441ccec983 100644
--- a/R-package/demo/multiclass.R
+++ b/R-package/demo/multiclass.R
@@ -56,21 +56,15 @@ model <- lgb.train(
 # We can predict on test data, identical
 my_preds <- predict(model, test[, 1L:4L])
 
-# A (30x3) matrix with the predictions, use parameter reshape
+# A (30x3) matrix with the predictions
 # class1 class2 class3
 #   obs1   obs1   obs1
 #   obs2   obs2   obs2
 #   ....   ....   ....
-my_preds <- predict(model, test[, 1L:4L], reshape = TRUE)
+my_preds <- predict(model, test[, 1L:4L])
 
 # We can also get the predicted scores before the Sigmoid/Softmax application
-my_preds <- predict(model, test[, 1L:4L], rawscore = TRUE)
-
-# Raw score predictions as matrix instead of vector
-my_preds <- predict(model, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
+my_preds <- predict(model, test[, 1L:4L], type = "raw")
 
 # We can also get the leaf index
-my_preds <- predict(model, test[, 1L:4L], predleaf = TRUE)
-
-# Predict leaf index as matrix instead of vector
-my_preds <- predict(model, test[, 1L:4L], predleaf = TRUE, reshape = TRUE)
+my_preds <- predict(model, test[, 1L:4L], type = "leaf")
diff --git a/R-package/demo/multiclass_custom_objective.R b/R-package/demo/multiclass_custom_objective.R
index a1e8edc958aa..09bdd322179c 100644
--- a/R-package/demo/multiclass_custom_objective.R
+++ b/R-package/demo/multiclass_custom_objective.R
@@ -36,7 +36,7 @@ model_builtin <- lgb.train(
     , obj = "multiclass"
 )
 
-preds_builtin <- predict(model_builtin, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
+preds_builtin <- predict(model_builtin, test[, 1L:4L], type = "raw")
 probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin))
 
 # Method 2 of training with custom objective function
@@ -109,7 +109,7 @@ model_custom <- lgb.train(
     , eval = custom_multiclass_metric
 )
 
-preds_custom <- predict(model_custom, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
+preds_custom <- predict(model_custom, test[, 1L:4L], type = "raw")
 probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
 
 # compare predictions
diff --git a/R-package/demo/weight_param.R b/R-package/demo/weight_param.R
index 9702de41ece9..8fd8ae17f087 100644
--- a/R-package/demo/weight_param.R
+++ b/R-package/demo/weight_param.R
@@ -6,13 +6,13 @@
 library(lightgbm)
 
 # We will train a model with the following scenarii:
-# - Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
-# - Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
-# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
+# - Run 1: sum of weights equal to 6513 (x 1e-5) without adjusted regularization (not learning)
+# - Run 2: sum of weights equal to 6513 (x 1e-5) adjusted regularization (learning)
+# - Run 3: sum of weights equal to 6513 with adjusted regularization (learning)
 
 # Setup small weights
-weights1 <- rep(1.0 / 100000.0, 6513L)
-weights2 <- rep(1.0 / 100000.0, 1611L)
+weights1 <- rep(1e-5, 6513L)
+weights2 <- rep(1e-5, 1611L)
 
 # Load data and create datasets
 data(agaricus.train, package = "lightgbm")
@@ -23,7 +23,7 @@ test <- agaricus.test
 dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label, weight = weights2)
 valids <- list(test = dtest)
 
-# Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
+# Run 1: sum of weights equal to 6513 (x 1e-5) without adjusted regularization (not learning)
 # It cannot learn because regularization is too large!
 # min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything
 params <- list(
@@ -47,7 +47,7 @@ model <- lgb.train(
 weight_loss <- as.numeric(model$record_evals$test$l2$eval)
 plot(weight_loss) # Shows how poor the learning was: a straight line!
 
-# Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
+# Run 2: sum of weights equal to 6513 (x 1e-5) with adjusted regularization (learning)
 # Adjusted regularization just consisting in multiplicating results by 1e4 (x10000)
 # Notice how it learns, there is no issue as we adjusted regularization ourselves
 params <- list(
@@ -71,17 +71,8 @@ model <- lgb.train(
 small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
 plot(small_weight_loss) # It learns!
 
-# Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
-# To make it better, we are first cleaning the environment and reloading LightGBM
-lgb.unloader(wipe = TRUE)
-
-# And now, we are doing as usual
-library(lightgbm)
-data(agaricus.train, package = "lightgbm")
-train <- agaricus.train
+# Run 3: sum of weights equal to 6513 with adjusted regularization (learning)
 dtrain <- lgb.Dataset(train$data, label = train$label)
-data(agaricus.test, package = "lightgbm")
-test <- agaricus.test
 dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
 valids <- list(test = dtest)
 
diff --git a/R-package/inst/make-r-def.R b/R-package/inst/make-r-def.R
index 94f548a51334..726041e64aad 100644
--- a/R-package/inst/make-r-def.R
+++ b/R-package/inst/make-r-def.R
@@ -24,7 +24,7 @@ message(sprintf("Creating '%s' from '%s'", OUT_DEF_FILE, IN_DLL_FILE))
 .pipe_shell_command_to_stdout <- function(command, args, out_file) {
     has_processx <- suppressMessages({
       suppressWarnings({
-        require("processx")  # nolint
+        require("processx")  # nolint: undesirable_function
       })
     })
     if (has_processx) {
@@ -71,7 +71,7 @@ invisible(file.remove(OBJDUMP_FILE))
 # see https://www.cs.colorado.edu/~main/cs1300/doc/mingwfaq.html
 start_index <- which(
     grepl(
-        pattern = "[Ordinal/Name Pointer] Table"
+        pattern = "[Ordinal/Name Pointer] Table"  # nolint: non_portable_path
         , x = objdump_results
         , fixed = TRUE
     )
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
index f240a241b7ac..0717bff7d58f 100644
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@@ -25,7 +25,8 @@ lgb.cv(
   early_stopping_rounds = NULL,
   callbacks = list(),
   reset_data = FALSE,
-  serializable = TRUE
+  serializable = TRUE,
+  eval_train_metric = FALSE
 )
 }
 \arguments{
@@ -82,11 +83,12 @@ may allow you to pass other types of data like \code{matrix} and then separately
                 }
             }}
 
-\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
+\item{verbose}{verbosity for output, if <= 0 and \code{valids} has been provided, also will disable the
+printing of evaluation during training}
 
 \item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
 
-\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
+\item{eval_freq}{evaluation output frequency, only effective when verbose > 0 and \code{valids} has been provided}
 
 \item{showsd}{\code{boolean}, whether to show standard deviation of cross validation.
 This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a
@@ -120,6 +122,10 @@ into a predictor model which frees up memory and the original datasets}
 
 \item{serializable}{whether to make the resulting objects serializable through functions such as
 \code{save} or \code{saveRDS} (see section "Model serialization").}
+
+\item{eval_train_metric}{\code{boolean}, whether to add the cross validation results on the
+training data. This parameter defaults to \code{FALSE}. Setting it to \code{TRUE}
+will increase run time.}
 }
 \value{
 a trained model \code{lgb.CVBooster}.
diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd
index 199614241502..31a0fcf9c545 100644
--- a/R-package/man/lgb.restore_handle.Rd
+++ b/R-package/man/lgb.restore_handle.Rd
@@ -25,9 +25,8 @@ data("agaricus.train")
 model <- lightgbm(
   agaricus.train$data
   , agaricus.train$label
-  , params = list(objective = "binary", nthreads = 1L)
+  , params = list(objective = "binary")
   , nrounds = 5L
-  , save_name = NULL
   , verbose = 0)
 fname <- tempfile(fileext="rds")
 saveRDS(model, fname)
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index ba01601edb88..6090646b31f7 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -73,11 +73,12 @@ may allow you to pass other types of data like \code{matrix} and then separately
                 }
             }}
 
-\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
+\item{verbose}{verbosity for output, if <= 0 and \code{valids} has been provided, also will disable the
+printing of evaluation during training}
 
 \item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
 
-\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
+\item{eval_freq}{evaluation output frequency, only effective when verbose > 0 and \code{valids} has been provided}
 
 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
 
@@ -106,7 +107,9 @@ original datasets}
 a trained booster model \code{lgb.Booster}.
 }
 \description{
-Logic to train with LightGBM
+Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}},
+             this function is focused on performance (e.g. speed, memory efficiency). It is also
+             less likely to have breaking API changes in new releases than \code{\link{lightgbm}}.
 }
 \section{Early Stopping}{
 
diff --git a/R-package/man/lgb.unloader.Rd b/R-package/man/lgb.unloader.Rd
deleted file mode 100644
index 5a07d3eb1e17..000000000000
--- a/R-package/man/lgb.unloader.Rd
+++ /dev/null
@@ -1,57 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/lgb.unloader.R
-\name{lgb.unloader}
-\alias{lgb.unloader}
-\title{Remove lightgbm and its objects from an environment}
-\usage{
-lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv)
-}
-\arguments{
-\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R.
-Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
-unloading is performed.}
-
-\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
-environment. Defaults to \code{FALSE} which means to not remove them.}
-
-\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
-\code{.GlobalEnv} which is the global environment.}
-}
-\value{
-NULL invisibly.
-}
-\description{
-Attempts to unload LightGBM packages so you can remove objects cleanly without
-             having to restart R. This is useful for instance if an object becomes stuck for no
-             apparent reason and you do not want to restart R to fix the lost object.
-}
-\examples{
-\donttest{
-data(agaricus.train, package = "lightgbm")
-train <- agaricus.train
-dtrain <- lgb.Dataset(train$data, label = train$label)
-data(agaricus.test, package = "lightgbm")
-test <- agaricus.test
-dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
-params <- list(
-  objective = "regression"
-  , metric = "l2"
-  , min_data = 1L
-  , learning_rate = 1.0
-)
-valids <- list(test = dtest)
-model <- lgb.train(
-  params = params
-  , data = dtrain
-  , nrounds = 5L
-  , valids = valids
-)
-
-lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
-rm(model, dtrain, dtest) # Not needed if wipe = TRUE
-gc() # Not needed if wipe = TRUE
-
-library(lightgbm)
-# Do whatever you want again with LightGBM without object clashing
-}
-}
diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd
index b95c258e5f75..ce3de8d7a342 100644
--- a/R-package/man/lgb_shared_params.Rd
+++ b/R-package/man/lgb_shared_params.Rd
@@ -50,7 +50,7 @@ set to the iteration number of the best iteration.}
                 }
             }}
 
-\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
+\item{eval_freq}{evaluation output frequency, only effective when verbose > 0 and \code{valids} has been provided}
 
 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
 
@@ -63,7 +63,8 @@ set to the iteration number of the best iteration.}
 \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 the "Parameters" section of the documentation} for a list of parameters and valid values.}
 
-\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
+\item{verbose}{verbosity for output, if <= 0 and \code{valids} has been provided, also will disable the
+printing of evaluation during training}
 
 \item{serializable}{whether to make the resulting objects serializable through functions such as
 \code{save} or \code{saveRDS} (see section "Model serialization").}
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
index 1e6be676f62e..4e834d040e33 100644
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@@ -7,16 +7,18 @@
 lightgbm(
   data,
   label = NULL,
-  weight = NULL,
+  weights = NULL,
   params = list(),
   nrounds = 100L,
   verbose = 1L,
   eval_freq = 1L,
   early_stopping_rounds = NULL,
-  save_name = "lightgbm.model",
   init_model = NULL,
   callbacks = list(),
   serializable = TRUE,
+  objective = "regression",
+  init_score = NULL,
+  num_threads = NULL,
   ...
 )
 }
@@ -27,16 +29,18 @@ may allow you to pass other types of data like \code{matrix} and then separately
 
 \item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}}
 
-\item{weight}{vector of response values. If not NULL, will set to dataset}
+\item{weights}{Sample / observation weights for rows in the input data. If \code{NULL}, will assume that all
+observations / rows have the same importance / weight.}
 
 \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 the "Parameters" section of the documentation} for a list of parameters and valid values.}
 
 \item{nrounds}{number of training rounds}
 
-\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
+\item{verbose}{verbosity for output, if <= 0 and \code{valids} has been provided, also will disable the
+printing of evaluation during training}
 
-\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
+\item{eval_freq}{evaluation output frequency, only effective when verbose > 0 and \code{valids} has been provided}
 
 \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
 training will stop if the evaluation of any metric on any validation set
@@ -44,9 +48,6 @@ fails to improve for \code{early_stopping_rounds} consecutive boosting rounds.
 If training stops early, the returned model will have attribute \code{best_iter}
 set to the iteration number of the best iteration.}
 
-\item{save_name}{File name to use when writing the trained model to disk. Should end in ".model".
-If passing `NULL`, will not save the trained model to disk.}
-
 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
 
 \item{callbacks}{List of callback functions that are applied at each iteration.}
@@ -54,6 +55,30 @@ If passing `NULL`, will not save the trained model to disk.}
 \item{serializable}{whether to make the resulting objects serializable through functions such as
 \code{save} or \code{saveRDS} (see section "Model serialization").}
 
+\item{objective}{Optimization objective (e.g. `"regression"`, `"binary"`, etc.).
+For a list of accepted objectives, see
+\href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{
+the "objective" item of the "Parameters" section of the documentation}.}
+
+\item{init_score}{initial score is the base prediction lightgbm will boost from}
+
+\item{num_threads}{Number of parallel threads to use. For best speed, this should be set to the number of
+                   physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
+                   number of maximum threads.
+
+                   Be aware that using too many threads can result in speed degradation in smaller datasets
+                   (see the parameters documentation for more details).
+
+                   If passing zero, will use the default number of threads configured for OpenMP
+                   (typically controlled through an environment variable \code{OMP_NUM_THREADS}).
+
+                   If passing \code{NULL} (the default), will try to use the number of physical cores in the
+                   system, but be aware that getting the number of cores detected correctly requires package
+                   \code{RhpcBLASctl} to be installed.
+
+                   This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+                   if passed there.}
+
 \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
 \itemize{
    \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
@@ -74,7 +99,12 @@ If passing `NULL`, will not save the trained model to disk.}
 a trained \code{lgb.Booster}
 }
 \description{
-Simple interface for training a LightGBM model.
+High-level R interface to train a LightGBM model. Unlike \code{\link{lgb.train}}, this function
+             is focused on compatibility with other statistics and machine learning interfaces in R.
+             This focus on compatibility means that this interface may experience more frequent breaking API changes
+             than \code{\link{lgb.train}}.
+             For efficiency-sensitive applications, or for applications where breaking API changes across releases
+             is very expensive, use \code{\link{lgb.train}}.
 }
 \section{Early Stopping}{
 
diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd
index 8948a4b17d01..7d9734d9181f 100644
--- a/R-package/man/predict.lgb.Booster.Rd
+++ b/R-package/man/predict.lgb.Booster.Rd
@@ -6,14 +6,11 @@
 \usage{
 \method{predict}{lgb.Booster}(
   object,
-  data,
+  newdata,
+  type = "response",
   start_iteration = NULL,
   num_iteration = NULL,
-  rawscore = FALSE,
-  predleaf = FALSE,
-  predcontrib = FALSE,
   header = FALSE,
-  reshape = FALSE,
   params = list(),
   ...
 )
@@ -21,9 +18,30 @@
 \arguments{
 \item{object}{Object of class \code{lgb.Booster}}
 
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
+\item{newdata}{a \code{matrix} object, a \code{dgCMatrix} object or
 a character representing a path to a text file (CSV, TSV, or LibSVM)}
 
+\item{type}{Type of prediction to output. Allowed types are:\itemize{
+            \item \code{"response"}: will output the predicted score according to the objective function being
+                  optimized (depending on the link function that the objective uses), after applying any necessary
+                  transformations - for example, for \code{objective="binary"}, it will output class probabilities.
+            \item \code{"class"}: for classification objectives, will output the class with the highest predicted
+                  probability. For other objectives, will output the same as "response".
+            \item \code{"raw"}: will output the non-transformed numbers (sum of predictions from boosting iterations'
+                  results) from which the "response" number is produced for a given objective function - for example,
+                  for \code{objective="binary"}, this corresponds to log-odds. For many objectives such as
+                  "regression", since no transformation is applied, the output will be the same as for "response".
+            \item \code{"leaf"}: will output the index of the terminal node / leaf at which each observations falls
+                  in each tree in the model, outputted as integers, with one column per tree.
+            \item \code{"contrib"}: will return the per-feature contributions for each prediction, including an
+                  intercept (each feature will produce one column). If there are multiple classes, each class will
+                  have separate feature contributions (thus the number of columns is features+1 multiplied by the
+                  number of classes).
+            }
+
+            Note that, if using custom objectives, types "class" and "response" will not be available and will
+            default towards using "raw" instead.}
+
 \item{start_iteration}{int or None, optional (default=None)
 Start index of the iteration to predict.
 If None or <= 0, starts from the first iteration.}
@@ -34,34 +52,24 @@ If None, if the best iteration exists and start_iteration is None or <= 0, the
 best iteration is used; otherwise, all iterations from start_iteration are used.
 If <= 0, all iterations from start_iteration are used (no limits).}
 
-\item{rawscore}{whether the prediction should be returned in the for of original untransformed
-sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
-for logistic regression would result in predictions for log-odds instead of probabilities.}
-
-\item{predleaf}{whether predict leaf index instead.}
-
-\item{predcontrib}{return per-feature contributions for each record.}
-
 \item{header}{only used for prediction for text file. True if text file has header}
 
-\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
-prediction outputs per case.}
-
 \item{params}{a list of additional named parameters. See
 \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{
 the "Predict Parameters" section of the documentation} for a list of parameters and
-valid values.}
+valid values. Where these conflict with the values of keyword arguments to this function,
+the values in \code{params} take precedence.}
 
 \item{...}{ignored}
 }
 \value{
-For regression or binary classification, it returns a vector of length \code{nrows(data)}.
-        For multiclass classification, either a \code{num_class * nrows(data)} vector or
-        a \code{(nrows(data), num_class)} dimension matrix is returned, depending on
-        the \code{reshape} value.
+For prediction types that are meant to always return one output per observation (e.g. when predicting
+        \code{type="response"} on a binary classification or regression objective), will return a vector with one
+        element per row in \code{newdata}.
 
-        When \code{predleaf = TRUE}, the output is a matrix object with the
-        number of columns corresponding to the number of trees.
+        For prediction types that are meant to return more than one output per observation (e.g. when predicting
+        \code{type="response"} on a multi-class objective, or when predicting \code{type="leaf"}, regardless of
+        objective), will return a matrix with one row per observation in \code{newdata} and one column per output.
 }
 \description{
 Predicted values based on class \code{lgb.Booster}
diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml
index 9e105d2c6bb5..233a31f0ead9 100644
--- a/R-package/pkgdown/_pkgdown.yml
+++ b/R-package/pkgdown/_pkgdown.yml
@@ -95,7 +95,3 @@ reference:
     - '`lgb.interprete`'
     - '`lgb.plot.importance`'
     - '`lgb.plot.interpretation`'
-  - title: Miscellaneous
-    desc: Ungroupable functions to troubleshoot LightGBM
-    contents:
-    - '`lgb.unloader`'
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 2490ba0757df..f7f6dca97bb0 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -37,10 +37,13 @@ OBJECTS = \
     io/parser.o \
     io/train_share_states.o \
     io/tree.o \
+    io/dense_bin.o \
+    io/sparse_bin.o \
+    io/multi_val_dense_bin.o \
+    io/multi_val_sparse_bin.o \
     metric/dcg_calculator.o \
     metric/metric.o \
     objective/objective_function.o \
-    network/ifaddrs_patch.o \
     network/linker_topo.o \
     network/linkers_mpi.o \
     network/linkers_socket.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 0fb2de926905..354a3fda210f 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -38,10 +38,13 @@ OBJECTS = \
     io/parser.o \
     io/train_share_states.o \
     io/tree.o \
+    io/dense_bin.o \
+    io/sparse_bin.o \
+    io/multi_val_dense_bin.o \
+    io/multi_val_sparse_bin.o \
     metric/dcg_calculator.o \
     metric/metric.o \
     objective/objective_function.o \
-    network/ifaddrs_patch.o \
     network/linker_topo.o \
     network/linkers_mpi.o \
     network/linkers_socket.o \
diff --git a/R-package/src/install.libs.R b/R-package/src/install.libs.R
index b9019755b4fa..bc6ed37a7f17 100644
--- a/R-package/src/install.libs.R
+++ b/R-package/src/install.libs.R
@@ -23,6 +23,11 @@ if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
   warning("Warning: unmatched R_INTERNALS_UUID, may not run normally.")
 }
 
+# Get some paths
+source_dir <- file.path(R_PACKAGE_SOURCE, "src", fsep = "/")
+build_dir <- file.path(source_dir, "build", fsep = "/")
+inst_dir <- file.path(R_PACKAGE_SOURCE, "inst", fsep = "/")
+
 # system() will not raise an R exception if the process called
 # fails. Wrapping it here to get that behavior.
 #
@@ -32,7 +37,7 @@ if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
     on_windows <- .Platform$OS.type == "windows"
     has_processx <- suppressMessages({
       suppressWarnings({
-        require("processx")  # nolint
+        require("processx")  # nolint: undesirable_function
       })
     })
     if (has_processx && on_windows) {
@@ -96,7 +101,7 @@ if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
 
 # Move in CMakeLists.txt
 write_succeeded <- file.copy(
-  "../inst/bin/CMakeLists.txt"
+  file.path(inst_dir, "bin", "CMakeLists.txt")
   , "CMakeLists.txt"
   , overwrite = TRUE
 )
@@ -104,10 +109,6 @@ if (!write_succeeded) {
   stop("Copying CMakeLists.txt failed")
 }
 
-# Get some paths
-source_dir <- file.path(R_PACKAGE_SOURCE, "src", fsep = "/")
-build_dir <- file.path(source_dir, "build", fsep = "/")
-
 # Prepare building package
 dir.create(
   build_dir
@@ -122,7 +123,7 @@ use_visual_studio <- !(use_mingw || use_msys2)
 # to create R.def from R.dll
 if (WINDOWS && use_visual_studio) {
   write_succeeded <- file.copy(
-    "../../inst/make-r-def.R"
+    file.path(inst_dir, "make-r-def.R")
     , file.path(build_dir, "make-r-def.R")
     , overwrite = TRUE
   )
diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp
index 781fa0024d9c..560622788422 100644
--- a/R-package/src/lightgbm_R.cpp
+++ b/R-package/src/lightgbm_R.cpp
@@ -65,6 +65,14 @@ SEXP wrapped_R_raw(void *len) {
   return Rf_allocVector(RAWSXP, *(reinterpret_cast<R_xlen_t*>(len)));
 }
 
+SEXP wrapped_R_int(void *len) {
+  return Rf_allocVector(INTSXP, *(reinterpret_cast<R_xlen_t*>(len)));
+}
+
+SEXP wrapped_R_real(void *len) {
+  return Rf_allocVector(REALSXP, *(reinterpret_cast<R_xlen_t*>(len)));
+}
+
 SEXP wrapped_Rf_mkChar(void *txt) {
   return Rf_mkChar(reinterpret_cast<char*>(txt));
 }
@@ -84,6 +92,14 @@ SEXP safe_R_raw(R_xlen_t len, SEXP *cont_token) {
   return R_UnwindProtect(wrapped_R_raw, reinterpret_cast<void*>(&len), throw_R_memerr, cont_token, *cont_token);
 }
 
+SEXP safe_R_int(R_xlen_t len, SEXP *cont_token) {
+  return R_UnwindProtect(wrapped_R_int, reinterpret_cast<void*>(&len), throw_R_memerr, cont_token, *cont_token);
+}
+
+SEXP safe_R_real(R_xlen_t len, SEXP *cont_token) {
+  return R_UnwindProtect(wrapped_R_real, reinterpret_cast<void*>(&len), throw_R_memerr, cont_token, *cont_token);
+}
+
 SEXP safe_R_mkChar(char *txt, SEXP *cont_token) {
   return R_UnwindProtect(wrapped_Rf_mkChar, reinterpret_cast<void*>(txt), throw_R_memerr, cont_token, *cont_token);
 }
@@ -428,6 +444,17 @@ SEXP LGBM_DatasetGetNumFeature_R(SEXP handle,
   R_API_END();
 }
 
+SEXP LGBM_DatasetGetFeatureNumBin_R(SEXP handle, SEXP feature_idx, SEXP out) {
+  R_API_BEGIN();
+  _AssertDatasetHandleNotNull(handle);
+  int feature = Rf_asInteger(feature_idx);
+  int nbins;
+  CHECK_CALL(LGBM_DatasetGetFeatureNumBin(R_ExternalPtrAddr(handle), feature, &nbins));
+  INTEGER(out)[0] = nbins;
+  return R_NilValue;
+  R_API_END();
+}
+
 // --- start Booster interfaces
 
 void _BoosterFinalizer(SEXP handle) {
@@ -840,6 +867,76 @@ SEXP LGBM_BoosterPredictForMat_R(SEXP handle,
   R_API_END();
 }
 
+struct SparseOutputPointers {
+  void* indptr;
+  int32_t* indices;
+  void* data;
+  int indptr_type;
+  int data_type;
+  SparseOutputPointers(void* indptr, int32_t* indices, void* data)
+  : indptr(indptr), indices(indices), data(data) {}
+};
+
+void delete_SparseOutputPointers(SparseOutputPointers *ptr) {
+  LGBM_BoosterFreePredictSparse(ptr->indptr, ptr->indices, ptr->data, C_API_DTYPE_INT32, C_API_DTYPE_FLOAT64);
+  delete ptr;
+}
+
+SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle,
+  SEXP indptr,
+  SEXP indices,
+  SEXP data,
+  SEXP is_csr,
+  SEXP nrows,
+  SEXP ncols,
+  SEXP start_iteration,
+  SEXP num_iteration,
+  SEXP parameter) {
+  SEXP cont_token = PROTECT(R_MakeUnwindCont());
+  R_API_BEGIN();
+  _AssertBoosterHandleNotNull(handle);
+  const char* out_names[] = {"indptr", "indices", "data", ""};
+  SEXP out = PROTECT(Rf_mkNamed(VECSXP, out_names));
+  const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
+
+  int64_t out_len[2];
+  void *out_indptr;
+  int32_t *out_indices;
+  void *out_data;
+
+  CHECK_CALL(LGBM_BoosterPredictSparseOutput(R_ExternalPtrAddr(handle),
+    INTEGER(indptr), C_API_DTYPE_INT32, INTEGER(indices),
+    REAL(data), C_API_DTYPE_FLOAT64,
+    Rf_xlength(indptr), Rf_xlength(data),
+    Rf_asLogical(is_csr)? Rf_asInteger(ncols) : Rf_asInteger(nrows),
+    C_API_PREDICT_CONTRIB, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
+    parameter_ptr,
+    Rf_asLogical(is_csr)? C_API_MATRIX_TYPE_CSR : C_API_MATRIX_TYPE_CSC,
+    out_len, &out_indptr, &out_indices, &out_data));
+
+  std::unique_ptr<SparseOutputPointers, decltype(&delete_SparseOutputPointers)> pointers_struct = {
+    new SparseOutputPointers(
+      out_indptr,
+      out_indices,
+      out_data),
+    &delete_SparseOutputPointers
+  };
+
+  SEXP out_indptr_R = safe_R_int(out_len[1], &cont_token);
+  SET_VECTOR_ELT(out, 0, out_indptr_R);
+  SEXP out_indices_R = safe_R_int(out_len[0], &cont_token);
+  SET_VECTOR_ELT(out, 1, out_indices_R);
+  SEXP out_data_R = safe_R_real(out_len[0], &cont_token);
+  SET_VECTOR_ELT(out, 2, out_data_R);
+  std::memcpy(INTEGER(out_indptr_R), out_indptr, out_len[1]*sizeof(int));
+  std::memcpy(INTEGER(out_indices_R), out_indices, out_len[0]*sizeof(int));
+  std::memcpy(REAL(out_data_R), out_data, out_len[0]*sizeof(double));
+
+  UNPROTECT(3);
+  return out;
+  R_API_END();
+}
+
 SEXP LGBM_BoosterSaveModel_R(SEXP handle,
   SEXP num_iteration,
   SEXP feature_importance_type,
@@ -939,6 +1036,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"LGBM_DatasetUpdateParamChecking_R", (DL_FUNC) &LGBM_DatasetUpdateParamChecking_R, 2},
   {"LGBM_DatasetGetNumData_R"         , (DL_FUNC) &LGBM_DatasetGetNumData_R         , 2},
   {"LGBM_DatasetGetNumFeature_R"      , (DL_FUNC) &LGBM_DatasetGetNumFeature_R      , 2},
+  {"LGBM_DatasetGetFeatureNumBin_R"   , (DL_FUNC) &LGBM_DatasetGetFeatureNumBin_R   , 3},
   {"LGBM_BoosterCreate_R"             , (DL_FUNC) &LGBM_BoosterCreate_R             , 2},
   {"LGBM_BoosterFree_R"               , (DL_FUNC) &LGBM_BoosterFree_R               , 1},
   {"LGBM_BoosterCreateFromModelfile_R", (DL_FUNC) &LGBM_BoosterCreateFromModelfile_R, 1},
@@ -963,6 +1061,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"LGBM_BoosterCalcNumPredict_R"     , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R     , 8},
   {"LGBM_BoosterPredictForCSC_R"      , (DL_FUNC) &LGBM_BoosterPredictForCSC_R      , 14},
   {"LGBM_BoosterPredictForMat_R"      , (DL_FUNC) &LGBM_BoosterPredictForMat_R      , 11},
+  {"LGBM_BoosterPredictSparseOutput_R", (DL_FUNC) &LGBM_BoosterPredictSparseOutput_R, 10},
   {"LGBM_BoosterSaveModel_R"          , (DL_FUNC) &LGBM_BoosterSaveModel_R          , 4},
   {"LGBM_BoosterSaveModelToString_R"  , (DL_FUNC) &LGBM_BoosterSaveModelToString_R  , 3},
   {"LGBM_BoosterDumpModel_R"          , (DL_FUNC) &LGBM_BoosterDumpModel_R          , 3},
diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h
index 562ebec2e7da..0f2a0949b61c 100644
--- a/R-package/src/lightgbm_R.h
+++ b/R-package/src/lightgbm_R.h
@@ -213,6 +213,19 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetNumFeature_R(
   SEXP out
 );
 
+/*!
+* \brief get number of bins for feature
+* \param handle the handle to the Dataset
+* \param feature the index of the feature
+* \param out The output of number of bins
+* \return R NULL value
+*/
+LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetFeatureNumBin_R(
+  SEXP handle,
+  SEXP feature,
+  SEXP out
+);
+
 // --- start Booster interfaces
 
 /*!
@@ -561,6 +574,35 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMat_R(
   SEXP out_result
 );
 
+/*!
+* \brief make feature contribution prediction for a new Dataset
+* \param handle Booster handle
+* \param indptr array with the index pointer of the data in CSR or CSC format
+* \param indices array with the non-zero indices of the data in CSR or CSC format
+* \param data array with the non-zero values of the data in CSR or CSC format
+* \param is_csr whether the input data is in CSR format or not (pass FALSE for CSC)
+* \param nrows number of rows in the data
+* \param ncols number of columns in the data
+* \param start_iteration Start index of the iteration to predict
+* \param num_iteration number of iteration for prediction, <= 0 means no limit
+* \param parameter additional parameters
+* \return An R list with entries "indptr", "indices", "data", constituting the
+*         feature contributions in sparse format, in the same storage order as
+*         the input data.
+*/
+LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictSparseOutput_R(
+  SEXP handle,
+  SEXP indptr,
+  SEXP indices,
+  SEXP data,
+  SEXP is_csr,
+  SEXP nrows,
+  SEXP ncols,
+  SEXP start_iteration,
+  SEXP num_iteration,
+  SEXP parameter
+);
+
 /*!
 * \brief save model into file
 * \param handle Booster handle
diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R
index 625f2e860d4b..cd38bebc0bcc 100644
--- a/R-package/tests/testthat/test_Predictor.R
+++ b/R-package/tests/testthat/test_Predictor.R
@@ -1,8 +1,10 @@
+library(Matrix)
+
 VERBOSITY <- as.integer(
   Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
 
-context("Predictor")
+TOLERANCE <- 1e-6
 
 test_that("Predictor$finalize() should not fail", {
     X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L)
@@ -81,8 +83,8 @@ test_that("start_iteration works correctly", {
         , early_stopping_rounds = 2L
     )
     expect_true(lgb.is.Booster(bst))
-    pred1 <- predict(bst, data = test$data, rawscore = TRUE)
-    pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE)
+    pred1 <- predict(bst, newdata = test$data, type = "raw")
+    pred_contrib1 <- predict(bst, test$data, type = "contrib")
     pred2 <- rep(0.0, length(pred1))
     pred_contrib2 <- rep(0.0, length(pred2))
     step <- 11L
@@ -96,7 +98,7 @@ test_that("start_iteration works correctly", {
         inc_pred <- predict(bst, test$data
             , start_iteration = start_iter
             , num_iteration = n_iter
-            , rawscore = TRUE
+            , type = "raw"
         )
         inc_pred_contrib <- bst$predict(test$data
             , start_iteration = start_iter
@@ -109,7 +111,463 @@ test_that("start_iteration works correctly", {
     expect_equal(pred2, pred1)
     expect_equal(pred_contrib2, pred_contrib1)
 
-    pred_leaf1 <- predict(bst, test$data, predleaf = TRUE)
-    pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE)
+    pred_leaf1 <- predict(bst, test$data, type = "leaf")
+    pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, type = "leaf")
     expect_equal(pred_leaf1, pred_leaf2)
 })
+
+test_that("Feature contributions from sparse inputs produce sparse outputs", {
+    data(mtcars)
+    X <- as.matrix(mtcars[, -1L])
+    y <- as.numeric(mtcars[, 1L])
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+      data = dtrain
+      , obj = "regression"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+      , params = list(min_data_in_leaf = 5L)
+    )
+
+    pred_dense <- predict(bst, X, type = "contrib")
+
+    Xcsc <- as(X, "CsparseMatrix")
+    pred_csc <- predict(bst, Xcsc, type = "contrib")
+    expect_s4_class(pred_csc, "dgCMatrix")
+    expect_equal(unname(pred_dense), unname(as.matrix(pred_csc)))
+
+    Xcsr <- as(X, "RsparseMatrix")
+    pred_csr <- predict(bst, Xcsr, type = "contrib")
+    expect_s4_class(pred_csr, "dgRMatrix")
+    expect_equal(as(pred_csr, "CsparseMatrix"), pred_csc)
+
+    Xspv <- as(X[1L, , drop = FALSE], "sparseVector")
+    pred_spv <- predict(bst, Xspv, type = "contrib")
+    expect_s4_class(pred_spv, "dsparseVector")
+    expect_equal(Matrix::t(as(pred_spv, "CsparseMatrix")), unname(pred_csc[1L, , drop = FALSE]))
+})
+
+test_that("Sparse feature contribution predictions do not take inputs with wrong number of columns", {
+    data(mtcars)
+    X <- as.matrix(mtcars[, -1L])
+    y <- as.numeric(mtcars[, 1L])
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+      data = dtrain
+      , obj = "regression"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+      , params = list(min_data_in_leaf = 5L)
+    )
+
+    X_wrong <- X[, c(1L:10L, 1L:10L)]
+    X_wrong <- as(X_wrong, "CsparseMatrix")
+    expect_error(predict(bst, X_wrong, type = "contrib"), regexp = "input data has 20 columns")
+
+    X_wrong <- as(X_wrong, "RsparseMatrix")
+    expect_error(predict(bst, X_wrong, type = "contrib"), regexp = "input data has 20 columns")
+
+    X_wrong <- as(X_wrong, "CsparseMatrix")
+    X_wrong <- X_wrong[, 1L:3L]
+    expect_error(predict(bst, X_wrong, type = "contrib"), regexp = "input data has 3 columns")
+})
+
+test_that("Feature contribution predictions do not take non-general CSR or CSC inputs", {
+    set.seed(123L)
+    y <- runif(25L)
+    Dmat <- matrix(runif(625L), nrow = 25L, ncol = 25L)
+    Dmat <- crossprod(Dmat)
+    Dmat <- as(Dmat, "symmetricMatrix")
+    SmatC <- as(Dmat, "sparseMatrix")
+    SmatR <- as(SmatC, "RsparseMatrix")
+
+    dtrain <- lgb.Dataset(as.matrix(Dmat), label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+      data = dtrain
+      , obj = "regression"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+      , params = list(min_data_in_leaf = 5L)
+    )
+
+    expect_error(predict(bst, SmatC, type = "contrib"))
+    expect_error(predict(bst, SmatR, type = "contrib"))
+})
+
+test_that("predict() params should override keyword argument for raw-score predictions", {
+  data(agaricus.train, package = "lightgbm")
+  X <- agaricus.train$data
+  y <- agaricus.train$label
+  bst <- lgb.train(
+    data = lgb.Dataset(
+      data = X
+      , label = y
+      , params = list(
+        data_seed = 708L
+        , min_data_in_bin = 5L
+      )
+    )
+    , params = list(
+      objective = "binary"
+      , min_data_in_leaf = 1L
+      , seed = 708L
+    )
+    , nrounds = 10L
+    , verbose = VERBOSITY
+  )
+
+  # check that the predictions from predict.lgb.Booster() really look like raw score predictions
+  preds_prob <- predict(bst, X)
+  preds_raw_s3_keyword <- predict(bst, X, type = "raw")
+  preds_prob_from_raw <- 1.0 / (1.0 + exp(-preds_raw_s3_keyword))
+  expect_equal(preds_prob, preds_prob_from_raw, tolerance = TOLERANCE)
+  accuracy <- sum(as.integer(preds_prob_from_raw > 0.5) == y) / length(y)
+  expect_equal(accuracy, 1.0)
+
+  # should get the same results from Booster$predict() method
+  preds_raw_r6_keyword <- bst$predict(X, rawscore = TRUE)
+  expect_equal(preds_raw_s3_keyword, preds_raw_r6_keyword)
+
+  # using a parameter alias of predict_raw_score should result in raw scores being returned
+  aliases <- .PARAMETER_ALIASES()[["predict_raw_score"]]
+  expect_true(length(aliases) > 1L)
+  for (rawscore_alias in aliases) {
+    params <- as.list(
+      stats::setNames(
+        object = TRUE
+        , nm = rawscore_alias
+      )
+    )
+    preds_raw_s3_param <- predict(bst, X, params = params)
+    preds_raw_r6_param <- bst$predict(X, params = params)
+    expect_equal(preds_raw_s3_keyword, preds_raw_s3_param)
+    expect_equal(preds_raw_s3_keyword, preds_raw_r6_param)
+  }
+})
+
+test_that("predict() params should override keyword argument for leaf-index predictions", {
+  data(mtcars)
+  X <- as.matrix(mtcars[, which(names(mtcars) != "mpg")])
+  y <- as.numeric(mtcars[, "mpg"])
+  bst <- lgb.train(
+    data = lgb.Dataset(
+      data = X
+      , label = y
+      , params = list(
+        min_data_in_bin = 1L
+        , data_seed = 708L
+      )
+    )
+    , params = list(
+      objective = "regression"
+      , min_data_in_leaf = 1L
+      , seed = 708L
+    )
+    , nrounds = 10L
+    , verbose = VERBOSITY
+  )
+
+  # check that predictions really look like leaf index predictions
+  preds_leaf_s3_keyword <- predict(bst, X, type = "leaf")
+  expect_true(is.matrix(preds_leaf_s3_keyword))
+  expect_equal(dim(preds_leaf_s3_keyword), c(nrow(X), bst$current_iter()))
+  expect_true(min(preds_leaf_s3_keyword) >= 0L)
+  trees_dt <- lgb.model.dt.tree(bst)
+  max_leaf_by_tree_from_dt <- trees_dt[, .(idx = max(leaf_index, na.rm = TRUE)), by = tree_index]$idx
+  max_leaf_by_tree_from_preds <- apply(preds_leaf_s3_keyword, 2L, max, na.rm = TRUE)
+  expect_equal(max_leaf_by_tree_from_dt, max_leaf_by_tree_from_preds)
+
+  # should get the same results from Booster$predict() method
+  preds_leaf_r6_keyword <- bst$predict(X, predleaf = TRUE)
+  expect_equal(preds_leaf_s3_keyword, preds_leaf_r6_keyword)
+
+  # using a parameter alias of predict_leaf_index should result in leaf indices being returned
+  aliases <- .PARAMETER_ALIASES()[["predict_leaf_index"]]
+  expect_true(length(aliases) > 1L)
+  for (predleaf_alias in aliases) {
+    params <- as.list(
+      stats::setNames(
+        object = TRUE
+        , nm = predleaf_alias
+      )
+    )
+    preds_leaf_s3_param <- predict(bst, X, params = params)
+    preds_leaf_r6_param <- bst$predict(X, params = params)
+    expect_equal(preds_leaf_s3_keyword, preds_leaf_s3_param)
+    expect_equal(preds_leaf_s3_keyword, preds_leaf_r6_param)
+  }
+})
+
+test_that("predict() params should override keyword argument for feature contributions", {
+  data(mtcars)
+  X <- as.matrix(mtcars[, which(names(mtcars) != "mpg")])
+  y <- as.numeric(mtcars[, "mpg"])
+  bst <- lgb.train(
+    data = lgb.Dataset(
+      data = X
+      , label = y
+      , params = list(
+        min_data_in_bin = 1L
+        , data_seed = 708L
+      )
+    )
+    , params = list(
+      objective = "regression"
+      , min_data_in_leaf = 1L
+      , seed = 708L
+    )
+    , nrounds = 10L
+    , verbose = VERBOSITY
+  )
+
+  # check that predictions really look like feature contributions
+  preds_contrib_s3_keyword <- predict(bst, X, type = "contrib")
+  num_features <- ncol(X)
+  shap_base_value <- unname(preds_contrib_s3_keyword[, ncol(preds_contrib_s3_keyword)])
+  expect_true(is.matrix(preds_contrib_s3_keyword))
+  expect_equal(dim(preds_contrib_s3_keyword), c(nrow(X), num_features + 1L))
+  expect_equal(length(unique(shap_base_value)), 1L)
+  expect_equal(mean(y), shap_base_value[1L])
+  expect_equal(predict(bst, X), rowSums(preds_contrib_s3_keyword))
+
+  # should get the same results from Booster$predict() method
+  preds_contrib_r6_keyword <- bst$predict(X, predcontrib = TRUE)
+  expect_equal(preds_contrib_s3_keyword, preds_contrib_r6_keyword)
+
+  # using a parameter alias of predict_contrib should result in feature contributions being returned
+  aliases <- .PARAMETER_ALIASES()[["predict_contrib"]]
+  expect_true(length(aliases) > 1L)
+  for (predcontrib_alias in aliases) {
+    params <- as.list(
+      stats::setNames(
+        object = TRUE
+        , nm = predcontrib_alias
+      )
+    )
+    preds_contrib_s3_param <- predict(bst, X, params = params)
+    preds_contrib_r6_param <- bst$predict(X, params = params)
+    expect_equal(preds_contrib_s3_keyword, preds_contrib_s3_param)
+    expect_equal(preds_contrib_s3_keyword, preds_contrib_r6_param)
+  }
+})
+
+.expect_has_row_names <- function(pred, X) {
+    if (is.vector(pred)) {
+        rnames <- names(pred)
+    } else {
+        rnames <- row.names(pred)
+    }
+    expect_false(is.null(rnames))
+    expect_true(is.vector(rnames))
+    expect_true(length(rnames) > 0L)
+    expect_equal(row.names(X), rnames)
+}
+
+.expect_doesnt_have_row_names <- function(pred) {
+    if (is.vector(pred)) {
+        expect_null(names(pred))
+    } else {
+        expect_null(row.names(pred))
+    }
+}
+
+.check_all_row_name_expectations <- function(bst, X) {
+
+    # dense matrix with row names
+    pred <- predict(bst, X)
+    .expect_has_row_names(pred, X)
+    pred <- predict(bst, X, type = "raw")
+    .expect_has_row_names(pred, X)
+    pred <- predict(bst, X, type = "leaf")
+    .expect_has_row_names(pred, X)
+    pred <- predict(bst, X, type = "contrib")
+    .expect_has_row_names(pred, X)
+
+    # dense matrix without row names
+    Xcopy <- X
+    row.names(Xcopy) <- NULL
+    pred <- predict(bst, Xcopy)
+    .expect_doesnt_have_row_names(pred)
+
+    # sparse matrix with row names
+    Xcsc <- as(X, "CsparseMatrix")
+    pred <- predict(bst, Xcsc)
+    .expect_has_row_names(pred, Xcsc)
+    pred <- predict(bst, Xcsc, type = "raw")
+    .expect_has_row_names(pred, Xcsc)
+    pred <- predict(bst, Xcsc, type = "leaf")
+    .expect_has_row_names(pred, Xcsc)
+    pred <- predict(bst, Xcsc, type = "contrib")
+    .expect_has_row_names(pred, Xcsc)
+    pred <- predict(bst, as(Xcsc, "RsparseMatrix"), type = "contrib")
+    .expect_has_row_names(pred, Xcsc)
+
+    # sparse matrix without row names
+    Xcopy <- Xcsc
+    row.names(Xcopy) <- NULL
+    pred <- predict(bst, Xcopy)
+    .expect_doesnt_have_row_names(pred)
+}
+
+test_that("predict() keeps row names from data (regression)", {
+    data("mtcars")
+    X <- as.matrix(mtcars[, -1L])
+    y <- as.numeric(mtcars[, 1L])
+    dtrain <- lgb.Dataset(
+      X
+      , label = y
+      , params = list(
+        max_bins = 5L
+        , min_data_in_bin = 1L
+      )
+    )
+    bst <- lgb.train(
+        data = dtrain
+        , obj = "regression"
+        , nrounds = 5L
+        , verbose = VERBOSITY
+        , params = list(min_data_in_leaf = 1L)
+    )
+    .check_all_row_name_expectations(bst, X)
+})
+
+test_that("predict() keeps row names from data (binary classification)", {
+    data(agaricus.train, package = "lightgbm")
+    X <- as.matrix(agaricus.train$data)
+    y <- agaricus.train$label
+    row.names(X) <- paste0("rname", seq(1L, nrow(X)))
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+        data = dtrain
+        , obj = "binary"
+        , nrounds = 5L
+        , verbose = VERBOSITY
+    )
+    .check_all_row_name_expectations(bst, X)
+})
+
+test_that("predict() keeps row names from data (multi-class classification)", {
+    data(iris)
+    y <- as.numeric(iris$Species) - 1.0
+    X <- as.matrix(iris[, names(iris) != "Species"])
+    row.names(X) <- paste0("rname", seq(1L, nrow(X)))
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+        data = dtrain
+        , obj = "multiclass"
+        , params = list(num_class = 3L)
+        , nrounds = 5L
+        , verbose = VERBOSITY
+    )
+    .check_all_row_name_expectations(bst, X)
+})
+
+test_that("predictions for regression and binary classification are returned as vectors", {
+    data(mtcars)
+    X <- as.matrix(mtcars[, -1L])
+    y <- as.numeric(mtcars[, 1L])
+    dtrain <- lgb.Dataset(
+      X
+      , label = y
+      , params = list(
+        max_bins = 5L
+        , min_data_in_bin = 1L
+      )
+    )
+    model <- lgb.train(
+      data = dtrain
+      , obj = "regression"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+      , params = list(min_data_in_leaf = 1L)
+    )
+    pred <- predict(model, X)
+    expect_true(is.vector(pred))
+    expect_equal(length(pred), nrow(X))
+    pred <- predict(model, X, type = "raw")
+    expect_true(is.vector(pred))
+    expect_equal(length(pred), nrow(X))
+
+    data(agaricus.train, package = "lightgbm")
+    X <- agaricus.train$data
+    y <- agaricus.train$label
+    dtrain <- lgb.Dataset(X, label = y)
+    model <- lgb.train(
+      data = dtrain
+      , obj = "binary"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+    )
+    pred <- predict(model, X)
+    expect_true(is.vector(pred))
+    expect_equal(length(pred), nrow(X))
+    pred <- predict(model, X, type = "raw")
+    expect_true(is.vector(pred))
+    expect_equal(length(pred), nrow(X))
+})
+
+test_that("predictions for multiclass classification are returned as matrix", {
+    data(iris)
+    X <- as.matrix(iris[, -5L])
+    y <- as.numeric(iris$Species) - 1.0
+    dtrain <- lgb.Dataset(X, label = y)
+    model <- lgb.train(
+      data = dtrain
+      , obj = "multiclass"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+      , params = list(num_class = 3L)
+    )
+    pred <- predict(model, X)
+    expect_true(is.matrix(pred))
+    expect_equal(nrow(pred), nrow(X))
+    expect_equal(ncol(pred), 3L)
+    pred <- predict(model, X, type = "raw")
+    expect_true(is.matrix(pred))
+    expect_equal(nrow(pred), nrow(X))
+    expect_equal(ncol(pred), 3L)
+})
+
+test_that("predict type='class' returns predicted class for classification objectives", {
+    data(agaricus.train, package = "lightgbm")
+    X <- as.matrix(agaricus.train$data)
+    y <- agaricus.train$label
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+        data = dtrain
+        , obj = "binary"
+        , nrounds = 5L
+        , verbose = VERBOSITY
+    )
+    pred <- predict(bst, X, type = "class")
+    expect_true(all(pred %in% c(0L, 1L)))
+
+    data(iris)
+    X <- as.matrix(iris[, -5L])
+    y <- as.numeric(iris$Species) - 1.0
+    dtrain <- lgb.Dataset(X, label = y)
+    model <- lgb.train(
+      data = dtrain
+      , obj = "multiclass"
+      , nrounds = 5L
+      , verbose = VERBOSITY
+      , params = list(num_class = 3L)
+    )
+    pred <- predict(model, X, type = "class")
+    expect_true(all(pred %in% c(0L, 1L, 2L)))
+})
+
+test_that("predict type='class' returns values in the target's range for regression objectives", {
+    data(agaricus.train, package = "lightgbm")
+    X <- as.matrix(agaricus.train$data)
+    y <- agaricus.train$label
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+        data = dtrain
+        , obj = "regression"
+        , nrounds = 5L
+        , verbose = VERBOSITY
+    )
+    pred <- predict(bst, X, type = "class")
+    expect_true(!any(pred %in% c(0.0, 1.0)))
+})
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index f9b209547a6a..52f53935bea0 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1,10 +1,12 @@
-context("lightgbm()")
+VERBOSITY <- as.integer(
+  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
+)
 
 ON_WINDOWS <- .Platform$OS.type == "windows"
 
-UTF8_LOCALE <- all(grepl(
-  pattern = "UTF-8$"
-  , x = Sys.getlocale(category = "LC_CTYPE")
+UTF8_LOCALE <- all(endsWith(
+  Sys.getlocale(category = "LC_CTYPE")
+  , "UTF-8"
 ))
 
 data(agaricus.train, package = "lightgbm")
@@ -76,9 +78,15 @@ test_that("train and predict binary classification", {
         num_leaves = 5L
         , objective = "binary"
         , metric = "binary_error"
+        , verbose = VERBOSITY
     )
     , nrounds = nrounds
-    , save_name = tempfile(fileext = ".model")
+    , valids = list(
+      "train" = lgb.Dataset(
+        data = train$data
+        , label = train$label
+      )
+    )
   )
   expect_false(is.null(bst$record_evals))
   record_results <- lgb.get.eval.result(bst, "train", "binary_error")
@@ -97,10 +105,11 @@ test_that("train and predict binary classification", {
 
 test_that("train and predict softmax", {
   set.seed(708L)
+  X_mat <- as.matrix(iris[, -5L])
   lb <- as.numeric(iris$Species) - 1L
 
   bst <- lightgbm(
-    data = as.matrix(iris[, -5L])
+    data = X_mat
     , label = lb
     , params = list(
         num_leaves = 4L
@@ -110,9 +119,15 @@ test_that("train and predict softmax", {
         , objective = "multiclass"
         , metric = "multi_error"
         , num_class = 3L
+        , verbose = VERBOSITY
     )
     , nrounds = 20L
-    , save_name = tempfile(fileext = ".model")
+    , valids = list(
+      "train" = lgb.Dataset(
+        data = X_mat
+        , label = lb
+      )
+    )
   )
 
   expect_false(is.null(bst$record_evals))
@@ -134,9 +149,15 @@ test_that("use of multiple eval metrics works", {
         , learning_rate = 1.0
         , objective = "binary"
         , metric = metrics
+        , verbose = VERBOSITY
     )
     , nrounds = 10L
-    , save_name = tempfile(fileext = ".model")
+    , valids = list(
+      "train" = lgb.Dataset(
+        data = train$data
+        , label = train$label
+      )
+    )
   )
   expect_false(is.null(bst$record_evals))
   expect_named(
@@ -157,9 +178,9 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec
         num_leaves = 5L
         , objective = "binary"
         , metric = "binary_error"
+        , verbose = VERBOSITY
     )
     , nrounds = nrounds
-    , save_name = tempfile(fileext = ".model")
   )
   expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE)
   expect_true(abs(bst$upper_bound() - 1.871015) <  TOLERANCE)
@@ -175,9 +196,9 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec
         num_leaves = 5L
         , objective = "regression"
         , metric = "l2"
+        , verbose = VERBOSITY
     )
     , nrounds = nrounds
-    , save_name = tempfile(fileext = ".model")
   )
   expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE)
   expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE)
@@ -192,7 +213,6 @@ test_that("lightgbm() rejects negative or 0 value passed to nrounds", {
         data = dtrain
         , params = params
         , nrounds = nround_value
-        , save_name = tempfile(fileext = ".model")
       )
     }, "nrounds should be greater than zero")
   }
@@ -210,8 +230,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete
       objective = "regression"
       , metric = "l2"
       , num_leaves = 5L
+      , verbose = VERBOSITY
     )
-    , save_name = tempfile(fileext = ".model")
   )
 
   set.seed(708L)
@@ -223,8 +243,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
+      , verbose = VERBOSITY
     )
-    , save_name = tempfile(fileext = ".model")
   )
 
   set.seed(708L)
@@ -237,13 +257,13 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
+      , verbose = VERBOSITY
     )
-    , save_name = tempfile(fileext = ".model")
   )
 
   top_level_l2 <- top_level_bst$eval_train()[[1L]][["value"]]
-  params_l2    <- param_bst$eval_train()[[1L]][["value"]]
-  both_l2      <- both_customized$eval_train()[[1L]][["value"]]
+  params_l2 <- param_bst$eval_train()[[1L]][["value"]]
+  both_l2 <- both_customized$eval_train()[[1L]][["value"]]
 
   # check type just to be sure the subsetting didn't return a NULL
   expect_true(is.numeric(top_level_l2))
@@ -281,13 +301,17 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
             "binary_error"
             , "auc"
         )
+        , verbose = VERBOSITY
     )
     , nrounds = nrounds
     , valids = list(
       "valid1" = dvalid1
       , "valid2" = dvalid2
+      , "train" = lgb.Dataset(
+        data = train$data
+        , label = train$label
+      )
     )
-    , save_name = tempfile(fileext = ".model")
   )
 
   expect_named(
@@ -305,26 +329,6 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
   expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE)
 })
 
-test_that("lightgbm() does not write model to disk if save_name=NULL", {
-  files_before <- list.files(getwd())
-
-  model <- lightgbm(
-    data = train$data
-    , label = train$label
-    , nrounds = 5L
-    , params = list(objective = "binary")
-    , verbose = 0L
-    , save_name = NULL
-  )
-
-  files_after <- list.files(getwd())
-
-  expect_equal(files_before, files_after)
-})
-
-
-context("training continuation")
-
 test_that("training continuation works", {
   dtrain <- lgb.Dataset(
     train$data
@@ -337,6 +341,7 @@ test_that("training continuation works", {
     , metric = "binary_logloss"
     , num_leaves = 5L
     , learning_rate = 1.0
+    , verbose = VERBOSITY
   )
 
   # train for 10 consecutive iterations
@@ -355,8 +360,6 @@ test_that("training continuation works", {
   expect_lt(abs(err_bst - err_bst2), 0.01)
 })
 
-context("lgb.cv()")
-
 test_that("cv works", {
   dtrain <- lgb.Dataset(train$data, label = train$label)
   params <- list(
@@ -364,6 +367,7 @@ test_that("cv works", {
     , metric = "l2,l1"
     , min_data = 1L
     , learning_rate = 1.0
+    , verbose = VERBOSITY
   )
   bst <- lgb.cv(
     params
@@ -375,6 +379,31 @@ test_that("cv works", {
   expect_false(is.null(bst$record_evals))
 })
 
+test_that("CVBooster$reset_parameter() works as expected", {
+  dtrain <- lgb.Dataset(train$data, label = train$label)
+  n_folds <- 2L
+  cv_bst <- lgb.cv(
+    params = list(
+      objective = "regression"
+      , min_data = 1L
+      , num_leaves = 7L
+      , verbose = VERBOSITY
+    )
+    , data = dtrain
+    , nrounds = 3L
+    , nfold = n_folds
+  )
+  expect_true(methods::is(cv_bst, "lgb.CVBooster"))
+  expect_length(cv_bst$boosters, n_folds)
+  for (bst in cv_bst$boosters) {
+    expect_equal(bst[["booster"]]$params[["num_leaves"]], 7L)
+  }
+  cv_bst$reset_parameter(list(num_leaves = 11L))
+  for (bst in cv_bst$boosters) {
+    expect_equal(bst[["booster"]]$params[["num_leaves"]], 11L)
+  }
+})
+
 test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
   dtrain <- lgb.Dataset(train$data, label = train$label)
   params <- list(
@@ -435,9 +464,10 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric
       , metric = "auc,binary_error"
       , learning_rate = 1.5
       , num_leaves = 5L
+      , verbose = VERBOSITY
     )
   )
-  expect_is(cv_bst, "lgb.CVBooster")
+  expect_true(methods::is(cv_bst, "lgb.CVBooster"))
   expect_named(
     cv_bst$record_evals
     , c("start_iter", "valid")
@@ -475,7 +505,7 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea
     , params = params
     , nfold = 5L
   )
-  expect_is(cv_bst, "lgb.CVBooster")
+  expect_true(methods::is(cv_bst, "lgb.CVBooster"))
 
   dtrain <- .new_dataset()
   cv_bst_linear <- lgb.cv(
@@ -484,7 +514,7 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea
     , params = utils::modifyList(params, list(linear_tree = TRUE))
     , nfold = 5L
   )
-  expect_is(cv_bst_linear, "lgb.CVBooster")
+  expect_true(methods::is(cv_bst_linear, "lgb.CVBooster"))
 
   expect_true(cv_bst_linear$best_score < cv_bst$best_score)
 })
@@ -495,6 +525,7 @@ test_that("lgb.cv() respects showsd argument", {
     objective = "regression"
     , metric = "l2"
     , min_data = 1L
+    , verbose = VERBOSITY
   )
   nrounds <- 5L
   set.seed(708L)
@@ -519,12 +550,144 @@ test_that("lgb.cv() respects showsd argument", {
     evals_showsd[["eval"]]
     , evals_no_showsd[["eval"]]
   )
-  expect_is(evals_showsd[["eval_err"]], "list")
+  expect_true(methods::is(evals_showsd[["eval_err"]], "list"))
   expect_equal(length(evals_showsd[["eval_err"]]), nrounds)
   expect_identical(evals_no_showsd[["eval_err"]], list())
 })
 
-context("lgb.train()")
+test_that("lgb.cv() raises an informative error for unrecognized objectives", {
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  expect_error({
+    capture.output({
+      bst <- lgb.cv(
+        data = dtrain
+        , params = list(
+          objective_type = "not_a_real_objective"
+          , verbosity = VERBOSITY
+        )
+      )
+    }, type = "message")
+  }, regexp = "Unknown objective type name: not_a_real_objective")
+})
+
+test_that("lgb.cv() respects parameter aliases for objective", {
+  nrounds <- 3L
+  nfold <- 4L
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  cv_bst <- lgb.cv(
+    data = dtrain
+    , params = list(
+      num_leaves = 5L
+      , application = "binary"
+      , num_iterations = nrounds
+      , verbose = VERBOSITY
+    )
+    , nfold = nfold
+  )
+  expect_equal(cv_bst$best_iter, nrounds)
+  expect_named(cv_bst$record_evals[["valid"]], "binary_logloss")
+  expect_length(cv_bst$record_evals[["valid"]][["binary_logloss"]][["eval"]], nrounds)
+  expect_length(cv_bst$boosters, nfold)
+})
+
+test_that("lgb.cv() prefers objective in params to keyword argument", {
+  data("EuStockMarkets")
+  cv_bst <- lgb.cv(
+    data = lgb.Dataset(
+      data = EuStockMarkets[, c("SMI", "CAC", "FTSE")]
+      , label = EuStockMarkets[, "DAX"]
+    )
+    , params = list(
+      application = "regression_l1"
+      , verbosity = VERBOSITY
+    )
+    , nrounds = 5L
+    , obj = "regression_l2"
+  )
+  for (bst_list in cv_bst$boosters) {
+    bst <- bst_list[["booster"]]
+    expect_equal(bst$params$objective, "regression_l1")
+    # NOTE: using save_model_to_string() since that is the simplest public API in the R package
+    #       allowing access to the "objective" attribute of the Booster object on the C++ side
+    model_txt_lines <- strsplit(
+      x = bst$save_model_to_string()
+      , split = "\n"
+    )[[1L]]
+    expect_true(any(model_txt_lines == "objective=regression_l1"))
+    expect_false(any(model_txt_lines == "objective=regression_l2"))
+  }
+})
+
+test_that("lgb.cv() respects parameter aliases for metric", {
+  nrounds <- 3L
+  nfold <- 4L
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  cv_bst <- lgb.cv(
+    data = dtrain
+    , params = list(
+      num_leaves = 5L
+      , objective = "binary"
+      , num_iterations = nrounds
+      , metric_types = c("auc", "binary_logloss")
+      , verbose = VERBOSITY
+    )
+    , nfold = nfold
+  )
+  expect_equal(cv_bst$best_iter, nrounds)
+  expect_named(cv_bst$record_evals[["valid"]], c("auc", "binary_logloss"))
+  expect_length(cv_bst$record_evals[["valid"]][["binary_logloss"]][["eval"]], nrounds)
+  expect_length(cv_bst$record_evals[["valid"]][["auc"]][["eval"]], nrounds)
+  expect_length(cv_bst$boosters, nfold)
+})
+
+test_that("lgb.cv() respects eval_train_metric argument", {
+  dtrain <- lgb.Dataset(train$data, label = train$label)
+  params <- list(
+    objective = "regression"
+    , metric = "l2"
+    , min_data = 1L
+    , verbose = VERBOSITY
+  )
+  nrounds <- 5L
+  set.seed(708L)
+  bst_train <- lgb.cv(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+    , nfold = 3L
+    , showsd = FALSE
+    , eval_train_metric = TRUE
+  )
+  set.seed(708L)
+  bst_no_train <- lgb.cv(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+    , nfold = 3L
+    , showsd = FALSE
+    , eval_train_metric = FALSE
+  )
+  expect_equal(
+    bst_train$record_evals[["valid"]][["l2"]]
+    , bst_no_train$record_evals[["valid"]][["l2"]]
+  )
+  expect_true("train" %in% names(bst_train$record_evals))
+  expect_false("train" %in% names(bst_no_train$record_evals))
+  expect_true(methods::is(bst_train$record_evals[["train"]][["l2"]][["eval"]], "list"))
+  expect_equal(
+    length(bst_train$record_evals[["train"]][["l2"]][["eval"]])
+    , nrounds
+  )
+})
 
 test_that("lgb.train() works as expected with multiple eval metrics", {
   metrics <- c("binary_error", "auc", "binary_logloss")
@@ -538,6 +701,7 @@ test_that("lgb.train() works as expected with multiple eval metrics", {
       objective = "binary"
       , metric = metrics
       , learning_rate = 1.0
+      , verbose = VERBOSITY
     )
     , valids = list(
       "train" = lgb.Dataset(
@@ -555,9 +719,105 @@ test_that("lgb.train() works as expected with multiple eval metrics", {
   )
 })
 
+test_that("lgb.train() raises an informative error for unrecognized objectives", {
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  expect_error({
+    capture.output({
+      bst <- lgb.train(
+        data = dtrain
+        , params = list(
+          objective_type = "not_a_real_objective"
+          , verbosity = VERBOSITY
+        )
+      )
+    }, type = "message")
+  }, regexp = "Unknown objective type name: not_a_real_objective")
+})
+
+test_that("lgb.train() respects parameter aliases for objective", {
+  nrounds <- 3L
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  bst <- lgb.train(
+    data = dtrain
+    , params = list(
+      num_leaves = 5L
+      , application = "binary"
+      , num_iterations = nrounds
+      , verbose = VERBOSITY
+    )
+    , valids = list(
+      "the_training_data" = dtrain
+    )
+  )
+  expect_named(bst$record_evals[["the_training_data"]], "binary_logloss")
+  expect_length(bst$record_evals[["the_training_data"]][["binary_logloss"]][["eval"]], nrounds)
+  expect_equal(bst$params[["objective"]], "binary")
+})
+
+test_that("lgb.train() prefers objective in params to keyword argument", {
+  data("EuStockMarkets")
+  bst <- lgb.train(
+    data = lgb.Dataset(
+      data = EuStockMarkets[, c("SMI", "CAC", "FTSE")]
+      , label = EuStockMarkets[, "DAX"]
+    )
+    , params = list(
+        loss = "regression_l1"
+        , verbosity = VERBOSITY
+    )
+    , nrounds = 5L
+    , obj = "regression_l2"
+  )
+  expect_equal(bst$params$objective, "regression_l1")
+  # NOTE: using save_model_to_string() since that is the simplest public API in the R package
+  #       allowing access to the "objective" attribute of the Booster object on the C++ side
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(model_txt_lines == "objective=regression_l1"))
+  expect_false(any(model_txt_lines == "objective=regression_l2"))
+})
+
+test_that("lgb.train() respects parameter aliases for metric", {
+  nrounds <- 3L
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  bst <- lgb.train(
+    data = dtrain
+    , params = list(
+      num_leaves = 5L
+      , objective = "binary"
+      , num_iterations = nrounds
+      , metric_types = c("auc", "binary_logloss")
+      , verbose = VERBOSITY
+    )
+    , valids = list(
+      "train" = dtrain
+    )
+  )
+  record_results <- bst$record_evals[["train"]]
+  expect_equal(sort(names(record_results)), c("auc", "binary_logloss"))
+  expect_length(record_results[["auc"]][["eval"]], nrounds)
+  expect_length(record_results[["binary_logloss"]][["eval"]], nrounds)
+  expect_equal(bst$params[["metric"]], list("auc", "binary_logloss"))
+})
+
 test_that("lgb.train() rejects negative or 0 value passed to nrounds", {
   dtrain <- lgb.Dataset(train$data, label = train$label)
-  params <- list(objective = "regression", metric = "l2,l1")
+  params <- list(
+    objective = "regression"
+    , metric = "l2,l1"
+    , verbose = VERBOSITY
+  )
   for (nround_value in c(-10L, 0L)) {
     expect_error({
       bst <- lgb.train(
@@ -584,7 +844,7 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
       objective = "regression"
       , metric = "l2"
       , num_leaves = 5L
-      , save_name = tempfile(fileext = ".model")
+      , verbose = VERBOSITY
     )
   )
 
@@ -599,7 +859,7 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
-      , save_name = tempfile(fileext = ".model")
+      , verbose = VERBOSITY
     )
   )
 
@@ -615,13 +875,13 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
-      , save_name = tempfile(fileext = ".model")
+      , verbose = VERBOSITY
     )
   )
 
   top_level_l2 <- top_level_bst$eval_train()[[1L]][["value"]]
-  params_l2    <- param_bst$eval_train()[[1L]][["value"]]
-  both_l2      <- both_customized$eval_train()[[1L]][["value"]]
+  params_l2 <- param_bst$eval_train()[[1L]][["value"]]
+  both_l2 <- both_customized$eval_train()[[1L]][["value"]]
 
   # check type just to be sure the subsetting didn't return a NULL
   expect_true(is.numeric(top_level_l2))
@@ -651,7 +911,11 @@ test_that("lgb.train() throws an informative error if 'data' is not an lgb.Datas
   for (val in bad_values) {
     expect_error({
       bst <- lgb.train(
-        params = list(objective = "regression", metric = "l2,l1")
+        params = list(
+            objective = "regression"
+            , metric = "l2,l1"
+            , verbose = VERBOSITY
+        )
         , data = val
         , 10L
       )
@@ -666,7 +930,11 @@ test_that("lgb.train() throws an informative error if 'valids' is not a list of
   )
   expect_error({
     bst <- lgb.train(
-      params = list(objective = "regression", metric = "l2,l1")
+      params = list(
+        objective = "regression"
+        , metric = "l2,l1"
+        , verbose = VERBOSITY
+      )
       , data = lgb.Dataset(train$data, label = train$label)
       , 10L
       , valids = valids
@@ -681,7 +949,11 @@ test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but s
   )
   expect_error({
     bst <- lgb.train(
-      params = list(objective = "regression", metric = "l2,l1")
+      params = list(
+        objective = "regression"
+        , metric = "l2,l1"
+        , verbose = VERBOSITY
+      )
       , data = lgb.Dataset(train$data, label = train$label)
       , 10L
       , valids = valids
@@ -696,7 +968,11 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
   )
   expect_error({
     bst <- lgb.train(
-      params = list(objective = "regression", metric = "l2,l1")
+      params = list(
+        objective = "regression"
+        , metric = "l2,l1"
+        , verbose = VERBOSITY
+    )
       , data = lgb.Dataset(train$data, label = train$label)
       , 10L
       , valids = valids
@@ -715,6 +991,7 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", {
     objective = "binary"
     , metric = "binary_error"
     , force_col_wise = TRUE
+    , verbose = VERBOSITY
   )
   bst_col_wise <- lgb.train(
     params = params
@@ -726,6 +1003,7 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", {
     objective = "binary"
     , metric = "binary_error"
     , force_row_wise = TRUE
+    , verbose = VERBOSITY
   )
   bst_row_wise <- lgb.train(
     params = params
@@ -764,6 +1042,7 @@ test_that("lgb.train() works as expected with sparse features", {
       objective = "binary"
       , min_data = 1L
       , min_data_in_bin = 1L
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -804,6 +1083,7 @@ test_that("lgb.train() works with early stopping for classification", {
     params = list(
       objective = "binary"
       , metric = "binary_error"
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -822,11 +1102,12 @@ test_that("lgb.train() works with early stopping for classification", {
   # train with early stopping #
   #############################
   early_stopping_rounds <- 5L
-  bst  <- lgb.train(
+  bst <- lgb.train(
     params = list(
       objective = "binary"
       , metric = "binary_error"
       , early_stopping_rounds = early_stopping_rounds
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -875,6 +1156,7 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi
       params = list(
         objective = "binary"
         , metric = "binary_error"
+        , verbose = VERBOSITY
       )
       , data = dtrain
       , nrounds = nrounds
@@ -898,6 +1180,7 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi
         objective = "binary"
         , metric = "binary_error"
         , n_iter_no_change = value
+        , verbose = VERBOSITY
       )
       , data = dtrain
       , nrounds = nrounds
@@ -931,12 +1214,13 @@ test_that("lgb.train() works with early stopping for classification with a metri
   #############################
   early_stopping_rounds <- 5L
   # the harsh max_depth guarantees that AUC improves over at least the first few iterations
-  bst_auc  <- lgb.train(
+  bst_auc <- lgb.train(
     params = list(
       objective = "binary"
       , metric = "auc"
       , max_depth = 3L
       , early_stopping_rounds = early_stopping_rounds
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -944,12 +1228,13 @@ test_that("lgb.train() works with early stopping for classification with a metri
       "valid1" = dvalid
     )
   )
-  bst_binary_error  <- lgb.train(
+  bst_binary_error <- lgb.train(
     params = list(
       objective = "binary"
       , metric = "binary_error"
       , max_depth = 3L
       , early_stopping_rounds = early_stopping_rounds
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1008,6 +1293,7 @@ test_that("lgb.train() works with early stopping for regression", {
     params = list(
       objective = "regression"
       , metric = "rmse"
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1026,11 +1312,12 @@ test_that("lgb.train() works with early stopping for regression", {
   # train with early stopping #
   #############################
   early_stopping_rounds <- 5L
-  bst  <- lgb.train(
+  bst <- lgb.train(
     params = list(
       objective = "regression"
       , metric = "rmse"
       , early_stopping_rounds = early_stopping_rounds
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1065,6 +1352,7 @@ test_that("lgb.train() does not stop early if early_stopping_rounds is not given
     params = list(
       objective = "regression"
       , metric = "None"
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
@@ -1108,12 +1396,14 @@ test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to
       objective = "regression"
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
+      , verbose = VERBOSITY
     )
     , list(
       objective = "regression"
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = FALSE
+      , verbose = VERBOSITY
     )
   )
 
@@ -1176,6 +1466,7 @@ test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = TRUE
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
@@ -1221,6 +1512,7 @@ test_that("lgb.train() works when a mixture of functions and strings are passed
     params = list(
       objective = "regression"
       , metric = "None"
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
@@ -1276,6 +1568,7 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
       params = list(
         objective = "binary"
         , metric = "None"
+        , verbose = VERBOSITY
       )
       , data = DTRAIN_RANDOM_CLASSIFICATION
       , nrounds = nrounds
@@ -1312,6 +1605,7 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
     params = list(
       objective = "binary"
       , metric = "binary_error"
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
@@ -1343,6 +1637,7 @@ test_that("lgb.train() works when you give a function for eval", {
     params = list(
       objective = "binary"
       , metric = "None"
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
@@ -1381,7 +1676,7 @@ test_that("lgb.train() works with early stopping for regression with a metric th
   # train with early stopping #
   #############################
   early_stopping_rounds <- 5L
-  bst  <- lgb.train(
+  bst <- lgb.train(
     params = list(
       objective = "regression"
       , metric = c(
@@ -1391,6 +1686,7 @@ test_that("lgb.train() works with early stopping for regression with a metric th
       )
       , min_data_in_bin = 5L
       , early_stopping_rounds = early_stopping_rounds
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1430,6 +1726,7 @@ test_that("lgb.train() supports non-ASCII feature names", {
     , obj = "regression"
     , params = list(
       metric = "rmse"
+      , verbose = VERBOSITY
     )
     , colnames = feature_names
   )
@@ -1465,9 +1762,11 @@ test_that("lgb.train() works with integer, double, and numeric data", {
       , label = y
       , params = list(
         objective = "regression"
-        , min_data = 1L
+        , min_data_in_bin = 1L
+        , min_data_in_leaf = 1L
         , learning_rate = 0.01
         , seed = 708L
+        , verbose = VERBOSITY
       )
       , nrounds = nrounds
     )
@@ -1484,6 +1783,60 @@ test_that("lgb.train() works with integer, double, and numeric data", {
   }
 })
 
+test_that("lgb.train() updates params based on keyword arguments", {
+  dtrain <- lgb.Dataset(
+    data = matrix(rnorm(400L), ncol =  4L)
+    , label = rnorm(100L)
+  )
+
+  # defaults from keyword arguments should be used if not specified in params
+  invisible(
+    capture.output({
+      bst <- lgb.train(
+        data = dtrain
+        , obj = "regression"
+        , params = list()
+      )
+    })
+  )
+  expect_equal(bst$params[["verbosity"]], 1L)
+  expect_equal(bst$params[["num_iterations"]], 100L)
+
+  # main param names should be preferred to keyword arguments
+  invisible(
+    capture.output({
+      bst <- lgb.train(
+        data = dtrain
+        , obj = "regression"
+        , params = list(
+          "verbosity" = 5L
+          , "num_iterations" = 2L
+        )
+      )
+    })
+  )
+  expect_equal(bst$params[["verbosity"]], 5L)
+  expect_equal(bst$params[["num_iterations"]], 2L)
+
+  # aliases should be preferred to keyword arguments, and converted to main parameter name
+  invisible(
+    capture.output({
+      bst <- lgb.train(
+        data = dtrain
+        , obj = "regression"
+        , params = list(
+          "verbose" = 5L
+          , "num_boost_round" = 2L
+        )
+      )
+    })
+  )
+  expect_equal(bst$params[["verbosity"]], 5L)
+  expect_false("verbose" %in% bst$params)
+  expect_equal(bst$params[["num_iterations"]], 2L)
+  expect_false("num_boost_round" %in% bst$params)
+})
+
 test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", {
   set.seed(708L)
   trainDF <- data.frame(
@@ -1512,6 +1865,7 @@ test_that("when early stopping is not activated, best_iter and best_score come f
     , metric = "rmse"
     , learning_rate = 1.5
     , num_leaves = 5L
+    , verbose = VERBOSITY
   )
 
   # example 1: two valids, neither are the training data
@@ -1671,6 +2025,7 @@ test_that("lightgbm.train() gives the correct best_score and best_iter for a met
       , metric = "auc"
       , learning_rate = 1.5
       , num_leaves = 5L
+      , verbose = VERBOSITY
     )
   )
   # note that "something-random-we-would-not-hardcode" was recognized as the training
@@ -1727,7 +2082,6 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com
       , num_leaves = 5L
     )
     , verbose = -7L
-    , save_name = tempfile(fileext = ".model")
   )
   # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train()
   # untouched. If you set verbose to > 0, the training data will still be first but called "train"
@@ -1752,6 +2106,7 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings
     params = list(
       objective = "binary"
       , metric = "binary_error"
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
@@ -1785,6 +2140,7 @@ test_that("lgb.cv() works when you give a function for eval", {
     params = list(
       objective = "binary"
       , metric = "None"
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nfold = nfolds
@@ -1810,6 +2166,7 @@ test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = TRUE
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nfold = nfolds
@@ -1866,6 +2223,7 @@ test_that("early stopping works with lgb.cv()", {
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = TRUE
+      , verbose = VERBOSITY
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nfold = nfolds
@@ -1899,9 +2257,135 @@ test_that("early stopping works with lgb.cv()", {
     length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]])
     , early_stopping_rounds + 1L
   )
+
+  # every booster's predict method should use best_iter as num_iteration in predict
+  random_data <- as.matrix(rnorm(10L), ncol = 1L, drop = FALSE)
+  for (x in bst$boosters) {
+    expect_equal(x$booster$best_iter, bst$best_iter)
+    expect_gt(x$booster$current_iter(), bst$best_iter)
+    preds_iter <- predict(x$booster, random_data, num_iteration = bst$best_iter)
+    preds_no_iter <- predict(x$booster, random_data)
+    expect_equal(preds_iter, preds_no_iter)
+  }
+})
+
+test_that("lgb.cv() respects changes to logging verbosity", {
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  # (verbose = 1) should be INFO and WARNING level logs
+  lgb_cv_logs <- capture.output({
+    cv_bst <- lgb.cv(
+      params = list()
+      , nfold = 2L
+      , nrounds = 5L
+      , data = dtrain
+      , obj = "binary"
+      , verbose = 1L
+    )
+  })
+  expect_true(any(grepl("\\[LightGBM\\] \\[Info\\]", lgb_cv_logs)))
+  expect_true(any(grepl("\\[LightGBM\\] \\[Warning\\]", lgb_cv_logs)))
+
+  # (verbose = 0) should be WARNING level logs only
+  lgb_cv_logs <- capture.output({
+    cv_bst <- lgb.cv(
+      params = list()
+      , nfold = 2L
+      , nrounds = 5L
+      , data = dtrain
+      , obj = "binary"
+      , verbose = 0L
+    )
+  })
+  expect_false(any(grepl("\\[LightGBM\\] \\[Info\\]", lgb_cv_logs)))
+  expect_true(any(grepl("\\[LightGBM\\] \\[Warning\\]", lgb_cv_logs)))
+
+  # (verbose = -1) no logs
+  lgb_cv_logs <- capture.output({
+    cv_bst <- lgb.cv(
+      params = list()
+      , nfold = 2L
+      , nrounds = 5L
+      , data = dtrain
+      , obj = "binary"
+      , verbose = -1L
+    )
+  })
+  # NOTE: this is not length(lgb_cv_logs) == 0 because lightgbm's
+  #       dependencies might print other messages
+  expect_false(any(grepl("\\[LightGBM\\] \\[Info\\]", lgb_cv_logs)))
+  expect_false(any(grepl("\\[LightGBM\\] \\[Warning\\]", lgb_cv_logs)))
 })
 
-context("linear learner")
+test_that("lgb.cv() updates params based on keyword arguments", {
+  dtrain <- lgb.Dataset(
+    data = matrix(rnorm(400L), ncol =  4L)
+    , label = rnorm(100L)
+  )
+
+  # defaults from keyword arguments should be used if not specified in params
+  invisible(
+    capture.output({
+      cv_bst <- lgb.cv(
+        data = dtrain
+        , obj = "regression"
+        , params = list()
+        , nfold = 2L
+      )
+    })
+  )
+
+  for (bst in cv_bst$boosters) {
+    bst_params <- bst[["booster"]]$params
+    expect_equal(bst_params[["verbosity"]], 1L)
+    expect_equal(bst_params[["num_iterations"]], 100L)
+  }
+
+  # main param names should be preferred to keyword arguments
+  invisible(
+    capture.output({
+      cv_bst <- lgb.cv(
+        data = dtrain
+        , obj = "regression"
+        , params = list(
+          "verbosity" = 5L
+          , "num_iterations" = 2L
+        )
+        , nfold = 2L
+      )
+    })
+  )
+  for (bst in cv_bst$boosters) {
+    bst_params <- bst[["booster"]]$params
+    expect_equal(bst_params[["verbosity"]], 5L)
+    expect_equal(bst_params[["num_iterations"]], 2L)
+  }
+
+  # aliases should be preferred to keyword arguments, and converted to main parameter name
+  invisible(
+    capture.output({
+      cv_bst <- lgb.cv(
+        data = dtrain
+        , obj = "regression"
+        , params = list(
+          "verbose" = 5L
+          , "num_boost_round" = 2L
+        )
+        , nfold = 2L
+      )
+    })
+  )
+  for (bst in cv_bst$boosters) {
+    bst_params <- bst[["booster"]]$params
+    expect_equal(bst_params[["verbosity"]], 5L)
+    expect_false("verbose" %in% bst_params)
+    expect_equal(bst_params[["num_iterations"]], 2L)
+    expect_false("num_boost_round" %in% bst_params)
+  }
+
+})
 
 test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", {
   set.seed(708L)
@@ -1915,7 +2399,7 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear
 
   params <- list(
     objective = "regression"
-    , verbose = -1L
+    , verbose = VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
@@ -1945,11 +2429,11 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear
 })
 
 
-test_that("lgb.train() w/ linear learner fails already-constructed dataset with linear=false", {
+test_that("lgb.train() with linear learner fails already-constructed dataset with linear=false", {
   set.seed(708L)
   params <- list(
     objective = "regression"
-    , verbose = -1L
+    , verbose = VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
@@ -1961,11 +2445,13 @@ test_that("lgb.train() w/ linear learner fails already-constructed dataset with
   )
   dtrain$construct()
   expect_error({
-    bst_linear <- lgb.train(
-      data = dtrain
-      , nrounds = 10L
-      , params = utils::modifyList(params, list(linear_tree = TRUE))
-    )
+    capture.output({
+      bst_linear <- lgb.train(
+        data = dtrain
+        , nrounds = 10L
+        , params = utils::modifyList(params, list(linear_tree = TRUE))
+      )
+    }, type = "message")
   }, regexp = "Cannot change linear_tree after constructed Dataset handle")
 })
 
@@ -1986,7 +2472,7 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va
 
   params <- list(
     objective = "regression"
-    , verbose = -1L
+    , verbose = VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
@@ -2032,7 +2518,7 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h
 
   params <- list(
     objective = "regression"
-    , verbose = -1L
+    , verbose = VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
@@ -2142,8 +2628,6 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f
   expect_true(bst_lin_last_mse <  bst_last_mse)
 })
 
-context("interaction constraints")
-
 test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {
   dtrain <- lgb.Dataset(train$data, label = train$label)
   params <- list(objective = "regression", interaction_constraints = "[1,2],[3]")
@@ -2187,7 +2671,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
   set.seed(1L)
   dtrain <- lgb.Dataset(train$data, label = train$label)
 
-  params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L))
+  params <- list(
+    objective = "regression"
+    , interaction_constraints = list(c(1L, 2L), 3L)
+    , verbose = VERBOSITY
+  )
   bst <- lightgbm(
     data = dtrain
     , params = params
@@ -2196,7 +2684,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
   pred1 <- bst$predict(test$data)
 
   cnames <- colnames(train$data)
-  params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]]))
+  params <- list(
+    objective = "regression"
+    , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])
+    , verbose = VERBOSITY
+  )
   bst <- lightgbm(
     data = dtrain
     , params = params
@@ -2204,7 +2696,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
   )
   pred2 <- bst$predict(test$data)
 
-  params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L))
+  params <- list(
+    objective = "regression"
+    , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)
+    , verbose = VERBOSITY
+  )
   bst <- lightgbm(
     data = dtrain
     , params = params
@@ -2221,7 +2717,11 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
   set.seed(1L)
   dtrain <- lgb.Dataset(train$data, label = train$label)
 
-  params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L))
+  params <- list(
+    objective = "regression"
+    , interaction_constraints = list(c(1L, 2L), 3L)
+    , verbose = VERBOSITY
+  )
   bst <- lightgbm(
     data = dtrain
     , params = params
@@ -2230,8 +2730,11 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
   pred1 <- bst$predict(test$data)
 
   new_colnames <- paste0(colnames(train$data), "_x")
-  params <- list(objective = "regression"
-                 , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L]))
+  params <- list(
+    objective = "regression"
+    , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])
+    , verbose = VERBOSITY
+  )
   bst <- lightgbm(
     data = dtrain
     , params = params
@@ -2244,8 +2747,6 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
 
 })
 
-context("monotone constraints")
-
 .generate_trainset_for_monotone_constraints_tests <- function(x3_to_categorical) {
   n_samples <- 3000L
   x1_positively_correlated_with_y <- runif(n = n_samples, min = 0.0, max = 1.0)
@@ -2376,6 +2877,7 @@ for (x3_to_categorical in c(TRUE, FALSE)) {
         , monotone_constraints = c(1L, -1L, 0L)
         , monotone_constraints_method = monotone_constraints_method
         , use_missing = FALSE
+        , verbose = VERBOSITY
       )
       constrained_model <- lgb.train(
         params = params
@@ -2392,3 +2894,617 @@ for (x3_to_categorical in c(TRUE, FALSE)) {
     })
   }
 }
+
+test_that("lightgbm() accepts objective as function argument and under params", {
+  bst1 <- lightgbm(
+    data = train$data
+    , label = train$label
+    , params = list(objective = "regression_l1")
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  expect_equal(bst1$params$objective, "regression_l1")
+  model_txt_lines <- strsplit(
+    x = bst1$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(model_txt_lines == "objective=regression_l1"))
+  expect_false(any(model_txt_lines == "objective=regression_l2"))
+
+  bst2 <- lightgbm(
+    data = train$data
+    , label = train$label
+    , objective = "regression_l1"
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  expect_equal(bst2$params$objective, "regression_l1")
+  model_txt_lines <- strsplit(
+    x = bst2$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(model_txt_lines == "objective=regression_l1"))
+  expect_false(any(model_txt_lines == "objective=regression_l2"))
+})
+
+test_that("lightgbm() prioritizes objective under params over objective as function argument", {
+  bst1 <- lightgbm(
+    data = train$data
+    , label = train$label
+    , objective = "regression"
+    , params = list(objective = "regression_l1")
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  expect_equal(bst1$params$objective, "regression_l1")
+  model_txt_lines <- strsplit(
+    x = bst1$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(model_txt_lines == "objective=regression_l1"))
+  expect_false(any(model_txt_lines == "objective=regression_l2"))
+
+  bst2 <- lightgbm(
+    data = train$data
+    , label = train$label
+    , objective = "regression"
+    , params = list(loss = "regression_l1")
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  expect_equal(bst2$params$objective, "regression_l1")
+  model_txt_lines <- strsplit(
+    x = bst2$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(model_txt_lines == "objective=regression_l1"))
+  expect_false(any(model_txt_lines == "objective=regression_l2"))
+})
+
+test_that("lightgbm() accepts init_score as function argument", {
+  bst1 <- lightgbm(
+    data = train$data
+    , label = train$label
+    , objective = "binary"
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  pred1 <- predict(bst1, train$data, type = "raw")
+
+  bst2 <- lightgbm(
+    data = train$data
+    , label = train$label
+    , init_score = pred1
+    , objective = "binary"
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  pred2 <- predict(bst2, train$data, type = "raw")
+
+  expect_true(any(pred1 != pred2))
+})
+
+test_that("lightgbm() defaults to 'regression' objective if objective not otherwise provided", {
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+  )
+  expect_equal(bst$params$objective, "regression")
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(model_txt_lines == "objective=regression"))
+  expect_false(any(model_txt_lines == "objective=regression_l1"))
+})
+
+test_that("lightgbm() accepts 'num_threads' as either top-level argument or under params", {
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , num_threads = 1L
+  )
+  expect_equal(bst$params$num_threads, 1L)
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
+
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , params = list(num_threads = 1L)
+  )
+  expect_equal(bst$params$num_threads, 1L)
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
+
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , num_threads = 10L
+    , params = list(num_threads = 1L)
+  )
+  expect_equal(bst$params$num_threads, 1L)
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
+})
+
+test_that("lightgbm() accepts 'weight' and 'weights'", {
+  data(mtcars)
+  X <- as.matrix(mtcars[, -1L])
+  y <- as.numeric(mtcars[, 1L])
+  w <- rep(1.0, nrow(X))
+  model <- lightgbm(
+    X
+    , y
+    , weights = w
+    , obj = "regression"
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , params = list(
+      min_data_in_bin = 1L
+      , min_data_in_leaf = 1L
+    )
+  )
+  expect_equal(model$.__enclos_env__$private$train_set$get_field("weight"), w)
+
+  # Avoid a bad CRAN check due to partial argument matches
+  lgb_args <- list(
+    X
+    , y
+    , weight = w
+    , obj = "regression"
+    , nrounds = 5L
+    , verbose = -1L
+  )
+  model <- do.call(lightgbm, lgb_args)
+  expect_equal(model$.__enclos_env__$private$train_set$get_field("weight"), w)
+})
+
+.assert_has_expected_logs <- function(log_txt, lgb_info, lgb_warn, early_stopping, valid_eval_msg) {
+  expect_identical(
+    object = any(grepl("\\[LightGBM\\] \\[Info\\]", log_txt))
+    , expected = lgb_info
+  )
+  expect_identical(
+    object = any(grepl("\\[LightGBM\\] \\[Warning\\]", log_txt))
+    , expected = lgb_warn
+  )
+  expect_identical(
+    object = any(grepl("Will train until there is no improvement in 5 rounds", log_txt))
+    , expected = early_stopping
+  )
+  expect_identical(
+    object = any(grepl("Did not meet early stopping", log_txt))
+    , expected = early_stopping
+  )
+  expect_identical(
+    object = any(grepl("valid's auc\\:[0-9]+", log_txt))
+    , expected = valid_eval_msg
+  )
+}
+
+.assert_has_expected_record_evals <- function(fitted_model) {
+  record_evals <- fitted_model$record_evals
+  expect_equal(record_evals$start_iter, 1L)
+  if (inherits(fitted_model, "lgb.CVBooster")) {
+    expected_valid_auc <- c(0.979056, 0.9844697, 0.9900813, 0.9908026, 0.9935588)
+  } else {
+    expected_valid_auc <-  c(0.9805752, 0.9805752, 0.9934957, 0.9934957, 0.9949372)
+  }
+  expect_equal(
+    object = unlist(record_evals[["valid"]][["auc"]][["eval"]])
+    , expected = expected_valid_auc
+    , tolerance = TOLERANCE
+  )
+   expect_named(record_evals, c("start_iter", "valid"), ignore.order = TRUE, ignore.case = FALSE)
+  expect_equal(record_evals[["valid"]][["auc"]][["eval_err"]], list())
+}
+
+.train_for_verbosity_test <- function(train_function, verbose_kwarg, verbose_param) {
+  set.seed(708L)
+  nrounds <- 5L
+  params <- list(
+    num_leaves = 5L
+    , objective = "binary"
+    , metric =  "auc"
+    , early_stopping_round = nrounds
+  )
+  if (!is.null(verbose_param)) {
+    params[["verbose"]] <- verbose_param
+  }
+  train_kwargs <- list(
+    params = params
+    , nrounds = nrounds
+  )
+  if (!is.null(verbose_kwarg)) {
+    train_kwargs[["verbose"]] <- verbose_kwarg
+  }
+  function_name <- deparse(substitute(train_function))
+  if (function_name == "lgb.train") {
+    train_kwargs[["data"]] <- lgb.Dataset(
+      data = train$data
+      , label = train$label
+    )
+    train_kwargs[["valids"]] <- list(
+      "valid" = lgb.Dataset(data = test$data, label = test$label)
+    )
+  } else if (function_name == "lightgbm") {
+    train_kwargs[["data"]] <- train$data
+    train_kwargs[["label"]] <- train$label
+    train_kwargs[["valids"]] <- list(
+      "valid" = lgb.Dataset(data = test$data, label = test$label)
+    )
+  } else if (function_name == "lgb.cv") {
+    train_kwargs[["data"]] <- lgb.Dataset(
+      data = train$data
+      , label = train$label
+    )
+    train_kwargs[["nfold"]] <- 3L
+    train_kwargs[["showsd"]] <- FALSE
+  }
+  log_txt <- capture.output({
+    bst <- do.call(
+      what = train_function
+      , args = train_kwargs
+    )
+  })
+  return(list(booster = bst, logs = log_txt))
+}
+
+test_that("lgb.train() only prints eval metrics when expected to", {
+
+  # regardless of value passed to keyword argument 'verbose', value in params
+  # should take precedence
+  for (verbose_keyword_arg in c(-5L, -1L, 0L, 1L, 5L)) {
+
+    # (verbose = -1) should not be any logs, should be record evals
+    out <- .train_for_verbosity_test(
+      train_function = lgb.train
+      , verbose_kwarg = verbose_keyword_arg
+      , verbose_param = -1L
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = FALSE
+      , lgb_warn = FALSE
+      , early_stopping = FALSE
+      , valid_eval_msg = FALSE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+
+    # (verbose = 0) should be only WARN-level LightGBM logs
+    out <- .train_for_verbosity_test(
+      train_function = lgb.train
+      , verbose_kwarg = verbose_keyword_arg
+      , verbose_param = 0L
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = FALSE
+      , lgb_warn = TRUE
+      , early_stopping = FALSE
+      , valid_eval_msg = FALSE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+
+    # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages
+    out <- .train_for_verbosity_test(
+      train_function = lgb.train
+      , verbose_kwarg = verbose_keyword_arg
+      , verbose_param = 1L
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = TRUE
+      , lgb_warn = TRUE
+      , early_stopping = TRUE
+      , valid_eval_msg = TRUE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+  }
+
+  # if verbosity isn't specified in `params`, changing keyword argument `verbose` should
+  # alter what messages are printed
+
+  # (verbose = -1) should not be any logs, should be record evals
+  out <- .train_for_verbosity_test(
+    train_function = lgb.train
+    , verbose_kwarg = -1L
+    , verbose_param = NULL
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = FALSE
+    , lgb_warn = FALSE
+    , early_stopping = FALSE
+    , valid_eval_msg = FALSE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+
+  # (verbose = 0) should be only WARN-level LightGBM logs
+  out <- .train_for_verbosity_test(
+    train_function = lgb.train
+    , verbose_kwarg = 0L
+    , verbose_param = NULL
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = FALSE
+    , lgb_warn = TRUE
+    , early_stopping = FALSE
+    , valid_eval_msg = FALSE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+
+  # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages
+  out <- .train_for_verbosity_test(
+    train_function = lgb.train
+    , verbose_kwarg = 1L
+    , verbose_param = NULL
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = TRUE
+    , lgb_warn = TRUE
+    , early_stopping = TRUE
+    , valid_eval_msg = TRUE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+})
+
+test_that("lightgbm() only prints eval metrics when expected to", {
+
+  # regardless of value passed to keyword argument 'verbose', value in params
+  # should take precedence
+  for (verbose_keyword_arg in c(-5L, -1L, 0L, 1L, 5L)) {
+
+    # (verbose = -1) should not be any logs, train should not be in valids
+    out <- .train_for_verbosity_test(
+      train_function = lightgbm
+      , verbose_kwarg = verbose_keyword_arg
+      , verbose_param = -1L
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = FALSE
+      , lgb_warn = FALSE
+      , early_stopping = FALSE
+      , valid_eval_msg = FALSE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+
+    # (verbose = 0) should be only WARN-level LightGBM logs, train should not be in valids
+    out <- .train_for_verbosity_test(
+      train_function = lightgbm
+      , verbose_kwarg = verbose_keyword_arg
+      , verbose_param = 0L
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = FALSE
+      , lgb_warn = TRUE
+      , early_stopping = FALSE
+      , valid_eval_msg = FALSE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+
+    # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages, and
+    #               train should be in valids
+    out <- .train_for_verbosity_test(
+      train_function = lightgbm
+      , verbose_kwarg = verbose_keyword_arg
+      , verbose_param = 1L
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = TRUE
+      , lgb_warn = TRUE
+      , early_stopping = TRUE
+      , valid_eval_msg = TRUE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+  }
+
+  # if verbosity isn't specified in `params`, changing keyword argument `verbose` should
+  # alter what messages are printed
+
+  # (verbose = -1) should not be any logs, train should not be in valids
+  out <- .train_for_verbosity_test(
+    train_function = lightgbm
+    , verbose_kwarg = -1L
+    , verbose_param = NULL
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = FALSE
+    , lgb_warn = FALSE
+    , early_stopping = FALSE
+    , valid_eval_msg = FALSE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+
+  # (verbose = 0) should be only WARN-level LightGBM logs, train should not be in valids
+  out <- .train_for_verbosity_test(
+    train_function = lightgbm
+    , verbose_kwarg = 0L
+    , verbose_param = NULL
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = FALSE
+    , lgb_warn = TRUE
+    , early_stopping = FALSE
+    , valid_eval_msg = FALSE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+
+  # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages, and
+  #               train should be in valids
+  out <- .train_for_verbosity_test(
+    train_function = lightgbm
+    , verbose_kwarg = 1L
+    , verbose_param = NULL
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = TRUE
+    , lgb_warn = TRUE
+    , early_stopping = TRUE
+    , valid_eval_msg = TRUE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+})
+
+test_that("lgb.cv() only prints eval metrics when expected to", {
+
+  # regardless of value passed to keyword argument 'verbose', value in params
+  # should take precedence
+  for (verbose_keyword_arg in c(-5L, -1L, 0L, 1L, 5L)) {
+
+    # (verbose = -1) should not be any logs, should be record evals
+    out <- .train_for_verbosity_test(
+      verbose_kwarg = verbose_keyword_arg
+      , verbose_param = -1L
+      , train_function = lgb.cv
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = FALSE
+      , lgb_warn = FALSE
+      , early_stopping = FALSE
+      , valid_eval_msg = FALSE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+
+    # (verbose = 0) should be only WARN-level LightGBM logs
+    out <- .train_for_verbosity_test(
+      verbose_kwarg = verbose_keyword_arg
+      , verbose_param = 0L
+      , train_function = lgb.cv
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = FALSE
+      , lgb_warn = TRUE
+      , early_stopping = FALSE
+      , valid_eval_msg = FALSE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+
+    # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages
+    out <- .train_for_verbosity_test(
+      verbose_kwarg = verbose_keyword_arg
+      , verbose_param = 1L
+      , train_function = lgb.cv
+    )
+    .assert_has_expected_logs(
+      log_txt = out[["logs"]]
+      , lgb_info = TRUE
+      , lgb_warn = TRUE
+      , early_stopping = TRUE
+      , valid_eval_msg = TRUE
+    )
+    .assert_has_expected_record_evals(
+      fitted_model = out[["booster"]]
+    )
+  }
+
+  # if verbosity isn't specified in `params`, changing keyword argument `verbose` should
+  # alter what messages are printed
+
+  # (verbose = -1) should not be any logs, should be record evals
+  out <- .train_for_verbosity_test(
+    verbose_kwarg = verbose_keyword_arg
+    , verbose_param = -1L
+    , train_function = lgb.cv
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = FALSE
+    , lgb_warn = FALSE
+    , early_stopping = FALSE
+    , valid_eval_msg = FALSE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+
+  # (verbose = 0) should be only WARN-level LightGBM logs
+  out <- .train_for_verbosity_test(
+    verbose_kwarg = verbose_keyword_arg
+    , verbose_param = 0L
+    , train_function = lgb.cv
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = FALSE
+    , lgb_warn = TRUE
+    , early_stopping = FALSE
+    , valid_eval_msg = FALSE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+
+  # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages
+  out <- .train_for_verbosity_test(
+    verbose_kwarg = verbose_keyword_arg
+    , verbose_param = 1L
+    , train_function = lgb.cv
+  )
+  .assert_has_expected_logs(
+    log_txt = out[["logs"]]
+    , lgb_info = TRUE
+    , lgb_warn = TRUE
+    , early_stopping = TRUE
+    , valid_eval_msg = TRUE
+  )
+  .assert_has_expected_record_evals(
+    fitted_model = out[["booster"]]
+  )
+})
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 54f5c300907a..974430e1ab41 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -1,4 +1,6 @@
-context("Test models with custom objective")
+VERBOSITY <- as.integer(
+  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
+)
 
 data(agaricus.train, package = "lightgbm")
 data(agaricus.test, package = "lightgbm")
@@ -36,6 +38,7 @@ param <- list(
   , learning_rate = 1.0
   , objective = logregobj
   , metric = "auc"
+  , verbose = VERBOSITY
 )
 num_round <- 10L
 
@@ -50,6 +53,7 @@ test_that("using a custom objective, custom eval, and no other metrics works", {
     params = list(
       num_leaves = 8L
       , learning_rate = 1.0
+      , verbose = VERBOSITY
     )
     , data = dtrain
     , nrounds = 4L
@@ -67,3 +71,19 @@ test_that("using a custom objective, custom eval, and no other metrics works", {
   expect_true(eval_results[["name"]] == "error")
   expect_false(eval_results[["higher_better"]])
 })
+
+test_that("using a custom objective that returns wrong shape grad or hess raises an informative error", {
+  bad_grad <- function(preds, dtrain) {
+    return(list(grad = numeric(0L), hess = rep(1.0, length(preds))))
+  }
+  bad_hess <- function(preds, dtrain) {
+    return(list(grad = rep(1.0, length(preds)), hess = numeric(0L)))
+  }
+  params <- list(num_leaves = 3L, verbose = VERBOSITY)
+  expect_error({
+    lgb.train(params = params, data = dtrain, obj = bad_grad)
+  }, sprintf("Expected custom objective function to return grad with length %d, got 0.", nrow(dtrain)))
+  expect_error({
+    lgb.train(params = params, data = dtrain, obj = bad_hess)
+  }, sprintf("Expected custom objective function to return hess with length %d, got 0.", nrow(dtrain)))
+})
diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R
index 52515440d7fb..7be64daedf60 100644
--- a/R-package/tests/testthat/test_dataset.R
+++ b/R-package/tests/testthat/test_dataset.R
@@ -1,4 +1,6 @@
-context("testing lgb.Dataset functionality")
+VERBOSITY <- as.integer(
+  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
+)
 
 data(agaricus.train, package = "lightgbm")
 train_data <- agaricus.train$data[seq_len(1000L), ]
@@ -10,7 +12,13 @@ test_label <- agaricus.test$label[1L:100L]
 
 test_that("lgb.Dataset: basic construction, saving, loading", {
   # from sparse matrix
-  dtest1 <- lgb.Dataset(test_data, label = test_label)
+  dtest1 <- lgb.Dataset(
+    test_data
+    , label = test_label
+    , params = list(
+      verbose = VERBOSITY
+    )
+  )
   # from dense matrix
   dtest2 <- lgb.Dataset(as.matrix(test_data), label = test_label)
   expect_equal(get_field(dtest1, "label"), get_field(dtest2, "label"))
@@ -19,7 +27,12 @@ test_that("lgb.Dataset: basic construction, saving, loading", {
   tmp_file <- tempfile("lgb.Dataset_")
   lgb.Dataset.save(dtest1, tmp_file)
   # read from a local file
-  dtest3 <- lgb.Dataset(tmp_file)
+  dtest3 <- lgb.Dataset(
+    tmp_file
+    , params = list(
+      verbose = VERBOSITY
+    )
+  )
   lgb.Dataset.construct(dtest3)
   unlink(tmp_file)
   expect_equal(get_field(dtest1, "label"), get_field(dtest3, "label"))
@@ -144,7 +157,10 @@ test_that("Dataset$set_reference() updates categorical_feature, colnames, and pr
   dtest$set_reference(dtrain)
 
   # after setting reference to dtrain, those attributes should have dtrain's values
-  expect_is(dtest$.__enclos_env__$private$predictor, "lgb.Predictor")
+  expect_true(methods::is(
+    dtest$.__enclos_env__$private$predictor
+    , "lgb.Predictor"
+  ))
   expect_identical(
     dtest$.__enclos_env__$private$predictor$.__enclos_env__$private$handle
     , dtrain$.__enclos_env__$private$predictor$.__enclos_env__$private$handle
@@ -195,7 +211,7 @@ test_that("lgb.Dataset: Dataset should be able to construct from matrix and retu
     , lightgbm:::lgb.params2str(params = list())
     , ref_handle
   )
-  expect_is(handle, "externalptr")
+  expect_true(methods::is(handle, "externalptr"))
   expect_false(is.null(handle))
   .Call(LGBM_DatasetFree_R, handle)
   handle <- NULL
@@ -210,7 +226,9 @@ test_that("cpp errors should be raised as proper R errors", {
     , init_score = seq_len(10L)
   )
   expect_error({
-    dtrain$construct()
+    capture.output({
+      dtrain$construct()
+    }, type = "message")
   }, regexp = "Initial score size doesn't match data size")
 })
 
@@ -353,6 +371,9 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin
   dtest <- lgb.Dataset(
     data = test_data
     , label = test_label
+    , params = list(
+      verbose = VERBOSITY
+    )
   )
   tmp_file <- tempfile(pattern = "lgb.Dataset_")
   lgb.Dataset.save(
@@ -368,6 +389,7 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin
     , metric = "binary_logloss"
     , num_leaves = 5L
     , learning_rate = 1.0
+    , verbose = VERBOSITY
   )
 
   # should be able to train right away
@@ -383,6 +405,9 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
   dtest <- lgb.Dataset(
     data = test_data
     , label = test_label
+    , params = list(
+      verbosity = VERBOSITY
+    )
   )
   tmp_file <- tempfile(pattern = "lgb.Dataset_")
   lgb.Dataset.save(
@@ -398,6 +423,8 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
     , metric = "binary_logloss"
     , num_leaves = 5L
     , learning_rate = 1.0
+    , num_iterations = 5L
+    , verbosity = VERBOSITY
   )
 
   # should be able to train right away
@@ -406,7 +433,7 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
     , data = dtest_read_in
   )
 
-  expect_is(bst, "lgb.CVBooster")
+  expect_true(methods::is(bst, "lgb.CVBooster"))
 })
 
 test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
@@ -440,7 +467,10 @@ test_that("lgb.Dataset: should be able to create a Dataset from a text file with
 
   dtrain <- lgb.Dataset(
     data = train_file
-    , params = list(header = TRUE)
+    , params = list(
+      header = TRUE
+      , verbosity = VERBOSITY
+    )
   )
   dtrain$construct()
   expect_identical(dtrain$get_colnames(), c("x1", "x2"))
@@ -461,7 +491,10 @@ test_that("lgb.Dataset: should be able to create a Dataset from a text file with
 
   dtrain <- lgb.Dataset(
     data = train_file
-    , params = list(header = FALSE)
+    , params = list(
+      header = FALSE
+      , verbosity = VERBOSITY
+    )
   )
   dtrain$construct()
   expect_identical(dtrain$get_colnames(), c("Column_0", "Column_1"))
@@ -492,6 +525,9 @@ test_that("Dataset: method calls on a Dataset with a null handle should raise an
   expect_error({
     dtrain$get_colnames()
   }, regexp = "cannot get column names before dataset has been constructed")
+  expect_error({
+    dtrain$get_feature_num_bin(1L)
+  }, regexp = "Cannot get number of bins in feature before constructing Dataset.")
   expect_error({
     dtrain$save_binary(fname = tempfile(fileext = ".bin"))
   }, regexp = "Attempting to create a Dataset without any raw data")
@@ -516,3 +552,68 @@ test_that("Dataset: method calls on a Dataset with a null handle should raise an
     dtrain$set_reference(reference = dvalid)
   }, regexp = "cannot get column names before dataset has been constructed")
 })
+
+test_that("lgb.Dataset$get_feature_num_bin() works", {
+  raw_df <- data.frame(
+    all_random = runif(100L)
+    , two_vals = rep(c(1.0, 2.0), 50L)
+    , three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
+    , two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
+    , all_zero = rep(0.0, 100L)
+    , categorical = sample.int(2L, 100L, replace = TRUE)
+  )
+  n_features <- ncol(raw_df)
+  raw_mat <- data.matrix(raw_df)
+  min_data_in_bin <- 2L
+  ds <- lgb.Dataset(
+    raw_mat
+    , params = list(min_data_in_bin = min_data_in_bin)
+    , categorical_feature = n_features
+  )
+  ds$construct()
+  expected_num_bins <- c(
+    100L %/% min_data_in_bin + 1L  # extra bin for zero
+    , 3L  # 0, 1, 2
+    , 3L  # 0, 1, 2
+    , 4L  # 0, 1, 2 + NA
+    , 0L  # unused
+    , 3L  # 1, 2 + NA
+  )
+  actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
+  expect_identical(actual_num_bins, expected_num_bins)
+  # test using defined feature names
+  bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
+  expect_identical(unname(bins_by_name), expected_num_bins)
+  # test using default feature names
+  no_names_mat <- raw_mat
+  colnames(no_names_mat) <- NULL
+  ds_no_names <- lgb.Dataset(
+    no_names_mat
+    , params = list(min_data_in_bin = min_data_in_bin)
+    , categorical_feature = n_features
+  )
+  ds_no_names$construct()
+  default_names <- lapply(
+    X = seq(1L, ncol(raw_mat))
+    , FUN = function(i) {
+      sprintf("Column_%d", i - 1L)
+    }
+  )
+  bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
+  expect_identical(bins_by_default_name, expected_num_bins)
+})
+
+test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
+  # check that dataset can be constructed
+  raw_mat <- matrix(rep(c(0L, 1L), 50L), ncol = 1L)
+  ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
+  sparse_mat <- as(raw_mat, "dgCMatrix")
+  ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
+  # check that the column names are the default ones
+  expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
+  expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
+  # check for error when index is greater than the number of columns
+  expect_error({
+    lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
+  }, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
+})
diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R
index d0966692f5ba..8e49e2d4d567 100644
--- a/R-package/tests/testthat/test_learning_to_rank.R
+++ b/R-package/tests/testthat/test_learning_to_rank.R
@@ -1,9 +1,10 @@
-context("Learning to rank")
+VERBOSITY <- as.integer(
+  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
+)
 
 # numerical tolerance to use when checking metric values
 TOLERANCE <- 1e-06
 
-ON_SOLARIS <- Sys.info()["sysname"] == "SunOS"
 ON_32_BIT_WINDOWS <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8L
 
 test_that("learning-to-rank with lgb.train() works as expected", {
@@ -18,13 +19,14 @@ test_that("learning-to-rank with lgb.train() works as expected", {
         , group = rep(150L, 40L)
     )
     ndcg_at <- "1,2,3"
-    eval_names <-  paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]])
+    eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]])
     params <- list(
         objective = "lambdarank"
         , metric = "ndcg"
         , ndcg_at = ndcg_at
         , lambdarank_truncation_level = 3L
         , learning_rate = 0.001
+        , verbose = VERBOSITY
     )
     model <- lgb.train(
         params = params
@@ -47,9 +49,17 @@ test_that("learning-to-rank with lgb.train() works as expected", {
         expect_true(result[["higher_better"]])
         expect_identical(result[["data_name"]], "training")
     }
-    expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
+    expect_identical(
+        sapply(
+            X = eval_results
+            , FUN = function(x) {
+                x$name
+            }
+        )
+        , eval_names
+    )
     expect_equal(eval_results[[1L]][["value"]], 0.775)
-    if (!(ON_SOLARIS || ON_32_BIT_WINDOWS)) {
+    if (!ON_32_BIT_WINDOWS) {
         expect_true(abs(eval_results[[2L]][["value"]] - 0.745986) < TOLERANCE)
         expect_true(abs(eval_results[[3L]][["value"]] - 0.7351959) < TOLERANCE)
     }
@@ -57,8 +67,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
 
 test_that("learning-to-rank with lgb.cv() works as expected", {
     testthat::skip_if(
-        ON_SOLARIS || ON_32_BIT_WINDOWS
-        , message = "Skipping on Solaris and 32-bit Windows"
+        ON_32_BIT_WINDOWS
+        , message = "Skipping on 32-bit Windows"
     )
     set.seed(708L)
     data(agaricus.train, package = "lightgbm")
@@ -71,7 +81,7 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
         , group = rep(150L, 40L)
     )
     ndcg_at <- "1,2,3"
-    eval_names <-  paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]])
+    eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]])
     params <- list(
         objective = "lambdarank"
         , metric = "ndcg"
@@ -80,6 +90,7 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
         , label_gain = "0,1,3"
         , min_data = 1L
         , learning_rate = 0.01
+        , verbose = VERBOSITY
     )
     nfold <- 4L
     nrounds <- 10L
@@ -89,12 +100,12 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
         , nrounds = nrounds
         , nfold = nfold
     )
-    expect_is(cv_bst, "lgb.CVBooster")
+    expect_true(methods::is(cv_bst, "lgb.CVBooster"))
     expect_equal(length(cv_bst$boosters), nfold)
 
     # "valid" should contain results for each metric
     eval_results <- cv_bst$record_evals[["valid"]]
-    eval_names <-  c("ndcg@1", "ndcg@2", "ndcg@3")
+    eval_names <- c("ndcg@1", "ndcg@2", "ndcg@3")
     expect_identical(names(eval_results), eval_names)
 
     # check that best score and iter make sense (0.0 < nDCG < 1.0)
diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R
index 1b357f794468..8208ef416a65 100644
--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -1,4 +1,6 @@
-context("Booster")
+VERBOSITY <- as.integer(
+  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
+)
 
 ON_WINDOWS <- .Platform$OS.type == "windows"
 TOLERANCE <- 1e-6
@@ -12,7 +14,7 @@ test_that("Booster$finalize() should not fail", {
         , params = list(
             objective = "regression"
         )
-        , verbose = -1L
+        , verbose = VERBOSITY
         , nrounds = 3L
     )
     expect_true(lgb.is.Booster(bst))
@@ -27,8 +29,6 @@ test_that("Booster$finalize() should not fail", {
     expect_true(lgb.is.null.handle(bst$.__enclos_env__$private$handle))
 })
 
-context("lgb.get.eval.result")
-
 test_that("lgb.get.eval.result() should throw an informative error if booster is not an lgb.Booster", {
     bad_inputs <- list(
         matrix(1.0:10.0, 2L, 5L)
@@ -65,6 +65,7 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect
             , metric = "l2"
             , min_data = 1L
             , learning_rate = 1.0
+            , verbose = VERBOSITY
         )
         , data = dtrain
         , nrounds = 5L
@@ -98,6 +99,7 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect
             , metric = "l2"
             , min_data = 1L
             , learning_rate = 1.0
+            , verbose = VERBOSITY
         )
         , data = dtrain
         , nrounds = 5L
@@ -118,8 +120,6 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect
     }, regexp = "Only the following eval_names exist for dataset.*\\: \\[l2\\]", fixed = FALSE)
 })
 
-context("lgb.load()")
-
 test_that("lgb.load() gives the expected error messages given different incorrect inputs", {
     set.seed(708L)
     data(agaricus.train, package = "lightgbm")
@@ -133,9 +133,9 @@ test_that("lgb.load() gives the expected error messages given different incorrec
             objective = "binary"
             , num_leaves = 4L
             , learning_rate = 1.0
+            , verbose = VERBOSITY
         )
         , nrounds = 2L
-        , save_name = tempfile(fileext = ".model")
     )
 
     # you have to give model_str or filename
@@ -179,9 +179,9 @@ test_that("Loading a Booster from a text file works", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = 2L
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst))
 
@@ -221,6 +221,7 @@ test_that("boosters with linear models at leaves can be written to text file and
         data = dtrain
         , nrounds = 10L
         , params = params
+        , verbose = VERBOSITY
     )
     expect_true(lgb.is.Booster(bst))
 
@@ -254,9 +255,9 @@ test_that("Loading a Booster from a string works", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = 2L
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst))
 
@@ -288,13 +289,12 @@ test_that("Saving a large model to string should work", {
             , objective = "binary"
         )
         , nrounds = 500L
-        , save_name = tempfile(fileext = ".model")
-        , verbose = -1L
+        , verbose = VERBOSITY
     )
 
     pred <- predict(bst, train$data)
-    pred_leaf_indx <- predict(bst, train$data, predleaf = TRUE)
-    pred_raw_score <- predict(bst, train$data, rawscore = TRUE)
+    pred_leaf_indx <- predict(bst, train$data, type = "leaf")
+    pred_raw_score <- predict(bst, train$data, type = "raw")
     model_string <- bst$save_model_to_string()
 
     # make sure this test is still producing a model bigger than the default
@@ -312,8 +312,8 @@ test_that("Saving a large model to string should work", {
         model_str = model_string
     )
     pred2 <- predict(bst2, train$data)
-    pred2_leaf_indx <- predict(bst2, train$data, predleaf = TRUE)
-    pred2_raw_score <- predict(bst2, train$data, rawscore = TRUE)
+    pred2_leaf_indx <- predict(bst2, train$data, type = "leaf")
+    pred2_raw_score <- predict(bst2, train$data, type = "raw")
     expect_identical(pred, pred2)
     expect_identical(pred_leaf_indx, pred2_leaf_indx)
     expect_identical(pred_raw_score, pred2_raw_score)
@@ -332,8 +332,7 @@ test_that("Saving a large model to JSON should work", {
             , objective = "binary"
         )
         , nrounds = 200L
-        , save_name = tempfile(fileext = ".model")
-        , verbose = -1L
+        , verbose = VERBOSITY
     )
 
     model_json <- bst$dump_model()
@@ -360,9 +359,9 @@ test_that("If a string and a file are both passed to lgb.load() the file is used
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = 2L
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst))
 
@@ -383,8 +382,6 @@ test_that("If a string and a file are both passed to lgb.load() the file is used
     expect_identical(pred, pred2)
 })
 
-context("Booster")
-
 test_that("Creating a Booster from a Dataset should work", {
     set.seed(708L)
     data(agaricus.train, package = "lightgbm")
@@ -396,6 +393,7 @@ test_that("Creating a Booster from a Dataset should work", {
     bst <- Booster$new(
         params = list(
             objective = "binary"
+            , verbose = VERBOSITY
         ),
         train_set = dtrain
     )
@@ -416,9 +414,9 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = nrounds
-        , save_name = tempfile(fileext = ".model")
     )
     data(agaricus.test, package = "lightgbm")
     dtest <- Dataset$new(
@@ -428,6 +426,9 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w
     )
     bst_from_ds <- Booster$new(
         train_set = dtest
+        , params = list(
+            verbose = VERBOSITY
+        )
     )
     expect_true(lgb.is.Booster(bst))
     expect_equal(bst$current_iter(), nrounds)
@@ -449,6 +450,7 @@ test_that("Booster$eval() should work on a Dataset stored in a binary file", {
             objective = "regression"
             , metric = "l2"
             , num_leaves = 4L
+            , verbose = VERBOSITY
         )
         , data = dtrain
         , nrounds = 2L
@@ -478,6 +480,7 @@ test_that("Booster$eval() should work on a Dataset stored in a binary file", {
     eval_from_file <- bst$eval(
         data = lgb.Dataset(
             data = test_file
+            , params = list(verbose = VERBOSITY)
         )$construct()
         , name = "test"
     )
@@ -505,9 +508,9 @@ test_that("Booster$rollback_one_iter() should work as expected", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = nrounds
-        , save_name = tempfile(fileext = ".model")
     )
     expect_equal(bst$current_iter(), nrounds)
     expect_true(lgb.is.Booster(bst))
@@ -539,9 +542,9 @@ test_that("Booster$update() passing a train_set works as expected", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = nrounds
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst))
     expect_equal(bst$current_iter(), nrounds)
@@ -549,6 +552,7 @@ test_that("Booster$update() passing a train_set works as expected", {
         train_set = Dataset$new(
             data = agaricus.train$data
             , label = agaricus.train$label
+            , params = list(verbose = VERBOSITY)
         )
     )
     expect_true(lgb.is.Booster(bst))
@@ -562,9 +566,9 @@ test_that("Booster$update() passing a train_set works as expected", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = nrounds +  1L
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst2))
     expect_equal(bst2$current_iter(), nrounds +  1L)
@@ -587,9 +591,9 @@ test_that("Booster$update() throws an informative error if you provide a non-Dat
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = nrounds
-        , save_name = tempfile(fileext = ".model")
     )
     expect_error({
         bst$update(
@@ -614,6 +618,7 @@ test_that("Booster should store parameters and Booster$reset_parameter() should
         , metric = c("multi_logloss", "multi_error")
         , boosting = "gbdt"
         , num_class = 5L
+        , verbose = VERBOSITY
     )
     bst <- Booster$new(
         params = params
@@ -640,6 +645,7 @@ test_that("Booster$params should include dataset params, before and after Booste
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.8
+        , verbose = VERBOSITY
     )
     bst <- Booster$new(
         params = params
@@ -651,6 +657,7 @@ test_that("Booster$params should include dataset params, before and after Booste
             objective = "binary"
             , max_depth = 4L
             , bagging_fraction = 0.8
+            , verbose = VERBOSITY
             , max_bin = 17L
         )
     )
@@ -661,14 +668,13 @@ test_that("Booster$params should include dataset params, before and after Booste
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.9
+        , verbose = VERBOSITY
         , max_bin = 17L
     )
     expect_identical(ret_bst$params, expected_params)
     expect_identical(bst$params, expected_params)
 })
 
-context("save_model")
-
 test_that("Saving a model with different feature importance types works", {
     set.seed(708L)
     data(agaricus.train, package = "lightgbm")
@@ -680,9 +686,9 @@ test_that("Saving a model with different feature importance types works", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = 2L
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst))
 
@@ -735,15 +741,19 @@ test_that("Saving a model with unknown importance type fails", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
+            , verbose = VERBOSITY
         )
         , nrounds = 2L
-        , save_name = tempfile(fileext = ".model")
     )
     expect_true(lgb.is.Booster(bst))
 
     UNSUPPORTED_IMPORTANCE <- 2L
     expect_error({
-        model_string <- bst$save_model_to_string(feature_importance_type = UNSUPPORTED_IMPORTANCE)
+        capture.output({
+          model_string <- bst$save_model_to_string(
+            feature_importance_type = UNSUPPORTED_IMPORTANCE
+          )
+        }, type = "message")
     }, "Unknown importance type")
 })
 
@@ -770,27 +780,27 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
         )
         , data = dtrain
         , nrounds = nrounds
-        , verbose = 0L
+        , verbose = VERBOSITY
     )
 
     model_str <- bst$save_model_to_string()
     params_in_file <- .params_from_model_string(model_str = model_str)
 
     # parameters should match what was passed from the R package
-    expect_equal(sum(grepl(pattern = "^\\[metric\\:", x = params_in_file)), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L)
     expect_equal(sum(params_in_file == "[metric: l2]"), 1L)
 
-    expect_equal(sum(grepl(pattern = "^\\[num_iterations\\:", x = params_in_file)), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
     expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L)
 
-    expect_equal(sum(grepl(pattern = "^\\[objective\\:", x = params_in_file)), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L)
     expect_equal(sum(params_in_file == "[objective: regression]"), 1L)
 
-    expect_equal(sum(grepl(pattern = "^\\[verbosity\\:", x = params_in_file)), 1L)
-    expect_equal(sum(params_in_file == "[verbosity: 0]"), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L)
+    expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", VERBOSITY)), 1L)
 
     # early stopping should be off by default
-    expect_equal(sum(grepl(pattern = "^\\[early_stopping_round\\:", x = params_in_file)), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
     expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L)
 })
 
@@ -833,7 +843,7 @@ test_that("early_stopping, num_iterations are stored correctly in model string e
         , valids = list(
             "random_valid" = dvalid
         )
-        , verbose = 0L
+        , verbose = VERBOSITY
     )
 
     model_str <- bst$save_model_to_string()
@@ -841,15 +851,15 @@ test_that("early_stopping, num_iterations are stored correctly in model string e
 
     # parameters should match what was passed from the R package, and the "main" (non-alias)
     # params values in `params` should be preferred to keyword argumentts or aliases
-    expect_equal(sum(grepl(pattern = "^\\[num_iterations\\:", x = params_in_file)), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
     expect_equal(sum(params_in_file == sprintf("[num_iterations: %s]", num_iterations)), 1L)
-    expect_equal(sum(grepl(pattern = "^\\[early_stopping_round\\:", x = params_in_file)), 1L)
+    expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
     expect_equal(sum(params_in_file == sprintf("[early_stopping_round: %s]", early_stopping_round)), 1L)
 
     # none of the aliases shouold have been written to the model file
-    expect_equal(sum(grepl(pattern = "^\\[num_boost_round\\:", x = params_in_file)), 0L)
-    expect_equal(sum(grepl(pattern = "^\\[n_iter\\:", x = params_in_file)), 0L)
-    expect_equal(sum(grepl(pattern = "^\\[n_iter_no_change\\:", x = params_in_file)), 0L)
+    expect_equal(sum(startsWith(params_in_file, "[num_boost_round:")), 0L)
+    expect_equal(sum(startsWith(params_in_file, "[n_iter:")), 0L)
+    expect_equal(sum(startsWith(params_in_file, "[n_iter_no_change:")), 0L)
 
 })
 
@@ -864,7 +874,7 @@ test_that("Booster: method calls Booster with a null handle should raise an info
             , num_leaves = 8L
         )
         , data = dtrain
-        , verbose = -1L
+        , verbose = VERBOSITY
         , nrounds = 5L
         , valids = list(
             train = dtrain
@@ -937,8 +947,90 @@ test_that("Booster$new() using a Dataset with a null handle should raise an info
     rm(dtrain)
     dtrain <- readRDS(tmp_file)
     expect_error({
-        bst <- Booster$new(train_set = dtrain)
-    }, regexp = "lgb.Booster: cannot create Booster handle")
+        bst <- Booster$new(
+            train_set = dtrain
+            , params = list(
+                verbose = VERBOSITY
+            )
+        )
+    }, regexp = "Attempting to create a Dataset without any raw data")
+})
+
+test_that("Booster$new() raises informative errors for malformed inputs", {
+  data(agaricus.train, package = "lightgbm")
+  train <- agaricus.train
+  dtrain <- lgb.Dataset(train$data, label = train$label)
+
+  # no inputs
+  expect_error({
+    Booster$new()
+  }, regexp = "lgb.Booster: Need at least either training dataset, model file, or model_str")
+
+  # unrecognized objective
+  expect_error({
+    capture.output({
+      Booster$new(
+        params = list(objective = "not_a_real_objective")
+        , train_set = dtrain
+      )
+    }, type = "message")
+  }, regexp = "Unknown objective type name: not_a_real_objective")
+
+  # train_set is not a Dataset
+  expect_error({
+    Booster$new(
+      train_set = data.table::data.table(rnorm(1L:10L))
+    )
+  }, regexp = "lgb.Booster: Can only use lgb.Dataset as training data")
+
+  # model file isn't a string
+  expect_error({
+    Booster$new(
+      modelfile = list()
+    )
+  }, regexp = "lgb.Booster: Can only use a string as model file path")
+
+  # model file doesn't exist
+  expect_error({
+    capture.output({
+      Booster$new(
+        params = list()
+        , modelfile = "file-that-does-not-exist.model"
+      )
+    }, type = "message")
+  }, regexp = "Could not open file-that-does-not-exist.model")
+
+  # model file doesn't contain a valid LightGBM model
+  model_file <- tempfile(fileext = ".model")
+  writeLines(
+    text = c("make", "good", "predictions")
+    , con = model_file
+  )
+  expect_error({
+    capture.output({
+      Booster$new(
+        params = list()
+        , modelfile = model_file
+      )
+    }, type = "message")
+  }, regexp = "Unknown model format or submodel type in model file")
+
+  # malformed model string
+  expect_error({
+    capture.output({
+      Booster$new(
+        params = list()
+        , model_str = "a\nb\n"
+      )
+    }, type = "message")
+  }, regexp = "Model file doesn't specify the number of classes")
+
+  # model string isn't character or raw
+  expect_error({
+    Booster$new(
+      model_str = numeric()
+    )
+  }, regexp = "lgb.Booster: Can only use a character/raw vector as model_str")
 })
 
 # this is almost identical to the test above it, but for lgb.cv(). A lot of code
@@ -969,6 +1061,7 @@ test_that("lgb.cv() correctly handles passing through params to the model file",
         , n_iter = n_iter
         , early_stopping_round = early_stopping_round
         , n_iter_no_change = n_iter_no_change
+        , verbose = VERBOSITY
     )
 
     cv_bst <- lgb.cv(
@@ -977,7 +1070,7 @@ test_that("lgb.cv() correctly handles passing through params to the model file",
         , nrounds = nrounds_kwarg
         , early_stopping_rounds = early_stopping_round_kwarg
         , nfold = 3L
-        , verbose = 0L
+        , verbose = VERBOSITY
     )
 
     for (bst in cv_bst$boosters) {
@@ -986,21 +1079,19 @@ test_that("lgb.cv() correctly handles passing through params to the model file",
 
         # parameters should match what was passed from the R package, and the "main" (non-alias)
         # params values in `params` should be preferred to keyword argumentts or aliases
-        expect_equal(sum(grepl(pattern = "^\\[num_iterations\\:", x = params_in_file)), 1L)
+        expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
         expect_equal(sum(params_in_file == sprintf("[num_iterations: %s]", num_iterations)), 1L)
-        expect_equal(sum(grepl(pattern = "^\\[early_stopping_round\\:", x = params_in_file)), 1L)
+        expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
         expect_equal(sum(params_in_file == sprintf("[early_stopping_round: %s]", early_stopping_round)), 1L)
 
         # none of the aliases shouold have been written to the model file
-        expect_equal(sum(grepl(pattern = "^\\[num_boost_round\\:", x = params_in_file)), 0L)
-        expect_equal(sum(grepl(pattern = "^\\[n_iter\\:", x = params_in_file)), 0L)
-        expect_equal(sum(grepl(pattern = "^\\[n_iter_no_change\\:", x = params_in_file)), 0L)
+        expect_equal(sum(startsWith(params_in_file, "[num_boost_round:")), 0L)
+        expect_equal(sum(startsWith(params_in_file, "[n_iter:")), 0L)
+        expect_equal(sum(startsWith(params_in_file, "[n_iter_no_change:")), 0L)
     }
 
 })
 
-context("saveRDS.lgb.Booster() and readRDS.lgb.Booster()")
-
 test_that("params (including dataset params) should be stored in .rds file for Booster", {
     data(agaricus.train, package = "lightgbm")
     dtrain <- lgb.Dataset(
@@ -1014,6 +1105,7 @@ test_that("params (including dataset params) should be stored in .rds file for B
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.8
+        , verbose = VERBOSITY
     )
     bst <- Booster$new(
         params = params
@@ -1029,13 +1121,12 @@ test_that("params (including dataset params) should be stored in .rds file for B
             objective = "binary"
             , max_depth = 4L
             , bagging_fraction = 0.8
+            , verbose = VERBOSITY
             , max_bin = 17L
         )
     )
 })
 
-context("saveRDS and readRDS work on Booster")
-
 test_that("params (including dataset params) should be stored in .rds file for Booster", {
     data(agaricus.train, package = "lightgbm")
     dtrain <- lgb.Dataset(
@@ -1049,6 +1140,7 @@ test_that("params (including dataset params) should be stored in .rds file for B
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.8
+        , verbose = VERBOSITY
     )
     bst <- Booster$new(
         params = params
@@ -1064,6 +1156,7 @@ test_that("params (including dataset params) should be stored in .rds file for B
             objective = "binary"
             , max_depth = 4L
             , bagging_fraction = 0.8
+            , verbose = VERBOSITY
             , max_bin = 17L
         )
     )
@@ -1071,7 +1164,15 @@ test_that("params (including dataset params) should be stored in .rds file for B
 
 test_that("Handle is automatically restored when calling predict", {
     data(agaricus.train, package = "lightgbm")
-    bst <- lightgbm(agaricus.train$data, agaricus.train$label, nrounds = 5L, obj = "binary")
+    bst <- lightgbm(
+        agaricus.train$data
+        , agaricus.train$label
+        , nrounds = 5L
+        , obj = "binary"
+        , params = list(
+            verbose = VERBOSITY
+        )
+    )
     bst_file <- tempfile(fileext = ".rds")
     saveRDS(bst, file = bst_file)
 
@@ -1092,7 +1193,7 @@ test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster a
 
     params <- list(
         objective = "regression"
-        , verbose = -1L
+        , verbose = VERBOSITY
         , metric = "mse"
         , seed = 0L
         , num_leaves = 2L
@@ -1114,7 +1215,9 @@ test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster a
     rm(bst)
 
     # load the booster and make predictions...should be the same
-    expect_warning({bst2 <- readRDS.lgb.Booster(file = model_file)})
+    expect_warning({
+        bst2 <- readRDS.lgb.Booster(file = model_file)
+    })
     preds2 <- predict(bst2, X)
     expect_identical(preds, preds2)
 })
@@ -1129,7 +1232,7 @@ test_that("boosters with linear models at leaves can be written to RDS and re-lo
 
     params <- list(
         objective = "regression"
-        , verbose = -1L
+        , verbose = VERBOSITY
         , metric = "mse"
         , seed = 0L
         , num_leaves = 2L
@@ -1164,33 +1267,80 @@ test_that("Booster's print, show, and summary work correctly", {
        )
     }
 
+    .has_expected_content_for_fitted_model <- function(printed_txt) {
+      expect_true(any(startsWith(printed_txt, "LightGBM Model")))
+      expect_true(any(startsWith(printed_txt, "Fitted to dataset")))
+    }
+
+    .has_expected_content_for_finalized_model <- function(printed_txt) {
+      expect_true(any(grepl("^LightGBM Model$", printed_txt)))
+      expect_true(any(grepl("Booster handle is invalid", printed_txt)))
+    }
+
     .check_methods_work <- function(model) {
 
-        # should work for fitted models
-        ret <- print(model)
+        #--- should work for fitted models --- #
+
+        # print()
+        log_txt <- capture.output({
+          ret <- print(model)
+        })
         .have_same_handle(ret, model)
-        ret <- show(model)
+        .has_expected_content_for_fitted_model(log_txt)
+
+        # show()
+        log_txt <- capture.output({
+          ret <- show(model)
+        })
         expect_null(ret)
-        ret <- summary(model)
+        .has_expected_content_for_fitted_model(log_txt)
+
+        # summary()
+        log_text <- capture.output({
+          ret <- summary(model)
+        })
         .have_same_handle(ret, model)
+        .has_expected_content_for_fitted_model(log_txt)
 
-        # should not fail for finalized models
+        #--- should not fail for finalized models ---#
         model$finalize()
-        ret <- print(model)
+
+        # print()
+        log_txt <- capture.output({
+          ret <- print(model)
+        })
+        .has_expected_content_for_finalized_model(log_txt)
+
+        # show()
         .have_same_handle(ret, model)
-        ret <- show(model)
+        log_txt <- capture.output({
+          ret <- show(model)
+        })
         expect_null(ret)
-        ret <- summary(model)
+        .has_expected_content_for_finalized_model(log_txt)
+
+        # summary()
+        log_txt <- capture.output({
+          ret <- summary(model)
+        })
         .have_same_handle(ret, model)
+        .has_expected_content_for_finalized_model(log_txt)
     }
 
     data("mtcars")
     model <- lgb.train(
-        params = list(objective = "regression")
+        params = list(
+          objective = "regression"
+          , min_data_in_leaf = 1L
+        )
         , data = lgb.Dataset(
             as.matrix(mtcars[, -1L])
-            , label = mtcars$mpg)
-        , verbose = 0L
+            , label = mtcars$mpg
+            , params = list(
+              min_data_in_bin = 1L
+            )
+        )
+        , verbose = VERBOSITY
         , nrounds = 5L
     )
     .check_methods_work(model)
@@ -1202,7 +1352,7 @@ test_that("Booster's print, show, and summary work correctly", {
             as.matrix(iris[, -5L])
             , label = as.numeric(factor(iris$Species)) - 1.0
         )
-        , verbose = 0L
+        , verbose = VERBOSITY
         , nrounds = 5L
     )
     .check_methods_work(model)
@@ -1235,7 +1385,7 @@ test_that("Booster's print, show, and summary work correctly", {
         )
         , obj = .logregobj
         , eval = .evalerror
-        , verbose = 0L
+        , verbose = VERBOSITY
         , nrounds = 5L
     )
 
@@ -1245,11 +1395,18 @@ test_that("Booster's print, show, and summary work correctly", {
 test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", {
     data("mtcars")
     model <- lgb.train(
-        params = list(objective = "regression")
+        params = list(
+          objective = "regression"
+          , min_data_in_leaf = 1L
+        )
         , data = lgb.Dataset(
             as.matrix(mtcars[, -1L])
-            , label = mtcars$mpg)
-        , verbose = 0L
+            , label = mtcars$mpg
+            , params = list(
+              min_data_in_bin = 1L
+            )
+        )
+        , verbose = VERBOSITY
         , nrounds = 5L
     )
     ncols <- .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle)
@@ -1262,7 +1419,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", {
             as.matrix(iris[, -5L])
             , label = as.numeric(factor(iris$Species)) - 1.0
         )
-        , verbose = 0L
+        , verbose = VERBOSITY
         , nrounds = 5L
     )
     ncols <- .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle)
diff --git a/R-package/tests/testthat/test_lgb.convert_with_rules.R b/R-package/tests/testthat/test_lgb.convert_with_rules.R
index 546ab9663f4f..39438f0ec5cd 100644
--- a/R-package/tests/testthat/test_lgb.convert_with_rules.R
+++ b/R-package/tests/testthat/test_lgb.convert_with_rules.R
@@ -1,5 +1,3 @@
-context("lgb.convert_with_rules()")
-
 test_that("lgb.convert_with_rules() rejects inputs that are not a data.table or data.frame", {
     bad_inputs <- list(
         matrix(1.0:10.0, 2L, 5L)
@@ -37,7 +35,7 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with onl
         expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
         # rules should be returned and correct
         rules <- conversion_result$rules
-        expect_is(rules, "list")
+        expect_true(methods::is(rules, "list"))
         expect_length(rules, ncol(input_data))
         expect_identical(rules[["col1"]], c("a" = 1L, "b" = 2L, "c" = 3L))
         expect_identical(rules[["col2"]], c("green" = 1L, "red" = 2L))
@@ -62,7 +60,7 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with onl
         expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
         # rules should be returned and correct
         rules <- conversion_result$rules
-        expect_is(rules, "list")
+        expect_true(methods::is(rules, "list"))
         expect_length(rules, ncol(input_data))
         expect_identical(rules[["col1"]], c("a" = 1L, "b" = 2L, "c" = 3L))
         expect_identical(rules[["col2"]], c("green" = 1L, "red" = 2L))
@@ -106,7 +104,7 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with num
         expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
         # rules should be returned and correct
         rules <- conversion_result$rules
-        expect_is(rules, "list")
+        expect_true(methods::is(rules, "list"))
         expect_length(rules, 2L)
         expect_identical(rules[["character_col"]], c("a" = 1L, "b" = 2L, "c" = 3L))
         expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
@@ -164,7 +162,7 @@ test_that("lgb.convert_with_rules() should convert missing values to the expecte
 
         # rules should be returned and correct
         rules <- conversion_result$rules
-        expect_is(rules, "list")
+        expect_true(methods::is(rules, "list"))
         expect_length(rules, 3L)
         expect_identical(rules[["character_col"]], c("a" = 1L, "c" = 2L))
         expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
diff --git a/R-package/tests/testthat/test_lgb.importance.R b/R-package/tests/testthat/test_lgb.importance.R
index c0e1d6e8ca82..7dcf75613fb4 100644
--- a/R-package/tests/testthat/test_lgb.importance.R
+++ b/R-package/tests/testthat/test_lgb.importance.R
@@ -1,5 +1,3 @@
-context("lgb.importance")
-
 test_that("lgb.importance() should reject bad inputs", {
     bad_inputs <- list(
         .Machine$integer.max
diff --git a/R-package/tests/testthat/test_lgb.interprete.R b/R-package/tests/testthat/test_lgb.interprete.R
index 9c03165f181f..29ac110accbc 100644
--- a/R-package/tests/testthat/test_lgb.interprete.R
+++ b/R-package/tests/testthat/test_lgb.interprete.R
@@ -2,8 +2,6 @@ VERBOSITY <- as.integer(
     Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
 
-context("lgb.interpete")
-
 .sigmoid <- function(x) {
     1.0 / (1.0 + exp(-x))
 }
diff --git a/R-package/tests/testthat/test_lgb.plot.importance.R b/R-package/tests/testthat/test_lgb.plot.importance.R
index 5cbd04dab83f..1a1e2b0d5398 100644
--- a/R-package/tests/testthat/test_lgb.plot.importance.R
+++ b/R-package/tests/testthat/test_lgb.plot.importance.R
@@ -2,8 +2,6 @@ VERBOSITY <- as.integer(
     Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
 
-context("lgb.plot.importance()")
-
 test_that("lgb.plot.importance() should run without error for well-formed inputs", {
     data(agaricus.train, package = "lightgbm")
     train <- agaricus.train
diff --git a/R-package/tests/testthat/test_lgb.plot.interpretation.R b/R-package/tests/testthat/test_lgb.plot.interpretation.R
index 931ab9e2fcf8..bb8009d3595b 100644
--- a/R-package/tests/testthat/test_lgb.plot.interpretation.R
+++ b/R-package/tests/testthat/test_lgb.plot.interpretation.R
@@ -2,8 +2,6 @@ VERBOSITY <- as.integer(
     Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
 
-context("lgb.plot.interpretation")
-
 .sigmoid <- function(x) {
     1.0 / (1.0 + exp(-x))
 }
diff --git a/R-package/tests/testthat/test_lgb.unloader.R b/R-package/tests/testthat/test_lgb.unloader.R
deleted file mode 100644
index 58087bc63429..000000000000
--- a/R-package/tests/testthat/test_lgb.unloader.R
+++ /dev/null
@@ -1,59 +0,0 @@
-VERBOSITY <- as.integer(
-    Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
-context("lgb.unloader")
-
-test_that("lgb.unloader works as expected", {
-    data(agaricus.train, package = "lightgbm")
-    train <- agaricus.train
-    dtrain <- lgb.Dataset(train$data, label = train$label)
-    bst <- lgb.train(
-        params = list(
-            objective = "regression"
-            , metric = "l2"
-            , min_data = 1L
-            , learning_rate = 1.0
-            , verbosity = VERBOSITY
-        )
-        , data = dtrain
-        , nrounds = 1L
-    )
-    expect_true(exists("bst"))
-    result <- lgb.unloader(restore = TRUE, wipe = TRUE, envir = environment())
-    expect_false(exists("bst"))
-    expect_null(result)
-})
-
-test_that("lgb.unloader finds all boosters and removes them", {
-    data(agaricus.train, package = "lightgbm")
-    train <- agaricus.train
-    dtrain <- lgb.Dataset(train$data, label = train$label)
-    bst1 <- lgb.train(
-        params = list(
-            objective = "regression"
-            , metric = "l2"
-            , min_data = 1L
-            , learning_rate = 1.0
-            , verbosity = VERBOSITY
-        )
-        , data = dtrain
-        , nrounds = 1L
-    )
-    bst2 <- lgb.train(
-        params = list(
-            objective = "regression"
-            , metric = "l2"
-            , min_data = 1L
-            , learning_rate = 1.0
-            , verbosity = VERBOSITY
-        )
-        , data = dtrain
-        , nrounds = 1L
-    )
-    expect_true(exists("bst1"))
-    expect_true(exists("bst2"))
-    lgb.unloader(restore = TRUE, wipe = TRUE, envir = environment())
-    expect_false(exists("bst1"))
-    expect_false(exists("bst2"))
-})
diff --git a/R-package/tests/testthat/test_metrics.R b/R-package/tests/testthat/test_metrics.R
index 73f60c5f8043..2974ec0130ff 100644
--- a/R-package/tests/testthat/test_metrics.R
+++ b/R-package/tests/testthat/test_metrics.R
@@ -1,5 +1,3 @@
-context(".METRICS_HIGHER_BETTER()")
-
 test_that(".METRICS_HIGHER_BETTER() should be well formed", {
     metrics <- .METRICS_HIGHER_BETTER()
     metric_names <- names(.METRICS_HIGHER_BETTER())
@@ -8,5 +6,5 @@ test_that(".METRICS_HIGHER_BETTER() should be well formed", {
     # no metrics should be repeated
     expect_true(length(unique(metric_names)) == length(metrics))
     # should not be any NAs
-    expect_false(any(is.na(metrics)))
+    expect_false(anyNA(metrics))
 })
diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R
index 170a7b900ac8..610db6bd9d4b 100644
--- a/R-package/tests/testthat/test_parameters.R
+++ b/R-package/tests/testthat/test_parameters.R
@@ -1,6 +1,3 @@
-
-context("feature penalties")
-
 data(agaricus.train, package = "lightgbm")
 data(agaricus.test, package = "lightgbm")
 train <- agaricus.train
@@ -26,7 +23,6 @@ test_that("Feature penalties work properly", {
       )
       , nrounds = 5L
       , verbose = -1L
-      , save_name = tempfile(fileext = ".model")
     )
   })
 
@@ -47,8 +43,6 @@ test_that("Feature penalties work properly", {
   expect_length(var_gain[[length(var_gain)]], 0L)
 })
 
-context("parameter aliases")
-
 test_that(".PARAMETER_ALIASES() returns a named list of character vectors, where names are unique", {
   param_aliases <- .PARAMETER_ALIASES()
   expect_identical(class(param_aliases), "list")
@@ -64,6 +58,37 @@ test_that(".PARAMETER_ALIASES() returns a named list of character vectors, where
   expect_equal(sort(param_aliases[["task"]]), c("task", "task_type"))
 })
 
+test_that(".PARAMETER_ALIASES() uses the internal session cache", {
+
+  cache_key <- "PARAMETER_ALIASES"
+
+  # clear cache, so this test isn't reliant on the order unit tests are run in
+  if (exists(cache_key, where = .lgb_session_cache_env)) {
+    rm(list = cache_key, envir = .lgb_session_cache_env)
+  }
+  expect_false(exists(cache_key, where = .lgb_session_cache_env))
+
+  # check that result looks correct for at least one parameter
+  iter_aliases <- .PARAMETER_ALIASES()[["num_iterations"]]
+  expect_true(is.character(iter_aliases))
+  expect_true(all(c("num_round", "nrounds") %in% iter_aliases))
+
+  # patch the cache to check that .PARAMETER_ALIASES() checks it
+  assign(
+    x = cache_key
+    , value = list(num_iterations = c("test", "other_test"))
+    , envir = .lgb_session_cache_env
+  )
+  iter_aliases <- .PARAMETER_ALIASES()[["num_iterations"]]
+  expect_equal(iter_aliases, c("test", "other_test"))
+
+  # re-set cache so this doesn't interfere with other unit tests
+  if (exists(cache_key, where = .lgb_session_cache_env)) {
+    rm(list = cache_key, envir = .lgb_session_cache_env)
+  }
+  expect_false(exists(cache_key, where = .lgb_session_cache_env))
+})
+
 test_that("training should warn if you use 'dart' boosting, specified with 'boosting' or aliases", {
   for (boosting_param in .PARAMETER_ALIASES()[["boosting"]]) {
     params <- list(
@@ -80,7 +105,6 @@ test_that("training should warn if you use 'dart' boosting, specified with 'boos
         , params = params
         , nrounds = 5L
         , verbose = -1L
-        , save_name = tempfile(fileext = ".model")
       )
     }, regexp = "Early stopping is not available in 'dart' mode")
   }
diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R
index ed351af5a3f9..a0866b00d6b4 100644
--- a/R-package/tests/testthat/test_utils.R
+++ b/R-package/tests/testthat/test_utils.R
@@ -1,5 +1,3 @@
-context("lgb.params2str")
-
 test_that("lgb.params2str() works as expected for empty lists", {
     out_str <- lgb.params2str(
         params = list()
@@ -26,7 +24,16 @@ test_that("lgb.params2str() works as expected for a key in params with multiple
     )
 })
 
-context("lgb.check.eval")
+test_that("lgb.params2str() passes through duplicated params", {
+    out_str <- lgb.params2str(
+        params = list(
+            objective = "regression"
+            , bagging_fraction = 0.8
+            , bagging_fraction = 0.5  # nolint: duplicate_argument
+        )
+    )
+    expect_equal(out_str, "objective=regression bagging_fraction=0.8 bagging_fraction=0.5")
+})
 
 test_that("lgb.check.eval works as expected with no metric", {
     params <- lgb.check.eval(
@@ -73,8 +80,6 @@ test_that("lgb.check.eval drops duplicate metrics and preserves order", {
     expect_identical(params[["metric"]], list("l1", "l2", "rmse"))
 })
 
-context("lgb.check.wrapper_param")
-
 test_that("lgb.check.wrapper_param() uses passed-in keyword arg if no alias found in params", {
     kwarg_val <- sample(seq_len(100L), size = 1L)
     params <- lgb.check.wrapper_param(
diff --git a/R-package/tests/testthat/test_weighted_loss.R b/R-package/tests/testthat/test_weighted_loss.R
index 2d11f8f23f3b..d00399548560 100644
--- a/R-package/tests/testthat/test_weighted_loss.R
+++ b/R-package/tests/testthat/test_weighted_loss.R
@@ -2,8 +2,6 @@ VERBOSITY <- as.integer(
   Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
 
-context("Case weights are respected")
-
 test_that("Gamma regression reacts on 'weight'", {
   n <- 100L
   set.seed(87L)
diff --git a/README.md b/README.md
index c7dd0ed366fd..1cb7b9019ff5 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,8 @@ Julia-package: https://github.com/IQVIA-ML/LightGBM.jl
 
 JPMML (Java PMML converter): https://github.com/jpmml/jpmml-lightgbm
 
+Nyoka (Python PMML converter): https://github.com/SoftwareAG/nyoka
+
 Treelite (model compiler for efficient deployment): https://github.com/dmlc/treelite
 
 lleaves (LLVM-based model compiler for efficient inference): https://github.com/siboehm/lleaves
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000000..869fdfe2b246
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/VERSION.txt b/VERSION.txt
index 6903f9251ad2..49678dc178bd 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-3.3.1.99
+3.3.2.99
diff --git a/build_r.R b/build_r.R
index dbf225e25f70..64ca29989db9 100644
--- a/build_r.R
+++ b/build_r.R
@@ -24,7 +24,7 @@ TEMP_SOURCE_DIR <- file.path(TEMP_R_DIR, "src")
     , "make_args" = character(0L)
   )
   for (arg in args) {
-    if (any(grepl("^\\-j[0-9]+", arg))) {
+    if (any(grepl("^\\-j[0-9]+", arg))) {  # nolint: non_portable_path
         out_list[["make_args"]] <- arg
     } else if (any(grepl("=", arg))) {
       split_arg <- strsplit(arg, "=")[[1L]]
@@ -70,7 +70,7 @@ unrecognized_args <- setdiff(given_args, recognized_args)
 if (length(unrecognized_args) > 0L) {
   msg <- paste0(
     "Unrecognized arguments: "
-    , paste0(unrecognized_args, collapse = ", ")
+    , toString(unrecognized_args)
   )
   stop(msg)
 }
@@ -146,7 +146,7 @@ if (length(parsed_args[["make_args"]]) > 0L) {
     on_windows <- .Platform$OS.type == "windows"
     has_processx <- suppressMessages({
       suppressWarnings({
-        require("processx")  # nolint
+        require("processx")  # nolint: undesirable_function
       })
     })
     if (has_processx && on_windows) {
@@ -404,7 +404,7 @@ dynlib_line <- grep(
 )
 
 c_api_contents <- readLines(file.path(TEMP_SOURCE_DIR, "src", "lightgbm_R.h"))
-c_api_contents <- c_api_contents[grepl("^LIGHTGBM_C_EXPORT", c_api_contents)]
+c_api_contents <- c_api_contents[startsWith(c_api_contents, "LIGHTGBM_C_EXPORT")]
 c_api_contents <- gsub(
   pattern = "LIGHTGBM_C_EXPORT SEXP "
   , replacement = ""
@@ -417,7 +417,7 @@ c_api_symbols <- gsub(
 )
 dynlib_statement <- paste0(
   "useDynLib(lib_lightgbm, "
-  , paste0(c_api_symbols, collapse = ", ")
+  , toString(c_api_symbols)
   , ")"
 )
 namespace_contents[dynlib_line] <- dynlib_statement
diff --git a/docker/dockerfile-python b/docker/dockerfile-python
index ae37c93329ef..3e473f5e9686 100644
--- a/docker/dockerfile-python
+++ b/docker/dockerfile-python
@@ -1,6 +1,6 @@
 FROM ubuntu:16.04
 
-ARG CONDA_DIR=/opt/conda
+ARG CONDA_DIR=/opt/miniforge
 ENV PATH $CONDA_DIR/bin:$PATH
 
 RUN apt-get update && \
@@ -13,8 +13,8 @@ RUN apt-get update && \
         curl \
         git && \
     # python environment
-    curl -sL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o conda.sh && \
-    /bin/bash conda.sh -f -b -p $CONDA_DIR && \
+    curl -sL https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -o miniforge.sh && \
+    /bin/bash miniforge.sh -f -b -p $CONDA_DIR && \
     export PATH="$CONDA_DIR/bin:$PATH" && \
     conda config --set always_yes yes --set changeps1 no && \
     # lightgbm
diff --git a/docker/gpu/README.md b/docker/gpu/README.md
index 13ff2bc686c9..cff4c555774c 100644
--- a/docker/gpu/README.md
+++ b/docker/gpu/README.md
@@ -18,7 +18,7 @@ LightGBM can be utilized in GPU and CPU modes and via Python.
 ## Contents
 
 - LightGBM (cpu + gpu)
-- Python 3.8 (conda) + scikit-learn, notebooks, pandas, matplotlib
+- Python (conda) + scikit-learn, notebooks, pandas, matplotlib
 
 Running the container starts a Jupyter Notebook at `localhost:8888`.
 
diff --git a/docker/gpu/dockerfile.gpu b/docker/gpu/dockerfile.gpu
index cc1aa43a02f2..bac9d97b2c2b 100644
--- a/docker/gpu/dockerfile.gpu
+++ b/docker/gpu/dockerfile.gpu
@@ -63,18 +63,18 @@ RUN mkdir -p /etc/OpenCL/vendors && \
 #           CONDA
 #################################################################################################################
 
-ARG CONDA_DIR=/opt/conda
+ARG CONDA_DIR=/opt/miniforge
 # add to path
 ENV PATH $CONDA_DIR/bin:$PATH
 
-# Install miniconda
+# Install miniforge
 RUN echo "export PATH=$CONDA_DIR/bin:"'$PATH' > /etc/profile.d/conda.sh && \
-    curl -sL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/miniconda.sh && \
-    /bin/bash ~/miniconda.sh -b -p $CONDA_DIR && \
-    rm ~/miniconda.sh
+    curl -sL https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -o ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p $CONDA_DIR && \
+    rm ~/miniforge.sh
 
 RUN conda config --set always_yes yes --set changeps1 no && \
-    conda create -y -q -n py3 python=3.8 mkl numpy scipy scikit-learn jupyter notebook ipython pandas matplotlib
+    conda create -y -q -n py3 numpy scipy scikit-learn jupyter notebook ipython pandas matplotlib
 
 #################################################################################################################
 #           LightGBM
diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst
index 8ef239d22de4..d1787b998479 100644
--- a/docs/Advanced-Topics.rst
+++ b/docs/Advanced-Topics.rst
@@ -23,8 +23,10 @@ Categorical Feature Support
 -  Use ``categorical_feature`` to specify the categorical features.
    Refer to the parameter ``categorical_feature`` in `Parameters <./Parameters.rst#categorical_feature>`__.
 
--  Categorical features must be encoded as non-negative integers (``int``) less than ``Int32.MaxValue`` (2147483647).
+-  Categorical features will be cast to ``int32`` (integer codes will be extracted from pandas categoricals in the Python-package) so they must be encoded as non-negative integers (negative values will be treated as missing)
+   less than ``Int32.MaxValue`` (2147483647).
    It is best to use a contiguous range of integers started from zero.
+   Floating point numbers in categorical features will be rounded towards 0.
 
 -  Use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting (when ``#data`` is small or ``#category`` is large).
 
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
index 63c729b829b4..9f86b882e0a1 100644
--- a/docs/FAQ.rst
+++ b/docs/FAQ.rst
@@ -23,6 +23,8 @@ You may also ping a member of the core team according to the relevant area of ex
 -  `@guolinke <https://github.com/guolinke>`__ **Guolin Ke** (C++ code / R-package / Python-package)
 -  `@chivee <https://github.com/chivee>`__ **Qiwei Ye** (C++ code / Python-package)
 -  `@shiyu1994 <https://github.com/shiyu1994>`__ **Yu Shi** (C++ code / Python-package)
+-  `@tongwu-msft <https://github.com/tongwu-msft>`__ **Tong Wu** (C++ code / Python-package)
+-  `@hzy46 <https://github.com/hzy46>`__ **Zhiyuan He** (C++ code / Python-package)
 -  `@btrotta <https://github.com/btrotta>`__ **Belinda Trotta** (C++ code)
 -  `@Laurae2 <https://github.com/Laurae2>`__ **Damien Soukhavong** (R-package)
 -  `@jameslamb <https://github.com/jameslamb>`__ **James Lamb** (R-package / Dask-package)
@@ -31,8 +33,6 @@ You may also ping a member of the core team according to the relevant area of ex
 -  `@henry0312 <https://github.com/henry0312>`__ **Tsukasa Omoto** (Python-package)
 -  `@StrikerRUS <https://github.com/StrikerRUS>`__ **Nikita Titov** (Python-package)
 -  `@huanzhang12 <https://github.com/huanzhang12>`__ **Huan Zhang** (GPU support)
--  `@tongwu-msft <https://github.com/tongwu-msft>`__ **Tong Wu** (C++ code / Python-package)
--  `@hzy46 <https://github.com/hzy46>`__ **Zhiyuan He** (C++ code / Python-package)
 
 Please include as much of the following information as possible when submitting a critical issue:
 
@@ -211,6 +211,30 @@ See `Microsoft/LightGBM#3060 <https://github.com/microsoft/LightGBM/issues/3060#
 
 You can find LightGBM's logo in different file formats and resolutions `here <https://github.com/microsoft/LightGBM/tree/master/docs/logo>`__.
 
+16. LightGBM crashes randomly or operating system hangs during or after running LightGBM.
+-----------------------------------------------------------------------------------------
+
+**Possible Cause**: This behavior may indicate that you have multiple OpenMP libraries installed on your machine and they conflict with each other, similarly to the ``FAQ #10``.
+
+If you are using any Python package that depends on ``threadpoolctl``, you also may see the following warning in your logs in this case:
+
+.. code-block:: console
+
+    /root/miniconda/envs/test-env/lib/python3.8/site-packages/threadpoolctl.py:546: RuntimeWarning: 
+    Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
+    the same time. Both libraries are known to be incompatible and this
+    can cause random crashes or deadlocks on Linux when loaded in the
+    same Python program.
+    Using threadpoolctl may cause crashes or deadlocks. For more
+    information and possible workarounds, please see
+        https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md
+
+Detailed description of conflicts between multiple OpenMP instances is provided in the `following document <https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md>`__.
+
+**Solution**: Assuming you are using LightGBM Python-package and conda as a package manager, we strongly recommend using ``conda-forge`` channel as the only source of all your Python package installations because it contains built-in patches to workaround OpenMP conflicts. Some other workarounds are listed `here <https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md#workarounds-for-intel-openmp-and-llvm-openmp-case>`__.
+
+If this is not your case, then you should find conflicting OpenMP library installations on your own and leave only one of them.
+
 ------
 
 R-package
@@ -223,9 +247,9 @@ R-package
 1. Any training command using LightGBM does not work after an error occurred during the training of a previous LightGBM model.
 ------------------------------------------------------------------------------------------------------------------------------
 
-Run ``lgb.unloader(wipe = TRUE)`` in the R console, and recreate the LightGBM datasets (this will wipe all LightGBM-related variables).
-Due to the pointers, choosing to not wipe variables will not fix the error.
-This is a known issue: `Microsoft/LightGBM#698 <https://github.com/microsoft/LightGBM/issues/698>`__.
+In older versions of the R package (prior to ``v3.3.0``), this could happen occasionally and the solution was to run ``lgb.unloader(wipe = TRUE)`` to remove all LightGBM-related objects. Some conversation about this could be found in `Microsoft/LightGBM#698 <https://github.com/microsoft/LightGBM/issues/698>`__.
+
+That is no longer necessary as of ``v3.3.0``, and function ``lgb.unloader()`` has since been removed from the R package.
 
 2. I used ``setinfo()``, tried to print my ``lgb.Dataset``, and now the R console froze!
 ----------------------------------------------------------------------------------------
@@ -309,3 +333,10 @@ Therefore, the first thing you should try in case of segfaults is **compiling fr
 For the OS-specific prerequisites see `this guide <https://github.com/microsoft/LightGBM/blob/master/python-package/README.rst#user-content-build-from-sources>`__.
 
 Also, feel free to post a new issue in our GitHub repository. We always look at each case individually and try to find a root cause.
+
+4. I would like to install LightGBM from conda. What channel should I choose?
+-----------------------------------------------------------------------------
+
+We strongly recommend installation from the ``conda-forge`` channel and not from the ``default`` one due to many reasons.
+The main ones are less time delay for new releases, greater number of supported architectures and better handling of dependency conflicts, especially workaround for OpenMP is crucial for LightGBM.
+More details can be found in `this comment <https://github.com/microsoft/LightGBM/issues/4948#issuecomment-1013766397>`__.
diff --git a/docs/GPU-Targets.rst b/docs/GPU-Targets.rst
index aa2d0ac3cd9d..9c3cac7c814a 100644
--- a/docs/GPU-Targets.rst
+++ b/docs/GPU-Targets.rst
@@ -19,6 +19,8 @@ You can find below a table of correspondence:
 +---------------------------+-----------------+-----------------+-----------------+--------------+
 | AMD APP SDK \*            | Supported       | Not Supported   | Supported       | Not Supported|
 +---------------------------+-----------------+-----------------+-----------------+--------------+
+| `PoCL`_                   | Supported       | Not Supported   | Supported       | Not Supported|
++---------------------------+-----------------+-----------------+-----------------+--------------+
 | `NVIDIA CUDA Toolkit`_    | Not Supported   | Not Supported   | Not Supported   | Supported    |
 +---------------------------+-----------------+-----------------+-----------------+--------------+
 
@@ -167,4 +169,6 @@ Known issues:
 
 .. _clinfo: https://github.com/Oblomov/clinfo
 
-.. _GPUCapsViewer: http://www.ozone3d.net/gpu_caps_viewer/
+.. _GPUCapsViewer: https://www.ozone3d.net/gpu_caps_viewer/
+
+.. _PoCL: http://portablecl.org/
diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst
index 90772ddaf2c3..ccc1aab7c1f5 100644
--- a/docs/GPU-Windows.rst
+++ b/docs/GPU-Windows.rst
@@ -598,13 +598,13 @@ And open an issue in GitHub `here`_ with that log.
 
 .. _Khronos official OpenCL headers: https://github.com/KhronosGroup/OpenCL-Headers
 
-.. _this: http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe
+.. _this: https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe/download
 
 .. _Boost: https://www.boost.org/users/history/
 
-.. _Prebuilt Boost x86_64: https://mirror.linux-ia64.org/fedora/linux/releases/32/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.66.0-6.fc32.noarch.rpm
+.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/development/rawhide/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.75.0-7.fc37.noarch.rpm
 
-.. _Prebuilt Boost i686: https://mirror.linux-ia64.org/fedora/linux/releases/32/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.66.0-6.fc32.noarch.rpm
+.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/development/rawhide/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.75.0-7.fc37.noarch.rpm
 
 .. _7zip: https://www.7-zip.org/download.html
 
diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst
index 6d6456ea71e0..596daf80dc52 100644
--- a/docs/Installation-Guide.rst
+++ b/docs/Installation-Guide.rst
@@ -636,6 +636,8 @@ To build LightGBM CUDA version, run the following commands:
   cmake -DUSE_CUDA=1 ..
   make -j4
 
+Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``-DUSE_CUDA`` with ``-DUSE_CUDA_EXP`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries.
+
 **Note**: glibc >= 2.14 is required.
 
 **Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this).
diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst
index 2fe895d5d3a2..f220ebe7c28c 100644
--- a/docs/Parallel-Learning-Guide.rst
+++ b/docs/Parallel-Learning-Guide.rst
@@ -230,6 +230,41 @@ You could edit your firewall rules to allow communication between any of the wor
   * the port ``local_listen_port`` is not open on any of the worker hosts
   * any machine has multiple Dask worker processes running on it
 
+Using Custom Objective Functions with Dask
+******************************************
+
+It is possible to customize the boosting process by providing a custom objective function written in Python.
+See the Dask API's documentation for details on how to implement such functions.
+
+.. warning::
+
+  Custom objective functions used with ``lightgbm.dask`` will be called by each worker process on only that worker's local data.
+
+Follow the example below to use a custom implementation of the ``regression_l2`` objective.
+
+.. code:: python
+
+  import dask.array as da
+  import lightgbm as lgb
+  import numpy as np
+  from distributed import Client, LocalCluster
+
+  cluster = LocalCluster(n_workers=2)
+  client = Client(cluster)
+
+  X = da.random.random((1000, 10), (500, 10))
+  y = da.random.random((1000,), (500,))
+
+  def custom_l2_obj(y_true, y_pred):
+      grad = y_pred - y_true
+      hess = np.ones(len(y_true))
+      return grad, hess
+
+  dask_model = lgb.DaskLGBMRegressor(
+      objective=custom_l2_obj
+  )
+  dask_model.fit(X, y)
+
 Prediction with Dask
 ''''''''''''''''''''
 
diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
index 0171f456c967..ece235f6e6c0 100644
--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -108,9 +108,9 @@ Use Early Stopping
 
 If early stopping is enabled, after each boosting round the model's training accuracy is evaluated against a validation set that contains data not available to the training process. That accuracy is then compared to the accuracy as of the previous boosting round. If the model's accuracy fails to improve for some number of consecutive rounds, LightGBM stops the training process.
 
-That "number of consecutive rounds" is controlled by the parameter ``early_stopping_rounds``. For example, ``early_stopping_rounds=1`` says "the first time accuracy on the validation set does not improve, stop training".
+That "number of consecutive rounds" is controlled by the parameter ``early_stopping_round``. For example, ``early_stopping_round=1`` says "the first time accuracy on the validation set does not improve, stop training".
 
-Set ``early_stopping_rounds`` and provide a validation set to possibly reduce training time.
+Set ``early_stopping_round`` and provide a validation set to possibly reduce training time.
 
 Consider Fewer Splits
 '''''''''''''''''''''
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index b69142d6839d..152b7fbf616d 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -199,7 +199,7 @@ Core Parameters
 
    -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
 
--  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device``
+-  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, ``cuda_exp``, aliases: ``device``
 
    -  device for the tree learning, you can use GPU to achieve the faster learning
 
@@ -209,6 +209,10 @@ Core Parameters
 
    -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
 
+   -  **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
+
+   -  **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
+
 -  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state``
 
    -  this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
@@ -590,7 +594,7 @@ Learning Control Parameters
 
    -  larger values give stronger regularization
 
-      -  the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
+      -  the weight of each node is ``w * (n / path_smooth) / (n / path_smooth + 1) + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
 
       -  note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
 
@@ -660,6 +664,8 @@ Dataset Parameters
 
       -  the linear model at each leaf includes all the numerical features in that leaf's branch
 
+      -  the first tree has constant leaf values
+
       -  categorical features are used for splits as normal but are not used in the linear models
 
       -  missing values should not be encoded as ``0``. Use ``np.nan`` for Python, ``NA`` for the CLI, and ``NA``, ``NA_real_``, or ``NA_integer_`` for R
@@ -780,6 +786,8 @@ Dataset Parameters
 
    -  **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
 
+   -  **Note**: weights should be non-negative
+
 -  ``group_column`` :raw-html:`<a id="group_column" title="Permalink to this parameter" href="#group_column">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = int or string, aliases: ``group``, ``group_id``, ``query_column``, ``query``, ``query_id``
 
    -  used to specify the query/group id column
@@ -816,7 +824,7 @@ Dataset Parameters
 
    -  add a prefix ``name:`` for column name, e.g. ``categorical_feature=name:c1,c2,c3`` means c1, c2 and c3 are categorical features
 
-   -  **Note**: only supports categorical with ``int`` type (not applicable for data represented as pandas DataFrame in Python-package)
+   -  **Note**: all values will be cast to ``int32`` (integer codes will be extracted from pandas categoricals in the Python-package)
 
    -  **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
 
@@ -828,6 +836,8 @@ Dataset Parameters
 
    -  **Note**: the output cannot be monotonically constrained with respect to a categorical feature
 
+   -  **Note**: floating point numbers in categorical features will be rounded towards 0
+
 -  ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
 
    -  path to a ``.json`` file that specifies bin upper bounds for some or all features
@@ -1272,6 +1282,8 @@ The initial score file corresponds with data file line by line, and has per scor
 And if the name of data file is ``train.txt``, the initial score file should be named as ``train.txt.init`` and placed in the same folder as the data file.
 In this case, LightGBM will auto load initial score file if it exists.
 
+If binary data files exist for raw data file ``train.txt``, for example in the name ``train.txt.bin``, then the initial score file should be named as ``train.txt.bin.init``.
+
 Weight Data
 ~~~~~~~~~~~
 
@@ -1284,7 +1296,8 @@ LightGBM supports weighted training. It uses an additional file to store weight
     0.8
     ...
 
-It means the weight of the first data row is ``1.0``, second is ``0.5``, and so on.
+It means the weight of the first data row is ``1.0``, second is ``0.5``, and so on. Weights should be non-negative.
+
 The weight file corresponds with data file line by line, and has per weight per line.
 
 And if the name of data file is ``train.txt``, the weight file should be named as ``train.txt.weight`` and placed in the same folder as the data file.
diff --git a/docs/Python-Intro.rst b/docs/Python-Intro.rst
index 090bbc1c3b54..3c1cb1557e3f 100644
--- a/docs/Python-Intro.rst
+++ b/docs/Python-Intro.rst
@@ -228,18 +228,18 @@ Early stopping requires at least one set in ``valid_sets``. If there is more tha
 
 .. code:: python
 
-    bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, early_stopping_rounds=5)
+    bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, callbacks=[lgb.early_stopping(stopping_rounds=5)])
     bst.save_model('model.txt', num_iteration=bst.best_iteration)
 
 The model will train until the validation score stops improving.
-Validation score needs to improve at least every ``early_stopping_rounds`` to continue training.
+Validation score needs to improve at least every ``stopping_rounds`` to continue training.
 
-The index of iteration that has the best performance will be saved in the ``best_iteration`` field if early stopping logic is enabled by setting ``early_stopping_rounds``.
+The index of iteration that has the best performance will be saved in the ``best_iteration`` field if early stopping logic is enabled by setting ``early_stopping`` callback.
 Note that ``train()`` will return a model from the best iteration.
 
 This works with both metrics to minimize (L2, log loss, etc.) and to maximize (NDCG, AUC, etc.).
 Note that if you specify more than one evaluation metric, all of them will be used for early stopping.
-However, you can change this behavior and make LightGBM check only the first metric for early stopping by passing ``first_metric_only=True`` in ``param`` or ``early_stopping`` callback constructor.
+However, you can change this behavior and make LightGBM check only the first metric for early stopping by passing ``first_metric_only=True`` in ``early_stopping`` callback constructor.
 
 Prediction
 ----------
diff --git a/docs/README.rst b/docs/README.rst
index e41fe8803715..c4f5ace3d4fe 100644
--- a/docs/README.rst
+++ b/docs/README.rst
@@ -13,20 +13,57 @@ After each commit on ``master``, documentation is updated and published to `Read
 Build
 -----
 
-You can build the documentation locally. Just install Doxygen and run in ``docs`` folder
+It is not necessary to re-build this documentation while modifying LightGBM's source code.
+The HTML files generated using ``Sphinx`` are not checked into source control.
+However, you may want to build them locally during development to test changes.
+
+Docker
+^^^^^^
+
+The most reliable way to build the documentation locally is with Docker, using `the same images Read the Docs uses <https://hub.docker.com/r/readthedocs/build>`_.
+
+Run the following from the root of this repository to pull the relevant image and run a container locally.
+
+.. code:: sh
+
+    docker run \
+        --rm \
+        --user=0 \
+        -v $(pwd):/opt/LightGBM \
+        --env C_API=true \
+        --env CONDA=/opt/miniforge \
+        --env READTHEDOCS=true \
+        --workdir=/opt/LightGBM/docs \
+        --entrypoint="" \
+        -it readthedocs/build:ubuntu-20.04-2021.09.23 \
+        /bin/bash build-docs.sh
+
+When that code completes, open ``docs/_build/html/index.html`` in your browser.
+
+.. note::
+
+    The navigation in these locally-built docs does not link to the local copy of the R documentation. To view the local version of the R docs, open ``docs/_build/html/R/index.html`` in your browser.
+
+Without Docker
+^^^^^^^^^^^^^^
+
+You can build the documentation locally without Docker. Just install Doxygen and run in ``docs`` folder
 
 .. code:: sh
 
-    pip install -r requirements.txt
+    pip install breathe sphinx 'sphinx_rtd_theme>=0.5'
     make html
 
-Unfortunately, documentation for R code is built only on our site, and commands above will not build it for you locally.
+Note that this will not build the R documentation.
 Consider using common R utilities for documentation generation, if you need it.
+Or use the Docker-based approach described above to build the R documentation locally.
+
+Optionally, you may also install ``scikit-learn`` and get richer documentation for the classes in ``Scikit-learn API``.
 
 If you faced any problems with Doxygen installation or you simply do not need documentation for C code, it is possible to build the documentation without it:
 
 .. code:: sh
 
-    pip install -r requirements_base.txt
+    pip install sphinx 'sphinx_rtd_theme>=0.5'
     export C_API=NO || set C_API=NO
     make html
diff --git a/docs/_static/js/script.js b/docs/_static/js/script.js
index f3e59258f53b..d6f5b4125057 100644
--- a/docs/_static/js/script.js
+++ b/docs/_static/js/script.js
@@ -2,6 +2,12 @@ $(function() {
     /* Use wider container for the page content */
     $('.wy-nav-content').each(function() { this.style.setProperty('max-width', 'none', 'important'); });
 
+    /* List each class property item on a new line
+       https://github.com/microsoft/LightGBM/issues/5073 */
+    if(window.location.pathname.toLocaleLowerCase().indexOf('pythonapi') != -1) {
+        $('.py.property').each(function() { this.style.setProperty('display', 'inline', 'important'); });
+    }
+
     /* Point to the same version of R API as the current docs version */
     var current_version_elems = $('.rst-current-version');
     if(current_version_elems.length !== 0) {
@@ -56,9 +62,9 @@ $(function() {
         /* Initialize artifacts badge */
         modifyBadge('./_static/images/artifacts-fetching.svg', '#');
         /* Fetch latest buildId and construct artifacts badge */
-        $.getJSON('https://dev.azure.com/lightgbm-ci/lightgbm-ci/_apis/build/builds?branchName=refs/heads/master&resultFilter=succeeded&queryOrder=finishTimeDescending&%24top=1&api-version=5.0-preview.5', function(data) {
+        $.getJSON('https://dev.azure.com/lightgbm-ci/lightgbm-ci/_apis/build/builds?branchName=refs/heads/master&resultFilter=succeeded&queryOrder=finishTimeDescending&%24top=1&api-version=7.1-preview.7', function(data) {
             modifyBadge('./_static/images/artifacts-download.svg',
-                        'https://dev.azure.com/lightgbm-ci/lightgbm-ci/_apis/build/builds/' + data['value'][0]['id'] + '/artifacts?artifactName=PackageAssets&api-version=5.0-preview.5&%24format=zip');
+                        'https://dev.azure.com/lightgbm-ci/lightgbm-ci/_apis/build/builds/' + data['value'][0]['id'] + '/artifacts?artifactName=PackageAssets&api-version=7.1-preview.5&%24format=zip');
             });
     }
 });
diff --git a/docs/build-docs.sh b/docs/build-docs.sh
new file mode 100644
index 000000000000..689a30df9962
--- /dev/null
+++ b/docs/build-docs.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+rm -f ./_FIRST_RUN.flag
+
+export PATH="${CONDA}/bin:${PATH}"
+
+curl \
+    -sL \
+    -o ${HOME}/miniforge.sh \
+    https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
+
+/bin/bash ${HOME}/miniforge.sh -b -p ${CONDA}
+
+conda config --set always_yes yes --set changeps1 no
+conda update -q -y conda
+
+conda env create \
+    --name docs-env \
+    --file env.yml || exit -1
+
+source activate docs-env
+make clean html || exit -1
+
+echo "Done building docs. Open docs/_build/html/index.html in a web browser to view them."
diff --git a/docs/conf.py b/docs/conf.py
index 0c2a5d183c82..c97bf7c7fd62 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -103,8 +103,11 @@ def run(self) -> List:
     'pandas',
     'scipy',
     'scipy.sparse',
-    'sklearn'
 ]
+try:
+    import sklearn
+except ImportError:
+    autodoc_mock_imports.append('sklearn')
 # hide type hints in API docs
 autodoc_typehints = "none"
 
@@ -147,7 +150,7 @@ def run(self) -> List:
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'en'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -259,21 +262,6 @@ def generate_r_docs(app: Sphinx) -> None:
         The application object representing the Sphinx process.
     """
     commands = f"""
-    /home/docs/.conda/bin/conda create \
-        -q \
-        -y \
-        -c conda-forge \
-        --override-channels \
-        -n r_env \
-            r-base=4.1.0=hb67fd72_2 \
-            r-data.table=1.14.0=r41hcfec24a_0 \
-            r-jsonlite=1.7.2=r41hcfec24a_0 \
-            r-knitr=1.35=r41hc72bb7e_0 \
-            r-matrix=1.3_4=r41he454529_0 \
-            r-pkgdown=1.6.1=r41hc72bb7e_0 \
-            r-rmarkdown=2.11=r41hc72bb7e_0 \
-            r-roxygen2=7.1.1=r41h03ef668_0
-    source /home/docs/.conda/bin/activate r_env
     export TAR=/bin/tar
     cd {CURR_PATH.parent}
     export R_LIBS="$CONDA_PREFIX/lib/R/library"
@@ -298,6 +286,7 @@ def generate_r_docs(app: Sphinx) -> None:
     cd {CURR_PATH.parent}
     """
     try:
+        print("Building R-package documentation")
         # Warning! The following code can cause buffer overflows on RTD.
         # Consider suppressing output completely if RTD project silently fails.
         # Refer to https://github.com/svenevs/exhale
@@ -311,6 +300,7 @@ def generate_r_docs(app: Sphinx) -> None:
             raise RuntimeError(output)
         else:
             print(output)
+            print("Done building R-package documentation")
     except BaseException as e:
         raise Exception(f"An error has occurred while generating documentation for R-package\n{e}")
 
diff --git a/docs/env.yml b/docs/env.yml
new file mode 100644
index 000000000000..4a6afbc097fe
--- /dev/null
+++ b/docs/env.yml
@@ -0,0 +1,18 @@
+name: docs-env
+channels:
+  - nodefaults
+  - conda-forge
+dependencies:
+  - breathe
+  - python=3.9
+  - r-base=4.1.3
+  - r-data.table=1.14.2
+  - r-jsonlite=1.7.2
+  - r-knitr=1.37
+  - r-matrix=1.4_0
+  - r-pkgdown=1.6.1
+  - r-rmarkdown=2.11
+  - r-roxygen2=7.2.0
+  - scikit-learn
+  - sphinx
+  - "sphinx_rtd_theme>=0.5"
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 17896e0c7283..000000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r requirements_base.txt
-breathe
diff --git a/docs/requirements_base.txt b/docs/requirements_base.txt
deleted file mode 100644
index baebc41b5e1c..000000000000
--- a/docs/requirements_base.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-sphinx
-sphinx_rtd_theme >= 0.5
diff --git a/examples/python-guide/advanced_example.py b/examples/python-guide/advanced_example.py
index 54b62cdb1563..6c2a42ce2bf6 100644
--- a/examples/python-guide/advanced_example.py
+++ b/examples/python-guide/advanced_example.py
@@ -1,4 +1,5 @@
 # coding: utf-8
+import copy
 import json
 import pickle
 from pathlib import Path
@@ -159,11 +160,14 @@ def binary_error(preds, train_data):
     return 'error', np.mean(labels != (preds > 0.5)), False
 
 
-gbm = lgb.train(params,
+# Pass custom objective function through params
+params_custom_obj = copy.deepcopy(params)
+params_custom_obj['objective'] = loglikelihood
+
+gbm = lgb.train(params_custom_obj,
                 lgb_train,
                 num_boost_round=10,
                 init_model=gbm,
-                fobj=loglikelihood,
                 feval=binary_error,
                 valid_sets=lgb_eval)
 
@@ -183,11 +187,14 @@ def accuracy(preds, train_data):
     return 'accuracy', np.mean(labels == (preds > 0.5)), True
 
 
-gbm = lgb.train(params,
+# Pass custom objective function through params
+params_custom_obj = copy.deepcopy(params)
+params_custom_obj['objective'] = loglikelihood
+
+gbm = lgb.train(params_custom_obj,
                 lgb_train,
                 num_boost_round=10,
                 init_model=gbm,
-                fobj=loglikelihood,
                 feval=[binary_error, accuracy],
                 valid_sets=lgb_eval)
 
diff --git a/examples/python-guide/notebooks/interactive_plot_example.ipynb b/examples/python-guide/notebooks/interactive_plot_example.ipynb
index ac7d85550928..3090f4a6531f 100644
--- a/examples/python-guide/notebooks/interactive_plot_example.ipynb
+++ b/examples/python-guide/notebooks/interactive_plot_example.ipynb
@@ -148,8 +148,10 @@
     "                valid_sets=[lgb_train, lgb_test],\n",
     "                feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],\n",
     "                categorical_feature=[21],\n",
-    "                evals_result=evals_result,\n",
-    "                callbacks=[lgb.log_evaluation(10)])"
+    "                callbacks=[\n",
+    "                    lgb.log_evaluation(10),\n",
+    "                    lgb.record_evaluation(evals_result)\n",
+    "                ])"
    ]
   },
   {
diff --git a/examples/python-guide/plot_example.py b/examples/python-guide/plot_example.py
index de70565e1e72..d85fcaa411a1 100644
--- a/examples/python-guide/plot_example.py
+++ b/examples/python-guide/plot_example.py
@@ -36,14 +36,18 @@
 
 print('Starting training...')
 # train
-gbm = lgb.train(params,
-                lgb_train,
-                num_boost_round=100,
-                valid_sets=[lgb_train, lgb_test],
-                feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
-                categorical_feature=[21],
-                evals_result=evals_result,
-                callbacks=[lgb.log_evaluation(10)])
+gbm = lgb.train(
+    params,
+    lgb_train,
+    num_boost_round=100,
+    valid_sets=[lgb_train, lgb_test],
+    feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
+    categorical_feature=[21],
+    callbacks=[
+        lgb.log_evaluation(10),
+        lgb.record_evaluation(evals_result)
+    ]
+)
 
 print('Plotting metrics recorded during training...')
 ax = lgb.plot_metric(evals_result, metric='l1')
diff --git a/examples/python-guide/simple_example.py b/examples/python-guide/simple_example.py
index 48af051903db..79c4f70938bc 100644
--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
@@ -40,7 +40,7 @@
                 lgb_train,
                 num_boost_round=20,
                 valid_sets=lgb_eval,
-                early_stopping_rounds=5)
+                callbacks=[lgb.early_stopping(stopping_rounds=5)])
 
 print('Saving model...')
 # save model to file
diff --git a/external_libs/compute b/external_libs/compute
index 36c89134d401..36350b7de849 160000
--- a/external_libs/compute
+++ b/external_libs/compute
@@ -1 +1 @@
-Subproject commit 36c89134d4013b2e5e45bc55656a18bd6141995a
+Subproject commit 36350b7de849300bd3d72a05d8bf890ca405a014
diff --git a/external_libs/eigen b/external_libs/eigen
index 8ba1b0f41a79..3147391d946b 160000
--- a/external_libs/eigen
+++ b/external_libs/eigen
@@ -1 +1 @@
-Subproject commit 8ba1b0f41a7950dc3e1d4ed75859e36c73311235
+Subproject commit 3147391d946bb4b6c68edd901f2add6ac1f31f8c
diff --git a/external_libs/fmt b/external_libs/fmt
index cc09f1a6798c..b6f4ceaed0a0 160000
--- a/external_libs/fmt
+++ b/external_libs/fmt
@@ -1 +1 @@
-Subproject commit cc09f1a6798c085c325569ef466bcdcffdc266d4
+Subproject commit b6f4ceaed0a0a24ccf575fab6c56dd50ccf6f1a9
diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py
index 4932a1a07316..abc770bc8e43 100644
--- a/helpers/parameter_generator.py
+++ b/helpers/parameter_generator.py
@@ -34,6 +34,8 @@ def get_parameter_infos(
     member_infos: List[List[Dict[str, List]]] = []
     with open(config_hpp) as config_hpp_file:
         for line in config_hpp_file:
+            if line.strip() in {"#ifndef __NVCC__", "#endif  // __NVCC__"}:
+                continue
             if "#pragma region Parameters" in line:
                 is_inparameter = True
             elif "#pragma region" in line and "Parameters" in line:
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 3192d9eed478..66be49a911e4 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -119,6 +119,23 @@ class BinMapper {
     }
   }
 
+  /*!
+  * \brief Maximum categorical value
+  * \return Maximum categorical value for categorical features, 0 for numerical features  
+  */
+  inline int MaxCatValue() const {
+    if (bin_2_categorical_.size() == 0) {
+      return 0;
+    }
+    int max_cat_value = bin_2_categorical_[0];
+    for (size_t i = 1; i < bin_2_categorical_.size(); ++i) {
+      if (bin_2_categorical_[i] > max_cat_value) {
+        max_cat_value = bin_2_categorical_[i];
+      }
+    }
+    return max_cat_value;
+  }
+
   /*!
   * \brief Get sizes in byte of this object
   */
@@ -379,6 +396,10 @@ class Bin {
   * \brief Deep copy the bin
   */
   virtual Bin* Clone() = 0;
+
+  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const = 0;
+
+  virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
 };
 
 
@@ -452,6 +473,14 @@ class MultiValBin {
   static constexpr double multi_val_bin_sparse_threshold = 0.25f;
 
   virtual MultiValBin* Clone() = 0;
+
+  #ifdef USE_CUDA_EXP
+  virtual const void* GetRowWiseData(uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) const = 0;
+  #endif  // USE_CUDA_EXP
 };
 
 inline uint32_t BinMapper::ValueToBin(double value) const {
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index e0df38eeb82e..8e4d8d4d8602 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -118,7 +118,8 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
  * \param ncol Number of columns
  * \param num_per_col Size of each sampling column
  * \param num_sample_row Number of sampled rows
- * \param num_total_row Number of total rows
+ * \param num_local_row Total number of rows local to machine
+ * \param num_dist_row Number of total distributed rows
  * \param parameters Additional parameters
  * \param[out] out Created dataset
  * \return 0 when succeed, -1 when failure happens
@@ -128,7 +129,8 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
                                                           int32_t ncol,
                                                           const int* num_per_col,
                                                           int32_t num_sample_row,
-                                                          int32_t num_total_row,
+                                                          int32_t num_local_row,
+                                                          int64_t num_dist_row,
                                                           const char* parameters,
                                                           DatasetHandle* out);
 
@@ -432,6 +434,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle,
 LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumFeature(DatasetHandle handle,
                                                 int* out);
 
+/*!
+ * \brief Get number of bins for feature.
+ * \param handle Handle of dataset
+ * \param feature Index of the feature
+ * \param[out] out The address to hold number of bins
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_DatasetGetFeatureNumBin(DatasetHandle handle,
+                                                   int feature,
+                                                   int* out);
+
 /*!
  * \brief Add features from ``source`` to ``target``.
  * \param target The handle of the dataset to add features to
@@ -572,6 +585,9 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterRefit(BoosterHandle handle,
 /*!
  * \brief Update the model by specifying gradient and Hessian directly
  *        (this can be used to support customized loss functions).
+ * \note
+ * The length of the arrays referenced by ``grad`` and ``hess`` must be equal to
+ * ``num_class * num_train_data``, this is not verified by the library, the caller must ensure this.
  * \param handle Handle of booster
  * \param grad The first order derivative (gradient) statistics
  * \param hess The second order derivative (Hessian) statistics
@@ -664,6 +680,17 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterGetFeatureNames(BoosterHandle handle,
                                                   size_t* out_buffer_len,
                                                   char** out_strs);
 
+/*!
+ * \brief Check that the feature names of the data match the ones used to train the booster.
+ * \param handle Handle of booster
+ * \param data_names Array with the feature names in the data
+ * \param data_num_features Number of features in the data
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterValidateFeatureNames(BoosterHandle handle,
+                                                       const char** data_names,
+                                                       int data_num_features);
+
 /*!
  * \brief Get number of features.
  * \param handle Handle of booster
@@ -827,7 +854,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
  * \param indices Pointer to column indices for CSR or row indices for CSC
  * \param data Pointer to the data space
  * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
- * \param nindptr Number of rows in the matrix + 1
+ * \param nindptr Number of entries in ``indptr``
  * \param nelem Number of nonzero elements in the matrix
  * \param num_col_or_row Number of columns for CSR or number of rows for CSC
  * \param predict_type What should be predicted, only feature contributions supported currently
@@ -836,7 +863,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param matrix_type Type of matrix input and output, can be ``C_API_MATRIX_TYPE_CSR`` or ``C_API_MATRIX_TYPE_CSC``
- * \param[out] out_len Length of output indices and data
+ * \param[out] out_len Length of output data and output indptr (pointer to an array with two entries where to write them)
  * \param[out] out_indptr Pointer to output row headers for CSR or column headers for CSC
  * \param[out] out_indices Pointer to sparse column indices for CSR or row indices for CSC
  * \param[out] out_data Pointer to sparse data space
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 21e0fa185f6c..00677a675054 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -81,9 +81,11 @@ struct Config {
   static void KV2Map(std::unordered_map<std::string, std::string>* params, const char* kv);
   static std::unordered_map<std::string, std::string> Str2Map(const char* parameters);
 
+  #ifndef __NVCC__
   #pragma region Parameters
 
   #pragma region Core Parameters
+  #endif  // __NVCC__
 
   // [no-save]
   // [doc-only]
@@ -204,12 +206,14 @@ struct Config {
 
   // [doc-only]
   // type = enum
-  // options = cpu, gpu, cuda
+  // options = cpu, gpu, cuda, cuda_exp
   // alias = device
   // desc = device for the tree learning, you can use GPU to achieve the faster learning
   // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
   // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
   // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
+  // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
+  // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
   std::string device_type = "cpu";
 
   // [doc-only]
@@ -228,9 +232,11 @@ struct Config {
   // desc = **Note**: to avoid potential instability due to numerical issues, please set ``force_col_wise=true`` or ``force_row_wise=true`` when setting ``deterministic=true``
   bool deterministic = false;
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region Learning Control Parameters
+  #endif  // __NVCC__
 
   // desc = used only with ``cpu`` device type
   // desc = set this to ``true`` to force col-wise histogram building
@@ -525,7 +531,7 @@ struct Config {
   // desc = if set to zero, no smoothing is applied
   // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
   // desc = larger values give stronger regularization
-  // descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
+  // descl2 = the weight of each node is ``w * (n / path_smooth) / (n / path_smooth + 1) + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
   // descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
   double path_smooth = 0;
 
@@ -568,16 +574,19 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   int snapshot_freq = -1;
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region IO Parameters
 
   #pragma region Dataset Parameters
+  #endif  // __NVCC__
 
   // alias = linear_trees
   // desc = fit piecewise linear gradient boosting tree
   // descl2 = tree splits are chosen in the usual way, but the model at each leaf is linear instead of constant
   // descl2 = the linear model at each leaf includes all the numerical features in that leaf's branch
+  // descl2 = the first tree has constant leaf values
   // descl2 = categorical features are used for splits as normal but are not used in the linear models
   // descl2 = missing values should not be encoded as ``0``. Use ``np.nan`` for Python, ``NA`` for the CLI, and ``NA``, ``NA_real_``, or ``NA_integer_`` for R
   // descl2 = it is recommended to rescale data before training so that features have similar mean and standard deviation
@@ -670,6 +679,7 @@ struct Config {
   // desc = add a prefix ``name:`` for column name, e.g. ``weight=name:weight``
   // desc = **Note**: works only in case of loading data directly from text file
   // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
+  // desc = **Note**: weights should be non-negative
   std::string weight_column = "";
 
   // type = int or string
@@ -697,12 +707,13 @@ struct Config {
   // desc = used to specify categorical features
   // desc = use number for index, e.g. ``categorical_feature=0,1,2`` means column\_0, column\_1 and column\_2 are categorical features
   // desc = add a prefix ``name:`` for column name, e.g. ``categorical_feature=name:c1,c2,c3`` means c1, c2 and c3 are categorical features
-  // desc = **Note**: only supports categorical with ``int`` type (not applicable for data represented as pandas DataFrame in Python-package)
+  // desc = **Note**: all values will be cast to ``int32`` (integer codes will be extracted from pandas categoricals in the Python-package)
   // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``
   // desc = **Note**: all values should be less than ``Int32.MaxValue`` (2147483647)
   // desc = **Note**: using large values could be memory consuming. Tree decision rule works best when categorical features are presented by consecutive integers starting from zero
   // desc = **Note**: all negative values will be treated as **missing values**
   // desc = **Note**: the output cannot be monotonically constrained with respect to a categorical feature
+  // desc = **Note**: floating point numbers in categorical features will be rounded towards 0
   std::string categorical_feature = "";
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
@@ -726,9 +737,11 @@ struct Config {
   // desc = **Note**: ``lightgbm-transform`` is not maintained by LightGBM's maintainers. Bug reports or feature requests should go to `issues page <https://github.com/microsoft/lightgbm-transform/issues>`__
   std::string parser_config_file = "";
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region Predict Parameters
+  #endif  // __NVCC__
 
   // [no-save]
   // desc = used only in ``prediction`` task
@@ -798,9 +811,11 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   std::string output_result = "LightGBM_predict_result.txt";
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region Convert Parameters
+  #endif  // __NVCC__
 
   // [no-save]
   // desc = used only in ``convert_model`` task
@@ -816,11 +831,13 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   std::string convert_model = "gbdt_prediction.cpp";
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma endregion
 
   #pragma region Objective Parameters
+  #endif  // __NVCC__
 
   // desc = used only in ``rank_xendcg`` objective
   // desc = random seed for objectives, if random process is needed
@@ -908,9 +925,11 @@ struct Config {
   // desc = used only in ``lambdarank`` application where ``lambdarank_unbiased = true``
   double lambdarank_bias_p_norm = 0.5;
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region Metric Parameters
+  #endif  // __NVCC__
 
   // [doc-only]
   // alias = metrics, metric_types
@@ -982,9 +1001,11 @@ struct Config {
   // desc = if not specified, will use equal weights for all classes
   std::vector<double> auc_mu_weights;
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region Network Parameters
+  #endif  // __NVCC__
 
   // check = >0
   // alias = num_machine
@@ -1013,9 +1034,11 @@ struct Config {
   // desc = list of machines in the following format: ``ip1:port1,ip2:port2``
   std::string machines = "";
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma region GPU Parameters
+  #endif  // __NVCC__
 
   // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
   // desc = ``-1`` means the system-wide default platform
@@ -1036,9 +1059,11 @@ struct Config {
   // desc = **Note**: can be used only in CUDA implementation
   int num_gpu = 1;
 
+  #ifndef __NVCC__
   #pragma endregion
 
   #pragma endregion
+  #endif  // __NVCC__
 
   size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024;
 
diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp
new file mode 100644
index 000000000000..a4e91eb9bf38
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -0,0 +1,390 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
+#define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <LightGBM/bin.h>
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/utils/log.h>
+
+#include <algorithm>
+
+#define NUM_BANKS_DATA_PARTITION (16)
+#define LOG_NUM_BANKS_DATA_PARTITION (4)
+#define GLOBAL_PREFIX_SUM_BLOCK_SIZE (1024)
+
+#define BITONIC_SORT_NUM_ELEMENTS (1024)
+#define BITONIC_SORT_DEPTH (11)
+#define BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE (10)
+
+#define CONFLICT_FREE_INDEX(n) \
+  ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \
+
+namespace LightGBM {
+
+template <typename T>
+__device__ __forceinline__ T ShufflePrefixSum(T value, T* shared_mem_buffer) {
+  const uint32_t mask = 0xffffffff;
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  const uint32_t warpID = threadIdx.x / warpSize;
+  const uint32_t num_warp = blockDim.x / warpSize;
+  for (uint32_t offset = 1; offset < warpSize; offset <<= 1) {
+    const T other_value = __shfl_up_sync(mask, value, offset);
+    if (warpLane >= offset) {
+      value += other_value;
+    }
+  }
+  if (warpLane == warpSize - 1) {
+    shared_mem_buffer[warpID] = value;
+  }
+  __syncthreads();
+  if (warpID == 0) {
+    T warp_sum = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0);
+    for (uint32_t offset = 1; offset < warpSize; offset <<= 1) {
+      const T other_warp_sum = __shfl_up_sync(mask, warp_sum, offset);
+      if (warpLane >= offset) {
+        warp_sum += other_warp_sum;
+      }
+    }
+    shared_mem_buffer[warpLane] = warp_sum;
+  }
+  __syncthreads();
+  const T warp_base = warpID == 0 ? 0 : shared_mem_buffer[warpID - 1];
+  return warp_base + value;
+}
+
+template <typename T>
+__device__ __forceinline__ T ShufflePrefixSumExclusive(T value, T* shared_mem_buffer) {
+  const uint32_t mask = 0xffffffff;
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  const uint32_t warpID = threadIdx.x / warpSize;
+  const uint32_t num_warp = blockDim.x / warpSize;
+  for (uint32_t offset = 1; offset < warpSize; offset <<= 1) {
+    const T other_value = __shfl_up_sync(mask, value, offset);
+    if (warpLane >= offset) {
+      value += other_value;
+    }
+  }
+  if (warpLane == warpSize - 1) {
+    shared_mem_buffer[warpID] = value;
+  }
+  __syncthreads();
+  if (warpID == 0) {
+    T warp_sum = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0);
+    for (uint32_t offset = 1; offset < warpSize; offset <<= 1) {
+      const T other_warp_sum = __shfl_up_sync(mask, warp_sum, offset);
+      if (warpLane >= offset) {
+        warp_sum += other_warp_sum;
+      }
+    }
+    shared_mem_buffer[warpLane] = warp_sum;
+  }
+  __syncthreads();
+  const T warp_base = warpID == 0 ? 0 : shared_mem_buffer[warpID - 1];
+  const T inclusive_result = warp_base + value;
+  if (threadIdx.x % warpSize == warpSize - 1) {
+    shared_mem_buffer[warpLane] = inclusive_result;
+  }
+  __syncthreads();
+  T exclusive_result = __shfl_up_sync(mask, inclusive_result, 1);
+  if (threadIdx.x == 0) {
+    exclusive_result = 0;
+  } else if (threadIdx.x % warpSize == 0) {
+    exclusive_result = shared_mem_buffer[warpLane - 1];
+  }
+  return exclusive_result;
+}
+
+template <typename T>
+void ShufflePrefixSumGlobal(T* values, size_t len, T* block_prefix_sum_buffer);
+
+template <typename T>
+__device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len) {
+  if (len > 0) {
+    const uint32_t mask = 0xffffffff;
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+      value += __shfl_down_sync(mask, value, offset);
+    }
+  }
+  return value;
+}
+
+// reduce values from an 1-dimensional block (block size must be no greather than 1024)
+template <typename T>
+__device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, const size_t len) {
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  const uint32_t warpID = threadIdx.x / warpSize;
+  const data_size_t warp_len = min(static_cast<data_size_t>(warpSize), static_cast<data_size_t>(len) - static_cast<data_size_t>(warpID * warpSize));
+  value = ShuffleReduceSumWarp<T>(value, warp_len);
+  if (warpLane == 0) {
+    shared_mem_buffer[warpID] = value;
+  }
+  __syncthreads();
+  const data_size_t num_warp = static_cast<data_size_t>((len + warpSize - 1) / warpSize);
+  if (warpID == 0) {
+    value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0);
+    value = ShuffleReduceSumWarp<T>(value, num_warp);
+  }
+  return value;
+}
+
+template <typename T>
+__device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len) {
+  if (len > 0) {
+    const uint32_t mask = 0xffffffff;
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+      value = max(value, __shfl_down_sync(mask, value, offset));
+    }
+  }
+  return value;
+}
+
+// reduce values from an 1-dimensional block (block size must be no greather than 1024)
+template <typename T>
+__device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, const size_t len) {
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  const uint32_t warpID = threadIdx.x / warpSize;
+  const data_size_t warp_len = min(static_cast<data_size_t>(warpSize), static_cast<data_size_t>(len) - static_cast<data_size_t>(warpID * warpSize));
+  value = ShuffleReduceMaxWarp<T>(value, warp_len);
+  if (warpLane == 0) {
+    shared_mem_buffer[warpID] = value;
+  }
+  __syncthreads();
+  const data_size_t num_warp = static_cast<data_size_t>((len + warpSize - 1) / warpSize);
+  if (warpID == 0) {
+    value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0);
+    value = ShuffleReduceMaxWarp<T>(value, num_warp);
+  }
+  return value;
+}
+
+// calculate prefix sum values within an 1-dimensional block in global memory, exclusively
+template <typename T>
+__device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len) {
+  const size_t num_values_per_thread = (len + blockDim.x - 1) / blockDim.x;
+  const size_t start = threadIdx.x * num_values_per_thread;
+  const size_t end = min(start + num_values_per_thread, len);
+  T thread_sum = 0;
+  for (size_t index = start; index < end; ++index) {
+    thread_sum += array[index];
+  }
+  __shared__ T shared_mem[32];
+  const T thread_base = ShufflePrefixSumExclusive<T>(thread_sum, shared_mem);
+  if (start < end) {
+    array[start] += thread_base;
+  }
+  for (size_t index = start + 1; index < end; ++index) {
+    array[index] += array[index - 1];
+  }
+}
+
+template <typename VAL_T, typename INDEX_T, bool ASCENDING>
+__device__ __forceinline__ void BitonicArgSort_1024(const VAL_T* scores, INDEX_T* indices, const INDEX_T num_items) {
+  INDEX_T depth = 1;
+  INDEX_T num_items_aligend = 1;
+  INDEX_T num_items_ref = num_items - 1;
+  while (num_items_ref > 0) {
+    num_items_ref >>= 1;
+    num_items_aligend <<= 1;
+    ++depth;
+  }
+  for (INDEX_T outer_depth = depth - 1; outer_depth >= 1; --outer_depth) {
+    const INDEX_T outer_segment_length = 1 << (depth - outer_depth);
+    const INDEX_T outer_segment_index = threadIdx.x / outer_segment_length;
+    const bool ascending = ASCENDING ? (outer_segment_index % 2 == 0) : (outer_segment_index % 2 > 0);
+    for (INDEX_T inner_depth = outer_depth; inner_depth < depth; ++inner_depth) {
+      const INDEX_T segment_length = 1 << (depth - inner_depth);
+      const INDEX_T half_segment_length = segment_length >> 1;
+      const INDEX_T half_segment_index = threadIdx.x / half_segment_length;
+      if (threadIdx.x < num_items_aligend) {
+        if (half_segment_index % 2 == 0) {
+          const INDEX_T index_to_compare = threadIdx.x + half_segment_length;
+          if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) {
+            const INDEX_T index = indices[threadIdx.x];
+            indices[threadIdx.x] = indices[index_to_compare];
+            indices[index_to_compare] = index;
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+template <typename VAL_T, typename INDEX_T, bool ASCENDING, uint32_t BLOCK_DIM, uint32_t MAX_DEPTH>
+__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) {
+  __shared__ VAL_T shared_values[BLOCK_DIM];
+  __shared__ INDEX_T shared_indices[BLOCK_DIM];
+  int len_to_shift = len - 1;
+  int max_depth = 1;
+  while (len_to_shift > 0) {
+    len_to_shift >>= 1;
+    ++max_depth;
+  }
+  const int num_blocks = (len + static_cast<int>(BLOCK_DIM) - 1) / static_cast<int>(BLOCK_DIM);
+  for (int block_index = 0; block_index < num_blocks; ++block_index) {
+    const int this_index = block_index * static_cast<int>(BLOCK_DIM) + static_cast<int>(threadIdx.x);
+    if (this_index < len) {
+      shared_values[threadIdx.x] = values[this_index];
+      shared_indices[threadIdx.x] = this_index;
+    } else {
+      shared_indices[threadIdx.x] = len;
+    }
+    __syncthreads();
+    for (int depth = max_depth - 1; depth > max_depth - static_cast<int>(MAX_DEPTH); --depth) {
+      const int segment_length = (1 << (max_depth - depth));
+      const int segment_index = this_index / segment_length;
+      const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1);
+      {
+        const int half_segment_length = (segment_length >> 1);
+        const int half_segment_index = this_index / half_segment_length;
+        const int num_total_segment = (len + segment_length - 1) / segment_length;
+        const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ?
+          (num_total_segment * segment_length - len) : 0;
+        if (half_segment_index % 2 == 0) {
+          const int segment_start = segment_index * segment_length;
+          if (this_index >= offset + segment_start) {
+            const int other_index = static_cast<int>(threadIdx.x) + half_segment_length - offset;
+            const INDEX_T this_data_index = shared_indices[threadIdx.x];
+            const INDEX_T other_data_index = shared_indices[other_index];
+            const VAL_T this_value = shared_values[threadIdx.x];
+            const VAL_T other_value = shared_values[other_index];
+            if (other_data_index < len && (this_value > other_value) == ascending) {
+              shared_indices[threadIdx.x] = other_data_index;
+              shared_indices[other_index] = this_data_index;
+              shared_values[threadIdx.x] = other_value;
+              shared_values[other_index] = this_value;
+            }
+          }
+        }
+        __syncthreads();
+      }
+      for (int inner_depth = depth + 1; inner_depth < max_depth; ++inner_depth) {
+        const int half_segment_length = (1 << (max_depth - inner_depth - 1));
+        const int half_segment_index = this_index / half_segment_length;
+        if (half_segment_index % 2 == 0) {
+          const int other_index = static_cast<int>(threadIdx.x) + half_segment_length;
+          const INDEX_T this_data_index = shared_indices[threadIdx.x];
+          const INDEX_T other_data_index = shared_indices[other_index];
+          const VAL_T this_value = shared_values[threadIdx.x];
+          const VAL_T other_value = shared_values[other_index];
+          if (other_data_index < len && (this_value > other_value) == ascending) {
+            shared_indices[threadIdx.x] = other_data_index;
+            shared_indices[other_index] = this_data_index;
+            shared_values[threadIdx.x] = other_value;
+            shared_values[other_index] = this_value;
+          }
+        }
+        __syncthreads();
+      }
+    }
+    if (this_index < len) {
+      indices[this_index] = shared_indices[threadIdx.x];
+    }
+    __syncthreads();
+  }
+  for (int depth = max_depth - static_cast<int>(MAX_DEPTH); depth >= 1; --depth) {
+    const int segment_length = (1 << (max_depth - depth));
+    {
+      const int num_total_segment = (len + segment_length - 1) / segment_length;
+      const int half_segment_length = (segment_length >> 1);
+      for (int block_index = 0; block_index < num_blocks; ++block_index) {
+        const int this_index = block_index * static_cast<int>(BLOCK_DIM) + static_cast<int>(threadIdx.x);
+        const int segment_index = this_index / segment_length;
+        const int half_segment_index = this_index / half_segment_length;
+        const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1);
+        const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ?
+          (num_total_segment * segment_length - len) : 0;
+        if (half_segment_index % 2 == 0) {
+          const int segment_start = segment_index * segment_length;
+          if (this_index >= offset + segment_start) {
+            const int other_index = this_index + half_segment_length - offset;
+            if (other_index < len) {
+              const INDEX_T this_data_index = indices[this_index];
+              const INDEX_T other_data_index = indices[other_index];
+              const VAL_T this_value = values[this_data_index];
+              const VAL_T other_value = values[other_data_index];
+              if ((this_value > other_value) == ascending) {
+                indices[this_index] = other_data_index;
+                indices[other_index] = this_data_index;
+              }
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+    for (int inner_depth = depth + 1; inner_depth <= max_depth - static_cast<int>(MAX_DEPTH); ++inner_depth) {
+      const int half_segment_length = (1 << (max_depth - inner_depth - 1));
+      for (int block_index = 0; block_index < num_blocks; ++block_index) {
+        const int this_index = block_index * static_cast<int>(BLOCK_DIM) + static_cast<int>(threadIdx.x);
+        const int segment_index = this_index / segment_length;
+        const int half_segment_index = this_index / half_segment_length;
+        const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1);
+        if (half_segment_index % 2 == 0) {
+          const int other_index = this_index + half_segment_length;
+          if (other_index < len) {
+            const INDEX_T this_data_index = indices[this_index];
+            const INDEX_T other_data_index = indices[other_index];
+            const VAL_T this_value = values[this_data_index];
+            const VAL_T other_value = values[other_data_index];
+            if ((this_value > other_value) == ascending) {
+              indices[this_index] = other_data_index;
+              indices[other_index] = this_data_index;
+            }
+          }
+        }
+        __syncthreads();
+      }
+    }
+    for (int block_index = 0; block_index < num_blocks; ++block_index) {
+      const int this_index = block_index * static_cast<int>(BLOCK_DIM) + static_cast<int>(threadIdx.x);
+      const int segment_index = this_index / segment_length;
+      const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1);
+      if (this_index < len) {
+        const INDEX_T index = indices[this_index];
+        shared_values[threadIdx.x] = values[index];
+        shared_indices[threadIdx.x] = index;
+      } else {
+        shared_indices[threadIdx.x] = len;
+      }
+      __syncthreads();
+      for (int inner_depth = max_depth - static_cast<int>(MAX_DEPTH) + 1; inner_depth < max_depth; ++inner_depth) {
+        const int half_segment_length = (1 << (max_depth - inner_depth - 1));
+        const int half_segment_index = this_index / half_segment_length;
+        if (half_segment_index % 2 == 0) {
+          const int other_index = static_cast<int>(threadIdx.x) + half_segment_length;
+          const INDEX_T this_data_index = shared_indices[threadIdx.x];
+          const INDEX_T other_data_index = shared_indices[other_index];
+          const VAL_T this_value = shared_values[threadIdx.x];
+          const VAL_T other_value = shared_values[other_index];
+          if (other_data_index < len && (this_value > other_value) == ascending) {
+            shared_indices[threadIdx.x] = other_data_index;
+            shared_indices[other_index] = this_data_index;
+            shared_values[threadIdx.x] = other_value;
+            shared_values[other_index] = this_value;
+          }
+        }
+        __syncthreads();
+      }
+      if (this_index < len) {
+        indices[this_index] = shared_indices[threadIdx.x];
+      }
+      __syncthreads();
+    }
+  }
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+#endif  // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp
new file mode 100644
index 000000000000..5438f0103abc
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_column_data.hpp
@@ -0,0 +1,140 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#ifndef LIGHTGBM_CUDA_COLUMN_DATA_HPP_
+#define LIGHTGBM_CUDA_COLUMN_DATA_HPP_
+
+#include <LightGBM/config.h>
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <vector>
+
+namespace LightGBM {
+
+class CUDAColumnData {
+ public:
+  CUDAColumnData(const data_size_t num_data, const int gpu_device_id);
+
+  ~CUDAColumnData();
+
+  void Init(const int num_columns,
+            const std::vector<const void*>& column_data,
+            const std::vector<BinIterator*>& column_bin_iterator,
+            const std::vector<uint8_t>& column_bit_type,
+            const std::vector<uint32_t>& feature_max_bin,
+            const std::vector<uint32_t>& feature_min_bin,
+            const std::vector<uint32_t>& feature_offset,
+            const std::vector<uint32_t>& feature_most_freq_bin,
+            const std::vector<uint32_t>& feature_default_bin,
+            const std::vector<uint8_t>& feature_missing_is_zero,
+            const std::vector<uint8_t>& feature_missing_is_na,
+            const std::vector<uint8_t>& feature_mfb_is_zero,
+            const std::vector<uint8_t>& feature_mfb_is_na,
+            const std::vector<int>& feature_to_column);
+
+  const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; }
+
+  void CopySubrow(const CUDAColumnData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);
+
+  void* const* cuda_data_by_column() const { return cuda_data_by_column_; }
+
+  uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; }
+
+  uint32_t feature_max_bin(const int feature_index) const { return feature_max_bin_[feature_index]; }
+
+  uint32_t feature_offset(const int feature_index) const { return feature_offset_[feature_index]; }
+
+  uint32_t feature_most_freq_bin(const int feature_index) const { return feature_most_freq_bin_[feature_index]; }
+
+  uint32_t feature_default_bin(const int feature_index) const { return feature_default_bin_[feature_index]; }
+
+  uint8_t feature_missing_is_zero(const int feature_index) const { return feature_missing_is_zero_[feature_index]; }
+
+  uint8_t feature_missing_is_na(const int feature_index) const { return feature_missing_is_na_[feature_index]; }
+
+  uint8_t feature_mfb_is_zero(const int feature_index) const { return feature_mfb_is_zero_[feature_index]; }
+
+  uint8_t feature_mfb_is_na(const int feature_index) const { return feature_mfb_is_na_[feature_index]; }
+
+  const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; }
+
+  const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; }
+
+  const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_; }
+
+  const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_; }
+
+  const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; }
+
+  const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_; }
+
+  const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_; }
+
+  const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_; }
+
+  const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_; }
+
+  const int* cuda_feature_to_column() const { return cuda_feature_to_column_; }
+
+  const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; }
+
+  int feature_to_column(const int feature_index) const { return feature_to_column_[feature_index]; }
+
+  uint8_t column_bit_type(const int column_index) const { return column_bit_type_[column_index]; }
+
+ private:
+  template <bool IS_SPARSE, bool IS_4BIT, typename BIN_TYPE>
+  void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer);
+
+  void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column);
+
+  void InitColumnMetaInfo();
+
+  void ResizeWhenCopySubrow(const data_size_t num_used_indices);
+
+  int num_threads_;
+  data_size_t num_data_;
+  int num_columns_;
+  std::vector<uint8_t> column_bit_type_;
+  std::vector<uint32_t> feature_min_bin_;
+  std::vector<uint32_t> feature_max_bin_;
+  std::vector<uint32_t> feature_offset_;
+  std::vector<uint32_t> feature_most_freq_bin_;
+  std::vector<uint32_t> feature_default_bin_;
+  std::vector<uint8_t> feature_missing_is_zero_;
+  std::vector<uint8_t> feature_missing_is_na_;
+  std::vector<uint8_t> feature_mfb_is_zero_;
+  std::vector<uint8_t> feature_mfb_is_na_;
+  void** cuda_data_by_column_;
+  std::vector<int> feature_to_column_;
+  std::vector<void*> data_by_column_;
+
+  uint8_t* cuda_column_bit_type_;
+  uint32_t* cuda_feature_min_bin_;
+  uint32_t* cuda_feature_max_bin_;
+  uint32_t* cuda_feature_offset_;
+  uint32_t* cuda_feature_most_freq_bin_;
+  uint32_t* cuda_feature_default_bin_;
+  uint8_t* cuda_feature_missing_is_zero_;
+  uint8_t* cuda_feature_missing_is_na_;
+  uint8_t* cuda_feature_mfb_is_zero_;
+  uint8_t* cuda_feature_mfb_is_na_;
+  int* cuda_feature_to_column_;
+
+  // used when bagging with subset
+  data_size_t* cuda_used_indices_;
+  data_size_t num_used_indices_;
+  data_size_t cur_subset_buffer_size_;
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_CUDA_COLUMN_DATA_HPP_
+
+#endif  // USE_CUDA_EXP
diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp
new file mode 100644
index 000000000000..a72d03f02592
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_metadata.hpp
@@ -0,0 +1,58 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#ifndef LIGHTGBM_CUDA_META_DATA_HPP_
+#define LIGHTGBM_CUDA_META_DATA_HPP_
+
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/meta.h>
+
+#include <vector>
+
+namespace LightGBM {
+
+class CUDAMetadata {
+ public:
+  explicit CUDAMetadata(const int gpu_device_id);
+
+  ~CUDAMetadata();
+
+  void Init(const std::vector<label_t>& label,
+            const std::vector<label_t>& weight,
+            const std::vector<data_size_t>& query_boundaries,
+            const std::vector<label_t>& query_weights,
+            const std::vector<double>& init_score);
+
+  void SetLabel(const label_t* label, data_size_t len);
+
+  void SetWeights(const label_t* weights, data_size_t len);
+
+  void SetQuery(const data_size_t* query, const label_t* query_weights, data_size_t num_queries);
+
+  void SetInitScore(const double* init_score, data_size_t len);
+
+  const label_t* cuda_label() const { return cuda_label_; }
+
+  const label_t* cuda_weights() const { return cuda_weights_; }
+
+  const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; }
+
+  const label_t* cuda_query_weights() const { return cuda_query_weights_; }
+
+ private:
+  label_t* cuda_label_;
+  label_t* cuda_weights_;
+  data_size_t* cuda_query_boundaries_;
+  label_t* cuda_query_weights_;
+  double* cuda_init_score_;
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_CUDA_META_DATA_HPP_
+
+#endif  // USE_CUDA_EXP
diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp
new file mode 100644
index 000000000000..1f07d64452da
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_random.hpp
@@ -0,0 +1,74 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
+#define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace LightGBM {
+
+/*!
+* \brief A wrapper for random generator
+*/
+class CUDARandom {
+ public:
+  /*!
+  * \brief Set specific seed
+  */
+  __device__ void SetSeed(int seed) {
+    x = seed;
+  }
+  /*!
+  * \brief Generate random integer, int16 range. [0, 65536]
+  * \param lower_bound lower bound
+  * \param upper_bound upper bound
+  * \return The random integer between [lower_bound, upper_bound)
+  */
+  __device__ inline int NextShort(int lower_bound, int upper_bound) {
+    return (RandInt16()) % (upper_bound - lower_bound) + lower_bound;
+  }
+
+  /*!
+  * \brief Generate random integer, int32 range
+  * \param lower_bound lower bound
+  * \param upper_bound upper bound
+  * \return The random integer between [lower_bound, upper_bound)
+  */
+  __device__ inline int NextInt(int lower_bound, int upper_bound) {
+    return (RandInt32()) % (upper_bound - lower_bound) + lower_bound;
+  }
+
+  /*!
+  * \brief Generate random float data
+  * \return The random float between [0.0, 1.0)
+  */
+  __device__ inline float NextFloat() {
+    // get random float in [0,1)
+    return static_cast<float>(RandInt16()) / (32768.0f);
+  }
+
+ private:
+  __device__ inline int RandInt16() {
+    x = (214013 * x + 2531011);
+    return static_cast<int>((x >> 16) & 0x7FFF);
+  }
+
+  __device__ inline int RandInt32() {
+    x = (214013 * x + 2531011);
+    return static_cast<int>(x & 0x7FFFFFFF);
+  }
+
+  unsigned int x = 123456789;
+};
+
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+
+#endif  // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp
new file mode 100644
index 000000000000..8f5e2f8a0e03
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_row_data.hpp
@@ -0,0 +1,179 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#ifndef LIGHTGBM_CUDA_ROW_DATA_HPP_
+#define LIGHTGBM_CUDA_ROW_DATA_HPP_
+
+#include <LightGBM/bin.h>
+#include <LightGBM/config.h>
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/train_share_states.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <vector>
+
+#define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024)
+
+#if CUDART_VERSION == 10000
+#define DP_SHARED_HIST_SIZE (5560)
+#else
+#define DP_SHARED_HIST_SIZE (6144)
+#endif
+#define SP_SHARED_HIST_SIZE (DP_SHARED_HIST_SIZE * 2)
+
+namespace LightGBM {
+
+class CUDARowData {
+ public:
+  CUDARowData(const Dataset* train_data,
+              const TrainingShareStates* train_share_state,
+              const int gpu_device_id,
+              const bool gpu_use_dp);
+
+  ~CUDARowData();
+
+  void Init(const Dataset* train_data,
+            TrainingShareStates* train_share_state);
+
+  void CopySubrow(const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices);
+
+  void CopySubcol(const CUDARowData* full_set, const std::vector<int8_t>& is_feature_used, const Dataset* train_data);
+
+  void CopySubrowAndSubcol(const CUDARowData* full_set, const data_size_t* used_indices,
+    const data_size_t num_used_indices, const std::vector<bool>& is_feature_used, const Dataset* train_data);
+
+  template <typename BIN_TYPE>
+  const BIN_TYPE* GetBin() const;
+
+  template <typename PTR_TYPE>
+  const PTR_TYPE* GetPartitionPtr() const;
+
+  template <typename PTR_TYPE>
+  const PTR_TYPE* GetRowPtr() const;
+
+  int NumLargeBinPartition() const { return static_cast<int>(large_bin_partitions_.size()); }
+
+  int num_feature_partitions() const { return num_feature_partitions_; }
+
+  int max_num_column_per_partition() const { return max_num_column_per_partition_; }
+
+  bool is_sparse() const { return is_sparse_; }
+
+  uint8_t bit_type() const { return bit_type_; }
+
+  uint8_t row_ptr_bit_type() const { return row_ptr_bit_type_; }
+
+  const int* cuda_feature_partition_column_index_offsets() const { return cuda_feature_partition_column_index_offsets_; }
+
+  const uint32_t* cuda_column_hist_offsets() const { return cuda_column_hist_offsets_; }
+
+  const uint32_t* cuda_partition_hist_offsets() const { return cuda_partition_hist_offsets_; }
+
+  int shared_hist_size() const { return shared_hist_size_; }
+
+ private:
+  void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state);
+
+  template <typename BIN_TYPE>
+  void GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector<BIN_TYPE>* partitioned_data);
+
+  template <typename BIN_TYPE, typename ROW_PTR_TYPE>
+  void GetSparseDataPartitioned(const BIN_TYPE* row_wise_data,
+    const ROW_PTR_TYPE* row_ptr,
+    std::vector<std::vector<BIN_TYPE>>* partitioned_data,
+    std::vector<std::vector<ROW_PTR_TYPE>>* partitioned_row_ptr,
+    std::vector<ROW_PTR_TYPE>* partition_ptr);
+
+  template <typename BIN_TYPE, typename ROW_PTR_TYPE>
+  void InitSparseData(const BIN_TYPE* host_data,
+                      const ROW_PTR_TYPE* host_row_ptr,
+                      BIN_TYPE** cuda_data,
+                      ROW_PTR_TYPE** cuda_row_ptr,
+                      ROW_PTR_TYPE** cuda_partition_ptr);
+
+  /*! \brief number of threads to use */
+  int num_threads_;
+  /*! \brief number of training data */
+  data_size_t num_data_;
+  /*! \brief number of bins of all features */
+  int num_total_bin_;
+  /*! \brief number of feature groups in dataset */
+  int num_feature_group_;
+  /*! \brief number of features in dataset */
+  int num_feature_;
+  /*! \brief number of bits used to store each bin value */
+  uint8_t bit_type_;
+  /*! \brief number of bits used to store each row pointer value */
+  uint8_t row_ptr_bit_type_;
+  /*! \brief is sparse row wise data */
+  bool is_sparse_;
+  /*! \brief start column index of each feature partition */
+  std::vector<int> feature_partition_column_index_offsets_;
+  /*! \brief histogram offset of each column */
+  std::vector<uint32_t> column_hist_offsets_;
+  /*! \brief hisotgram offset of each partition */
+  std::vector<uint32_t> partition_hist_offsets_;
+  /*! \brief maximum number of columns among all feature partitions */
+  int max_num_column_per_partition_;
+  /*! \brief number of partitions */
+  int num_feature_partitions_;
+  /*! \brief used when bagging with subset, number of used indices */
+  data_size_t num_used_indices_;
+  /*! \brief used when bagging with subset, number of total elements */
+  uint64_t num_total_elements_;
+  /*! \brief used when bagging with column subset, the size of maximum number of feature partitions */
+  int cur_num_feature_partition_buffer_size_;
+  /*! \brief CUDA device ID */
+  int gpu_device_id_;
+  /*! \brief index of partitions with large bins that its histogram cannot fit into shared memory, each large bin partition contains a single column */
+  std::vector<int> large_bin_partitions_;
+  /*! \brief index of partitions with small bins */
+  std::vector<int> small_bin_partitions_;
+  /*! \brief shared memory size used by histogram */
+  int shared_hist_size_;
+  /*! \brief whether to use double precision in histograms per block */
+  bool gpu_use_dp_;
+
+  // CUDA memory
+
+  /*! \brief row-wise data stored in CUDA, 8 bits */
+  uint8_t* cuda_data_uint8_t_;
+  /*! \brief row-wise data stored in CUDA, 16 bits */
+  uint16_t* cuda_data_uint16_t_;
+  /*! \brief row-wise data stored in CUDA, 32 bits */
+  uint32_t* cuda_data_uint32_t_;
+  /*! \brief row pointer stored in CUDA, 16 bits */
+  uint16_t* cuda_row_ptr_uint16_t_;
+  /*! \brief row pointer stored in CUDA, 32 bits */
+  uint32_t* cuda_row_ptr_uint32_t_;
+  /*! \brief row pointer stored in CUDA, 64 bits */
+  uint64_t* cuda_row_ptr_uint64_t_;
+  /*! \brief partition bin offsets, 16 bits */
+  uint16_t* cuda_partition_ptr_uint16_t_;
+  /*! \brief partition bin offsets, 32 bits */
+  uint32_t* cuda_partition_ptr_uint32_t_;
+  /*! \brief partition bin offsets, 64 bits */
+  uint64_t* cuda_partition_ptr_uint64_t_;
+  /*! \brief start column index of each feature partition */
+  int* cuda_feature_partition_column_index_offsets_;
+  /*! \brief histogram offset of each column */
+  uint32_t* cuda_column_hist_offsets_;
+  /*! \brief hisotgram offset of each partition */
+  uint32_t* cuda_partition_hist_offsets_;
+  /*! \brief block buffer when calculating prefix sum */
+  uint16_t* cuda_block_buffer_uint16_t_;
+  /*! \brief block buffer when calculating prefix sum */
+  uint32_t* cuda_block_buffer_uint32_t_;
+  /*! \brief block buffer when calculating prefix sum */
+  uint64_t* cuda_block_buffer_uint64_t_;
+};
+
+}  // namespace LightGBM
+#endif  // LIGHTGBM_CUDA_ROW_DATA_HPP_
+
+#endif  // USE_CUDA_EXP
diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp
new file mode 100644
index 000000000000..5c525b431548
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_split_info.hpp
@@ -0,0 +1,105 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#ifndef LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
+#define LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
+
+#include <LightGBM/meta.h>
+
+namespace LightGBM {
+
+class CUDASplitInfo {
+ public:
+  bool is_valid;
+  int leaf_index;
+  double gain;
+  int inner_feature_index;
+  uint32_t threshold;
+  bool default_left;
+
+  double left_sum_gradients;
+  double left_sum_hessians;
+  data_size_t left_count;
+  double left_gain;
+  double left_value;
+
+  double right_sum_gradients;
+  double right_sum_hessians;
+  data_size_t right_count;
+  double right_gain;
+  double right_value;
+
+  int num_cat_threshold = 0;
+  uint32_t* cat_threshold = nullptr;
+  int* cat_threshold_real = nullptr;
+
+  __device__ CUDASplitInfo() {
+    num_cat_threshold = 0;
+    cat_threshold = nullptr;
+    cat_threshold_real = nullptr;
+  }
+
+  __device__ ~CUDASplitInfo() {
+    if (num_cat_threshold > 0) {
+      if (cat_threshold != nullptr) {
+        cudaFree(cat_threshold);
+      }
+      if (cat_threshold_real != nullptr) {
+        cudaFree(cat_threshold_real);
+      }
+    }
+  }
+
+  __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
+    is_valid = other.is_valid;
+    leaf_index = other.leaf_index;
+    gain = other.gain;
+    inner_feature_index = other.inner_feature_index;
+    threshold = other.threshold;
+    default_left = other.default_left;
+
+    left_sum_gradients = other.left_sum_gradients;
+    left_sum_hessians = other.left_sum_hessians;
+    left_count = other.left_count;
+    left_gain = other.left_gain;
+    left_value = other.left_value;
+
+    right_sum_gradients = other.right_sum_gradients;
+    right_sum_hessians = other.right_sum_hessians;
+    right_count = other.right_count;
+    right_gain = other.right_gain;
+    right_value = other.right_value;
+
+    num_cat_threshold = other.num_cat_threshold;
+    if (num_cat_threshold > 0 && cat_threshold == nullptr) {
+      cat_threshold = new uint32_t[num_cat_threshold];
+    }
+    if (num_cat_threshold > 0 && cat_threshold_real == nullptr) {
+      cat_threshold_real = new int[num_cat_threshold];
+    }
+    if (num_cat_threshold > 0) {
+      if (other.cat_threshold != nullptr) {
+        for (int i = 0; i < num_cat_threshold; ++i) {
+          cat_threshold[i] = other.cat_threshold[i];
+        }
+      }
+      if (other.cat_threshold_real != nullptr) {
+        for (int i = 0; i < num_cat_threshold; ++i) {
+          cat_threshold_real[i] = other.cat_threshold_real[i];
+        }
+      }
+    }
+    return *this;
+  }
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
+
+#endif  // USE_CUDA_EXP
diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp
new file mode 100644
index 000000000000..aa09df4140f1
--- /dev/null
+++ b/include/LightGBM/cuda/cuda_tree.hpp
@@ -0,0 +1,142 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#ifndef LIGHTGBM_CUDA_CUDA_TREE_HPP_
+#define LIGHTGBM_CUDA_CUDA_TREE_HPP_
+
+#include <LightGBM/cuda/cuda_column_data.hpp>
+#include <LightGBM/cuda/cuda_split_info.hpp>
+#include <LightGBM/tree.h>
+#include <LightGBM/bin.h>
+
+namespace LightGBM {
+
+__device__ void SetDecisionTypeCUDA(int8_t* decision_type, bool input, int8_t mask);
+
+__device__ void SetMissingTypeCUDA(int8_t* decision_type, int8_t input);
+
+__device__ bool GetDecisionTypeCUDA(int8_t decision_type, int8_t mask);
+
+__device__ int8_t GetMissingTypeCUDA(int8_t decision_type);
+
+__device__ bool IsZeroCUDA(double fval);
+
+class CUDATree : public Tree {
+ public:
+  /*!
+  * \brief Constructor
+  * \param max_leaves The number of max leaves
+  * \param track_branch_features Whether to keep track of ancestors of leaf nodes
+  * \param is_linear Whether the tree has linear models at each leaf
+  */
+  explicit CUDATree(int max_leaves, bool track_branch_features, bool is_linear,
+    const int gpu_device_id, const bool has_categorical_feature);
+
+  explicit CUDATree(const Tree* host_tree);
+
+  ~CUDATree() noexcept;
+
+  int Split(const int leaf_index,
+            const int real_feature_index,
+            const double real_threshold,
+            const MissingType missing_type,
+            const CUDASplitInfo* cuda_split_info);
+
+  int SplitCategorical(
+    const int leaf_index,
+    const int real_feature_index,
+    const MissingType missing_type,
+    const CUDASplitInfo* cuda_split_info,
+    uint32_t* cuda_bitset,
+    size_t cuda_bitset_len,
+    uint32_t* cuda_bitset_inner,
+    size_t cuda_bitset_inner_len);
+
+  const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }
+
+  const int* cuda_left_child() const { return cuda_left_child_; }
+
+  const int* cuda_right_child() const { return cuda_right_child_; }
+
+  const int* cuda_split_feature_inner() const { return cuda_split_feature_inner_; }
+
+  const int* cuda_split_feature() const { return cuda_split_feature_; }
+
+  const uint32_t* cuda_threshold_in_bin() const { return cuda_threshold_in_bin_; }
+
+  const double* cuda_threshold() const { return cuda_threshold_; }
+
+  const int8_t* cuda_decision_type() const { return cuda_decision_type_; }
+
+  const double* cuda_leaf_value() const { return cuda_leaf_value_; }
+
+  double* cuda_leaf_value_ref() { return cuda_leaf_value_; }
+
+  inline void Shrinkage(double rate) override;
+
+  inline void AddBias(double val) override;
+
+  void ToHost();
+
+  void SyncLeafOutputFromHostToCUDA();
+
+  void SyncLeafOutputFromCUDAToHost();
+
+ private:
+  void InitCUDAMemory();
+
+  void InitCUDA();
+
+  void LaunchSplitKernel(const int leaf_index,
+                         const int real_feature_index,
+                         const double real_threshold,
+                         const MissingType missing_type,
+                         const CUDASplitInfo* cuda_split_info);
+
+  void LaunchSplitCategoricalKernel(
+    const int leaf_index,
+    const int real_feature_index,
+    const MissingType missing_type,
+    const CUDASplitInfo* cuda_split_info,
+    size_t cuda_bitset_len,
+    size_t cuda_bitset_inner_len);
+
+  void LaunchShrinkageKernel(const double rate);
+
+  void LaunchAddBiasKernel(const double val);
+
+  int* cuda_left_child_;
+  int* cuda_right_child_;
+  int* cuda_split_feature_inner_;
+  int* cuda_split_feature_;
+  int* cuda_leaf_depth_;
+  int* cuda_leaf_parent_;
+  uint32_t* cuda_threshold_in_bin_;
+  double* cuda_threshold_;
+  double* cuda_internal_weight_;
+  double* cuda_internal_value_;
+  int8_t* cuda_decision_type_;
+  double* cuda_leaf_value_;
+  data_size_t* cuda_leaf_count_;
+  double* cuda_leaf_weight_;
+  data_size_t* cuda_internal_count_;
+  float* cuda_split_gain_;
+  CUDAVector<uint32_t> cuda_bitset_;
+  CUDAVector<uint32_t> cuda_bitset_inner_;
+  CUDAVector<int> cuda_cat_boundaries_;
+  CUDAVector<int> cuda_cat_boundaries_inner_;
+
+  cudaStream_t cuda_stream_;
+
+  const int num_threads_per_block_add_prediction_to_score_;
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_CUDA_CUDA_TREE_HPP_
+
+#endif  // USE_CUDA_EXP
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 1054e09daf18..ee88c52a0404 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -1,16 +1,25 @@
 /*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Copyright (c) 2020-2021 IBM Corporation, Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
+
 #ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
 #define LIGHTGBM_CUDA_CUDA_UTILS_H_
 
-#ifdef USE_CUDA
-
+#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
+#endif  // USE_CUDA || USE_CUDA_EXP
+
+#ifdef USE_CUDA_EXP
+#include <LightGBM/utils/log.h>
+#include <vector>
+#endif  // USE_CUDA_EXP
 
+namespace LightGBM {
+
+#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
@@ -18,7 +27,157 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
     if (abort) exit(code);
   }
 }
+#endif  // USE_CUDA || USE_CUDA_EXP
+
+#ifdef USE_CUDA_EXP
+#define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }
+
+void SetCUDADevice(int gpu_device_id, const char* file, int line);
+
+template <typename T>
+void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) {
+  void* tmp_ptr = nullptr;
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMalloc(&tmp_ptr, size * sizeof(T)));
+  *out_ptr = reinterpret_cast<T*>(tmp_ptr);
+}
+
+template <typename T>
+void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
+  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
+  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
+  size_t size_in_bytes = size * sizeof(T);
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice));
+}
+
+template <typename T>
+void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
+  AllocateCUDAMemory<T>(dst_ptr, size, file, line);
+  CopyFromHostToCUDADevice<T>(*dst_ptr, src_ptr, size, file, line);
+}
+
+template <typename T>
+void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
+  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
+  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
+  size_t size_in_bytes = size * sizeof(T);
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost));
+}
+
+template <typename T>
+void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) {
+  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
+  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
+  size_t size_in_bytes = size * sizeof(T);
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream));
+}
+
+template <typename T>
+void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
+  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
+  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
+  size_t size_in_bytes = size * sizeof(T);
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice));
+}
+
+template <typename T>
+void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
+  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
+  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
+  size_t size_in_bytes = size * sizeof(T);
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice));
+}
+
+void SynchronizeCUDADevice(const char* file, const int line);
+
+template <typename T>
+void SetCUDAMemory(T* dst_ptr, int value, size_t size, const char* file, const int line) {
+  CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast<void*>(dst_ptr), value, size * sizeof(T)));
+  SynchronizeCUDADevice(file, line);
+}
+
+template <typename T>
+void DeallocateCUDAMemory(T** ptr, const char* file, const int line) {
+  if (*ptr != nullptr) {
+    CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast<void*>(*ptr)));
+    *ptr = nullptr;
+  }
+}
+
+void PrintLastCUDAError();
+
+template <typename T>
+class CUDAVector {
+ public:
+  CUDAVector() {
+    size_ = 0;
+    data_ = nullptr;
+  }
+
+  explicit CUDAVector(size_t size) {
+    size_ = size;
+    AllocateCUDAMemory<T>(&data_, size_, __FILE__, __LINE__);
+  }
+
+  void Resize(size_t size) {
+    if (size == 0) {
+      Clear();
+    }
+    T* new_data = nullptr;
+    AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
+    if (size_ > 0 && data_ != nullptr) {
+      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
+    }
+    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
+    data_ = new_data;
+    size_ = size;
+  }
+
+  void Clear() {
+    if (size_ > 0 && data_ != nullptr) {
+      DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
+    }
+    size_ = 0;
+  }
+
+  void PushBack(const T* values, size_t len) {
+    T* new_data = nullptr;
+    AllocateCUDAMemory<T>(&new_data, size_ + len, __FILE__, __LINE__);
+    if (size_ > 0 && data_ != nullptr) {
+      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_, __FILE__, __LINE__);
+    }
+    CopyFromCUDADeviceToCUDADevice<T>(new_data + size_, values, len, __FILE__, __LINE__);
+    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
+    size_ += len;
+    data_ = new_data;
+  }
+
+  size_t Size() {
+    return size_;
+  }
+
+  ~CUDAVector() {
+    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
+  }
+
+  std::vector<T> ToHost() {
+    std::vector<T> host_vector(size_);
+    if (size_ > 0 && data_ != nullptr) {
+      CopyFromCUDADeviceToHost(host_vector.data(), data_, size_, __FILE__, __LINE__);
+    }
+    return host_vector;
+  }
+
+  T* RawData() {
+    return data_;
+  }
+
+ private:
+  T* data_;
+  size_t size_;
+};
+
+#endif  // USE_CUDA_EXP
 
-#endif  // USE_CUDA
+}  // namespace LightGBM
 
 #endif  // LIGHTGBM_CUDA_CUDA_UTILS_H_
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index bd488d793d09..7c6e219cbbd9 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
+ * Copyright (c) 2020 IBM Corporation, Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
@@ -7,7 +7,7 @@
 
 #include <LightGBM/utils/common.h>
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
@@ -42,8 +42,8 @@ struct CHAllocator {
   T* allocate(std::size_t n) {
     T* ptr;
     if (n == 0) return NULL;
-    n = (n + kAlignedSize - 1) & -kAlignedSize;
-    #ifdef USE_CUDA
+    n = SIZE_ALIGNED(n);
+    #if defined(USE_CUDA) || defined(USE_CUDA_EXP)
       if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
@@ -62,7 +62,7 @@ struct CHAllocator {
   void deallocate(T* p, std::size_t n) {
     (void)n;  // UNUSED
     if (p == NULL) return;
-    #ifdef USE_CUDA
+    #if defined(USE_CUDA) || defined(USE_CUDA_EXP)
       if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaPointerAttributes attributes;
         cudaPointerGetAttributes(&attributes, p);
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index cf19429322ee..e94eefb979e3 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -22,6 +22,9 @@
 #include <utility>
 #include <vector>
 
+#include <LightGBM/cuda/cuda_column_data.hpp>
+#include <LightGBM/cuda/cuda_metadata.hpp>
+
 namespace LightGBM {
 
 /*! \brief forward declaration */
@@ -144,6 +147,9 @@ class Metadata {
     queries_[idx] = static_cast<data_size_t>(value);
   }
 
+  /*! \brief Load initial scores from file */
+  void LoadInitialScore(const std::string& data_filename);
+
   /*!
   * \brief Get weights, if not exists, will return nullptr
   * \return Pointer of weights
@@ -211,9 +217,15 @@ class Metadata {
   /*! \brief Disable copy */
   Metadata(const Metadata&) = delete;
 
+  #ifdef USE_CUDA_EXP
+
+  CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); }
+
+  void CreateCUDAMetadata(const int gpu_device_id);
+
+  #endif  // USE_CUDA_EXP
+
  private:
-  /*! \brief Load initial scores from file */
-  void LoadInitialScore();
   /*! \brief Load wights from file */
   void LoadWeights();
   /*! \brief Load query boundaries from file */
@@ -247,6 +259,9 @@ class Metadata {
   bool weight_load_from_file_;
   bool query_load_from_file_;
   bool init_score_load_from_file_;
+  #ifdef USE_CUDA_EXP
+  std::unique_ptr<CUDAMetadata> cuda_metadata_;
+  #endif  // USE_CUDA_EXP
 };
 
 
@@ -623,6 +638,21 @@ class Dataset {
     return feature_groups_[group]->FeatureGroupData();
   }
 
+  const void* GetColWiseData(
+    const int feature_group_index,
+    const int sub_feature_index,
+    uint8_t* bit_type,
+    bool* is_sparse,
+    std::vector<BinIterator*>* bin_iterator,
+    const int num_threads) const;
+
+  const void* GetColWiseData(
+    const int feature_group_index,
+    const int sub_feature_index,
+    uint8_t* bit_type,
+    bool* is_sparse,
+    BinIterator** bin_iterator) const;
+
   inline double RealThreshold(int i, uint32_t threshold) const {
     const int group = feature2group_[i];
     const int sub_feature = feature2subfeature_[i];
@@ -636,6 +666,12 @@ class Dataset {
     return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
   }
 
+  inline int MaxRealCatValue(int i) const {
+    const int group = feature2group_[i];
+    const int sub_feature = feature2subfeature_[i];
+    return feature_groups_[group]->bin_mappers_[sub_feature]->MaxCatValue();
+  }
+
   /*!
   * \brief Get meta data pointer
   * \return Pointer of meta data
@@ -739,7 +775,29 @@ class Dataset {
     return raw_data_[numeric_feature_map_[feat_ind]].data();
   }
 
+  inline uint32_t feature_max_bin(const int inner_feature_index) const {
+    const int feature_group_index = Feature2Group(inner_feature_index);
+    const int sub_feature_index = feature2subfeature_[inner_feature_index];
+    return feature_groups_[feature_group_index]->feature_max_bin(sub_feature_index);
+  }
+
+  inline uint32_t feature_min_bin(const int inner_feature_index) const {
+    const int feature_group_index = Feature2Group(inner_feature_index);
+    const int sub_feature_index = feature2subfeature_[inner_feature_index];
+    return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index);
+  }
+
+  #ifdef USE_CUDA_EXP
+
+  const CUDAColumnData* cuda_column_data() const {
+    return cuda_column_data_.get();
+  }
+
+  #endif  // USE_CUDA_EXP
+
  private:
+  void CreateCUDAColumnData();
+
   std::string data_filename_;
   /*! \brief Store used features */
   std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
@@ -780,6 +838,13 @@ class Dataset {
   /*! map feature (inner index) to its index in the list of numeric (non-categorical) features */
   std::vector<int> numeric_feature_map_;
   int num_numeric_features_;
+  std::string device_type_;
+  int gpu_device_id_;
+
+  #ifdef USE_CUDA_EXP
+  std::unique_ptr<CUDAColumnData> cuda_column_data_;
+  #endif  // USE_CUDA_EXP
+
   std::string parser_config_str_;
 };
 
diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h
index 27bea113b052..8b04e8327ff0 100644
--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -7,6 +7,7 @@
 
 #include <LightGBM/dataset.h>
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -28,8 +29,12 @@ class DatasetLoader {
   LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
 
   LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values,
-    int** sample_indices, int num_col, const int* num_per_col,
-    size_t total_sample_size, data_size_t num_data);
+                                                   int** sample_indices,
+                                                   int num_col,
+                                                   const int* num_per_col,
+                                                   size_t total_sample_size,
+                                                   data_size_t num_local_data,
+                                                   int64_t num_dist_data);
 
   /*! \brief Disable copy */
   DatasetLoader& operator=(const DatasetLoader&) = delete;
@@ -63,6 +68,16 @@ class DatasetLoader {
   /*! \brief Check can load from binary file */
   std::string CheckCanLoadFromBin(const char* filename);
 
+  /*! \brief Check the number of bins for categorical features.
+   * The number of bins for categorical features may exceed the configured maximum value.
+   * Log warnings when such cases happen.
+   *
+   * \param bin_mappers the bin_mappers of all features
+   * \param max_bin max_bin from Config
+   * \param max_bin_by_feature max_bin_by_feature from Config
+   */
+  void CheckCategoricalFeatureNumBin(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers, const int max_bin, const std::vector<int>& max_bin_by_feature) const;
+
   const Config& config_;
   /*! \brief Random generator*/
   Random random_;
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index 285667c70518..66cc09ed6527 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -478,6 +478,50 @@ class FeatureGroup {
     }
   }
 
+  const void* GetColWiseData(const int sub_feature_index,
+    uint8_t* bit_type,
+    bool* is_sparse,
+    std::vector<BinIterator*>* bin_iterator,
+    const int num_threads) const {
+    if (sub_feature_index >= 0) {
+      CHECK(is_multi_val_);
+      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
+    } else {
+      CHECK(!is_multi_val_);
+      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads);
+    }
+  }
+
+  const void* GetColWiseData(const int sub_feature_index,
+    uint8_t* bit_type,
+    bool* is_sparse,
+    BinIterator** bin_iterator) const {
+    if (sub_feature_index >= 0) {
+      CHECK(is_multi_val_);
+      return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator);
+    } else {
+      CHECK(!is_multi_val_);
+      return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator);
+    }
+  }
+
+  uint32_t feature_max_bin(const int sub_feature_index) {
+    if (!is_multi_val_) {
+      return bin_offsets_[sub_feature_index + 1] - 1;
+    } else {
+      int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1;
+      return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi;
+    }
+  }
+
+  uint32_t feature_min_bin(const int sub_feature_index) {
+    if (!is_multi_val_) {
+      return bin_offsets_[sub_feature_index];
+    } else {
+      return 1;
+    }
+  }
+
  private:
   void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
     if (is_multi_val) {
diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h
index 3452f28d8ebc..ee97090cbe0a 100644
--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -49,6 +49,8 @@ typedef float label_t;
 
 const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
 
+const score_t kMaxScore = std::numeric_limits<score_t>::infinity();
+
 const score_t kEpsilon = 1e-15f;
 
 const double kZeroThreshold = 1e-35f;
diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h
index b2a08ff413ff..5c14c7d51a47 100644
--- a/include/LightGBM/train_share_states.h
+++ b/include/LightGBM/train_share_states.h
@@ -125,6 +125,25 @@ class MultiValBinWrapper {
     is_subrow_copied_ = is_subrow_copied;
   }
 
+
+  #ifdef USE_CUDA_EXP
+  const void* GetRowWiseData(
+    uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) const {
+    if (multi_val_bin_ == nullptr) {
+      *bit_type = 0;
+      *total_size = 0;
+      *is_sparse = false;
+      return nullptr;
+    } else {
+      return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type);
+    }
+  }
+  #endif  // USE_CUDA_EXP
+
  private:
   bool is_use_subcol_ = false;
   bool is_use_subrow_ = false;
@@ -162,7 +181,11 @@ struct TrainingShareStates {
 
   int num_hist_total_bin() { return num_hist_total_bin_; }
 
-  const std::vector<uint32_t>& feature_hist_offsets() { return feature_hist_offsets_; }
+  const std::vector<uint32_t>& feature_hist_offsets() const { return feature_hist_offsets_; }
+
+  #ifdef USE_CUDA_EXP
+  const std::vector<uint32_t>& column_hist_offsets() const { return column_hist_offsets_; }
+  #endif  // USE_CUDA_EXP
 
   bool IsSparseRowwise() {
     return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse());
@@ -211,8 +234,29 @@ struct TrainingShareStates {
     }
   }
 
+
+  #ifdef USE_CUDA_EXP
+  const void* GetRowWiseData(uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) {
+    if (multi_val_bin_wrapper_ != nullptr) {
+      return multi_val_bin_wrapper_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type);
+    } else {
+      *bit_type = 0;
+      *total_size = 0;
+      *is_sparse = false;
+      return nullptr;
+    }
+  }
+  #endif  // USE_CUDA_EXP
+
  private:
   std::vector<uint32_t> feature_hist_offsets_;
+  #ifdef USE_CUDA_EXP
+  std::vector<uint32_t> column_hist_offsets_;
+  #endif  // USE_CUDA_EXP
   int num_hist_total_bin_ = 0;
   std::unique_ptr<MultiValBinWrapper> multi_val_bin_wrapper_;
   std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 0ebd5621eb41..6ff0370e2ea6 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -39,7 +39,7 @@ class Tree {
   */
   Tree(const char* str, size_t* used_len);
 
-  ~Tree() noexcept = default;
+  virtual ~Tree() noexcept = default;
 
   /*!
   * \brief Performing a split on tree leaves.
@@ -100,7 +100,7 @@ class Tree {
   * \param num_data Number of total data
   * \param score Will add prediction to score
   */
-  void AddPredictionToScore(const Dataset* data,
+  virtual void AddPredictionToScore(const Dataset* data,
                             data_size_t num_data,
                             double* score) const;
 
@@ -111,7 +111,7 @@ class Tree {
   * \param num_data Number of total data
   * \param score Will add prediction to score
   */
-  void AddPredictionToScore(const Dataset* data,
+  virtual void AddPredictionToScore(const Dataset* data,
                             const data_size_t* used_data_indices,
                             data_size_t num_data, double* score) const;
 
@@ -184,7 +184,7 @@ class Tree {
   *        shrinkage rate (a.k.a learning rate) is used to tune the training process
   * \param rate The factor of shrinkage
   */
-  inline void Shrinkage(double rate) {
+  virtual inline void Shrinkage(double rate) {
 #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
     for (int i = 0; i < num_leaves_ - 1; ++i) {
       leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
@@ -209,7 +209,7 @@ class Tree {
 
   inline double shrinkage() const { return shrinkage_; }
 
-  inline void AddBias(double val) {
+  virtual inline void AddBias(double val) {
 #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
     for (int i = 0; i < num_leaves_ - 1; ++i) {
       leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
@@ -319,11 +319,15 @@ class Tree {
 
   inline bool is_linear() const { return is_linear_; }
 
+  #ifdef USE_CUDA_EXP
+  inline bool is_cuda_tree() const { return is_cuda_tree_; }
+  #endif  // USE_CUDA_EXP
+
   inline void SetIsLinear(bool is_linear) {
     is_linear_ = is_linear;
   }
 
- private:
+ protected:
   std::string NumericalDecisionIfElse(int node) const;
 
   std::string CategoricalDecisionIfElse(int node) const;
@@ -528,6 +532,10 @@ class Tree {
   std::vector<std::vector<int>> leaf_features_;
   /* \brief features used in leaf linear models; indexing is relative to used_features_ */
   std::vector<std::vector<int>> leaf_features_inner_;
+  #ifdef USE_CUDA_EXP
+  /*! \brief Marks whether this tree is a CUDATree */
+  bool is_cuda_tree_;
+  #endif  // USE_CUDA_EXP
 };
 
 inline void Tree::Split(int leaf, int feature, int real_feature,
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 1e47700d0a61..3125f6b9e9ca 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -5,9 +5,6 @@
 #ifndef LIGHTGBM_UTILS_COMMON_H_
 #define LIGHTGBM_UTILS_COMMON_H_
 
-#if ((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__)))
-#include <LightGBM/utils/common_legacy_solaris.h>
-#endif
 #include <LightGBM/utils/json11.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/utils/openmp_wrapper.h>
@@ -32,11 +29,9 @@
 #include <utility>
 #include <vector>
 
-#if (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))))
 #define FMT_HEADER_ONLY
-#include "../../../external_libs/fmt/include/fmt/format.h"
-#endif
 #include "../../../external_libs/fast_double_parser/include/fast_double_parser.h"
+#include "../../../external_libs/fmt/include/fmt/format.h"
 
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -1195,7 +1190,6 @@ inline static std::vector<T> StringToArray(const std::string& str, char delimite
   return ret;
 }
 
-#if (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))))
 /*!
 * Safely formats a value onto a buffer according to a format string and null-terminates it.
 *
@@ -1260,7 +1254,6 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n) {
   }
   return str_buf.str();
 }
-#endif  // (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))))
 
 
 }  // namespace CommonC
diff --git a/include/LightGBM/utils/common_legacy_solaris.h b/include/LightGBM/utils/common_legacy_solaris.h
deleted file mode 100644
index 97f977108fc6..000000000000
--- a/include/LightGBM/utils/common_legacy_solaris.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*!
- * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-/*!
- * This file is meant to be used ONLY IN SOLARIS!
- * The newer code that replaced it is faster and safe regarding locale!
- */
-#ifndef LIGHTGBM_UTILS_COMMON_LEGACY_SOLARIS_H_
-#define LIGHTGBM_UTILS_COMMON_LEGACY_SOLARIS_H_
-
-#include <LightGBM/utils/log.h>
-
-#include <algorithm>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-#include <string>
-
-namespace LightGBM {
-
-namespace CommonLegacy {
-
-inline static unsigned CountDecimalDigit32(uint32_t n) {
-  if (n < 10) return 1;
-  else if (n < 100) return 2;
-  else if (n < 1000) return 3;
-  else if (n < 10000) return 4;
-  else if (n < 100000) return 5;
-  else if (n < 1000000) return 6;
-  else if (n < 10000000) return 7;
-  else if (n < 100000000) return 8;
-  else if (n < 1000000000) return 9;
-  else
-    return 10;
-}
-
-inline static void Uint32ToStr(uint32_t value, char* buffer) {
-  const char kDigitsLut[200] = {
-    '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
-    '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
-    '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
-    '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
-    '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
-    '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
-    '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
-    '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
-    '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
-    '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'
-  };
-  unsigned digit = CountDecimalDigit32(value);
-  buffer += digit;
-  *buffer = '\0';
-
-  while (value >= 100) {
-    const unsigned i = (value % 100) << 1;
-    value /= 100;
-    *--buffer = kDigitsLut[i + 1];
-    *--buffer = kDigitsLut[i];
-  }
-
-  if (value < 10) {
-    *--buffer = static_cast<char>(value) + '0';
-  } else {
-    const unsigned i = value << 1;
-    *--buffer = kDigitsLut[i + 1];
-    *--buffer = kDigitsLut[i];
-  }
-}
-
-inline static void Int32ToStr(int32_t value, char* buffer) {
-  uint32_t u = static_cast<uint32_t>(value);
-  if (value < 0) {
-    *buffer++ = '-';
-    u = ~u + 1;
-  }
-  Uint32ToStr(u, buffer);
-}
-
-inline static void DoubleToStr(double value, char* buffer, size_t buffer_len) {
-  int num_chars = snprintf(buffer, buffer_len, "%.17g", value);
-  CHECK_GE(num_chars, 0);
-}
-
-
-template<typename T, bool is_float, bool is_unsign>
-struct __TToStringHelperFast {
-  void operator()(T value, char* buffer, size_t) const {
-    Int32ToStr(value, buffer);
-  }
-};
-
-template<typename T>
-struct __TToStringHelperFast<T, true, false> {
-  void operator()(T value, char* buffer, size_t buf_len) const {
-    int num_chars = snprintf(buffer, buf_len, "%g", value);
-    CHECK_GE(num_chars, 0);
-  }
-};
-
-template<typename T>
-struct __TToStringHelperFast<T, false, true> {
-  void operator()(T value, char* buffer, size_t) const {
-    Uint32ToStr(value, buffer);
-  }
-};
-
-template<typename T>
-inline static std::string _ArrayToStringFast(const std::vector<T>& arr, size_t n) {
-  if (arr.empty() || n == 0) {
-    return std::string("");
-  }
-  __TToStringHelperFast<T, std::is_floating_point<T>::value, std::is_unsigned<T>::value> helper;
-  const size_t buf_len = 16;
-  std::vector<char> buffer(buf_len);
-  std::stringstream str_buf;
-  helper(arr[0], buffer.data(), buf_len);
-  str_buf << buffer.data();
-  for (size_t i = 1; i < std::min(n, arr.size()); ++i) {
-    helper(arr[i], buffer.data(), buf_len);
-    str_buf << ' ' << buffer.data();
-  }
-  return str_buf.str();
-}
-
-inline static std::string _ArrayToString(const std::vector<double>& arr, size_t n) {
-  if (arr.empty() || n == 0) {
-    return std::string("");
-  }
-  const size_t buf_len = 32;
-  std::vector<char> buffer(buf_len);
-  std::stringstream str_buf;
-  DoubleToStr(arr[0], buffer.data(), buf_len);
-  str_buf << buffer.data();
-  for (size_t i = 1; i < std::min(n, arr.size()); ++i) {
-    DoubleToStr(arr[i], buffer.data(), buf_len);
-    str_buf << ' ' << buffer.data();
-  }
-  return str_buf.str();
-}
-
-
-template<bool high_precision_output = false, typename T>
-inline static typename std::enable_if<high_precision_output == false, std::string>::type
-ArrayToString(const std::vector<T>& arr, size_t n) {
-    return _ArrayToStringFast(arr, n);
-}
-
-template<bool high_precision_output, typename T>
-inline static typename std::enable_if<
-(high_precision_output == true) && (std::is_same<T, double>::value), std::string>::type
-ArrayToString(const std::vector<T>& arr, size_t n) {
-    return _ArrayToString(arr, n);
-}
-
-}  // namespace CommonLegacy
-
-}  // namespace LightGBM
-
-#endif  // LIGHTGBM_UTILS_COMMON_LEGACY_SOLARIS_H_
diff --git a/python-package/README.rst b/python-package/README.rst
index 318ad7daedff..aaa55bc668b9 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -30,7 +30,7 @@ For **Linux** users, **glibc** >= 2.14 is required. Also, in some rare cases, wh
 
 For **macOS** (we provide wheels for 3 newest macOS versions) users:
 
-- Starting from version 2.2.1, the library file in distribution wheels is built by the **Apple Clang** (Xcode_8.3.3 for versions 2.2.1 - 2.3.1, Xcode_9.4.1 for versions 2.3.2 - 3.3.1 and Xcode_10.3 from version 4.0.0) compiler. This means that you don't need to install the **gcc** compiler anymore. Instead of that you need to install the **OpenMP** library, which is required for running LightGBM on the system with the **Apple Clang** compiler. You can install the **OpenMP** library by the following command: ``brew install libomp``.
+- Starting from version 2.2.1, the library file in distribution wheels is built by the **Apple Clang** (Xcode_8.3.3 for versions 2.2.1 - 2.3.1, Xcode_9.4.1 for versions 2.3.2 - 3.3.2 and Xcode_10.3 from version 4.0.0) compiler. This means that you don't need to install the **gcc** compiler anymore. Instead of that you need to install the **OpenMP** library, which is required for running LightGBM on the system with the **Apple Clang** compiler. You can install the **OpenMP** library by the following command: ``brew install libomp``.
 
 - For version smaller than 2.2.1 and not smaller than 2.1.2, **gcc-8** with **OpenMP** support must be installed first. Refer to `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#gcc>`__ for installation of **gcc-8** with **OpenMP** support.
 
@@ -123,6 +123,8 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply
 
 **CUDA** library (version 9.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.
 
+Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries.
+
 Build HDFS Version
 ~~~~~~~~~~~~~~~~~~
 
@@ -164,6 +166,8 @@ Install from `conda-forge channel <https://anaconda.org/conda-forge/lightgbm>`_
 
 If you use ``conda`` to manage Python dependencies, you can install LightGBM using ``conda install``.
 
+We strongly recommend installation from the ``conda-forge`` channel and not from the ``default`` one due to many reasons. The main ones are less time delay for new releases, greater number of supported architectures and better handling of dependency conflicts, especially workaround for OpenMP is crucial for LightGBM. More details can be found in `this comment <https://github.com/microsoft/LightGBM/issues/4948#issuecomment-1013766397>`_.
+
 **Note**: The `lightgbm conda-forge feedstock <https://github.com/conda-forge/lightgbm-feedstock>`_ is not maintained by LightGBM maintainers.
 
 .. code:: sh
@@ -196,6 +200,8 @@ Run ``python setup.py install --gpu`` to enable GPU support. All requirements fr
 
 Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
 
+Run ``python setup.py install --cuda-exp`` to enable the new experimental version of CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
+
 Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.
 
 Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 64f1cb31edaa..3c35b1aa21da 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -7,7 +7,6 @@
 from collections import OrderedDict
 from copy import deepcopy
 from functools import wraps
-from logging import Logger
 from os import SEEK_END
 from os.path import getsize
 from pathlib import Path
@@ -17,14 +16,13 @@
 import numpy as np
 import scipy.sparse
 
-from .compat import (PANDAS_INSTALLED, concat, dt_DataTable, is_dtype_sparse, pd_CategoricalDtype, pd_DataFrame,
-                     pd_Series)
+from .compat import PANDAS_INSTALLED, concat, dt_DataTable, pd_CategoricalDtype, pd_DataFrame, pd_Series
 from .libpath import find_lib_path
 
 ZERO_THRESHOLD = 1e-35
 
 
-def _get_sample_count(total_nrow: int, params: str):
+def _get_sample_count(total_nrow: int, params: str) -> int:
     sample_cnt = ctypes.c_int(0)
     _safe_call(_LIB.LGBM_GetSampleCount(
         ctypes.c_int32(total_nrow),
@@ -42,21 +40,37 @@ def warning(self, msg: str) -> None:
         warnings.warn(msg, stacklevel=3)
 
 
-_LOGGER: Union[_DummyLogger, Logger] = _DummyLogger()
+_LOGGER: Any = _DummyLogger()
+_INFO_METHOD_NAME = "info"
+_WARNING_METHOD_NAME = "warning"
 
 
-def register_logger(logger: Logger) -> None:
+def register_logger(
+    logger: Any, info_method_name: str = "info", warning_method_name: str = "warning"
+) -> None:
     """Register custom logger.
 
     Parameters
     ----------
-    logger : logging.Logger
+    logger : Any
         Custom logger.
+    info_method_name : str, optional (default="info")
+        Method used to log info messages.
+    warning_method_name : str, optional (default="warning")
+        Method used to log warning messages.
     """
-    if not isinstance(logger, Logger):
-        raise TypeError("Logger should inherit logging.Logger class")
-    global _LOGGER
+    def _has_method(logger: Any, method_name: str) -> bool:
+        return callable(getattr(logger, method_name, None))
+
+    if not _has_method(logger, info_method_name) or not _has_method(logger, warning_method_name):
+        raise TypeError(
+            f"Logger must provide '{info_method_name}' and '{warning_method_name}' method"
+        )
+
+    global _LOGGER, _INFO_METHOD_NAME, _WARNING_METHOD_NAME
     _LOGGER = logger
+    _INFO_METHOD_NAME = info_method_name
+    _WARNING_METHOD_NAME = warning_method_name
 
 
 def _normalize_native_string(func: Callable[[str], None]) -> Callable[[str], None]:
@@ -77,16 +91,16 @@ def wrapper(msg: str) -> None:
 
 
 def _log_info(msg: str) -> None:
-    _LOGGER.info(msg)
+    getattr(_LOGGER, _INFO_METHOD_NAME)(msg)
 
 
 def _log_warning(msg: str) -> None:
-    _LOGGER.warning(msg)
+    getattr(_LOGGER, _WARNING_METHOD_NAME)(msg)
 
 
 @_normalize_native_string
 def _log_native(msg: str) -> None:
-    _LOGGER.info(msg)
+    getattr(_LOGGER, _INFO_METHOD_NAME)(msg)
 
 
 def _log_callback(msg: bytes) -> None:
@@ -94,7 +108,7 @@ def _log_callback(msg: bytes) -> None:
     _log_native(str(msg.decode('utf-8')))
 
 
-def _load_lib():
+def _load_lib() -> Optional[ctypes.CDLL]:
     """Load LightGBM library."""
     lib_path = find_lib_path()
     if len(lib_path) == 0:
@@ -127,7 +141,7 @@ def _safe_call(ret: int) -> None:
         raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
 
 
-def is_numeric(obj):
+def is_numeric(obj: Any) -> bool:
     """Check whether object is a number or not, include numpy number, etc."""
     try:
         float(obj)
@@ -138,12 +152,12 @@ def is_numeric(obj):
         return False
 
 
-def is_numpy_1d_array(data):
+def is_numpy_1d_array(data: Any) -> bool:
     """Check whether data is a numpy 1-D array."""
     return isinstance(data, np.ndarray) and len(data.shape) == 1
 
 
-def is_numpy_column_array(data):
+def is_numpy_column_array(data: Any) -> bool:
     """Check whether data is a column numpy array."""
     if not isinstance(data, np.ndarray):
         return False
@@ -158,7 +172,7 @@ def cast_numpy_array_to_dtype(array, dtype):
     return array.astype(dtype=dtype, copy=False)
 
 
-def is_1d_list(data):
+def is_1d_list(data: Any) -> bool:
     """Check whether data is a 1-D list."""
     return isinstance(data, list) and (not data or is_numeric(data[0]))
 
@@ -184,8 +198,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
     elif is_1d_list(data):
         return np.array(data, dtype=dtype, copy=False)
     elif isinstance(data, pd_Series):
-        if _get_bad_pandas_dtypes([data.dtypes]):
-            raise ValueError('Series.dtypes must be int, float or bool')
+        _check_for_bad_pandas_dtypes(data.to_frame().dtypes)
         return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
     else:
         raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n"
@@ -218,8 +231,7 @@ def _data_to_2d_numpy(data: Any, dtype: type = np.float32, name: str = 'list') -
     if _is_2d_list(data):
         return np.array(data, dtype=dtype)
     if isinstance(data, pd_DataFrame):
-        if _get_bad_pandas_dtypes(data.dtypes):
-            raise ValueError('DataFrame.dtypes must be int, float or bool')
+        _check_for_bad_pandas_dtypes(data.dtypes)
         return cast_numpy_array_to_dtype(data.values, dtype)
     raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n"
                     "It should be list of lists, numpy 2-D array or pandas DataFrame")
@@ -267,7 +279,7 @@ def c_array(ctype, values):
     return (ctype * len(values))(*values)
 
 
-def json_default_with_numpy(obj):
+def json_default_with_numpy(obj: Any) -> Any:
     """Convert numpy classes to JSON serializable objects."""
     if isinstance(obj, (np.integer, np.floating, np.bool_)):
         return obj.item()
@@ -277,7 +289,7 @@ def json_default_with_numpy(obj):
         return obj
 
 
-def param_dict_to_str(data):
+def param_dict_to_str(data: Optional[Dict[str, Any]]) -> str:
     """Convert Python dictionary to string, which is passed to C API."""
     if data is None or not data:
         return ""
@@ -396,21 +408,27 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
     # avoid side effects on passed-in parameters
     params = deepcopy(params)
 
-    # find a value, and remove other aliases with .pop()
-    # prefer the value of 'main_param_name' if it exists, otherwise search the aliases
-    found_value = None
+    aliases = _ConfigAliases.get(main_param_name) - {main_param_name}
+
+    # if main_param_name was provided, keep that value and remove all aliases
     if main_param_name in params.keys():
-        found_value = params[main_param_name]
+        for param in aliases:
+            params.pop(param, None)
+        return params
 
-    for param in _ConfigAliases.get(main_param_name):
-        val = params.pop(param, None)
-        if found_value is None and val is not None:
-            found_value = val
+    # if main param name was not found, search for an alias
+    for param in aliases:
+        if param in params.keys():
+            params[main_param_name] = params[param]
+            break
 
-    if found_value is not None:
-        params[main_param_name] = found_value
-    else:
-        params[main_param_name] = default_value
+    if main_param_name in params.keys():
+        for param in aliases:
+            params.pop(param, None)
+        return params
+
+    # neither of main_param_name, aliases were found
+    params[main_param_name] = default_value
 
     return params
 
@@ -501,15 +519,23 @@ def c_int_array(data):
     return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
 
 
-def _get_bad_pandas_dtypes(dtypes):
-    pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
-                           'int64': 'int', 'uint8': 'int', 'uint16': 'int',
-                           'uint32': 'int', 'uint64': 'int', 'bool': 'int',
-                           'float16': 'float', 'float32': 'float', 'float64': 'float'}
-    bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
-                                                           and (not is_dtype_sparse(dtype)
-                                                                or dtype.subtype.name not in pandas_dtype_mapper))]
-    return bad_indices
+def _check_for_bad_pandas_dtypes(pandas_dtypes_series):
+    float128 = getattr(np, 'float128', type(None))
+
+    def is_allowed_numpy_dtype(dtype):
+        return (
+            issubclass(dtype, (np.integer, np.floating, np.bool_))
+            and not issubclass(dtype, (np.timedelta64, float128))
+        )
+
+    bad_pandas_dtypes = [
+        f'{column_name}: {pandas_dtype}'
+        for column_name, pandas_dtype in pandas_dtypes_series.iteritems()
+        if not is_allowed_numpy_dtype(pandas_dtype.type)
+    ]
+    if bad_pandas_dtypes:
+        raise ValueError('pandas dtypes must be int, float or bool.\n'
+                         f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}')
 
 
 def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
@@ -517,7 +543,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
         if len(data.shape) != 2 or data.shape[0] < 1:
             raise ValueError('Input data must be 2 dimensional and non empty.')
         if feature_name == 'auto' or feature_name is None:
-            data = data.rename(columns=str)
+            data = data.rename(columns=str, copy=False)
         cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
         cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
         if pandas_categorical is None:  # train dataset
@@ -529,7 +555,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
                 if list(data[col].cat.categories) != list(category):
                     data[col] = data[col].cat.set_categories(category)
         if len(cat_cols):  # cat_cols is list
-            data = data.copy()  # not alter origin DataFrame
+            data = data.copy(deep=False)  # not alter origin DataFrame
             data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
         if categorical_feature is not None:
             if feature_name is None:
@@ -540,15 +566,11 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
                 categorical_feature = list(categorical_feature)
         if feature_name == 'auto':
             feature_name = list(data.columns)
-        bad_indices = _get_bad_pandas_dtypes(data.dtypes)
-        if bad_indices:
-            bad_index_cols_str = ', '.join(data.columns[bad_indices])
-            raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
-                             "Did not expect the data types in the following fields: "
-                             f"{bad_index_cols_str}")
-        data = data.values
-        if data.dtype != np.float32 and data.dtype != np.float64:
-            data = data.astype(np.float32)
+        _check_for_bad_pandas_dtypes(data.dtypes)
+        df_dtypes = [dtype.type for dtype in data.dtypes]
+        df_dtypes.append(np.float32)  # so that the target dtype considers floats
+        target_dtype = np.find_common_type(df_dtypes, [])
+        data = data.astype(target_dtype, copy=False).values
     else:
         if feature_name == 'auto':
             feature_name = None
@@ -561,8 +583,7 @@ def _label_from_pandas(label):
     if isinstance(label, pd_DataFrame):
         if len(label.columns) > 1:
             raise ValueError('DataFrame for label cannot have multiple columns')
-        if _get_bad_pandas_dtypes(label.dtypes):
-            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
+        _check_for_bad_pandas_dtypes(label.dtypes)
         label = np.ravel(label.values.astype(np.float32, copy=False))
     return label
 
@@ -723,7 +744,7 @@ def __init__(self, model_file=None, booster_handle=None, pred_parameter=None):
         pred_parameter = {} if pred_parameter is None else pred_parameter
         self.pred_parameter = param_dict_to_str(pred_parameter)
 
-    def __del__(self):
+    def __del__(self) -> None:
         try:
             if self.__is_manage_handle:
                 _safe_call(_LIB.LGBM_BoosterFree(self.handle))
@@ -735,9 +756,17 @@ def __getstate__(self):
         this.pop('handle', None)
         return this
 
-    def predict(self, data, start_iteration=0, num_iteration=-1,
-                raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False,
-                is_reshape=True):
+    def predict(
+        self,
+        data,
+        start_iteration: int = 0,
+        num_iteration: int = -1,
+        raw_score: bool = False,
+        pred_leaf: bool = False,
+        pred_contrib: bool = False,
+        data_has_header: bool = False,
+        validate_features: bool = False
+    ):
         """Predict logic.
 
         Parameters
@@ -758,8 +787,9 @@ def predict(self, data, start_iteration=0, num_iteration=-1,
         data_has_header : bool, optional (default=False)
             Whether data has header.
             Used only for txt data.
-        is_reshape : bool, optional (default=True)
-            Whether to reshape to (nrow, ncol).
+        validate_features : bool, optional (default=False)
+            If True, ensure that the features used to predict match the ones used to train.
+            Used only if data is pandas DataFrame.
 
         Returns
         -------
@@ -769,6 +799,17 @@ def predict(self, data, start_iteration=0, num_iteration=-1,
         """
         if isinstance(data, Dataset):
             raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
+        elif isinstance(data, pd_DataFrame) and validate_features:
+            data_names = [str(x) for x in data.columns]
+            ptr_names = (ctypes.c_char_p * len(data_names))()
+            ptr_names[:] = [x.encode('utf-8') for x in data_names]
+            _safe_call(
+                _LIB.LGBM_BoosterValidateFeatureNames(
+                    self.handle,
+                    ptr_names,
+                    ctypes.c_int(len(data_names)),
+                )
+            )
         data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
         predict_type = C_API_PREDICT_NORMAL
         if raw_score:
@@ -816,7 +857,7 @@ def predict(self, data, start_iteration=0, num_iteration=-1,
         if pred_leaf:
             preds = preds.astype(np.int32)
         is_sparse = scipy.sparse.issparse(preds) or isinstance(preds, list)
-        if is_reshape and not is_sparse and preds.size != nrow:
+        if not is_sparse and preds.size != nrow:
             if preds.size % nrow == 0:
                 preds = preds.reshape(nrow, -1)
             else:
@@ -1104,7 +1145,7 @@ def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type):
             raise ValueError("Wrong length for predict results")
         return preds, nrow
 
-    def current_iteration(self):
+    def current_iteration(self) -> int:
         """Get the index of the current iteration.
 
         Returns
@@ -1138,7 +1179,7 @@ def __init__(self, data, label=None, reference=None,
         reference : Dataset or None, optional (default=None)
             If this is Dataset for validation, training data should be used as reference.
         weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
-            Weight for each instance.
+            Weight for each instance. Weights should be non-negative.
         group : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Group/query data.
             Only used in the learning-to-rank task.
@@ -1155,10 +1196,11 @@ def __init__(self, data, label=None, reference=None,
             If list of int, interpreted as indices.
             If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
             If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-            All values in categorical features should be less than int32 max value (2147483647).
+            All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
             Large values could be memory consuming. Consider using consecutive integers starting from zero.
             All negative values in categorical features will be treated as missing values.
             The output cannot be monotonically constrained with respect to a categorical feature.
+            Floating point numbers in categorical features will be rounded towards 0.
         params : dict or None, optional (default=None)
             Other parameters for Dataset.
         free_raw_data : bool, optional (default=True)
@@ -1185,7 +1227,7 @@ def __init__(self, data, label=None, reference=None,
         self.version = 0
         self._start_row = 0  # Used when pushing rows one by one.
 
-    def __del__(self):
+    def __del__(self) -> None:
         try:
             self._free_handle()
         except AttributeError:
@@ -1302,6 +1344,7 @@ def _init_from_sample(
             num_per_col_ptr,
             ctypes.c_int32(sample_cnt),
             ctypes.c_int32(total_nrow),
+            ctypes.c_int64(total_nrow),
             c_str(params_str),
             ctypes.byref(self.handle),
         ))
@@ -1335,12 +1378,12 @@ def _push_rows(self, data: np.ndarray) -> 'Dataset':
         self._start_row += nrow
         return self
 
-    def get_params(self):
+    def get_params(self) -> Dict[str, Any]:
         """Get the used parameters in the Dataset.
 
         Returns
         -------
-        params : dict or None
+        params : dict
             The used parameters in this Dataset object.
         """
         if self.params is not None:
@@ -1367,8 +1410,10 @@ def get_params(self):
                                                 "weight_column",
                                                 "zero_as_missing")
             return {k: v for k, v in self.params.items() if k in dataset_params}
+        else:
+            return {}
 
-    def _free_handle(self):
+    def _free_handle(self) -> "Dataset":
         if self.handle is not None:
             _safe_call(_LIB.LGBM_DatasetFree(self.handle))
             self.handle = None
@@ -1386,8 +1431,8 @@ def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
         if predictor is not None:
             init_score = predictor.predict(data,
                                            raw_score=True,
-                                           data_has_header=data_has_header,
-                                           is_reshape=False)
+                                           data_has_header=data_has_header)
+            init_score = init_score.ravel()
             if used_indices is not None:
                 assert not self.need_slice
                 if isinstance(data, (str, Path)):
@@ -1745,7 +1790,7 @@ def _compare_params_for_warning(
                     return False
         return True
 
-    def construct(self):
+    def construct(self) -> "Dataset":
         """Lazy init.
 
         Returns
@@ -1804,6 +1849,7 @@ def construct(self):
                                 feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
             if self.free_raw_data:
                 self.data = None
+            self.feature_name = self.get_feature_name()
         return self
 
     def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
@@ -1817,7 +1863,7 @@ def create_valid(self, data, label=None, weight=None, group=None, init_score=Non
         label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
             Label of the data.
         weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
-            Weight for each instance.
+            Weight for each instance. Weights should be non-negative.
         group : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Group/query data.
             Only used in the learning-to-rank task.
@@ -1841,7 +1887,11 @@ def create_valid(self, data, label=None, weight=None, group=None, init_score=Non
         ret.pandas_categorical = self.pandas_categorical
         return ret
 
-    def subset(self, used_indices, params=None):
+    def subset(
+        self,
+        used_indices: List[int],
+        params: Optional[Dict[str, Any]] = None
+    ) -> "Dataset":
         """Get subset of current Dataset.
 
         Parameters
@@ -1866,7 +1916,7 @@ def subset(self, used_indices, params=None):
         ret.used_indices = sorted(used_indices)
         return ret
 
-    def save_binary(self, filename):
+    def save_binary(self, filename: Union[str, Path]) -> "Dataset":
         """Save Dataset to a binary file.
 
         .. note::
@@ -1916,7 +1966,7 @@ def update():
                     raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
         return self
 
-    def _reverse_update_params(self):
+    def _reverse_update_params(self) -> "Dataset":
         if self.handle is None:
             self.params = deepcopy(self.params_back_up)
             self.params_back_up = None
@@ -1981,7 +2031,7 @@ def set_field(self, field_name, data):
         self.version += 1
         return self
 
-    def get_field(self, field_name):
+    def get_field(self, field_name: str) -> Optional[np.ndarray]:
         """Get property from the Dataset.
 
         Parameters
@@ -2024,7 +2074,10 @@ def get_field(self, field_name):
                 arr = arr.reshape((num_data, num_classes), order='F')
         return arr
 
-    def set_categorical_feature(self, categorical_feature):
+    def set_categorical_feature(
+        self,
+        categorical_feature: Union[List[int], List[str]]
+    ) -> "Dataset":
         """Set categorical features.
 
         Parameters
@@ -2102,7 +2155,7 @@ def set_reference(self, reference):
             raise LightGBMError("Cannot set reference after freed raw data, "
                                 "set free_raw_data=False when construct Dataset to avoid this.")
 
-    def set_feature_name(self, feature_name):
+    def set_feature_name(self, feature_name: List[str]) -> "Dataset":
         """Set feature name.
 
         Parameters
@@ -2153,7 +2206,7 @@ def set_weight(self, weight):
         Parameters
         ----------
         weight : list, numpy 1-D array, pandas Series or None
-            Weight to be set for each data point.
+            Weight to be set for each data point. Weights should be non-negative.
 
         Returns
         -------
@@ -2211,7 +2264,7 @@ def set_group(self, group):
             self.set_field('group', group)
         return self
 
-    def get_feature_name(self):
+    def get_feature_name(self) -> List[str]:
         """Get the names of columns (features) in the Dataset.
 
         Returns
@@ -2268,7 +2321,7 @@ def get_weight(self):
         Returns
         -------
         weight : numpy array or None
-            Weight for each data point from the Dataset.
+            Weight for each data point from the Dataset. Weights should be non-negative.
         """
         if self.weight is None:
             self.weight = self.get_field('weight')
@@ -2337,7 +2390,7 @@ def get_group(self):
                 self.group = np.diff(self.group)
         return self.group
 
-    def num_data(self):
+    def num_data(self) -> int:
         """Get the number of rows in the Dataset.
 
         Returns
@@ -2353,7 +2406,7 @@ def num_data(self):
         else:
             raise LightGBMError("Cannot get num_data before construct dataset")
 
-    def num_feature(self):
+    def num_feature(self) -> int:
         """Get the number of columns (features) in the Dataset.
 
         Returns
@@ -2369,6 +2422,30 @@ def num_feature(self):
         else:
             raise LightGBMError("Cannot get num_feature before construct dataset")
 
+    def feature_num_bin(self, feature: Union[int, str]) -> int:
+        """Get the number of bins for a feature.
+
+        Parameters
+        ----------
+        feature : int or str
+            Index or name of the feature.
+
+        Returns
+        -------
+        number_of_bins : int
+            The number of constructed bins for the feature in the Dataset.
+        """
+        if self.handle is not None:
+            if isinstance(feature, str):
+                feature = self.feature_name.index(feature)
+            ret = ctypes.c_int(0)
+            _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
+                                                         ctypes.c_int(feature),
+                                                         ctypes.byref(ret)))
+            return ret.value
+        else:
+            raise LightGBMError("Cannot get feature_num_bin before construct dataset")
+
     def get_ref_chain(self, ref_limit=100):
         """Get a chain of Dataset objects.
 
@@ -2399,7 +2476,7 @@ def get_ref_chain(self, ref_limit=100):
                 break
         return ref_chain
 
-    def add_features_from(self, other):
+    def add_features_from(self, other: "Dataset") -> "Dataset":
         """Add features from other Dataset to the current Dataset.
 
         Both Datasets must be constructed before calling this method.
@@ -2488,7 +2565,7 @@ def add_features_from(self, other):
         self.pandas_categorical = None
         return self
 
-    def _dump_text(self, filename):
+    def _dump_text(self, filename: Union[str, Path]) -> "Dataset":
         """Save Dataset to a text file.
 
         This format cannot be loaded back in by LightGBM, but is useful for debugging purposes.
@@ -2509,10 +2586,22 @@ def _dump_text(self, filename):
         return self
 
 
+_LGBM_CustomObjectiveFunction = Callable[
+    [np.ndarray, Dataset],
+    Tuple[np.ndarray, np.ndarray]
+]
+
+
 class Booster:
     """Booster in LightGBM."""
 
-    def __init__(self, params=None, train_set=None, model_file=None, model_str=None):
+    def __init__(
+        self,
+        params: Optional[Dict[str, Any]] = None,
+        train_set: Optional[Dataset] = None,
+        model_file: Optional[Union[str, Path]] = None,
+        model_str: Optional[str] = None
+    ):
         """Initialize the Booster.
 
         Parameters
@@ -2530,7 +2619,6 @@ def __init__(self, params=None, train_set=None, model_file=None, model_str=None)
         self.network = False
         self.__need_reload_eval_info = True
         self._train_data_name = "training"
-        self.__attr = {}
         self.__set_objective_to_none = False
         self.best_iteration = -1
         self.best_score = {}
@@ -2625,7 +2713,7 @@ def __init__(self, params=None, train_set=None, model_file=None, model_str=None)
                             'to create Booster instance')
         self.params = params
 
-    def __del__(self):
+    def __del__(self) -> None:
         try:
             if self.network:
                 self.free_network()
@@ -2637,10 +2725,10 @@ def __del__(self):
         except AttributeError:
             pass
 
-    def __copy__(self):
+    def __copy__(self) -> "Booster":
         return self.__deepcopy__(None)
 
-    def __deepcopy__(self, _):
+    def __deepcopy__(self, _) -> "Booster":
         model_str = self.model_to_string(num_iteration=-1)
         booster = Booster(model_str=model_str)
         return booster
@@ -2666,7 +2754,7 @@ def __setstate__(self, state):
             state['handle'] = handle
         self.__dict__.update(state)
 
-    def free_dataset(self):
+    def free_dataset(self) -> "Booster":
         """Free Booster's Datasets.
 
         Returns
@@ -2679,7 +2767,7 @@ def free_dataset(self):
         self.__num_dataset = 0
         return self
 
-    def _free_buffer(self):
+    def _free_buffer(self) -> "Booster":
         self.__inner_predict_buffer = []
         self.__is_predicted_cur_iter = []
         return self
@@ -2718,7 +2806,7 @@ def set_network(
         self.network = True
         return self
 
-    def free_network(self):
+    def free_network(self) -> "Booster":
         """Free Booster's network.
 
         Returns
@@ -2730,7 +2818,7 @@ def free_network(self):
         self.network = False
         return self
 
-    def trees_to_dataframe(self):
+    def trees_to_dataframe(self) -> pd_DataFrame:
         """Parse the fitted model and return in an easy-to-read pandas DataFrame.
 
         The returned DataFrame has the following columns.
@@ -2750,7 +2838,7 @@ def trees_to_dataframe(self):
             - ``missing_direction`` : str, split direction that missing values should go to. ``None`` for leaf nodes.
             - ``missing_type`` : str, describes what types of values are treated as missing.
             - ``value`` : float64, predicted value for this leaf node, multiplied by the learning rate.
-            - ``weight`` : float64 or int64, sum of hessian (second-order derivative of objective), summed over observations that fall in this node.
+            - ``weight`` : float64 or int64, sum of Hessian (second-order derivative of objective), summed over observations that fall in this node.
             - ``count`` : int64, number of records in the training data that fall into this node.
 
         Returns
@@ -2866,7 +2954,7 @@ def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
 
         return pd_DataFrame(model_list, columns=model_list[0].keys())
 
-    def set_train_data_name(self, name):
+    def set_train_data_name(self, name: str) -> "Booster":
         """Set the name to the training Dataset.
 
         Parameters
@@ -2882,7 +2970,7 @@ def set_train_data_name(self, name):
         self._train_data_name = name
         return self
 
-    def add_valid(self, data, name):
+    def add_valid(self, data: Dataset, name: str) -> "Booster":
         """Add validation data.
 
         Parameters
@@ -2912,7 +3000,7 @@ def add_valid(self, data, name):
         self.__is_predicted_cur_iter.append(False)
         return self
 
-    def reset_parameter(self, params):
+    def reset_parameter(self, params: Dict[str, Any]) -> "Booster":
         """Reset parameters of Booster.
 
         Parameters
@@ -2933,7 +3021,11 @@ def reset_parameter(self, params):
         self.params.update(params)
         return self
 
-    def update(self, train_set=None, fobj=None):
+    def update(
+        self,
+        train_set: Optional[Dataset] = None,
+        fobj: Optional[_LGBM_CustomObjectiveFunction] = None
+    ) -> bool:
         """Update Booster for one iteration.
 
         Parameters
@@ -2946,22 +3038,21 @@ def update(self, train_set=None, fobj=None):
             Should accept two parameters: preds, train_data,
             and return (grad, hess).
 
-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
                     Predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task.
                 train_data : Dataset
                     The training dataset.
-                grad : list, numpy 1-D array or pandas Series
+                grad : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The value of the first order derivative (gradient) of the loss
                     with respect to the elements of preds for each sample point.
-                hess : list, numpy 1-D array or pandas Series
+                hess : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The value of the second order derivative (Hessian) of the loss
                     with respect to the elements of preds for each sample point.
 
-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-            and you should group grad and hess in this way as well.
+            For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+            and grad and hess should be returned in the same format.
 
         Returns
         -------
@@ -3001,23 +3092,26 @@ def update(self, train_set=None, fobj=None):
             grad, hess = fobj(self.__inner_predict(0), self.train_set)
             return self.__boost(grad, hess)
 
-    def __boost(self, grad, hess):
+    def __boost(
+        self,
+        grad: np.ndarray,
+        hess: np.ndarray
+    ) -> bool:
         """Boost Booster for one iteration with customized gradient statistics.
 
         .. note::
 
             Score is returned before any transformation,
             e.g. it is raw margin instead of probability of positive class for binary task.
-            For multi-class task, the score is group by class_id first, then group by row_id.
-            If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
-            and you should group grad and hess in this way as well.
+            For multi-class task, score are numpy 2-D array of shape = [n_samples, n_classes],
+            and grad and hess should be returned in the same format.
 
         Parameters
         ----------
-        grad : list, numpy 1-D array or pandas Series
+        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
             The value of the first order derivative (gradient) of the loss
             with respect to the elements of score for each sample point.
-        hess : list, numpy 1-D array or pandas Series
+        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
             The value of the second order derivative (Hessian) of the loss
             with respect to the elements of score for each sample point.
 
@@ -3026,12 +3120,22 @@ def __boost(self, grad, hess):
         is_finished : bool
             Whether the boost was successfully finished.
         """
+        if self.__num_class > 1:
+            grad = grad.ravel(order='F')
+            hess = hess.ravel(order='F')
         grad = list_to_1d_numpy(grad, name='gradient')
         hess = list_to_1d_numpy(hess, name='hessian')
         assert grad.flags.c_contiguous
         assert hess.flags.c_contiguous
         if len(grad) != len(hess):
-            raise ValueError(f"Lengths of gradient({len(grad)}) and hessian({len(hess)}) don't match")
+            raise ValueError(f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) don't match")
+        num_train_data = self.train_set.num_data()
+        if len(grad) != num_train_data * self.__num_class:
+            raise ValueError(
+                f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) "
+                f"don't match training data length ({num_train_data}) * "
+                f"number of models per one iteration ({self.__num_class})"
+            )
         is_finished = ctypes.c_int(0)
         _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
             self.handle,
@@ -3041,7 +3145,7 @@ def __boost(self, grad, hess):
         self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
         return is_finished.value == 1
 
-    def rollback_one_iter(self):
+    def rollback_one_iter(self) -> "Booster":
         """Rollback one iteration.
 
         Returns
@@ -3054,7 +3158,7 @@ def rollback_one_iter(self):
         self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
         return self
 
-    def current_iteration(self):
+    def current_iteration(self) -> int:
         """Get the index of the current iteration.
 
         Returns
@@ -3068,7 +3172,7 @@ def current_iteration(self):
             ctypes.byref(out_cur_iter)))
         return out_cur_iter.value
 
-    def num_model_per_iteration(self):
+    def num_model_per_iteration(self) -> int:
         """Get number of models per iteration.
 
         Returns
@@ -3082,7 +3186,7 @@ def num_model_per_iteration(self):
             ctypes.byref(model_per_iter)))
         return model_per_iter.value
 
-    def num_trees(self):
+    def num_trees(self) -> int:
         """Get number of weak sub-models.
 
         Returns
@@ -3096,12 +3200,12 @@ def num_trees(self):
             ctypes.byref(num_trees)))
         return num_trees.value
 
-    def upper_bound(self):
+    def upper_bound(self) -> float:
         """Get upper bound value of a model.
 
         Returns
         -------
-        upper_bound : double
+        upper_bound : float
             Upper bound value of the model.
         """
         ret = ctypes.c_double(0)
@@ -3110,12 +3214,12 @@ def upper_bound(self):
             ctypes.byref(ret)))
         return ret.value
 
-    def lower_bound(self):
+    def lower_bound(self) -> float:
         """Get lower bound value of a model.
 
         Returns
         -------
-        lower_bound : double
+        lower_bound : float
             Lower bound value of the model.
         """
         ret = ctypes.c_double(0)
@@ -3133,17 +3237,18 @@ def eval(self, data, name, feval=None):
             Data for the evaluating.
         name : str
             Name of the data.
-        feval : callable or None, optional (default=None)
+        feval : callable, list of callable, or None, optional (default=None)
             Customized evaluation function.
-            Should accept two parameters: preds, eval_data,
+            Each evaluation function should accept two parameters: preds, eval_data,
             and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
-                    If ``fobj`` is specified, predicted values are returned before any transformation,
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
+                    If custom objective function is used, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
-                    The evaluation dataset.
+                    A ``Dataset`` to evaluate.
                 eval_name : str
                     The name of evaluation function (without whitespace).
                 eval_result : float
@@ -3151,9 +3256,6 @@ def eval(self, data, name, feval=None):
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
-
         Returns
         -------
         result : list
@@ -3181,16 +3283,17 @@ def eval_train(self, feval=None):
 
         Parameters
         ----------
-        feval : callable or None, optional (default=None)
+        feval : callable, list of callable, or None, optional (default=None)
             Customized evaluation function.
-            Should accept two parameters: preds, train_data,
+            Each evaluation function should accept two parameters: preds, eval_data,
             and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
-                    If ``fobj`` is specified, predicted values are returned before any transformation,
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
+                    If custom objective function is used, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
-                train_data : Dataset
+                eval_data : Dataset
                     The training dataset.
                 eval_name : str
                     The name of evaluation function (without whitespace).
@@ -3199,9 +3302,6 @@ def eval_train(self, feval=None):
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
-
         Returns
         -------
         result : list
@@ -3214,16 +3314,17 @@ def eval_valid(self, feval=None):
 
         Parameters
         ----------
-        feval : callable or None, optional (default=None)
+        feval : callable, list of callable, or None, optional (default=None)
             Customized evaluation function.
-            Should accept two parameters: preds, valid_data,
+            Each evaluation function should accept two parameters: preds, eval_data,
             and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
-                    If ``fobj`` is specified, predicted values are returned before any transformation,
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
+                    If custom objective function is used, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
-                valid_data : Dataset
+                eval_data : Dataset
                     The validation dataset.
                 eval_name : str
                     The name of evaluation function (without whitespace).
@@ -3232,9 +3333,6 @@ def eval_valid(self, feval=None):
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
-
         Returns
         -------
         result : list
@@ -3300,7 +3398,7 @@ def shuffle_models(self, start_iteration=0, end_iteration=-1):
             ctypes.c_int(end_iteration)))
         return self
 
-    def model_from_string(self, model_str):
+    def model_from_string(self, model_str: str) -> "Booster":
         """Load Booster from a string.
 
         Parameters
@@ -3446,9 +3544,18 @@ def dump_model(self, num_iteration=None, start_iteration=0, importance_type='spl
                                                           default=json_default_with_numpy))
         return ret
 
-    def predict(self, data, start_iteration=0, num_iteration=None,
-                raw_score=False, pred_leaf=False, pred_contrib=False,
-                data_has_header=False, is_reshape=True, **kwargs):
+    def predict(
+        self,
+        data,
+        start_iteration: int = 0,
+        num_iteration: Optional[int] = None,
+        raw_score: bool = False,
+        pred_leaf: bool = False,
+        pred_contrib: bool = False,
+        data_has_header: bool = False,
+        validate_features: bool = False,
+        **kwargs: Any
+    ):
         """Make a prediction.
 
         Parameters
@@ -3482,8 +3589,9 @@ def predict(self, data, start_iteration=0, num_iteration=None,
         data_has_header : bool, optional (default=False)
             Whether the data has header.
             Used only if data is str.
-        is_reshape : bool, optional (default=True)
-            If True, result is reshaped to [nrow, ncol].
+        validate_features : bool, optional (default=False)
+            If True, ensure that the features used to predict match the ones used to train.
+            Used only if data is pandas DataFrame.
         **kwargs
             Other parameters for the prediction.
 
@@ -3501,9 +3609,24 @@ def predict(self, data, start_iteration=0, num_iteration=None,
                 num_iteration = -1
         return predictor.predict(data, start_iteration, num_iteration,
                                  raw_score, pred_leaf, pred_contrib,
-                                 data_has_header, is_reshape)
+                                 data_has_header, validate_features)
 
-    def refit(self, data, label, decay_rate=0.9, **kwargs):
+    def refit(
+        self,
+        data,
+        label,
+        decay_rate=0.9,
+        reference=None,
+        weight=None,
+        group=None,
+        init_score=None,
+        feature_name='auto',
+        categorical_feature='auto',
+        dataset_params=None,
+        free_raw_data=True,
+        validate_features=False,
+        **kwargs
+    ):
         """Refit the existing Booster by new data.
 
         Parameters
@@ -3516,6 +3639,38 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
         decay_rate : float, optional (default=0.9)
             Decay rate of refit,
             will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees.
+        reference : Dataset or None, optional (default=None)
+            Reference for ``data``.
+        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
+            Weight for each ``data`` instance. Weights should be non-negative.
+        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
+            Group/query size for ``data``.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+            Init score for ``data``.
+        feature_name : list of str, or 'auto', optional (default="auto")
+            Feature names for ``data``.
+            If 'auto' and data is pandas DataFrame, data columns names are used.
+        categorical_feature : list of str or int, or 'auto', optional (default="auto")
+            Categorical features for ``data``.
+            If list of int, interpreted as indices.
+            If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
+            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
+            All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
+            Large values could be memory consuming. Consider using consecutive integers starting from zero.
+            All negative values in categorical features will be treated as missing values.
+            The output cannot be monotonically constrained with respect to a categorical feature.
+            Floating point numbers in categorical features will be rounded towards 0.
+        dataset_params : dict or None, optional (default=None)
+            Other parameters for Dataset ``data``.
+        free_raw_data : bool, optional (default=True)
+            If True, raw data is freed after constructing inner Dataset for ``data``.
+        validate_features : bool, optional (default=False)
+            If True, ensure that the features used to refit the model match the original ones.
+            Used only if data is pandas DataFrame.
         **kwargs
             Other parameters for refit.
             These parameters will be passed to ``predict`` method.
@@ -3527,8 +3682,10 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
         """
         if self.__set_objective_to_none:
             raise LightGBMError('Cannot refit due to null objective function.')
+        if dataset_params is None:
+            dataset_params = {}
         predictor = self._to_predictor(deepcopy(kwargs))
-        leaf_preds = predictor.predict(data, -1, pred_leaf=True)
+        leaf_preds = predictor.predict(data, -1, pred_leaf=True, validate_features=validate_features)
         nrow, ncol = leaf_preds.shape
         out_is_linear = ctypes.c_int(0)
         _safe_call(_LIB.LGBM_BoosterGetLinear(
@@ -3540,7 +3697,19 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
             default_value=None
         )
         new_params["linear_tree"] = bool(out_is_linear.value)
-        train_set = Dataset(data, label, params=new_params)
+        new_params.update(dataset_params)
+        train_set = Dataset(
+            data=data,
+            label=label,
+            reference=reference,
+            weight=weight,
+            group=group,
+            init_score=init_score,
+            feature_name=feature_name,
+            categorical_feature=categorical_feature,
+            params=new_params,
+            free_raw_data=free_raw_data,
+        )
         new_params['refit_decay_rate'] = decay_rate
         new_booster = Booster(new_params, train_set)
         # Copy models
@@ -3555,10 +3724,9 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
             ctypes.c_int32(nrow),
             ctypes.c_int32(ncol)))
         new_booster.network = self.network
-        new_booster.__attr = self.__attr.copy()
         return new_booster
 
-    def get_leaf_output(self, tree_id, leaf_id):
+    def get_leaf_output(self, tree_id: int, leaf_id: int) -> float:
         """Get the output of a leaf.
 
         Parameters
@@ -3587,7 +3755,7 @@ def _to_predictor(self, pred_parameter=None):
         predictor.pandas_categorical = self.pandas_categorical
         return predictor
 
-    def num_feature(self):
+    def num_feature(self) -> int:
         """Get number of features.
 
         Returns
@@ -3601,7 +3769,7 @@ def num_feature(self):
             ctypes.byref(out_num_feature)))
         return out_num_feature.value
 
-    def feature_name(self):
+    def feature_name(self) -> List[str]:
         """Get names of features.
 
         Returns
@@ -3639,7 +3807,11 @@ def feature_name(self):
                 ptr_string_buffers))
         return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]
 
-    def feature_importance(self, importance_type='split', iteration=None):
+    def feature_importance(
+        self,
+        importance_type: str = 'split',
+        iteration: Optional[int] = None
+    ) -> np.ndarray:
         """Get feature importances.
 
         Parameters
@@ -3667,7 +3839,7 @@ def feature_importance(self, importance_type='split', iteration=None):
             ctypes.c_int(iteration),
             ctypes.c_int(importance_type_int),
             result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
-        if importance_type_int == 0:
+        if importance_type_int == C_API_FEATURE_IMPORTANCE_SPLIT:
             return result.astype(np.int32)
         else:
             return result
@@ -3779,7 +3951,7 @@ def __inner_eval(self, data_name, data_idx, feval=None):
                     ret.append((data_name, eval_name, val, is_higher_better))
         return ret
 
-    def __inner_predict(self, data_idx):
+    def __inner_predict(self, data_idx: int):
         """Predict for training and validation dataset."""
         if data_idx >= self.__num_dataset:
             raise ValueError("Data_idx should be smaller than number of dataset")
@@ -3801,9 +3973,13 @@ def __inner_predict(self, data_idx):
             if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
                 raise ValueError(f"Wrong length of predict results for data {data_idx}")
             self.__is_predicted_cur_iter[data_idx] = True
-        return self.__inner_predict_buffer[data_idx]
+        result = self.__inner_predict_buffer[data_idx]
+        if self.__num_class > 1:
+            num_data = result.size // self.__num_class
+            result = result.reshape(num_data, self.__num_class, order='F')
+        return result
 
-    def __get_eval_info(self):
+    def __get_eval_info(self) -> None:
         """Get inner evaluation count and names."""
         if self.__need_reload_eval_info:
             self.__need_reload_eval_info = False
@@ -3851,42 +4027,3 @@ def __get_eval_info(self):
                 self.__higher_better_inner_eval = [
                     name.startswith(('auc', 'ndcg@', 'map@', 'average_precision')) for name in self.__name_inner_eval
                 ]
-
-    def attr(self, key):
-        """Get attribute string from the Booster.
-
-        Parameters
-        ----------
-        key : str
-            The name of the attribute.
-
-        Returns
-        -------
-        value : str or None
-            The attribute value.
-            Returns None if attribute does not exist.
-        """
-        return self.__attr.get(key, None)
-
-    def set_attr(self, **kwargs):
-        """Set attributes to the Booster.
-
-        Parameters
-        ----------
-        **kwargs
-            The attributes to set.
-            Setting a value to None deletes an attribute.
-
-        Returns
-        -------
-        self : Booster
-            Booster with set attributes.
-        """
-        for key, value in kwargs.items():
-            if value is not None:
-                if not isinstance(value, str):
-                    raise ValueError("Only string values are accepted")
-                self.__attr[key] = value
-            else:
-                self.__attr.pop(key, None)
-        return self
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index 45e21f298480..05539b6396ac 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -12,14 +12,6 @@
 ]
 
 
-def _gt_delta(curr_score: float, best_score: float, delta: float) -> bool:
-    return curr_score > best_score + delta
-
-
-def _lt_delta(curr_score: float, best_score: float, delta: float) -> bool:
-    return curr_score < best_score - delta
-
-
 class EarlyStopException(Exception):
     """Exception of early stopping."""
 
@@ -62,7 +54,23 @@ def _format_eval_result(value: _EvalResultTuple, show_stdv: bool = True) -> str:
         raise ValueError("Wrong metric value")
 
 
-def log_evaluation(period: int = 1, show_stdv: bool = True) -> Callable:
+class _LogEvaluationCallback:
+    """Internal log evaluation callable class."""
+
+    def __init__(self, period: int = 1, show_stdv: bool = True) -> None:
+        self.order = 10
+        self.before_iteration = False
+
+        self.period = period
+        self.show_stdv = show_stdv
+
+    def __call__(self, env: CallbackEnv) -> None:
+        if self.period > 0 and env.evaluation_result_list and (env.iteration + 1) % self.period == 0:
+            result = '\t'.join([_format_eval_result(x, self.show_stdv) for x in env.evaluation_result_list])
+            _log_info(f'[{env.iteration + 1}]\t{result}')
+
+
+def log_evaluation(period: int = 1, show_stdv: bool = True) -> _LogEvaluationCallback:
     """Create a callback that logs the evaluation results.
 
     By default, standard output resource is used.
@@ -82,15 +90,50 @@ def log_evaluation(period: int = 1, show_stdv: bool = True) -> Callable:
 
     Returns
     -------
-    callback : callable
+    callback : _LogEvaluationCallback
         The callback that logs the evaluation results every ``period`` boosting iteration(s).
     """
-    def _callback(env: CallbackEnv) -> None:
-        if period > 0 and env.evaluation_result_list and (env.iteration + 1) % period == 0:
-            result = '\t'.join([_format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
-            _log_info(f'[{env.iteration + 1}]\t{result}')
-    _callback.order = 10  # type: ignore
-    return _callback
+    return _LogEvaluationCallback(period=period, show_stdv=show_stdv)
+
+
+class _RecordEvaluationCallback:
+    """Internal record evaluation callable class."""
+
+    def __init__(self, eval_result: Dict[str, Dict[str, List[Any]]]) -> None:
+        self.order = 20
+        self.before_iteration = False
+
+        if not isinstance(eval_result, dict):
+            raise TypeError('eval_result should be a dictionary')
+        self.eval_result = eval_result
+
+    def _init(self, env: CallbackEnv) -> None:
+        self.eval_result.clear()
+        for item in env.evaluation_result_list:
+            if len(item) == 4:  # regular train
+                data_name, eval_name = item[:2]
+            else:  # cv
+                data_name, eval_name = item[1].split()
+            self.eval_result.setdefault(data_name, collections.OrderedDict())
+            if len(item) == 4:
+                self.eval_result[data_name].setdefault(eval_name, [])
+            else:
+                self.eval_result[data_name].setdefault(f'{eval_name}-mean', [])
+                self.eval_result[data_name].setdefault(f'{eval_name}-stdv', [])
+
+    def __call__(self, env: CallbackEnv) -> None:
+        if env.iteration == env.begin_iteration:
+            self._init(env)
+        for item in env.evaluation_result_list:
+            if len(item) == 4:
+                data_name, eval_name, result = item[:3]
+                self.eval_result[data_name][eval_name].append(result)
+            else:
+                data_name, eval_name = item[1].split()
+                res_mean = item[2]
+                res_stdv = item[4]
+                self.eval_result[data_name][f'{eval_name}-mean'].append(res_mean)
+                self.eval_result[data_name][f'{eval_name}-stdv'].append(res_stdv)
 
 
 def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable:
@@ -123,214 +166,223 @@ def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable:
 
     Returns
     -------
-    callback : callable
+    callback : _RecordEvaluationCallback
         The callback that records the evaluation history into the passed dictionary.
     """
-    if not isinstance(eval_result, dict):
-        raise TypeError('eval_result should be a dictionary')
+    return _RecordEvaluationCallback(eval_result=eval_result)
 
-    def _init(env: CallbackEnv) -> None:
-        eval_result.clear()
-        for data_name, eval_name, _, _ in env.evaluation_result_list:
-            eval_result.setdefault(data_name, collections.OrderedDict())
-            eval_result[data_name].setdefault(eval_name, [])
 
-    def _callback(env: CallbackEnv) -> None:
-        if env.iteration == env.begin_iteration:
-            _init(env)
-        for data_name, eval_name, result, _ in env.evaluation_result_list:
-            eval_result[data_name][eval_name].append(result)
-    _callback.order = 20  # type: ignore
-    return _callback
+class _ResetParameterCallback:
+    """Internal reset parameter callable class."""
 
+    def __init__(self, **kwargs: Union[list, Callable]) -> None:
+        self.order = 10
+        self.before_iteration = True
 
-def reset_parameter(**kwargs: Union[list, Callable]) -> Callable:
-    """Create a callback that resets the parameter after the first iteration.
+        self.kwargs = kwargs
 
-    .. note::
-
-        The initial parameter will still take in-effect on first iteration.
-
-    Parameters
-    ----------
-    **kwargs : value should be list or callable
-        List of parameters for each boosting round
-        or a callable that calculates the parameter in terms of
-        current number of round (e.g. yields learning rate decay).
-        If list lst, parameter = lst[current_round].
-        If callable func, parameter = func(current_round).
-
-    Returns
-    -------
-    callback : callable
-        The callback that resets the parameter after the first iteration.
-    """
-    def _callback(env: CallbackEnv) -> None:
+    def __call__(self, env: CallbackEnv) -> None:
         new_parameters = {}
-        for key, value in kwargs.items():
+        for key, value in self.kwargs.items():
             if isinstance(value, list):
                 if len(value) != env.end_iteration - env.begin_iteration:
-                    raise ValueError(f"Length of list {key!r} has to equal to 'num_boost_round'.")
+                    raise ValueError(f"Length of list {key!r} has to be equal to 'num_boost_round'.")
                 new_param = value[env.iteration - env.begin_iteration]
-            else:
+            elif callable(value):
                 new_param = value(env.iteration - env.begin_iteration)
+            else:
+                raise ValueError("Only list and callable values are supported "
+                                 "as a mapping from boosting round index to new parameter value.")
             if new_param != env.params.get(key, None):
                 new_parameters[key] = new_param
         if new_parameters:
             env.model.reset_parameter(new_parameters)
             env.params.update(new_parameters)
-    _callback.before_iteration = True  # type: ignore
-    _callback.order = 10  # type: ignore
-    return _callback
 
 
-def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbose: bool = True, min_delta: Union[float, List[float]] = 0.0) -> Callable:
-    """Create a callback that activates early stopping.
+def reset_parameter(**kwargs: Union[list, Callable]) -> Callable:
+    """Create a callback that resets the parameter after the first iteration.
 
-    Activates early stopping.
-    The model will train until the validation score doesn't improve by at least ``min_delta``.
-    Validation score needs to improve at least every ``stopping_rounds`` round(s)
-    to continue training.
-    Requires at least one validation data and one metric.
-    If there's more than one, will check all of them. But the training data is ignored anyway.
-    To check only the first metric set ``first_metric_only`` to True.
-    The index of iteration that has the best performance will be saved in the ``best_iteration`` attribute of a model.
+    .. note::
+
+        The initial parameter will still take in-effect on first iteration.
 
     Parameters
     ----------
-    stopping_rounds : int
-        The possible number of rounds without the trend occurrence.
-    first_metric_only : bool, optional (default=False)
-        Whether to use only the first metric for early stopping.
-    verbose : bool, optional (default=True)
-        Whether to log message with early stopping information.
-        By default, standard output resource is used.
-        Use ``register_logger()`` function to register a custom logger.
-    min_delta : float or list of float, optional (default=0.0)
-        Minimum improvement in score to keep training.
-        If float, this single value is used for all metrics.
-        If list, its length should match the total number of metrics.
+    **kwargs : value should be list or callable
+        List of parameters for each boosting round
+        or a callable that calculates the parameter in terms of
+        current number of round (e.g. yields learning rate decay).
+        If list lst, parameter = lst[current_round].
+        If callable func, parameter = func(current_round).
 
     Returns
     -------
-    callback : callable
-        The callback that activates early stopping.
+    callback : _ResetParameterCallback
+        The callback that resets the parameter after the first iteration.
     """
-    best_score = []
-    best_iter = []
-    best_score_list: list = []
-    cmp_op = []
-    enabled = True
-    first_metric = ''
-
-    def _init(env: CallbackEnv) -> None:
-        nonlocal best_score
-        nonlocal best_iter
-        nonlocal best_score_list
-        nonlocal cmp_op
-        nonlocal enabled
-        nonlocal first_metric
-        enabled = not any(env.params.get(boost_alias, "") == 'dart' for boost_alias
-                          in _ConfigAliases.get("boosting"))
-        if not enabled:
+    return _ResetParameterCallback(**kwargs)
+
+
+class _EarlyStoppingCallback:
+    """Internal early stopping callable class."""
+
+    def __init__(
+        self,
+        stopping_rounds: int,
+        first_metric_only: bool = False,
+        verbose: bool = True,
+        min_delta: Union[float, List[float]] = 0.0
+    ) -> None:
+        self.order = 30
+        self.before_iteration = False
+
+        self.stopping_rounds = stopping_rounds
+        self.first_metric_only = first_metric_only
+        self.verbose = verbose
+        self.min_delta = min_delta
+
+        self.enabled = True
+        self._reset_storages()
+
+    def _reset_storages(self) -> None:
+        self.best_score = []
+        self.best_iter = []
+        self.best_score_list = []
+        self.cmp_op = []
+        self.first_metric = ''
+
+    def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
+        return curr_score > best_score + delta
+
+    def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
+        return curr_score < best_score - delta
+
+    def _init(self, env: CallbackEnv) -> None:
+        self.enabled = not any(env.params.get(boost_alias, "") == 'dart' for boost_alias
+                               in _ConfigAliases.get("boosting"))
+        if not self.enabled:
             _log_warning('Early stopping is not available in dart mode')
             return
         if not env.evaluation_result_list:
             raise ValueError('For early stopping, '
                              'at least one dataset and eval metric is required for evaluation')
 
-        if verbose:
-            _log_info(f"Training until validation scores don't improve for {stopping_rounds} rounds")
+        if self.stopping_rounds <= 0:
+            raise ValueError("stopping_rounds should be greater than zero.")
+
+        if self.verbose:
+            _log_info(f"Training until validation scores don't improve for {self.stopping_rounds} rounds")
 
-        # reset storages
-        best_score = []
-        best_iter = []
-        best_score_list = []
-        cmp_op = []
-        first_metric = ''
+        self._reset_storages()
 
         n_metrics = len(set(m[1] for m in env.evaluation_result_list))
         n_datasets = len(env.evaluation_result_list) // n_metrics
-        if isinstance(min_delta, list):
-            if not all(t >= 0 for t in min_delta):
+        if isinstance(self.min_delta, list):
+            if not all(t >= 0 for t in self.min_delta):
                 raise ValueError('Values for early stopping min_delta must be non-negative.')
-            if len(min_delta) == 0:
-                if verbose:
+            if len(self.min_delta) == 0:
+                if self.verbose:
                     _log_info('Disabling min_delta for early stopping.')
                 deltas = [0.0] * n_datasets * n_metrics
-            elif len(min_delta) == 1:
-                if verbose:
-                    _log_info(f'Using {min_delta[0]} as min_delta for all metrics.')
-                deltas = min_delta * n_datasets * n_metrics
+            elif len(self.min_delta) == 1:
+                if self.verbose:
+                    _log_info(f'Using {self.min_delta[0]} as min_delta for all metrics.')
+                deltas = self.min_delta * n_datasets * n_metrics
             else:
-                if len(min_delta) != n_metrics:
+                if len(self.min_delta) != n_metrics:
                     raise ValueError('Must provide a single value for min_delta or as many as metrics.')
-                if first_metric_only and verbose:
-                    _log_info(f'Using only {min_delta[0]} as early stopping min_delta.')
-                deltas = min_delta * n_datasets
+                if self.first_metric_only and self.verbose:
+                    _log_info(f'Using only {self.min_delta[0]} as early stopping min_delta.')
+                deltas = self.min_delta * n_datasets
         else:
-            if min_delta < 0:
+            if self.min_delta < 0:
                 raise ValueError('Early stopping min_delta must be non-negative.')
-            if min_delta > 0 and n_metrics > 1 and not first_metric_only and verbose:
-                _log_info(f'Using {min_delta} as min_delta for all metrics.')
-            deltas = [min_delta] * n_datasets * n_metrics
+            if self.min_delta > 0 and n_metrics > 1 and not self.first_metric_only and self.verbose:
+                _log_info(f'Using {self.min_delta} as min_delta for all metrics.')
+            deltas = [self.min_delta] * n_datasets * n_metrics
 
         # split is needed for "<dataset type> <metric>" case (e.g. "train l1")
-        first_metric = env.evaluation_result_list[0][1].split(" ")[-1]
+        self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1]
         for eval_ret, delta in zip(env.evaluation_result_list, deltas):
-            best_iter.append(0)
-            best_score_list.append(None)
+            self.best_iter.append(0)
+            self.best_score_list.append(None)
             if eval_ret[3]:  # greater is better
-                best_score.append(float('-inf'))
-                cmp_op.append(partial(_gt_delta, delta=delta))
+                self.best_score.append(float('-inf'))
+                self.cmp_op.append(partial(self._gt_delta, delta=delta))
             else:
-                best_score.append(float('inf'))
-                cmp_op.append(partial(_lt_delta, delta=delta))
+                self.best_score.append(float('inf'))
+                self.cmp_op.append(partial(self._lt_delta, delta=delta))
 
-    def _final_iteration_check(env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None:
-        nonlocal best_iter
-        nonlocal best_score_list
+    def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None:
         if env.iteration == env.end_iteration - 1:
-            if verbose:
-                best_score_str = '\t'.join([_format_eval_result(x) for x in best_score_list[i]])
+            if self.verbose:
+                best_score_str = '\t'.join([_format_eval_result(x) for x in self.best_score_list[i]])
                 _log_info('Did not meet early stopping. '
-                          f'Best iteration is:\n[{best_iter[i] + 1}]\t{best_score_str}')
-                if first_metric_only:
+                          f'Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}')
+                if self.first_metric_only:
                     _log_info(f"Evaluated only: {eval_name_splitted[-1]}")
-            raise EarlyStopException(best_iter[i], best_score_list[i])
-
-    def _callback(env: CallbackEnv) -> None:
-        nonlocal best_score
-        nonlocal best_iter
-        nonlocal best_score_list
-        nonlocal cmp_op
-        nonlocal enabled
-        nonlocal first_metric
+            raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
+
+    def __call__(self, env: CallbackEnv) -> None:
         if env.iteration == env.begin_iteration:
-            _init(env)
-        if not enabled:
+            self._init(env)
+        if not self.enabled:
             return
         for i in range(len(env.evaluation_result_list)):
             score = env.evaluation_result_list[i][2]
-            if best_score_list[i] is None or cmp_op[i](score, best_score[i]):
-                best_score[i] = score
-                best_iter[i] = env.iteration
-                best_score_list[i] = env.evaluation_result_list
+            if self.best_score_list[i] is None or self.cmp_op[i](score, self.best_score[i]):
+                self.best_score[i] = score
+                self.best_iter[i] = env.iteration
+                self.best_score_list[i] = env.evaluation_result_list
             # split is needed for "<dataset type> <metric>" case (e.g. "train l1")
             eval_name_splitted = env.evaluation_result_list[i][1].split(" ")
-            if first_metric_only and first_metric != eval_name_splitted[-1]:
+            if self.first_metric_only and self.first_metric != eval_name_splitted[-1]:
                 continue  # use only the first metric for early stopping
             if ((env.evaluation_result_list[i][0] == "cv_agg" and eval_name_splitted[0] == "train"
-                 or env.evaluation_result_list[i][0] == env.model._train_data_name)):
-                _final_iteration_check(env, eval_name_splitted, i)
+                    or env.evaluation_result_list[i][0] == env.model._train_data_name)):
+                self._final_iteration_check(env, eval_name_splitted, i)
                 continue  # train data for lgb.cv or sklearn wrapper (underlying lgb.train)
-            elif env.iteration - best_iter[i] >= stopping_rounds:
-                if verbose:
-                    eval_result_str = '\t'.join([_format_eval_result(x) for x in best_score_list[i]])
-                    _log_info(f"Early stopping, best iteration is:\n[{best_iter[i] + 1}]\t{eval_result_str}")
-                    if first_metric_only:
+            elif env.iteration - self.best_iter[i] >= self.stopping_rounds:
+                if self.verbose:
+                    eval_result_str = '\t'.join([_format_eval_result(x) for x in self.best_score_list[i]])
+                    _log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}")
+                    if self.first_metric_only:
                         _log_info(f"Evaluated only: {eval_name_splitted[-1]}")
-                raise EarlyStopException(best_iter[i], best_score_list[i])
-            _final_iteration_check(env, eval_name_splitted, i)
-    _callback.order = 30  # type: ignore
-    return _callback
+                raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
+            self._final_iteration_check(env, eval_name_splitted, i)
+
+
+def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbose: bool = True, min_delta: Union[float, List[float]] = 0.0) -> _EarlyStoppingCallback:
+    """Create a callback that activates early stopping.
+
+    Activates early stopping.
+    The model will train until the validation score doesn't improve by at least ``min_delta``.
+    Validation score needs to improve at least every ``stopping_rounds`` round(s)
+    to continue training.
+    Requires at least one validation data and one metric.
+    If there's more than one, will check all of them. But the training data is ignored anyway.
+    To check only the first metric set ``first_metric_only`` to True.
+    The index of iteration that has the best performance will be saved in the ``best_iteration`` attribute of a model.
+
+    Parameters
+    ----------
+    stopping_rounds : int
+        The possible number of rounds without the trend occurrence.
+    first_metric_only : bool, optional (default=False)
+        Whether to use only the first metric for early stopping.
+    verbose : bool, optional (default=True)
+        Whether to log message with early stopping information.
+        By default, standard output resource is used.
+        Use ``register_logger()`` function to register a custom logger.
+    min_delta : float or list of float, optional (default=0.0)
+        Minimum improvement in score to keep training.
+        If float, this single value is used for all metrics.
+        If list, its length should match the total number of metrics.
+
+    Returns
+    -------
+    callback : _EarlyStoppingCallback
+        The callback that activates early stopping.
+    """
+    return _EarlyStoppingCallback(stopping_rounds=stopping_rounds, first_metric_only=first_metric_only, verbose=verbose, min_delta=min_delta)
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index fb97621806db..65c044ffc883 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -6,7 +6,6 @@
     from pandas import DataFrame as pd_DataFrame
     from pandas import Series as pd_Series
     from pandas import concat
-    from pandas.api.types import is_sparse as is_dtype_sparse
     try:
         from pandas import CategoricalDtype as pd_CategoricalDtype
     except ImportError:
@@ -34,7 +33,6 @@ def __init__(self, *args, **kwargs):
             pass
 
     concat = None
-    is_dtype_sparse = None
 
 """matplotlib"""
 try:
@@ -77,9 +75,9 @@ def __init__(self, *args, **kwargs):
     from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
     try:
         from sklearn.exceptions import NotFittedError
-        from sklearn.model_selection import GroupKFold, StratifiedKFold
+        from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
     except ImportError:
-        from sklearn.cross_validation import GroupKFold, StratifiedKFold
+        from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold
         from sklearn.utils.validation import NotFittedError
     try:
         from sklearn.utils.validation import _check_sample_weight
@@ -92,6 +90,7 @@ def _check_sample_weight(sample_weight, X, dtype=None):
             return sample_weight
 
     SKLEARN_INSTALLED = True
+    _LGBMBaseCrossValidator = BaseCrossValidator
     _LGBMModelBase = BaseEstimator
     _LGBMRegressorBase = RegressorMixin
     _LGBMClassifierBase = ClassifierMixin
@@ -176,3 +175,21 @@ class dask_Series:  # type: ignore
 
         def __init__(self, *args, **kwargs):
             pass
+
+"""cpu_count()"""
+try:
+    from joblib import cpu_count
+
+    def _LGBMCpuCount(only_physical_cores: bool = True):
+        return cpu_count(only_physical_cores=only_physical_cores)
+except ImportError:
+    try:
+        from psutil import cpu_count
+
+        def _LGBMCpuCount(only_physical_cores: bool = True):
+            return cpu_count(logical=not only_physical_cores)
+    except ImportError:
+        from multiprocessing import cpu_count
+
+        def _LGBMCpuCount(only_physical_cores: bool = True):
+            return cpu_count()
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 062422286a47..2152ac8e35cb 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -22,7 +22,8 @@
                      dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series,
                      default_client, delayed, pd_DataFrame, pd_Series, wait)
 from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomEvalFunction,
-                      _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict)
+                      _LGBM_ScikitCustomObjectiveFunction, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit,
+                      _lgbmmodel_doc_predict)
 
 _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
 _DaskMatrixLike = Union[dask_Array, dask_DataFrame]
@@ -423,7 +424,7 @@ def _train(
     model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
         Class of the local underlying model.
     sample_weight : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)
-        Weights of training data.
+        Weights of training data. Weights should be non-negative.
     init_score : Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None)
         Init score of training data.
     group : Dask Array or Dask Series or None, optional (default=None)
@@ -440,7 +441,7 @@ def _train(
     eval_names : list of str, or None, optional (default=None)
         Names of eval_set.
     eval_sample_weight : list of Dask Array or Dask Series, or None, optional (default=None)
-        Weights for each validation set in eval_set.
+        Weights for each validation set in eval_set. Weights should be non-negative.
     eval_class_weight : list of dict or str, or None, optional (default=None)
         Class weights, one dict or str for each validation set in eval_set.
     eval_init_score : list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)
@@ -1099,7 +1100,7 @@ def __init__(
         learning_rate: float = 0.1,
         n_estimators: int = 100,
         subsample_for_bin: int = 200000,
-        objective: Optional[str] = None,
+        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
         class_weight: Optional[Union[dict, str]] = None,
         min_split_gain: float = 0.,
         min_child_weight: float = 1e-3,
@@ -1110,7 +1111,7 @@ def __init__(
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
         random_state: Optional[Union[int, np.random.RandomState]] = None,
-        n_jobs: int = -1,
+        n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         client: Optional[Client] = None,
         **kwargs: Any
@@ -1142,16 +1143,12 @@ def __init__(
 
     _base_doc = LGBMClassifier.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')  # type: ignore
-    _base_doc = f"""
+    __init__.__doc__ = f"""
         {_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
         {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
         {_kwargs}{_after_kwargs}
         """
 
-    # the note on custom objective functions in LGBMModel.__init__ is not
-    # currently relevant for the Dask estimators
-    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
-
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_dask_getstate()
 
@@ -1275,7 +1272,7 @@ def __init__(
         learning_rate: float = 0.1,
         n_estimators: int = 100,
         subsample_for_bin: int = 200000,
-        objective: Optional[str] = None,
+        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
         class_weight: Optional[Union[dict, str]] = None,
         min_split_gain: float = 0.,
         min_child_weight: float = 1e-3,
@@ -1286,7 +1283,7 @@ def __init__(
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
         random_state: Optional[Union[int, np.random.RandomState]] = None,
-        n_jobs: int = -1,
+        n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         client: Optional[Client] = None,
         **kwargs: Any
@@ -1318,14 +1315,11 @@ def __init__(
 
     _base_doc = LGBMRegressor.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')  # type: ignore
-    _base_doc = f"""
+    __init__.__doc__ = f"""
         {_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
         {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
         {_kwargs}{_after_kwargs}
         """
-    # the note on custom objective functions in LGBMModel.__init__ is not
-    # currently relevant for the Dask estimators
-    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
 
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_dask_getstate()
@@ -1431,7 +1425,7 @@ def __init__(
         learning_rate: float = 0.1,
         n_estimators: int = 100,
         subsample_for_bin: int = 200000,
-        objective: Optional[str] = None,
+        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
         class_weight: Optional[Union[dict, str]] = None,
         min_split_gain: float = 0.,
         min_child_weight: float = 1e-3,
@@ -1442,7 +1436,7 @@ def __init__(
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
         random_state: Optional[Union[int, np.random.RandomState]] = None,
-        n_jobs: int = -1,
+        n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         client: Optional[Client] = None,
         **kwargs: Any
@@ -1474,16 +1468,12 @@ def __init__(
 
     _base_doc = LGBMRanker.__init__.__doc__
     _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')  # type: ignore
-    _base_doc = f"""
+    __init__.__doc__ = f"""
         {_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
         {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
         {_kwargs}{_after_kwargs}
         """
 
-    # the note on custom objective functions in LGBMModel.__init__ is not
-    # currently relevant for the Dask estimators
-    __init__.__doc__ = _base_doc[:_base_doc.find('Note\n')]
-
     def __getstate__(self) -> Dict[Any, Any]:
         return self._lgb_dask_getstate()
 
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 6ad65117237b..e3827e726eea 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -4,23 +4,25 @@
 import copy
 from operator import attrgetter
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 
 from . import callback
-from .basic import Booster, Dataset, LightGBMError, _ArrayLike, _ConfigAliases, _InnerPredictor, _log_warning
-from .compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold
+from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
+                    _LGBM_CustomObjectiveFunction, _log_warning)
+from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
 
-_LGBM_CustomObjectiveFunction = Callable[
-    [np.ndarray, Dataset],
-    Tuple[_ArrayLike, _ArrayLike]
-]
 _LGBM_CustomMetricFunction = Callable[
     [np.ndarray, Dataset],
     Tuple[str, float, bool]
 ]
 
+_LGBM_PreprocFunction = Callable[
+    [Dataset, Dataset, Dict[str, Any]],
+    Tuple[Dataset, Dataset, Dict[str, Any]]
+]
+
 
 def train(
     params: Dict[str, Any],
@@ -28,13 +30,10 @@ def train(
     num_boost_round: int = 100,
     valid_sets: Optional[List[Dataset]] = None,
     valid_names: Optional[List[str]] = None,
-    fobj: Optional[_LGBM_CustomObjectiveFunction] = None,
     feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
     init_model: Optional[Union[str, Path, Booster]] = None,
     feature_name: Union[List[str], str] = 'auto',
     categorical_feature: Union[List[str], List[int], str] = 'auto',
-    early_stopping_rounds: Optional[int] = None,
-    evals_result: Optional[Dict[str, Any]] = None,
     keep_training_booster: bool = False,
     callbacks: Optional[List[Callable]] = None
 ) -> Booster:
@@ -43,7 +42,8 @@ def train(
     Parameters
     ----------
     params : dict
-        Parameters for training.
+        Parameters for training. Values passed through ``params`` take precedence over those
+        supplied via arguments.
     train_set : Dataset
         Data to be trained on.
     num_boost_round : int, optional (default=100)
@@ -52,39 +52,18 @@ def train(
         List of data to be evaluated on during training.
     valid_names : list of str, or None, optional (default=None)
         Names of ``valid_sets``.
-    fobj : callable or None, optional (default=None)
-        Customized objective function.
-        Should accept two parameters: preds, train_data,
-        and return (grad, hess).
-
-            preds : numpy 1-D array
-                The predicted values.
-                Predicted values are returned before any transformation,
-                e.g. they are raw margin instead of probability of positive class for binary task.
-            train_data : Dataset
-                The training dataset.
-            grad : list, numpy 1-D array or pandas Series
-                The value of the first order derivative (gradient) of the loss
-                with respect to the elements of preds for each sample point.
-            hess : list, numpy 1-D array or pandas Series
-                The value of the second order derivative (Hessian) of the loss
-                with respect to the elements of preds for each sample point.
-
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-        and you should group grad and hess in this way as well.
-
     feval : callable, list of callable, or None, optional (default=None)
         Customized evaluation function.
-        Each evaluation function should accept two parameters: preds, train_data,
+        Each evaluation function should accept two parameters: preds, eval_data,
         and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
-                If ``fobj`` is specified, predicted values are returned before any transformation,
+                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
+                If custom objective function is used, predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task in this case.
-            train_data : Dataset
-                The training dataset.
+            eval_data : Dataset
+                A ``Dataset`` to evaluate.
             eval_name : str
                 The name of evaluation function (without whitespaces).
             eval_result : float
@@ -92,8 +71,6 @@ def train(
             is_higher_better : bool
                 Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
         To ignore the default metric corresponding to the used objective,
         set the ``metric`` parameter to the string ``"None"`` in ``params``.
     init_model : str, pathlib.Path, Booster or None, optional (default=None)
@@ -106,32 +83,11 @@ def train(
         If list of int, interpreted as indices.
         If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
         If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-        All values in categorical features should be less than int32 max value (2147483647).
+        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
         Large values could be memory consuming. Consider using consecutive integers starting from zero.
         All negative values in categorical features will be treated as missing values.
         The output cannot be monotonically constrained with respect to a categorical feature.
-    early_stopping_rounds : int or None, optional (default=None)
-        Activates early stopping. The model will train until the validation score stops improving.
-        Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
-        to continue training.
-        Requires at least one validation data and one metric.
-        If there's more than one, will check all of them. But the training data is ignored anyway.
-        To check only the first metric, set the ``first_metric_only`` parameter to ``True`` in ``params``.
-        The index of iteration that has the best performance will be saved in the ``best_iteration`` field
-        if early stopping logic is enabled by setting ``early_stopping_rounds``.
-    evals_result : dict or None, optional (default=None)
-        Dictionary used to store all evaluation results of all the items in ``valid_sets``.
-        This should be initialized outside of your call to ``train()`` and should be empty.
-        Any initial contents of the dictionary will be deleted.
-
-        .. rubric:: Example
-
-        With a ``valid_sets`` = [valid_set, train_set],
-        ``valid_names`` = ['eval', 'train']
-        and a ``params`` = {'metric': 'logloss'}
-        returns {'train': {'logloss': ['0.48253', '0.35953', ...]},
-        'eval': {'logloss': ['0.480385', '0.357756', ...]}}.
-
+        Floating point numbers in categorical features will be rounded towards 0.
     keep_training_booster : bool, optional (default=False)
         Whether the returned Booster will be used to keep training.
         If False, the returned value will be converted into _InnerPredictor before returning.
@@ -143,6 +99,27 @@ def train(
         List of callback functions that are applied at each iteration.
         See Callbacks in Python API for more information.
 
+    Note
+    ----
+    A custom objective function can be provided for the ``objective`` parameter.
+    It should accept two parameters: preds, train_data and return (grad, hess).
+
+        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
+            The predicted values.
+            Predicted values are returned before any transformation,
+            e.g. they are raw margin instead of probability of positive class for binary task.
+        train_data : Dataset
+            The training dataset.
+        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
+            The value of the first order derivative (gradient) of the loss
+            with respect to the elements of preds for each sample point.
+        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
+            The value of the second order derivative (Hessian) of the loss
+            with respect to the elements of preds for each sample point.
+
+    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+    and grad and hess should be returned in the same format.
+
     Returns
     -------
     booster : Booster
@@ -150,23 +127,28 @@ def train(
     """
     # create predictor first
     params = copy.deepcopy(params)
-    if fobj is not None:
-        for obj_alias in _ConfigAliases.get("objective"):
-            params.pop(obj_alias, None)
-        params['objective'] = 'none'
+    params = _choose_param_value(
+        main_param_name='objective',
+        params=params,
+        default_value=None
+    )
+    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
+    if callable(params["objective"]):
+        fobj = params["objective"]
+        params["objective"] = 'none'
     for alias in _ConfigAliases.get("num_iterations"):
         if alias in params:
             num_boost_round = params.pop(alias)
             _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
     params["num_iterations"] = num_boost_round
-    # show deprecation warning only for early stop argument, setting early stop via global params should still be possible
-    if early_stopping_rounds is not None and early_stopping_rounds > 0:
-        _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
-                     "Pass 'early_stopping()' callback via 'callbacks' argument instead.")
-    for alias in _ConfigAliases.get("early_stopping_round"):
-        if alias in params:
-            early_stopping_rounds = params.pop(alias)
-    params["early_stopping_round"] = early_stopping_rounds
+    # setting early stopping via global params should be possible
+    params = _choose_param_value(
+        main_param_name="early_stopping_round",
+        params=params,
+        default_value=None
+    )
+    if params["early_stopping_round"] is None:
+        params.pop("early_stopping_round")
     first_metric_only = params.get('first_metric_only', False)
 
     if num_boost_round <= 0:
@@ -217,14 +199,18 @@ def train(
             cb.__dict__.setdefault('order', i - len(callbacks))
         callbacks_set = set(callbacks)
 
-    # Most of legacy advanced options becomes callbacks
-    if early_stopping_rounds is not None and early_stopping_rounds > 0:
-        callbacks_set.add(callback.early_stopping(early_stopping_rounds, first_metric_only))
-
-    if evals_result is not None:
-        _log_warning("'evals_result' argument is deprecated and will be removed in a future release of LightGBM. "
-                     "Pass 'record_evaluation()' callback via 'callbacks' argument instead.")
-        callbacks_set.add(callback.record_evaluation(evals_result))
+    if "early_stopping_round" in params:
+        callbacks_set.add(
+            callback.early_stopping(
+                stopping_rounds=params["early_stopping_round"],
+                first_metric_only=first_metric_only,
+                verbose=_choose_param_value(
+                    main_param_name="verbosity",
+                    params=params,
+                    default_value=1
+                ).pop("verbosity") > 0
+            )
+        )
 
     callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)}
     callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
@@ -305,13 +291,13 @@ def __init__(self):
         self.boosters = []
         self.best_iteration = -1
 
-    def _append(self, booster):
+    def _append(self, booster: Booster) -> None:
         """Add a booster to CVBooster."""
         self.boosters.append(booster)
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
         """Redirect methods call of CVBooster."""
-        def handler_function(*args, **kwargs):
+        def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
             """Call methods with each booster, and concatenate their results."""
             ret = []
             for booster in self.boosters:
@@ -320,8 +306,17 @@ def handler_function(*args, **kwargs):
         return handler_function
 
 
-def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratified=True,
-                  shuffle=True, eval_train_metric=False):
+def _make_n_folds(
+    full_data: Dataset,
+    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]],
+    nfold: int,
+    params: Dict[str, Any],
+    seed: int,
+    fpreproc: Optional[_LGBM_PreprocFunction] = None,
+    stratified: bool = True,
+    shuffle: bool = True,
+    eval_train_metric: bool = False
+) -> CVBooster:
     """Make a n-fold list of Booster from random indices."""
     full_data = full_data.construct()
     num_data = full_data.num_data()
@@ -380,35 +375,47 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
     return ret
 
 
-def _agg_cv_result(raw_results, eval_train_metric=False):
+def _agg_cv_result(
+    raw_results: List[List[Tuple[str, str, float, bool]]]
+) -> List[Tuple[str, str, float, bool, float]]:
     """Aggregate cross-validation results."""
     cvmap = collections.OrderedDict()
     metric_type = {}
     for one_result in raw_results:
         for one_line in one_result:
-            if eval_train_metric:
-                key = f"{one_line[0]} {one_line[1]}"
-            else:
-                key = one_line[1]
+            key = f"{one_line[0]} {one_line[1]}"
             metric_type[key] = one_line[3]
             cvmap.setdefault(key, [])
             cvmap[key].append(one_line[2])
     return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
 
 
-def cv(params, train_set, num_boost_round=100,
-       folds=None, nfold=5, stratified=True, shuffle=True,
-       metrics=None, fobj=None, feval=None, init_model=None,
-       feature_name='auto', categorical_feature='auto',
-       early_stopping_rounds=None, fpreproc=None,
-       seed=0, callbacks=None, eval_train_metric=False,
-       return_cvbooster=False):
+def cv(
+    params: Dict[str, Any],
+    train_set: Dataset,
+    num_boost_round: int = 100,
+    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]] = None,
+    nfold: int = 5,
+    stratified: bool = True,
+    shuffle: bool = True,
+    metrics: Optional[Union[str, List[str]]] = None,
+    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
+    init_model: Optional[Union[str, Path, Booster]] = None,
+    feature_name: Union[str, List[str]] = 'auto',
+    categorical_feature: Union[str, List[str], List[int]] = 'auto',
+    fpreproc: Optional[_LGBM_PreprocFunction] = None,
+    seed: int = 0,
+    callbacks: Optional[List[Callable]] = None,
+    eval_train_metric: bool = False,
+    return_cvbooster: bool = False
+) -> Dict[str, Any]:
     """Perform the cross-validation with given parameters.
 
     Parameters
     ----------
     params : dict
-        Parameters for Booster.
+        Parameters for training. Values passed through ``params`` take precedence over those
+        supplied via arguments.
     train_set : Dataset
         Data to be trained on.
     num_boost_round : int, optional (default=100)
@@ -428,39 +435,18 @@ def cv(params, train_set, num_boost_round=100,
     metrics : str, list of str, or None, optional (default=None)
         Evaluation metrics to be monitored while CV.
         If not None, the metric in ``params`` will be overridden.
-    fobj : callable or None, optional (default=None)
-        Customized objective function.
-        Should accept two parameters: preds, train_data,
-        and return (grad, hess).
-
-            preds : numpy 1-D array
-                The predicted values.
-                Predicted values are returned before any transformation,
-                e.g. they are raw margin instead of probability of positive class for binary task.
-            train_data : Dataset
-                The training dataset.
-            grad : list, numpy 1-D array or pandas Series
-                The value of the first order derivative (gradient) of the loss
-                with respect to the elements of preds for each sample point.
-            hess : list, numpy 1-D array or pandas Series
-                The value of the second order derivative (Hessian) of the loss
-                with respect to the elements of preds for each sample point.
-
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-        and you should group grad and hess in this way as well.
-
     feval : callable, list of callable, or None, optional (default=None)
         Customized evaluation function.
-        Each evaluation function should accept two parameters: preds, train_data,
+        Each evaluation function should accept two parameters: preds, eval_data,
         and return (eval_name, eval_result, is_higher_better) or list of such tuples.
 
-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
-                If ``fobj`` is specified, predicted values are returned before any transformation,
+                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
+                If custom objective function is used, predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task in this case.
-            train_data : Dataset
-                The training dataset.
+            eval_data : Dataset
+                A ``Dataset`` to evaluate.
             eval_name : str
                 The name of evaluation function (without whitespace).
             eval_result : float
@@ -468,8 +454,6 @@ def cv(params, train_set, num_boost_round=100,
             is_higher_better : bool
                 Is eval result higher better, e.g. AUC is ``is_higher_better``.
 
-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
         To ignore the default metric corresponding to the used objective,
         set ``metrics`` to the string ``"None"``.
     init_model : str, pathlib.Path, Booster or None, optional (default=None)
@@ -482,17 +466,11 @@ def cv(params, train_set, num_boost_round=100,
         If list of int, interpreted as indices.
         If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
         If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-        All values in categorical features should be less than int32 max value (2147483647).
+        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
         Large values could be memory consuming. Consider using consecutive integers starting from zero.
         All negative values in categorical features will be treated as missing values.
         The output cannot be monotonically constrained with respect to a categorical feature.
-    early_stopping_rounds : int or None, optional (default=None)
-        Activates early stopping.
-        CV score needs to improve at least every ``early_stopping_rounds`` round(s)
-        to continue.
-        Requires at least one metric. If there's more than one, will check all of them.
-        To check only the first metric, set the ``first_metric_only`` parameter to ``True`` in ``params``.
-        Last entry in evaluation history is the one from the best iteration.
+        Floating point numbers in categorical features will be rounded towards 0.
     fpreproc : callable or None, optional (default=None)
         Preprocessing function that takes (dtrain, dtest, params)
         and returns transformed versions of those.
@@ -507,6 +485,27 @@ def cv(params, train_set, num_boost_round=100,
     return_cvbooster : bool, optional (default=False)
         Whether to return Booster models trained on each fold through ``CVBooster``.
 
+    Note
+    ----
+    A custom objective function can be provided for the ``objective`` parameter.
+    It should accept two parameters: preds, train_data and return (grad, hess).
+
+        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
+            The predicted values.
+            Predicted values are returned before any transformation,
+            e.g. they are raw margin instead of probability of positive class for binary task.
+        train_data : Dataset
+            The training dataset.
+        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
+            The value of the first order derivative (gradient) of the loss
+            with respect to the elements of preds for each sample point.
+        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
+            The value of the second order derivative (Hessian) of the loss
+            with respect to the elements of preds for each sample point.
+
+    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+    and grad and hess should be returned in the same format.
+
     Returns
     -------
     eval_hist : dict
@@ -519,24 +518,29 @@ def cv(params, train_set, num_boost_round=100,
     """
     if not isinstance(train_set, Dataset):
         raise TypeError("Training only accepts Dataset object")
-
     params = copy.deepcopy(params)
-    if fobj is not None:
-        for obj_alias in _ConfigAliases.get("objective"):
-            params.pop(obj_alias, None)
-        params['objective'] = 'none'
+    params = _choose_param_value(
+        main_param_name='objective',
+        params=params,
+        default_value=None
+    )
+    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
+    if callable(params["objective"]):
+        fobj = params["objective"]
+        params["objective"] = 'none'
     for alias in _ConfigAliases.get("num_iterations"):
         if alias in params:
             _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument")
             num_boost_round = params.pop(alias)
     params["num_iterations"] = num_boost_round
-    if early_stopping_rounds is not None and early_stopping_rounds > 0:
-        _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
-                     "Pass 'early_stopping()' callback via 'callbacks' argument instead.")
-    for alias in _ConfigAliases.get("early_stopping_round"):
-        if alias in params:
-            early_stopping_rounds = params.pop(alias)
-    params["early_stopping_round"] = early_stopping_rounds
+    # setting early stopping via global params should be possible
+    params = _choose_param_value(
+        main_param_name="early_stopping_round",
+        params=params,
+        default_value=None
+    )
+    if params["early_stopping_round"] is None:
+        params.pop("early_stopping_round")
     first_metric_only = params.get('first_metric_only', False)
 
     if num_boost_round <= 0:
@@ -571,8 +575,19 @@ def cv(params, train_set, num_boost_round=100,
         for i, cb in enumerate(callbacks):
             cb.__dict__.setdefault('order', i - len(callbacks))
         callbacks = set(callbacks)
-    if early_stopping_rounds is not None and early_stopping_rounds > 0:
-        callbacks.add(callback.early_stopping(early_stopping_rounds, first_metric_only, verbose=False))
+
+    if "early_stopping_round" in params:
+        callbacks.add(
+            callback.early_stopping(
+                stopping_rounds=params["early_stopping_round"],
+                first_metric_only=first_metric_only,
+                verbose=_choose_param_value(
+                    main_param_name="verbosity",
+                    params=params,
+                    default_value=1
+                ).pop("verbosity") > 0
+            )
+        )
 
     callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
     callbacks_after_iter = callbacks - callbacks_before_iter
@@ -588,7 +603,7 @@ def cv(params, train_set, num_boost_round=100,
                                     end_iteration=num_boost_round,
                                     evaluation_result_list=None))
         cvfolds.update(fobj=fobj)
-        res = _agg_cv_result(cvfolds.eval_valid(feval), eval_train_metric)
+        res = _agg_cv_result(cvfolds.eval_valid(feval))
         for _, key, mean, _, std in res:
             results[f'{key}-mean'].append(mean)
             results[f'{key}-stdv'].append(std)
@@ -602,6 +617,8 @@ def cv(params, train_set, num_boost_round=100,
                                         evaluation_result_list=res))
         except callback.EarlyStopException as earlyStopException:
             cvfolds.best_iteration = earlyStopException.best_iteration + 1
+            for bst in cvfolds.boosters:
+                bst.best_iteration = cvfolds.best_iteration
             for k in results:
                 results[k] = results[k][:cvfolds.best_iteration]
             break
diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py
index eb625c8a1193..f7d35045d21f 100644
--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -556,7 +556,7 @@ def create_tree_digraph(
             - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node
             - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node
             - ``'leaf_count'`` : number of records from the training data that fall into this leaf node
-            - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node
+            - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node
             - ``'data_percentage'`` : percentage of training data that fall into this node
     precision : int or None, optional (default=3)
         Used to restrict the display of floating point values to a certain precision.
@@ -649,7 +649,7 @@ def plot_tree(
             - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node
             - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node
             - ``'leaf_count'`` : number of records from the training data that fall into this leaf node
-            - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node
+            - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node
             - ``'data_percentage'`` : percentage of training data that fall into this node
     precision : int or None, optional (default=3)
         Used to restrict the display of floating point values to a certain precision.
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index fa1769897736..1f320886ebfc 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -6,12 +6,12 @@
 
 import numpy as np
 
-from .basic import Booster, Dataset, LightGBMError, _ArrayLike, _choose_param_value, _ConfigAliases, _log_warning
+from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _log_warning
 from .callback import record_evaluation
 from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
                      _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
-                     _LGBMComputeSampleWeight, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, dt_DataTable,
-                     pd_DataFrame)
+                     _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
+                     dt_DataTable, pd_DataFrame)
 from .engine import train
 
 _EvalResultType = Tuple[str, float, bool]
@@ -19,11 +19,15 @@
 _LGBM_ScikitCustomObjectiveFunction = Union[
     Callable[
         [np.ndarray, np.ndarray],
-        Tuple[_ArrayLike, _ArrayLike]
+        Tuple[np.ndarray, np.ndarray]
     ],
     Callable[
         [np.ndarray, np.ndarray, np.ndarray],
-        Tuple[_ArrayLike, _ArrayLike]
+        Tuple[np.ndarray, np.ndarray]
+    ],
+    Callable[
+        [np.ndarray, np.ndarray, np.ndarray, np.ndarray],
+        Tuple[np.ndarray, np.ndarray]
     ],
 ]
 _LGBM_ScikitCustomEvalFunction = Union[
@@ -54,52 +58,56 @@ def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction):
         Parameters
         ----------
         func : callable
-            Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group)``
+            Expects a callable with following signatures:
+            ``func(y_true, y_pred)``,
+            ``func(y_true, y_pred, weight)``
+            or ``func(y_true, y_pred, weight, group)``
             and returns (grad, hess):
 
                 y_true : numpy 1-D array of shape = [n_samples]
                     The target values.
-                y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                     The predicted values.
                     Predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task.
+                weight : numpy 1-D array of shape = [n_samples]
+                    The weight of samples. Weights should be non-negative.
                 group : numpy 1-D array
                     Group/query data.
                     Only used in the learning-to-rank task.
                     sum(group) = n_samples.
                     For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                     where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-                grad : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
                     The value of the first order derivative (gradient) of the loss
                     with respect to the elements of y_pred for each sample point.
-                hess : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                     The value of the second order derivative (Hessian) of the loss
                     with respect to the elements of y_pred for each sample point.
 
         .. note::
 
-            For multi-class task, the y_pred is group by class_id first, then group by row_id.
-            If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
-            and you should group grad and hess in this way as well.
+            For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
+            and grad and hess should be returned in the same format.
         """
         self.func = func
 
-    def __call__(self, preds, dataset):
+    def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.ndarray]:
         """Call passed function with appropriate arguments.
 
         Parameters
         ----------
-        preds : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The predicted values.
         dataset : Dataset
             The training dataset.
 
         Returns
         -------
-        grad : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The value of the first order derivative (gradient) of the loss
             with respect to the elements of preds for each sample point.
-        hess : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The value of the second order derivative (Hessian) of the loss
             with respect to the elements of preds for each sample point.
         """
@@ -108,26 +116,11 @@ def __call__(self, preds, dataset):
         if argc == 2:
             grad, hess = self.func(labels, preds)
         elif argc == 3:
-            grad, hess = self.func(labels, preds, dataset.get_group())
+            grad, hess = self.func(labels, preds, dataset.get_weight())
+        elif argc == 4:
+            grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group())
         else:
-            raise TypeError(f"Self-defined objective function should have 2 or 3 arguments, got {argc}")
-        """weighted for objective"""
-        weight = dataset.get_weight()
-        if weight is not None:
-            """only one class"""
-            if len(weight) == len(grad):
-                grad = np.multiply(grad, weight)
-                hess = np.multiply(hess, weight)
-            else:
-                num_data = len(weight)
-                num_class = len(grad) // num_data
-                if num_class * num_data != len(grad):
-                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
-                for k in range(num_class):
-                    for i in range(num_data):
-                        idx = k * num_data + i
-                        grad[idx] *= weight[i]
-                        hess[idx] *= weight[i]
+            raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
         return grad, hess
 
 
@@ -152,12 +145,12 @@ def __init__(self, func: _LGBM_ScikitCustomEvalFunction):
 
                 y_true : numpy 1-D array of shape = [n_samples]
                     The target values.
-                y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array shape = [n_samples, n_classes] (for multi-class task)
                     The predicted values.
                     In case of custom ``objective``, predicted values are returned before any transformation,
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 weight : numpy 1-D array of shape = [n_samples]
-                    The weight of samples.
+                    The weight of samples. Weights should be non-negative.
                 group : numpy 1-D array
                     Group/query data.
                     Only used in the learning-to-rank task.
@@ -170,20 +163,15 @@ def __init__(self, func: _LGBM_ScikitCustomEvalFunction):
                     The eval result.
                 is_higher_better : bool
                     Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-        .. note::
-
-            For multi-class task, the y_pred is group by class_id first, then group by row_id.
-            If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
         """
         self.func = func
 
-    def __call__(self, preds, dataset):
+    def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[str, float, bool]:
         """Call passed function with appropriate arguments.
 
         Parameters
         ----------
-        preds : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The predicted values.
         dataset : Dataset
             The training dataset.
@@ -223,7 +211,7 @@ def __call__(self, preds, dataset):
     y : {y_shape}
         The target values (class labels in classification, real numbers in regression).
     sample_weight : {sample_weight_shape}
-        Weights of training data.
+        Weights of training data. Weights should be non-negative.
     init_score : {init_score_shape}
         Init score of training data.
     group : {group_shape}
@@ -237,7 +225,7 @@ def __call__(self, preds, dataset):
     eval_names : list of str, or None, optional (default=None)
         Names of eval_set.
     eval_sample_weight : {eval_sample_weight_shape}
-        Weights of eval data.
+        Weights of eval data. Weights should be non-negative.
     eval_class_weight : list or None, optional (default=None)
         Class weights of eval data.
     eval_init_score : {eval_init_score_shape}
@@ -258,10 +246,11 @@ def __call__(self, preds, dataset):
         If list of int, interpreted as indices.
         If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
         If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-        All values in categorical features should be less than int32 max value (2147483647).
+        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
         Large values could be memory consuming. Consider using consecutive integers starting from zero.
         All negative values in categorical features will be treated as missing values.
         The output cannot be monotonically constrained with respect to a categorical feature.
+        Floating point numbers in categorical features will be rounded towards 0.
     callbacks : list of callable, or None, optional (default=None)
         List of callback functions that are applied at each iteration.
         See Callbacks in Python API for more information.
@@ -286,12 +275,12 @@ def __call__(self, preds, dataset):
 
         y_true : numpy 1-D array of shape = [n_samples]
             The target values.
-        y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
             The predicted values.
             In case of custom ``objective``, predicted values are returned before any transformation,
             e.g. they are raw margin instead of probability of positive class for binary task in this case.
         weight : numpy 1-D array of shape = [n_samples]
-            The weight of samples.
+            The weight of samples. Weights should be non-negative.
         group : numpy 1-D array
             Group/query data.
             Only used in the learning-to-rank task.
@@ -304,9 +293,6 @@ def __call__(self, preds, dataset):
             The eval result.
         is_higher_better : bool
             Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-    For multi-class task, the y_pred is group by class_id first, then group by row_id.
-    If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
 """
 
 _lgbmmodel_doc_predict = (
@@ -340,6 +326,9 @@ def __call__(self, preds, dataset):
             Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
             column, where the last column is the expected value.
 
+    validate_features : bool, optional (default=False)
+        If True, ensure that the features used to predict match the ones used to train.
+        Used only if data is pandas DataFrame.
     **kwargs
         Other parameters for the prediction.
 
@@ -377,7 +366,7 @@ def __init__(
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
         random_state: Optional[Union[int, np.random.RandomState]] = None,
-        n_jobs: int = -1,
+        n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         **kwargs
     ):
@@ -422,7 +411,7 @@ def __init__(
         min_split_gain : float, optional (default=0.)
             Minimum loss reduction required to make a further partition on a leaf node of the tree.
         min_child_weight : float, optional (default=1e-3)
-            Minimum sum of instance weight (hessian) needed in a child (leaf).
+            Minimum sum of instance weight (Hessian) needed in a child (leaf).
         min_child_samples : int, optional (default=20)
             Minimum number of data needed in a child (leaf).
         subsample : float, optional (default=1.)
@@ -440,8 +429,18 @@ def __init__(
             If int, this number is used to seed the C++ code.
             If RandomState object (numpy), a random integer is picked based on its state to seed the C++ code.
             If None, default seeds in C++ code are used.
-        n_jobs : int, optional (default=-1)
-            Number of parallel threads to use for training (can be changed at prediction time).
+        n_jobs : int or None, optional (default=None)
+            Number of parallel threads to use for training (can be changed at prediction time by
+            passing it as an extra keyword argument).
+
+            For better performance, it is recommended to set this to the number of physical cores
+            in the CPU.
+
+            Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like
+            scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of
+            threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds
+            to using the number of physical cores in the system (its correct detection requires
+            either the ``joblib`` or the ``psutil`` util libraries to be installed).
         importance_type : str, optional (default='split')
             The type of feature importance to be filled into ``feature_importances_``.
             If 'split', result contains numbers of times the feature is used in a model.
@@ -458,31 +457,33 @@ def __init__(
         ----
         A custom objective function can be provided for the ``objective`` parameter.
         In this case, it should have the signature
-        ``objective(y_true, y_pred) -> grad, hess`` or
-        ``objective(y_true, y_pred, group) -> grad, hess``:
+        ``objective(y_true, y_pred) -> grad, hess``,
+        ``objective(y_true, y_pred, weight) -> grad, hess``
+        or ``objective(y_true, y_pred, weight, group) -> grad, hess``:
 
             y_true : numpy 1-D array of shape = [n_samples]
                 The target values.
-            y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                 The predicted values.
                 Predicted values are returned before any transformation,
                 e.g. they are raw margin instead of probability of positive class for binary task.
+            weight : numpy 1-D array of shape = [n_samples]
+                The weight of samples. Weights should be non-negative.
             group : numpy 1-D array
                 Group/query data.
                 Only used in the learning-to-rank task.
                 sum(group) = n_samples.
                 For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                 where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-            grad : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                 The value of the first order derivative (gradient) of the loss
                 with respect to the elements of y_pred for each sample point.
-            hess : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                 The value of the second order derivative (Hessian) of the loss
                 with respect to the elements of y_pred for each sample point.
 
-        For multi-class task, the y_pred is group by class_id first, then group by row_id.
-        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
-        and you should group grad and hess in this way as well.
+        For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
+        and grad and hess should be returned in the same format.
         """
         if not SKLEARN_INSTALLED:
             raise LightGBMError('scikit-learn is required for lightgbm.sklearn. '
@@ -521,7 +522,7 @@ def __init__(
         self._n_classes = None
         self.set_params(**kwargs)
 
-    def _more_tags(self):
+    def _more_tags(self) -> Dict[str, Any]:
         return {
             'allow_nan': True,
             'X_types': ['2darray', 'sparse', '1dlabels'],
@@ -536,7 +537,7 @@ def _more_tags(self):
     def __sklearn_is_fitted__(self) -> bool:
         return getattr(self, "fitted_", False)
 
-    def get_params(self, deep=True):
+    def get_params(self, deep: bool = True) -> Dict[str, Any]:
         """Get parameters for this estimator.
 
         Parameters
@@ -554,7 +555,7 @@ def get_params(self, deep=True):
         params.update(self._other_params)
         return params
 
-    def set_params(self, **params):
+    def set_params(self, **params: Any) -> "LGBMModel":
         """Set the parameters of this estimator.
 
         Parameters
@@ -612,11 +613,10 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
                     raise ValueError("Unknown LGBMModel type.")
         if callable(self._objective):
             if stage == "fit":
-                self._fobj = _ObjectiveFunctionWrapper(self._objective)
-            params['objective'] = 'None'  # objective = nullptr for unknown objective
+                params['objective'] = _ObjectiveFunctionWrapper(self._objective)
+            else:
+                params['objective'] = 'None'
         else:
-            if stage == "fit":
-                self._fobj = None
             params['objective'] = self._objective
 
         params.pop('importance_type', None)
@@ -651,8 +651,34 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
         # overwrite default metric by explicitly set metric
         params = _choose_param_value("metric", params, original_metric)
 
+        # use joblib conventions for negative n_jobs, just like scikit-learn
+        # at predict time, this is handled later due to the order of parameter updates
+        if stage == "fit":
+            params = _choose_param_value("num_threads", params, self.n_jobs)
+            params["num_threads"] = self._process_n_jobs(params["num_threads"])
+
         return params
 
+    def _process_n_jobs(self, n_jobs: Optional[int]) -> int:
+        """Convert special values of n_jobs to their actual values according to the formulas that apply.
+
+        Parameters
+        ----------
+        n_jobs : int or None
+            The original value of n_jobs, potentially having special values such as 'None' or
+            negative integers.
+
+        Returns
+        -------
+        n_jobs : int
+            The value of n_jobs with special values converted to actual number of threads.
+        """
+        if n_jobs is None:
+            n_jobs = _LGBMCpuCount(only_physical_cores=True)
+        elif n_jobs < 0:
+            n_jobs = max(_LGBMCpuCount(only_physical_cores=False) + 1 + n_jobs, 1)
+        return n_jobs
+
     def fit(
         self,
         X,
@@ -710,14 +736,9 @@ def fit(
         # copy for consistency
         self._n_features_in = self._n_features
 
-        def _construct_dataset(X, y, sample_weight, init_score, group, params,
-                               categorical_feature='auto'):
-            return Dataset(X, label=y, weight=sample_weight, group=group,
-                           init_score=init_score, params=params,
-                           categorical_feature=categorical_feature)
-
-        train_set = _construct_dataset(_X, _y, sample_weight, init_score, group, params,
-                                       categorical_feature=categorical_feature)
+        train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
+                            init_score=init_score, categorical_feature=categorical_feature,
+                            params=params)
 
         valid_sets = []
         if eval_set is not None:
@@ -751,8 +772,10 @@ def _get_meta_data(collection, name, i):
                             valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
                     valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
                     valid_group = _get_meta_data(eval_group, 'eval_group', i)
-                    valid_set = _construct_dataset(valid_data[0], valid_data[1],
-                                                   valid_weight, valid_init_score, valid_group, params)
+                    valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
+                                        group=valid_group, init_score=valid_init_score,
+                                        categorical_feature='auto', params=params)
+
                 valid_sets.append(valid_set)
 
         if isinstance(init_model, LGBMModel):
@@ -772,7 +795,6 @@ def _get_meta_data(collection, name, i):
             num_boost_round=self.n_estimators,
             valid_sets=valid_sets,
             valid_names=eval_names,
-            fobj=self._fobj,
             feval=eval_metrics_callable,
             init_model=init_model,
             feature_name=feature_name,
@@ -801,8 +823,17 @@ def _get_meta_data(collection, name, i):
         eval_group_shape="list of array, or None, optional (default=None)"
     ) + "\n\n" + _lgbmmodel_doc_custom_eval_note
 
-    def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
-                pred_leaf=False, pred_contrib=False, **kwargs):
+    def predict(
+        self,
+        X,
+        raw_score: bool = False,
+        start_iteration: int = 0,
+        num_iteration: Optional[int] = None,
+        pred_leaf: bool = False,
+        pred_contrib: bool = False,
+        validate_features: bool = False,
+        **kwargs: Any
+    ):
         """Docstring is set after definition, using a template."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
@@ -828,8 +859,15 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
         ):
             predict_params.pop(alias, None)
         predict_params.update(kwargs)
+
+        # number of threads can have values with special meaning which is only applied
+        # in the scikit-learn interface, these should not reach the c++ side as-is
+        predict_params = _choose_param_value("num_threads", predict_params, self.n_jobs)
+        predict_params["num_threads"] = self._process_n_jobs(predict_params["num_threads"])
+
         return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
-                                     pred_leaf=pred_leaf, pred_contrib=pred_contrib, **predict_params)
+                                     pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features,
+                                     **predict_params)
 
     predict.__doc__ = _lgbmmodel_doc_predict.format(
         description="Return the predicted value for each sample.",
@@ -841,14 +879,14 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
     )
 
     @property
-    def n_features_(self):
+    def n_features_(self) -> int:
         """:obj:`int`: The number of features of fitted model."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.')
         return self._n_features
 
     @property
-    def n_features_in_(self):
+    def n_features_in_(self) -> int:
         """:obj:`int`: The number of features of fitted model."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.')
@@ -862,14 +900,14 @@ def best_score_(self):
         return self._best_score
 
     @property
-    def best_iteration_(self):
+    def best_iteration_(self) -> int:
         """:obj:`int`: The best iteration of fitted model if ``early_stopping()`` callback has been specified."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping callback beforehand.')
         return self._best_iteration
 
     @property
-    def objective_(self):
+    def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]:
         """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError('No objective found. Need to call fit beforehand.')
@@ -1062,11 +1100,21 @@ def fit(
     fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
                    + _base_doc[_base_doc.find('eval_metric :'):])
 
-    def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
-                pred_leaf=False, pred_contrib=False, **kwargs):
+    def predict(
+        self,
+        X,
+        raw_score: bool = False,
+        start_iteration: int = 0,
+        num_iteration: Optional[int] = None,
+        pred_leaf: bool = False,
+        pred_contrib: bool = False,
+        validate_features: bool = False,
+        **kwargs: Any
+    ):
         """Docstring is inherited from the LGBMModel."""
         result = self.predict_proba(X, raw_score, start_iteration, num_iteration,
-                                    pred_leaf, pred_contrib, **kwargs)
+                                    pred_leaf, pred_contrib, validate_features,
+                                    **kwargs)
         if callable(self._objective) or raw_score or pred_leaf or pred_contrib:
             return result
         else:
@@ -1075,10 +1123,19 @@ def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
 
     predict.__doc__ = LGBMModel.predict.__doc__
 
-    def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=None,
-                      pred_leaf=False, pred_contrib=False, **kwargs):
+    def predict_proba(
+        self,
+        X,
+        raw_score: bool = False,
+        start_iteration: int = 0,
+        num_iteration: Optional[int] = None,
+        pred_leaf: bool = False,
+        pred_contrib: bool = False,
+        validate_features: bool = False,
+        **kwargs: Any
+    ):
         """Docstring is set after definition, using a template."""
-        result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs)
+        result = super().predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, validate_features, **kwargs)
         if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
             _log_warning("Cannot compute class probabilities or labels "
                          "due to the usage of customized objective function.\n"
@@ -1106,7 +1163,7 @@ def classes_(self):
         return self._classes
 
     @property
-    def n_classes_(self):
+    def n_classes_(self) -> int:
         """:obj:`int`: The number of classes."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
diff --git a/python-package/setup.py b/python-package/setup.py
index c234f65232e8..61dd423a60a8 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -21,6 +21,7 @@
     ('integrated-opencl', None, 'Compile integrated OpenCL version'),
     ('gpu', 'g', 'Compile GPU version'),
     ('cuda', None, 'Compile CUDA version'),
+    ('cuda-exp', None, 'Compile CUDA Experimental version'),
     ('mpi', None, 'Compile MPI version'),
     ('nomp', None, 'Compile version without OpenMP support'),
     ('hdfs', 'h', 'Compile HDFS version'),
@@ -104,6 +105,7 @@ def compile_cpp(
     use_mingw: bool = False,
     use_gpu: bool = False,
     use_cuda: bool = False,
+    use_cuda_exp: bool = False,
     use_mpi: bool = False,
     use_hdfs: bool = False,
     boost_root: Optional[str] = None,
@@ -144,6 +146,8 @@ def compile_cpp(
             cmake_cmd.append(f"-DOpenCL_LIBRARY={opencl_library}")
     elif use_cuda:
         cmake_cmd.append("-DUSE_CUDA=ON")
+    elif use_cuda_exp:
+        cmake_cmd.append("-DUSE_CUDA_EXP=ON")
     if use_mpi:
         cmake_cmd.append("-DUSE_MPI=ON")
     if nomp:
@@ -163,7 +167,7 @@ def compile_cpp(
         else:
             status = 1
             lib_path = CURRENT_DIR / "compile" / "windows" / "x64" / "DLL" / "lib_lightgbm.dll"
-            if not any((use_gpu, use_cuda, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
+            if not any((use_gpu, use_cuda, use_cuda_exp, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
                 logger.info("Starting to compile with MSBuild from existing solution file.")
                 platform_toolsets = ("v143", "v142", "v141", "v140")
                 for pt in platform_toolsets:
@@ -227,6 +231,7 @@ def initialize_options(self) -> None:
         self.integrated_opencl = False
         self.gpu = False
         self.cuda = False
+        self.cuda_exp = False
         self.boost_root = None
         self.boost_dir = None
         self.boost_include_dir = None
@@ -250,7 +255,7 @@ def run(self) -> None:
         LOG_PATH.touch()
         if not self.precompile:
             copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu)
-            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi,
+            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_cuda_exp=self.cuda_exp, use_mpi=self.mpi,
                         use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                         boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
                         opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
@@ -270,6 +275,7 @@ def initialize_options(self) -> None:
         self.integrated_opencl = False
         self.gpu = False
         self.cuda = False
+        self.cuda_exp = False
         self.boost_root = None
         self.boost_dir = None
         self.boost_include_dir = None
@@ -291,6 +297,7 @@ def finalize_options(self) -> None:
         install.integrated_opencl = self.integrated_opencl
         install.gpu = self.gpu
         install.cuda = self.cuda
+        install.cuda_exp = self.cuda_exp
         install.boost_root = self.boost_root
         install.boost_dir = self.boost_dir
         install.boost_include_dir = self.boost_include_dir
@@ -340,6 +347,7 @@ def run(self) -> None:
           version=version,
           description='LightGBM Python Package',
           long_description=readme,
+          python_requires='>=3.6',
           install_requires=[
               'wheel',
               'numpy',
@@ -379,4 +387,5 @@ def run(self) -> None:
                        'Programming Language :: Python :: 3.7',
                        'Programming Language :: Python :: 3.8',
                        'Programming Language :: Python :: 3.9',
+                       'Programming Language :: Python :: 3.10',
                        'Topic :: Scientific/Engineering :: Artificial Intelligence'])
diff --git a/src/application/application.cpp b/src/application/application.cpp
index b7f55a2ec0e4..7f7fc816cf4b 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -36,7 +36,7 @@ Application::Application(int argc, char** argv) {
     Log::Fatal("No training/prediction data, application quit");
   }
 
-  if (config_.device_type == std::string("cuda")) {
+  if (config_.device_type == std::string("cuda") || config_.device_type == std::string("cuda_exp")) {
       LGBM_config_::current_device = lgbm_device_cuda;
   }
 }
@@ -198,7 +198,7 @@ void Application::InitTrain() {
   for (size_t i = 0; i < valid_datas_.size(); ++i) {
     boosting_->AddValidDataset(valid_datas_[i].get(),
                                Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i]));
-    Log::Debug("Number of data points in validation set #%zu: %zu", i + 1, valid_datas_[i]->num_data());
+    Log::Debug("Number of data points in validation set #%zu: %d", i + 1, valid_datas_[i]->num_data());
   }
   Log::Info("Finished initializing training");
 }
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 5b31865748f1..39ea84d49fdf 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -65,7 +65,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   es_first_metric_only_ = config_->first_metric_only;
   shrinkage_rate_ = config_->learning_rate;
 
-  if (config_->device_type == std::string("cuda")) {
+  if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) {
     LGBM_config_::current_learner = use_cuda_learner;
   }
 
@@ -391,7 +391,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
       auto grad = gradients + offset;
       auto hess = hessians + offset;
       // need to copy gradients for bagging subset.
-      if (is_use_subset_ && bag_data_cnt_ < num_data_) {
+      if (is_use_subset_ && bag_data_cnt_ < num_data_ && config_->device_type != std::string("cuda_exp")) {
         for (int i = 0; i < bag_data_cnt_; ++i) {
           gradients_[offset + i] = grad[bag_data_indices_[i]];
           hessians_[offset + i] = hess[bag_data_indices_[i]];
@@ -419,20 +419,15 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
     } else {
       // only add default score one-time
       if (models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
-        double output = 0.0;
-        if (!class_need_train_[cur_tree_id]) {
-          if (objective_function_ != nullptr) {
-            output = objective_function_->BoostFromScore(cur_tree_id);
+        if (objective_function_ != nullptr && !config_->boost_from_average && !train_score_updater_->has_init_score()) {
+          init_scores[cur_tree_id] = ObtainAutomaticInitialScore(objective_function_, cur_tree_id);
+          // updates scores
+          train_score_updater_->AddScore(init_scores[cur_tree_id], cur_tree_id);
+          for (auto& score_updater : valid_score_updater_) {
+            score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id);
           }
-        } else {
-          output = init_scores[cur_tree_id];
-        }
-        new_tree->AsConstantTree(output);
-        // updates scores
-        train_score_updater_->AddScore(output, cur_tree_id);
-        for (auto& score_updater : valid_score_updater_) {
-          score_updater->AddScore(output, cur_tree_id);
         }
+        new_tree->AsConstantTree(init_scores[cur_tree_id]);
       }
     }
     // add model
@@ -810,15 +805,17 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
     double average_bag_rate =
         (static_cast<double>(bag_data_cnt_) / num_data_) / config->bagging_freq;
     is_use_subset_ = false;
-    const int group_threshold_usesubset = 100;
-    if (average_bag_rate <= 0.5
-        && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
-      if (tmp_subset_ == nullptr || is_change_dataset) {
-        tmp_subset_.reset(new Dataset(bag_data_cnt_));
-        tmp_subset_->CopyFeatureMapperFrom(train_data_);
+    if (config_->device_type != std::string("cuda_exp")) {
+      const int group_threshold_usesubset = 100;
+      if (average_bag_rate <= 0.5
+          && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
+        if (tmp_subset_ == nullptr || is_change_dataset) {
+          tmp_subset_.reset(new Dataset(bag_data_cnt_));
+          tmp_subset_->CopyFeatureMapperFrom(train_data_);
+        }
+        is_use_subset_ = true;
+        Log::Debug("Use subset for bagging");
       }
-      is_use_subset_ = true;
-      Log::Debug("Use subset for bagging");
     }
 
     need_re_bagging_ = true;
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index efeacfbfaef0..abd94a0c4022 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -193,7 +193,7 @@ class GBDT : public GBDTBase {
     if (data_idx > 0) {
       num_data = valid_score_updater_[data_idx - 1]->num_data();
     }
-    return num_data * num_class_;
+    return static_cast<int64_t>(num_data) * num_class_;
   }
 
   /*!
@@ -488,7 +488,7 @@ class GBDT : public GBDTBase {
   /*! \brief Parser config file content */
   std::string parser_config_str_ = "";
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
   /*! \brief First order derivative of training data */
   std::vector<score_t, CHAllocator<score_t>> gradients_;
   /*! \brief Second order derivative of training data */
diff --git a/src/c_api.cpp b/src/c_api.cpp
index d8a8deaf57b0..d86862060917 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -974,13 +974,13 @@ int LGBM_DatasetCreateFromFile(const char* filename,
   API_END();
 }
 
-
 int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
                                         int** sample_indices,
                                         int32_t ncol,
                                         const int* num_per_col,
                                         int32_t num_sample_row,
-                                        int32_t num_total_row,
+                                        int32_t num_local_row,
+                                        int64_t num_dist_row,
                                         const char* parameters,
                                         DatasetHandle* out) {
   API_BEGIN();
@@ -989,13 +989,16 @@ int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
   config.Set(param);
   OMP_SET_NUM_THREADS(config.num_threads);
   DatasetLoader loader(config, nullptr, 1, nullptr);
-  *out = loader.ConstructFromSampleData(sample_data, sample_indices, ncol, num_per_col,
+  *out = loader.ConstructFromSampleData(sample_data,
+                                        sample_indices,
+                                        ncol,
+                                        num_per_col,
                                         num_sample_row,
-                                        static_cast<data_size_t>(num_total_row));
+                                        static_cast<data_size_t>(num_local_row),
+                                        num_dist_row);
   API_END();
 }
 
-
 int LGBM_DatasetCreateByReference(const DatasetHandle reference,
                                   int64_t num_total_row,
                                   DatasetHandle* out) {
@@ -1141,7 +1144,9 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
                                              Vector2Ptr<int>(&sample_idx).data(),
                                              ncol,
                                              VectorSize<double>(sample_values).data(),
-                                             sample_cnt, total_nrow));
+                                             sample_cnt,
+                                             total_nrow,
+                                             total_nrow));
   } else {
     ret.reset(new Dataset(total_nrow));
     ret->CreateValid(
@@ -1216,7 +1221,9 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
                                              Vector2Ptr<int>(&sample_idx).data(),
                                              static_cast<int>(num_col),
                                              VectorSize<double>(sample_values).data(),
-                                             sample_cnt, nrow));
+                                             sample_cnt,
+                                             nrow,
+                                             nrow));
   } else {
     ret.reset(new Dataset(nrow));
     ret->CreateValid(
@@ -1283,7 +1290,9 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
                                              Vector2Ptr<int>(&sample_idx).data(),
                                              static_cast<int>(num_col),
                                              VectorSize<double>(sample_values).data(),
-                                             sample_cnt, nrow));
+                                             sample_cnt,
+                                             nrow,
+                                             nrow));
   } else {
     ret.reset(new Dataset(nrow));
     ret->CreateValid(
@@ -1355,7 +1364,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
                                              Vector2Ptr<int>(&sample_idx).data(),
                                              static_cast<int>(sample_values.size()),
                                              VectorSize<double>(sample_values).data(),
-                                             sample_cnt, nrow));
+                                             sample_cnt,
+                                             nrow,
+                                             nrow));
   } else {
     ret.reset(new Dataset(nrow));
     ret->CreateValid(
@@ -1550,6 +1561,25 @@ int LGBM_DatasetGetNumFeature(DatasetHandle handle,
   API_END();
 }
 
+int LGBM_DatasetGetFeatureNumBin(DatasetHandle handle,
+                                 int feature,
+                                 int* out) {
+  API_BEGIN();
+  auto dataset = reinterpret_cast<Dataset*>(handle);
+  int num_features = dataset->num_total_features();
+  if (feature < 0 || feature >= num_features) {
+    Log::Fatal("Tried to retrieve number of bins for feature index %d, "
+               "but the valid feature indices are [0, %d].", feature, num_features - 1);
+  }
+  int inner_idx = dataset->InnerFeatureIndex(feature);
+  if (inner_idx >= 0) {
+    *out = dataset->FeatureNumBin(inner_idx);
+  } else {
+    *out = 0;
+  }
+  API_END();
+}
+
 int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
                                 DatasetHandle source) {
   API_BEGIN();
@@ -1966,17 +1996,17 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
 int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, int indptr_type, int data_type) {
   API_BEGIN();
   if (indptr_type == C_API_DTYPE_INT32) {
-    delete reinterpret_cast<int32_t*>(indptr);
+    delete[] reinterpret_cast<int32_t*>(indptr);
   } else if (indptr_type == C_API_DTYPE_INT64) {
-    delete reinterpret_cast<int64_t*>(indptr);
+    delete[] reinterpret_cast<int64_t*>(indptr);
   } else {
     Log::Fatal("Unknown indptr type in LGBM_BoosterFreePredictSparse");
   }
-  delete indices;
+  delete[] indices;
   if (data_type == C_API_DTYPE_FLOAT32) {
-    delete reinterpret_cast<float*>(data);
+    delete[] reinterpret_cast<float*>(data);
   } else if (data_type == C_API_DTYPE_FLOAT64) {
-    delete reinterpret_cast<double*>(data);
+    delete[] reinterpret_cast<double*>(data);
   } else {
     Log::Fatal("Unknown data type in LGBM_BoosterFreePredictSparse");
   }
@@ -2110,6 +2140,27 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
   API_END();
 }
 
+int LGBM_BoosterValidateFeatureNames(BoosterHandle handle,
+                                     const char** data_names,
+                                     int data_num_features) {
+  API_BEGIN();
+  int booster_num_features;
+  size_t out_buffer_len;
+  LGBM_BoosterGetFeatureNames(handle, 0, &booster_num_features, 0, &out_buffer_len, nullptr);
+  if (booster_num_features != data_num_features) {
+    Log::Fatal("Model was trained on %d features, but got %d input features to predict.", booster_num_features, data_num_features);
+  }
+  std::vector<std::vector<char>> tmp_names(booster_num_features, std::vector<char>(out_buffer_len));
+  std::vector<char*> booster_names = Vector2Ptr(&tmp_names);
+  LGBM_BoosterGetFeatureNames(handle, data_num_features, &booster_num_features, out_buffer_len, &out_buffer_len, booster_names.data());
+  for (int i = 0; i < booster_num_features; ++i) {
+    if (strcmp(data_names[i], booster_names[i]) != 0) {
+      Log::Fatal("Expected '%s' at position %d but found '%s'", booster_names[i], i, data_names[i]);
+    }
+  }
+  API_END();
+}
+
 int LGBM_BoosterPredictForMat(BoosterHandle handle,
                               const void* data,
                               int data_type,
diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu
new file mode 100644
index 000000000000..9bc52ceaedc8
--- /dev/null
+++ b/src/cuda/cuda_algorithms.cu
@@ -0,0 +1,82 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+
+namespace LightGBM {
+
+template <typename T>
+__global__ void ShufflePrefixSumGlobalKernel(T* values, size_t len, T* block_prefix_sum_buffer) {
+  __shared__ T shared_mem_buffer[32];
+  const size_t index = static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  T value = 0;
+  if (index < len) {
+    value = values[index];
+  }
+  const T prefix_sum_value = ShufflePrefixSum<T>(value, shared_mem_buffer);
+  values[index] = prefix_sum_value;
+  if (threadIdx.x == blockDim.x - 1) {
+    block_prefix_sum_buffer[blockIdx.x] = prefix_sum_value;
+  }
+}
+
+template <typename T>
+__global__ void ShufflePrefixSumGlobalReduceBlockKernel(T* block_prefix_sum_buffer, int num_blocks) {
+  __shared__ T shared_mem_buffer[32];
+  const int num_blocks_per_thread = (num_blocks + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 2) / (GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1);
+  int thread_block_start = threadIdx.x == 0 ? 0 : (threadIdx.x - 1) * num_blocks_per_thread;
+  int thread_block_end = threadIdx.x == 0 ? 0 : min(thread_block_start + num_blocks_per_thread, num_blocks);
+  T base = 0;
+  for (int block_index = thread_block_start; block_index < thread_block_end; ++block_index) {
+    base += block_prefix_sum_buffer[block_index];
+  }
+  base = ShufflePrefixSum<T>(base, shared_mem_buffer);
+  thread_block_start = threadIdx.x == blockDim.x - 1 ? 0 : threadIdx.x * num_blocks_per_thread;
+  thread_block_end = threadIdx.x == blockDim.x - 1 ? 0 : min(thread_block_start + num_blocks_per_thread, num_blocks);
+  for (int block_index = thread_block_start + 1; block_index < thread_block_end; ++block_index) {
+    block_prefix_sum_buffer[block_index] += block_prefix_sum_buffer[block_index - 1];
+  }
+  for (int block_index = thread_block_start; block_index < thread_block_end; ++block_index) {
+    block_prefix_sum_buffer[block_index] += base;
+  }
+}
+
+template <typename T>
+__global__ void ShufflePrefixSumGlobalAddBase(size_t len, const T* block_prefix_sum_buffer, T* values) {
+  const T base = blockIdx.x == 0 ? 0 : block_prefix_sum_buffer[blockIdx.x - 1];
+  const size_t index = static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (index < len) {
+    values[index] += base;
+  }
+}
+
+template <typename T>
+void ShufflePrefixSumGlobalInner(T* values, size_t len, T* block_prefix_sum_buffer) {
+  const int num_blocks = (static_cast<int>(len) + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE;
+  ShufflePrefixSumGlobalKernel<<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, len, block_prefix_sum_buffer);
+  ShufflePrefixSumGlobalReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_prefix_sum_buffer, num_blocks);
+  ShufflePrefixSumGlobalAddBase<<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(len, block_prefix_sum_buffer, values);
+}
+
+template <>
+void ShufflePrefixSumGlobal(uint16_t* values, size_t len, uint16_t* block_prefix_sum_buffer) {
+  ShufflePrefixSumGlobalInner<uint16_t>(values, len, block_prefix_sum_buffer);
+}
+
+template <>
+void ShufflePrefixSumGlobal(uint32_t* values, size_t len, uint32_t* block_prefix_sum_buffer) {
+  ShufflePrefixSumGlobalInner<uint32_t>(values, len, block_prefix_sum_buffer);
+}
+
+template <>
+void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix_sum_buffer) {
+  ShufflePrefixSumGlobalInner<uint64_t>(values, len, block_prefix_sum_buffer);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp
new file mode 100644
index 000000000000..bab1e1b8ff37
--- /dev/null
+++ b/src/cuda/cuda_utils.cpp
@@ -0,0 +1,31 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_utils.h>
+
+namespace LightGBM {
+
+void SynchronizeCUDADevice(const char* file, const int line) {
+  gpuAssert(cudaDeviceSynchronize(), file, line);
+}
+
+void PrintLastCUDAError() {
+  const char* error_name = cudaGetErrorName(cudaGetLastError());
+  Log::Fatal(error_name);
+}
+
+void SetCUDADevice(int gpu_device_id, const char* file, int line) {
+  int cur_gpu_device_id = 0;
+  CUDASUCCESS_OR_FATAL_OUTER(cudaGetDevice(&cur_gpu_device_id));
+  if (cur_gpu_device_id != gpu_device_id) {
+    CUDASUCCESS_OR_FATAL_OUTER(cudaSetDevice(gpu_device_id));
+  }
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/config.cpp b/src/io/config.cpp
index a42b392dac3e..298ce79e661d 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -128,6 +128,8 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
       *device_type = "gpu";
     } else if (value == std::string("cuda")) {
       *device_type = "cuda";
+    } else if (value == std::string("cuda_exp")) {
+      *device_type = "cuda_exp";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());
     }
@@ -162,18 +164,18 @@ void Config::GetAucMuWeights() {
   } else {
     auc_mu_weights_matrix = std::vector<std::vector<double>> (num_class, std::vector<double>(num_class, 0));
     if (auc_mu_weights.size() != static_cast<size_t>(num_class * num_class)) {
-      Log::Fatal("auc_mu_weights must have %d elements, but found %d", num_class * num_class, auc_mu_weights.size());
+      Log::Fatal("auc_mu_weights must have %d elements, but found %zu", num_class * num_class, auc_mu_weights.size());
     }
     for (size_t i = 0; i < static_cast<size_t>(num_class); ++i) {
       for (size_t j = 0; j < static_cast<size_t>(num_class); ++j) {
         if (i == j) {
           auc_mu_weights_matrix[i][j] = 0;
           if (std::fabs(auc_mu_weights[i * num_class + j]) > kZeroThreshold) {
-            Log::Info("AUC-mu matrix must have zeros on diagonal. Overwriting value in position %d of auc_mu_weights with 0.", i * num_class + j);
+            Log::Info("AUC-mu matrix must have zeros on diagonal. Overwriting value in position %zu of auc_mu_weights with 0.", i * num_class + j);
           }
         } else {
           if (std::fabs(auc_mu_weights[i * num_class + j]) < kZeroThreshold) {
-            Log::Fatal("AUC-mu matrix must have non-zero values for non-diagonal entries. Found zero value in position %d of auc_mu_weights.", i * num_class + j);
+            Log::Fatal("AUC-mu matrix must have non-zero values for non-diagonal entries. Found zero value in position %zu of auc_mu_weights.", i * num_class + j);
           }
           auc_mu_weights_matrix[i][j] = auc_mu_weights[i * num_class + j];
         }
@@ -208,7 +210,7 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
   GetObjectiveType(params, &objective);
   GetMetricType(params, objective, &metric);
   GetDeviceType(params, &device_type);
-  if (device_type == std::string("cuda")) {
+  if (device_type == std::string("cuda") || device_type == std::string("cuda_exp")) {
     LGBM_config_::current_device = lgbm_device_cuda;
   }
   GetTreeLearnerType(params, &tree_learner);
@@ -331,13 +333,20 @@ void Config::CheckParamConflict() {
       num_leaves = static_cast<int>(full_num_leaves);
     }
   }
-  // force col-wise for gpu & CUDA
   if (device_type == std::string("gpu") || device_type == std::string("cuda")) {
+    // force col-wise for gpu, and cuda version
     force_col_wise = true;
     force_row_wise = false;
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
+  } else if (device_type == std::string("cuda_exp")) {
+    // force row-wise for cuda_exp version
+    force_col_wise = false;
+    force_row_wise = true;
+    if (deterministic) {
+      Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
+    }
   }
   // force gpu_use_dp for CUDA
   if (device_type == std::string("cuda") && !gpu_use_dp) {
diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp
new file mode 100644
index 000000000000..c4b0bb62e584
--- /dev/null
+++ b/src/io/cuda/cuda_column_data.cpp
@@ -0,0 +1,311 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_column_data.hpp>
+
+namespace LightGBM {
+
+CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_id) {
+  num_threads_ = OMP_NUM_THREADS();
+  num_data_ = num_data;
+  if (gpu_device_id >= 0) {
+    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
+  } else {
+    SetCUDADevice(0, __FILE__, __LINE__);
+  }
+  cuda_used_indices_ = nullptr;
+  cuda_data_by_column_ = nullptr;
+  cuda_column_bit_type_ = nullptr;
+  cuda_feature_min_bin_ = nullptr;
+  cuda_feature_max_bin_ = nullptr;
+  cuda_feature_offset_ = nullptr;
+  cuda_feature_most_freq_bin_ = nullptr;
+  cuda_feature_default_bin_ = nullptr;
+  cuda_feature_missing_is_zero_ = nullptr;
+  cuda_feature_missing_is_na_ = nullptr;
+  cuda_feature_mfb_is_zero_ = nullptr;
+  cuda_feature_mfb_is_na_ = nullptr;
+  cuda_feature_to_column_ = nullptr;
+  data_by_column_.clear();
+}
+
+CUDAColumnData::~CUDAColumnData() {
+  DeallocateCUDAMemory<data_size_t>(&cuda_used_indices_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<void*>(&cuda_data_by_column_, __FILE__, __LINE__);
+  for (size_t i = 0; i < data_by_column_.size(); ++i) {
+    DeallocateCUDAMemory<void>(&data_by_column_[i], __FILE__, __LINE__);
+  }
+  DeallocateCUDAMemory<uint8_t>(&cuda_column_bit_type_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_min_bin_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_max_bin_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_offset_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_most_freq_bin_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_default_bin_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint8_t>(&cuda_feature_missing_is_zero_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint8_t>(&cuda_feature_missing_is_na_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint8_t>(&cuda_feature_mfb_is_zero_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint8_t>(&cuda_feature_mfb_is_na_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_feature_to_column_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_used_indices_, __FILE__, __LINE__);
+}
+
+template <bool IS_SPARSE, bool IS_4BIT, typename BIN_TYPE>
+void CUDAColumnData::InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer) {
+  BIN_TYPE* cuda_column_data = nullptr;
+  if (!IS_SPARSE) {
+    if (IS_4BIT) {
+      std::vector<BIN_TYPE> expanded_column_data(num_data_, 0);
+      const BIN_TYPE* in_column_data_reintrepreted = reinterpret_cast<const BIN_TYPE*>(in_column_data);
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        expanded_column_data[i] = static_cast<BIN_TYPE>((in_column_data_reintrepreted[i >> 1] >> ((i & 1) << 2)) & 0xf);
+      }
+      InitCUDAMemoryFromHostMemory<BIN_TYPE>(&cuda_column_data,
+                                                  expanded_column_data.data(),
+                                                  static_cast<size_t>(num_data_),
+                                                  __FILE__,
+                                                  __LINE__);
+    } else {
+      InitCUDAMemoryFromHostMemory<BIN_TYPE>(&cuda_column_data,
+                                                  reinterpret_cast<const BIN_TYPE*>(in_column_data),
+                                                  static_cast<size_t>(num_data_),
+                                                  __FILE__,
+                                                  __LINE__);
+    }
+  } else {
+    // need to iterate bin iterator
+    std::vector<BIN_TYPE> expanded_column_data(num_data_, 0);
+    for (data_size_t i = 0; i < num_data_; ++i) {
+      expanded_column_data[i] = static_cast<BIN_TYPE>(bin_iterator->RawGet(i));
+    }
+    InitCUDAMemoryFromHostMemory<BIN_TYPE>(&cuda_column_data,
+                                                expanded_column_data.data(),
+                                                static_cast<size_t>(num_data_),
+                                                __FILE__,
+                                                __LINE__);
+  }
+  *out_column_data_pointer = reinterpret_cast<void*>(cuda_column_data);
+}
+
+void CUDAColumnData::Init(const int num_columns,
+                          const std::vector<const void*>& column_data,
+                          const std::vector<BinIterator*>& column_bin_iterator,
+                          const std::vector<uint8_t>& column_bit_type,
+                          const std::vector<uint32_t>& feature_max_bin,
+                          const std::vector<uint32_t>& feature_min_bin,
+                          const std::vector<uint32_t>& feature_offset,
+                          const std::vector<uint32_t>& feature_most_freq_bin,
+                          const std::vector<uint32_t>& feature_default_bin,
+                          const std::vector<uint8_t>& feature_missing_is_zero,
+                          const std::vector<uint8_t>& feature_missing_is_na,
+                          const std::vector<uint8_t>& feature_mfb_is_zero,
+                          const std::vector<uint8_t>& feature_mfb_is_na,
+                          const std::vector<int>& feature_to_column) {
+  num_columns_ = num_columns;
+  column_bit_type_ = column_bit_type;
+  feature_max_bin_ = feature_max_bin;
+  feature_min_bin_ = feature_min_bin;
+  feature_offset_ = feature_offset;
+  feature_most_freq_bin_ = feature_most_freq_bin;
+  feature_default_bin_ = feature_default_bin;
+  feature_missing_is_zero_ = feature_missing_is_zero;
+  feature_missing_is_na_ = feature_missing_is_na;
+  feature_mfb_is_zero_ = feature_mfb_is_zero;
+  feature_mfb_is_na_ = feature_mfb_is_na;
+  data_by_column_.resize(num_columns_, nullptr);
+  OMP_INIT_EX();
+  #pragma omp parallel for schedule(static) num_threads(num_threads_)
+  for (int column_index = 0; column_index < num_columns_; ++column_index) {
+    OMP_LOOP_EX_BEGIN();
+    const int8_t bit_type = column_bit_type[column_index];
+    if (column_data[column_index] != nullptr) {
+      // is dense column
+      if (bit_type == 4) {
+        column_bit_type_[column_index] = 8;
+        InitOneColumnData<false, true, uint8_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+      } else if (bit_type == 8) {
+        InitOneColumnData<false, false, uint8_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+      } else if (bit_type == 16) {
+        InitOneColumnData<false, false, uint16_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+      } else if (bit_type == 32) {
+        InitOneColumnData<false, false, uint32_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+      } else {
+        Log::Fatal("Unknow column bit type %d", bit_type);
+      }
+    } else {
+      // is sparse column
+      if (bit_type == 8) {
+        InitOneColumnData<true, false, uint8_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+      } else if (bit_type == 16) {
+        InitOneColumnData<true, false, uint16_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+      } else if (bit_type == 32) {
+        InitOneColumnData<true, false, uint32_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+      } else {
+        Log::Fatal("Unknow column bit type %d", bit_type);
+      }
+    }
+    OMP_LOOP_EX_END();
+  }
+  OMP_THROW_EX();
+  feature_to_column_ = feature_to_column;
+  InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_,
+                                           data_by_column_.data(),
+                                           data_by_column_.size(),
+                                           __FILE__,
+                                           __LINE__);
+  InitColumnMetaInfo();
+}
+
+void CUDAColumnData::CopySubrow(
+  const CUDAColumnData* full_set,
+  const data_size_t* used_indices,
+  const data_size_t num_used_indices) {
+  num_threads_ = full_set->num_threads_;
+  num_columns_ = full_set->num_columns_;
+  column_bit_type_ = full_set->column_bit_type_;
+  feature_min_bin_ = full_set->feature_min_bin_;
+  feature_max_bin_ = full_set->feature_max_bin_;
+  feature_offset_ = full_set->feature_offset_;
+  feature_most_freq_bin_ = full_set->feature_most_freq_bin_;
+  feature_default_bin_ = full_set->feature_default_bin_;
+  feature_missing_is_zero_ = full_set->feature_missing_is_zero_;
+  feature_missing_is_na_ = full_set->feature_missing_is_na_;
+  feature_mfb_is_zero_ = full_set->feature_mfb_is_zero_;
+  feature_mfb_is_na_ = full_set->feature_mfb_is_na_;
+  feature_to_column_ = full_set->feature_to_column_;
+  if (cuda_used_indices_ == nullptr) {
+    // initialize the subset cuda column data
+    const size_t num_used_indices_size = static_cast<size_t>(num_used_indices);
+    AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__);
+    data_by_column_.resize(num_columns_, nullptr);
+    OMP_INIT_EX();
+    #pragma omp parallel for schedule(static) num_threads(num_threads_)
+    for (int column_index = 0; column_index < num_columns_; ++column_index) {
+      OMP_LOOP_EX_BEGIN();
+      const uint8_t bit_type = column_bit_type_[column_index];
+      if (bit_type == 8) {
+        uint8_t* column_data = nullptr;
+        AllocateCUDAMemory<uint8_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+      } else if (bit_type == 16) {
+        uint16_t* column_data = nullptr;
+        AllocateCUDAMemory<uint16_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+      } else if (bit_type == 32) {
+        uint32_t* column_data = nullptr;
+        AllocateCUDAMemory<uint32_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+      }
+      OMP_LOOP_EX_END();
+    }
+    OMP_THROW_EX();
+    InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__);
+    InitColumnMetaInfo();
+    cur_subset_buffer_size_ = num_used_indices;
+  } else {
+    if (num_used_indices > cur_subset_buffer_size_) {
+      ResizeWhenCopySubrow(num_used_indices);
+      cur_subset_buffer_size_ = num_used_indices;
+    }
+  }
+  CopyFromHostToCUDADevice<data_size_t>(cuda_used_indices_, used_indices, static_cast<size_t>(num_used_indices), __FILE__, __LINE__);
+  num_used_indices_ = num_used_indices;
+  LaunchCopySubrowKernel(full_set->cuda_data_by_column());
+}
+
+void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) {
+  const size_t num_used_indices_size = static_cast<size_t>(num_used_indices);
+  DeallocateCUDAMemory<data_size_t>(&cuda_used_indices_, __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__);
+  OMP_INIT_EX();
+  #pragma omp parallel for schedule(static) num_threads(num_threads_)
+  for (int column_index = 0; column_index < num_columns_; ++column_index) {
+    OMP_LOOP_EX_BEGIN();
+    const uint8_t bit_type = column_bit_type_[column_index];
+    if (bit_type == 8) {
+      uint8_t* column_data = reinterpret_cast<uint8_t*>(data_by_column_[column_index]);
+      DeallocateCUDAMemory<uint8_t>(&column_data, __FILE__, __LINE__);
+      AllocateCUDAMemory<uint8_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+      data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+    } else if (bit_type == 16) {
+      uint16_t* column_data = reinterpret_cast<uint16_t*>(data_by_column_[column_index]);
+      DeallocateCUDAMemory<uint16_t>(&column_data, __FILE__, __LINE__);
+      AllocateCUDAMemory<uint16_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+      data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+    } else if (bit_type == 32) {
+      uint32_t* column_data = reinterpret_cast<uint32_t*>(data_by_column_[column_index]);
+      DeallocateCUDAMemory<uint32_t>(&column_data, __FILE__, __LINE__);
+      AllocateCUDAMemory<uint32_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+      data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+    }
+    OMP_LOOP_EX_END();
+  }
+  OMP_THROW_EX();
+  DeallocateCUDAMemory<void*>(&cuda_data_by_column_, __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__);
+}
+
+void CUDAColumnData::InitColumnMetaInfo() {
+  InitCUDAMemoryFromHostMemory<uint8_t>(&cuda_column_bit_type_,
+                                       column_bit_type_.data(),
+                                       column_bit_type_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_max_bin_,
+                                         feature_max_bin_.data(),
+                                         feature_max_bin_.size(),
+                                         __FILE__,
+                                         __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_min_bin_,
+                                         feature_min_bin_.data(),
+                                         feature_min_bin_.size(),
+                                         __FILE__,
+                                         __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_offset_,
+                                         feature_offset_.data(),
+                                         feature_offset_.size(),
+                                         __FILE__,
+                                         __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_most_freq_bin_,
+                                         feature_most_freq_bin_.data(),
+                                         feature_most_freq_bin_.size(),
+                                         __FILE__,
+                                         __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_default_bin_,
+                                         feature_default_bin_.data(),
+                                         feature_default_bin_.size(),
+                                         __FILE__,
+                                         __LINE__);
+  InitCUDAMemoryFromHostMemory<uint8_t>(&cuda_feature_missing_is_zero_,
+                                         feature_missing_is_zero_.data(),
+                                         feature_missing_is_zero_.size(),
+                                         __FILE__,
+                                         __LINE__);
+  InitCUDAMemoryFromHostMemory<uint8_t>(&cuda_feature_missing_is_na_,
+                                        feature_missing_is_na_.data(),
+                                        feature_missing_is_na_.size(),
+                                        __FILE__,
+                                        __LINE__);
+  InitCUDAMemoryFromHostMemory<uint8_t>(&cuda_feature_mfb_is_zero_,
+                                        feature_mfb_is_zero_.data(),
+                                        feature_mfb_is_zero_.size(),
+                                        __FILE__,
+                                        __LINE__);
+  InitCUDAMemoryFromHostMemory<uint8_t>(&cuda_feature_mfb_is_na_,
+                                        feature_mfb_is_na_.data(),
+                                        feature_mfb_is_na_.size(),
+                                        __FILE__,
+                                        __LINE__);
+  InitCUDAMemoryFromHostMemory<int>(&cuda_feature_to_column_,
+                                    feature_to_column_.data(),
+                                    feature_to_column_.size(),
+                                    __FILE__,
+                                    __LINE__);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu
new file mode 100644
index 000000000000..3ab70e9a5758
--- /dev/null
+++ b/src/io/cuda/cuda_column_data.cu
@@ -0,0 +1,61 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_column_data.hpp>
+
+#define COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA (1024)
+
+namespace LightGBM {
+
+__global__ void CopySubrowKernel_ColumnData(
+  void* const* in_cuda_data_by_column,
+  const uint8_t* cuda_column_bit_type,
+  const data_size_t* cuda_used_indices,
+  const data_size_t num_used_indices,
+  const int num_column,
+  void** out_cuda_data_by_column) {
+  const data_size_t local_data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (local_data_index < num_used_indices) {
+    for (int column_index = 0; column_index < num_column; ++column_index) {
+      const void* in_column_data = in_cuda_data_by_column[column_index];
+      void* out_column_data = out_cuda_data_by_column[column_index];
+      const uint8_t bit_type = cuda_column_bit_type[column_index];
+      if (bit_type == 8) {
+        const uint8_t* true_in_column_data = reinterpret_cast<const uint8_t*>(in_column_data);
+        uint8_t* true_out_column_data = reinterpret_cast<uint8_t*>(out_column_data);
+        const data_size_t global_data_index = cuda_used_indices[local_data_index];
+        true_out_column_data[local_data_index] = true_in_column_data[global_data_index];
+      } else if (bit_type == 16) {
+        const uint16_t* true_in_column_data = reinterpret_cast<const uint16_t*>(in_column_data);
+        uint16_t* true_out_column_data = reinterpret_cast<uint16_t*>(out_column_data);
+        const data_size_t global_data_index = cuda_used_indices[local_data_index];
+        true_out_column_data[local_data_index] = true_in_column_data[global_data_index];
+      } else if (bit_type == 32) {
+        const uint32_t* true_in_column_data = reinterpret_cast<const uint32_t*>(in_column_data);
+        uint32_t* true_out_column_data = reinterpret_cast<uint32_t*>(out_column_data);
+        const data_size_t global_data_index = cuda_used_indices[local_data_index];
+        true_out_column_data[local_data_index] = true_in_column_data[global_data_index];
+      }
+    }
+  }
+}
+
+void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column) {
+  const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA;
+  CopySubrowKernel_ColumnData<<<num_blocks, COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA>>>(
+    in_cuda_data_by_column,
+    cuda_column_bit_type_,
+    cuda_used_indices_,
+    num_used_indices_,
+    num_columns_,
+    cuda_data_by_column_);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp
new file mode 100644
index 000000000000..2a3dd380254a
--- /dev/null
+++ b/src/io/cuda/cuda_metadata.cpp
@@ -0,0 +1,92 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_metadata.hpp>
+
+namespace LightGBM {
+
+CUDAMetadata::CUDAMetadata(const int gpu_device_id) {
+  if (gpu_device_id >= 0) {
+    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
+  } else {
+    SetCUDADevice(0, __FILE__, __LINE__);
+  }
+  cuda_label_ = nullptr;
+  cuda_weights_ = nullptr;
+  cuda_query_boundaries_ = nullptr;
+  cuda_query_weights_ = nullptr;
+  cuda_init_score_ = nullptr;
+}
+
+CUDAMetadata::~CUDAMetadata() {
+  DeallocateCUDAMemory<label_t>(&cuda_label_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<label_t>(&cuda_weights_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_query_boundaries_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<label_t>(&cuda_query_weights_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_init_score_, __FILE__, __LINE__);
+}
+
+void CUDAMetadata::Init(const std::vector<label_t>& label,
+                        const std::vector<label_t>& weight,
+                        const std::vector<data_size_t>& query_boundaries,
+                        const std::vector<label_t>& query_weights,
+                        const std::vector<double>& init_score) {
+  if (label.size() == 0) {
+    cuda_label_ = nullptr;
+  } else {
+    InitCUDAMemoryFromHostMemory<label_t>(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__);
+  }
+  if (weight.size() == 0) {
+    cuda_weights_ = nullptr;
+  } else {
+    InitCUDAMemoryFromHostMemory<label_t>(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__);
+  }
+  if (query_boundaries.size() == 0) {
+    cuda_query_boundaries_ = nullptr;
+  } else {
+    InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__);
+  }
+  if (query_weights.size() == 0) {
+    cuda_query_weights_ = nullptr;
+  } else {
+    InitCUDAMemoryFromHostMemory<label_t>(&cuda_query_weights_, query_weights.data(), query_weights.size(), __FILE__, __LINE__);
+  }
+  if (init_score.size() == 0) {
+    cuda_init_score_ = nullptr;
+  } else {
+    InitCUDAMemoryFromHostMemory<double>(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDAMetadata::SetLabel(const label_t* label, data_size_t len) {
+  DeallocateCUDAMemory<label_t>(&cuda_label_, __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<label_t>(&cuda_label_, label, static_cast<size_t>(len), __FILE__, __LINE__);
+}
+
+void CUDAMetadata::SetWeights(const label_t* weights, data_size_t len) {
+  DeallocateCUDAMemory<label_t>(&cuda_weights_, __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<label_t>(&cuda_weights_, weights, static_cast<size_t>(len), __FILE__, __LINE__);
+}
+
+void CUDAMetadata::SetQuery(const data_size_t* query_boundaries, const label_t* query_weights, data_size_t num_queries) {
+  DeallocateCUDAMemory<data_size_t>(&cuda_query_boundaries_, __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_query_boundaries_, query_boundaries, static_cast<size_t>(num_queries) + 1, __FILE__, __LINE__);
+  if (query_weights != nullptr) {
+    DeallocateCUDAMemory<label_t>(&cuda_query_weights_, __FILE__, __LINE__);
+    InitCUDAMemoryFromHostMemory<label_t>(&cuda_query_weights_, query_weights, static_cast<size_t>(num_queries), __FILE__, __LINE__);
+  }
+}
+
+void CUDAMetadata::SetInitScore(const double* init_score, data_size_t len) {
+  DeallocateCUDAMemory<double>(&cuda_init_score_, __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<double>(&cuda_init_score_, init_score, static_cast<size_t>(len), __FILE__, __LINE__);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp
new file mode 100644
index 000000000000..3c66a164d9d4
--- /dev/null
+++ b/src/io/cuda/cuda_row_data.cpp
@@ -0,0 +1,477 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_row_data.hpp>
+
+namespace LightGBM {
+
+CUDARowData::CUDARowData(const Dataset* train_data,
+                         const TrainingShareStates* train_share_state,
+                         const int gpu_device_id,
+                         const bool gpu_use_dp):
+gpu_device_id_(gpu_device_id),
+gpu_use_dp_(gpu_use_dp) {
+  num_threads_ = OMP_NUM_THREADS();
+  num_data_ = train_data->num_data();
+  const auto& feature_hist_offsets = train_share_state->feature_hist_offsets();
+  if (gpu_use_dp_) {
+    shared_hist_size_ = DP_SHARED_HIST_SIZE;
+  } else {
+    shared_hist_size_ = SP_SHARED_HIST_SIZE;
+  }
+  if (feature_hist_offsets.empty()) {
+    num_total_bin_ = 0;
+  } else {
+    num_total_bin_ = static_cast<int>(feature_hist_offsets.back());
+  }
+  num_feature_group_ = train_data->num_feature_groups();
+  num_feature_ = train_data->num_features();
+  if (gpu_device_id >= 0) {
+    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
+  } else {
+    SetCUDADevice(0, __FILE__, __LINE__);
+  }
+  cuda_data_uint8_t_ = nullptr;
+  cuda_data_uint16_t_ = nullptr;
+  cuda_data_uint32_t_ = nullptr;
+  cuda_row_ptr_uint16_t_ = nullptr;
+  cuda_row_ptr_uint32_t_ = nullptr;
+  cuda_row_ptr_uint64_t_ = nullptr;
+  cuda_partition_ptr_uint16_t_ = nullptr;
+  cuda_partition_ptr_uint32_t_ = nullptr;
+  cuda_partition_ptr_uint64_t_ = nullptr;
+  cuda_feature_partition_column_index_offsets_ = nullptr;
+  cuda_column_hist_offsets_ = nullptr;
+  cuda_partition_hist_offsets_ = nullptr;
+  cuda_block_buffer_uint16_t_ = nullptr;
+  cuda_block_buffer_uint32_t_ = nullptr;
+  cuda_block_buffer_uint64_t_ = nullptr;
+}
+
+CUDARowData::~CUDARowData() {
+  DeallocateCUDAMemory<uint8_t>(&cuda_data_uint8_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint16_t>(&cuda_data_uint16_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_data_uint32_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint16_t>(&cuda_row_ptr_uint16_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_row_ptr_uint32_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint64_t>(&cuda_row_ptr_uint64_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_feature_partition_column_index_offsets_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_column_hist_offsets_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_partition_hist_offsets_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint16_t>(&cuda_block_buffer_uint16_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_block_buffer_uint32_t_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint64_t>(&cuda_block_buffer_uint64_t_, __FILE__, __LINE__);
+}
+
+void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) {
+  if (num_feature_ == 0) {
+    return;
+  }
+  DivideCUDAFeatureGroups(train_data, train_share_state);
+  bit_type_ = 0;
+  size_t total_size = 0;
+  const void* host_row_ptr = nullptr;
+  row_ptr_bit_type_ = 0;
+  const void* host_data = train_share_state->GetRowWiseData(&bit_type_, &total_size, &is_sparse_, &host_row_ptr, &row_ptr_bit_type_);
+  if (bit_type_ == 8) {
+    if (!is_sparse_) {
+      std::vector<uint8_t> partitioned_data;
+      GetDenseDataPartitioned<uint8_t>(reinterpret_cast<const uint8_t*>(host_data), &partitioned_data);
+      InitCUDAMemoryFromHostMemory<uint8_t>(&cuda_data_uint8_t_, partitioned_data.data(), total_size, __FILE__, __LINE__);
+    } else {
+      if (row_ptr_bit_type_ == 16) {
+        InitSparseData<uint8_t, uint16_t>(
+          reinterpret_cast<const uint8_t*>(host_data),
+          reinterpret_cast<const uint16_t*>(host_row_ptr),
+          &cuda_data_uint8_t_,
+          &cuda_row_ptr_uint16_t_,
+          &cuda_partition_ptr_uint16_t_);
+      } else if (row_ptr_bit_type_ == 32) {
+        InitSparseData<uint8_t, uint32_t>(
+          reinterpret_cast<const uint8_t*>(host_data),
+          reinterpret_cast<const uint32_t*>(host_row_ptr),
+          &cuda_data_uint8_t_,
+          &cuda_row_ptr_uint32_t_,
+          &cuda_partition_ptr_uint32_t_);
+      } else if (row_ptr_bit_type_ == 64) {
+        InitSparseData<uint8_t, uint64_t>(
+          reinterpret_cast<const uint8_t*>(host_data),
+          reinterpret_cast<const uint64_t*>(host_row_ptr),
+          &cuda_data_uint8_t_,
+          &cuda_row_ptr_uint64_t_,
+          &cuda_partition_ptr_uint64_t_);
+      } else {
+        Log::Fatal("Unknow data ptr bit type %d", row_ptr_bit_type_);
+      }
+    }
+  } else if (bit_type_ == 16) {
+    if (!is_sparse_) {
+      std::vector<uint16_t> partitioned_data;
+      GetDenseDataPartitioned<uint16_t>(reinterpret_cast<const uint16_t*>(host_data), &partitioned_data);
+      InitCUDAMemoryFromHostMemory<uint16_t>(&cuda_data_uint16_t_, partitioned_data.data(), total_size, __FILE__, __LINE__);
+    } else {
+      if (row_ptr_bit_type_ == 16) {
+        InitSparseData<uint16_t, uint16_t>(
+          reinterpret_cast<const uint16_t*>(host_data),
+          reinterpret_cast<const uint16_t*>(host_row_ptr),
+          &cuda_data_uint16_t_,
+          &cuda_row_ptr_uint16_t_,
+          &cuda_partition_ptr_uint16_t_);
+      } else if (row_ptr_bit_type_ == 32) {
+        InitSparseData<uint16_t, uint32_t>(
+          reinterpret_cast<const uint16_t*>(host_data),
+          reinterpret_cast<const uint32_t*>(host_row_ptr),
+          &cuda_data_uint16_t_,
+          &cuda_row_ptr_uint32_t_,
+          &cuda_partition_ptr_uint32_t_);
+      } else if (row_ptr_bit_type_ == 64) {
+        InitSparseData<uint16_t, uint64_t>(
+          reinterpret_cast<const uint16_t*>(host_data),
+          reinterpret_cast<const uint64_t*>(host_row_ptr),
+          &cuda_data_uint16_t_,
+          &cuda_row_ptr_uint64_t_,
+          &cuda_partition_ptr_uint64_t_);
+      } else {
+        Log::Fatal("Unknow data ptr bit type %d", row_ptr_bit_type_);
+      }
+    }
+  } else if (bit_type_ == 32) {
+    if (!is_sparse_) {
+      std::vector<uint32_t> partitioned_data;
+      GetDenseDataPartitioned<uint32_t>(reinterpret_cast<const uint32_t*>(host_data), &partitioned_data);
+      InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_data_uint32_t_, partitioned_data.data(), total_size, __FILE__, __LINE__);
+    } else {
+      if (row_ptr_bit_type_ == 16) {
+        InitSparseData<uint32_t, uint16_t>(
+          reinterpret_cast<const uint32_t*>(host_data),
+          reinterpret_cast<const uint16_t*>(host_row_ptr),
+          &cuda_data_uint32_t_,
+          &cuda_row_ptr_uint16_t_,
+          &cuda_partition_ptr_uint16_t_);
+      } else if (row_ptr_bit_type_ == 32) {
+        InitSparseData<uint32_t, uint32_t>(
+          reinterpret_cast<const uint32_t*>(host_data),
+          reinterpret_cast<const uint32_t*>(host_row_ptr),
+          &cuda_data_uint32_t_,
+          &cuda_row_ptr_uint32_t_,
+          &cuda_partition_ptr_uint32_t_);
+      } else if (row_ptr_bit_type_ == 64) {
+        InitSparseData<uint32_t, uint64_t>(
+          reinterpret_cast<const uint32_t*>(host_data),
+          reinterpret_cast<const uint64_t*>(host_row_ptr),
+          &cuda_data_uint32_t_,
+          &cuda_row_ptr_uint64_t_,
+          &cuda_partition_ptr_uint64_t_);
+      } else {
+        Log::Fatal("Unknow data ptr bit type %d", row_ptr_bit_type_);
+      }
+    }
+  } else {
+    Log::Fatal("Unknow bit type = %d", bit_type_);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) {
+  const uint32_t max_num_bin_per_partition = shared_hist_size_ / 2;
+  const std::vector<uint32_t>& column_hist_offsets = share_state->column_hist_offsets();
+  std::vector<int> feature_group_num_feature_offsets;
+  int offsets = 0;
+  int prev_group_index = -1;
+  for (int feature_index = 0; feature_index < num_feature_; ++feature_index) {
+    const int feature_group_index = train_data->Feature2Group(feature_index);
+    if (prev_group_index == -1 || feature_group_index != prev_group_index) {
+      feature_group_num_feature_offsets.emplace_back(offsets);
+      prev_group_index = feature_group_index;
+    }
+    ++offsets;
+  }
+  CHECK_EQ(offsets, num_feature_);
+  feature_group_num_feature_offsets.emplace_back(offsets);
+
+  uint32_t start_hist_offset = 0;
+  feature_partition_column_index_offsets_.clear();
+  column_hist_offsets_.clear();
+  partition_hist_offsets_.clear();
+  feature_partition_column_index_offsets_.emplace_back(0);
+  partition_hist_offsets_.emplace_back(0);
+  const int num_feature_groups = train_data->num_feature_groups();
+  int column_index = 0;
+  num_feature_partitions_ = 0;
+  large_bin_partitions_.clear();
+  small_bin_partitions_.clear();
+  for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) {
+    if (!train_data->IsMultiGroup(feature_group_index)) {
+      const uint32_t column_feature_hist_start = column_hist_offsets[column_index];
+      const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1];
+      const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start;
+
+      // if one column has too many bins, use a separate partition for that column
+      if (num_bin_in_dense_group > max_num_bin_per_partition) {
+        feature_partition_column_index_offsets_.emplace_back(column_index + 1);
+        start_hist_offset = column_feature_hist_end;
+        partition_hist_offsets_.emplace_back(start_hist_offset);
+        large_bin_partitions_.emplace_back(num_feature_partitions_);
+        ++num_feature_partitions_;
+        column_hist_offsets_.emplace_back(0);
+        ++column_index;
+        continue;
+      }
+
+      // try if adding this column exceed the maximum number per partition
+      const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset;
+      if (cur_hist_num_bin > max_num_bin_per_partition) {
+        feature_partition_column_index_offsets_.emplace_back(column_index);
+        start_hist_offset = column_feature_hist_start;
+        partition_hist_offsets_.emplace_back(start_hist_offset);
+        small_bin_partitions_.emplace_back(num_feature_partitions_);
+        ++num_feature_partitions_;
+      }
+      column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset);
+      if (feature_group_index == num_feature_groups - 1) {
+        feature_partition_column_index_offsets_.emplace_back(column_index + 1);
+        partition_hist_offsets_.emplace_back(column_hist_offsets.back());
+        small_bin_partitions_.emplace_back(num_feature_partitions_);
+        ++num_feature_partitions_;
+      }
+      ++column_index;
+    } else {
+      const int group_feature_index_start = feature_group_num_feature_offsets[feature_group_index];
+      const int num_feature_in_group = feature_group_num_feature_offsets[feature_group_index + 1] - group_feature_index_start;
+      for (int sub_feature_index = 0; sub_feature_index < num_feature_in_group; ++sub_feature_index) {
+        const int feature_index = group_feature_index_start + sub_feature_index;
+        const uint32_t column_feature_hist_start = column_hist_offsets[column_index];
+        const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1];
+        const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start;
+
+        // if one column has too many bins, use a separate partition for that column
+        if (num_bin_in_dense_group > max_num_bin_per_partition) {
+          feature_partition_column_index_offsets_.emplace_back(column_index + 1);
+          start_hist_offset = column_feature_hist_end;
+          partition_hist_offsets_.emplace_back(start_hist_offset);
+          large_bin_partitions_.emplace_back(num_feature_partitions_);
+          ++num_feature_partitions_;
+          column_hist_offsets_.emplace_back(0);
+          ++column_index;
+          continue;
+        }
+
+        // try if adding this column exceed the maximum number per partition
+        const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset;
+        if (cur_hist_num_bin > max_num_bin_per_partition) {
+          feature_partition_column_index_offsets_.emplace_back(column_index);
+          start_hist_offset = column_feature_hist_start;
+          partition_hist_offsets_.emplace_back(start_hist_offset);
+          small_bin_partitions_.emplace_back(num_feature_partitions_);
+          ++num_feature_partitions_;
+        }
+        column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset);
+        if (feature_group_index == num_feature_groups - 1 && sub_feature_index == num_feature_in_group - 1) {
+          CHECK_EQ(feature_index, num_feature_ - 1);
+          feature_partition_column_index_offsets_.emplace_back(column_index + 1);
+          partition_hist_offsets_.emplace_back(column_hist_offsets.back());
+          small_bin_partitions_.emplace_back(num_feature_partitions_);
+          ++num_feature_partitions_;
+        }
+        ++column_index;
+      }
+    }
+  }
+  column_hist_offsets_.emplace_back(column_hist_offsets.back() - start_hist_offset);
+  max_num_column_per_partition_ = 0;
+  for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) {
+    const int num_column = feature_partition_column_index_offsets_[i + 1] - feature_partition_column_index_offsets_[i];
+    if (num_column > max_num_column_per_partition_) {
+      max_num_column_per_partition_ = num_column;
+    }
+  }
+
+  InitCUDAMemoryFromHostMemory<int>(&cuda_feature_partition_column_index_offsets_,
+    feature_partition_column_index_offsets_.data(),
+    feature_partition_column_index_offsets_.size(),
+    __FILE__,
+    __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_column_hist_offsets_,
+    column_hist_offsets_.data(),
+    column_hist_offsets_.size(),
+    __FILE__,
+    __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_partition_hist_offsets_,
+    partition_hist_offsets_.data(),
+    partition_hist_offsets_.size(),
+    __FILE__,
+    __LINE__);
+}
+
+template <typename BIN_TYPE>
+void CUDARowData::GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector<BIN_TYPE>* partitioned_data) {
+  const int num_total_columns = feature_partition_column_index_offsets_.back();
+  partitioned_data->resize(static_cast<size_t>(num_total_columns) * static_cast<size_t>(num_data_), 0);
+  BIN_TYPE* out_data = partitioned_data->data();
+  Threading::For<data_size_t>(0, num_data_, 512,
+    [this, num_total_columns, row_wise_data, out_data] (int /*thread_index*/, data_size_t start, data_size_t end) {
+      for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) {
+        const int num_prev_columns = static_cast<int>(feature_partition_column_index_offsets_[i]);
+        const size_t offset = static_cast<size_t>(num_data_) * static_cast<size_t>(num_prev_columns);
+        const int partition_column_start = feature_partition_column_index_offsets_[i];
+        const int partition_column_end = feature_partition_column_index_offsets_[i + 1];
+        const int num_columns_in_cur_partition = partition_column_end - partition_column_start;
+        for (data_size_t data_index = start; data_index < end; ++data_index) {
+          const size_t data_offset = offset + data_index * num_columns_in_cur_partition;
+          const size_t read_data_offset = static_cast<size_t>(data_index) * num_total_columns;
+          for (int column_index = 0; column_index < num_columns_in_cur_partition; ++column_index) {
+            const size_t true_column_index = read_data_offset + column_index + partition_column_start;
+            const BIN_TYPE bin = row_wise_data[true_column_index];
+            out_data[data_offset + column_index] = bin;
+          }
+        }
+      }
+    });
+}
+
+template <typename BIN_TYPE, typename DATA_PTR_TYPE>
+void CUDARowData::GetSparseDataPartitioned(
+  const BIN_TYPE* row_wise_data,
+  const DATA_PTR_TYPE* row_ptr,
+  std::vector<std::vector<BIN_TYPE>>* partitioned_data,
+  std::vector<std::vector<DATA_PTR_TYPE>>* partitioned_row_ptr,
+  std::vector<DATA_PTR_TYPE>* partition_ptr) {
+  const int num_partitions = static_cast<int>(feature_partition_column_index_offsets_.size()) - 1;
+  partitioned_data->resize(num_partitions);
+  partitioned_row_ptr->resize(num_partitions);
+  std::vector<int> thread_max_elements_per_row(num_threads_, 0);
+  Threading::For<int>(0, num_partitions, 1,
+    [partitioned_data, partitioned_row_ptr, row_ptr, row_wise_data, &thread_max_elements_per_row, this] (int thread_index, int start, int end) {
+      for (int partition_index = start; partition_index < end; ++partition_index) {
+        std::vector<BIN_TYPE>& data_for_this_partition = partitioned_data->at(partition_index);
+        std::vector<DATA_PTR_TYPE>& row_ptr_for_this_partition = partitioned_row_ptr->at(partition_index);
+        const int partition_hist_start = partition_hist_offsets_[partition_index];
+        const int partition_hist_end = partition_hist_offsets_[partition_index + 1];
+        DATA_PTR_TYPE offset = 0;
+        row_ptr_for_this_partition.clear();
+        data_for_this_partition.clear();
+        row_ptr_for_this_partition.emplace_back(offset);
+        for (data_size_t data_index = 0; data_index < num_data_; ++data_index) {
+          const DATA_PTR_TYPE row_start = row_ptr[data_index];
+          const DATA_PTR_TYPE row_end = row_ptr[data_index + 1];
+          const BIN_TYPE* row_data_start = row_wise_data + row_start;
+          const BIN_TYPE* row_data_end = row_wise_data + row_end;
+          const size_t partition_start_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_start) - row_data_start;
+          const size_t partition_end_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_end) - row_data_start;
+          for (size_t pos = partition_start_in_row; pos < partition_end_in_row; ++pos) {
+            const BIN_TYPE bin = row_data_start[pos];
+            CHECK_GE(bin, static_cast<BIN_TYPE>(partition_hist_start));
+            data_for_this_partition.emplace_back(bin - partition_hist_start);
+          }
+          CHECK_GE(partition_end_in_row, partition_start_in_row);
+          const data_size_t num_elements_in_row = partition_end_in_row - partition_start_in_row;
+          offset += static_cast<DATA_PTR_TYPE>(num_elements_in_row);
+          row_ptr_for_this_partition.emplace_back(offset);
+          if (num_elements_in_row > thread_max_elements_per_row[thread_index]) {
+            thread_max_elements_per_row[thread_index] = num_elements_in_row;
+          }
+        }
+      }
+    });
+  partition_ptr->clear();
+  DATA_PTR_TYPE offset = 0;
+  partition_ptr->emplace_back(offset);
+  for (size_t i = 0; i < partitioned_row_ptr->size(); ++i) {
+    offset += partitioned_row_ptr->at(i).back();
+    partition_ptr->emplace_back(offset);
+  }
+  max_num_column_per_partition_ = 0;
+  for (int thread_index = 0; thread_index < num_threads_; ++thread_index) {
+    if (thread_max_elements_per_row[thread_index] > max_num_column_per_partition_) {
+      max_num_column_per_partition_ = thread_max_elements_per_row[thread_index];
+    }
+  }
+}
+
+template <typename BIN_TYPE, typename ROW_PTR_TYPE>
+void CUDARowData::InitSparseData(const BIN_TYPE* host_data,
+                                 const ROW_PTR_TYPE* host_row_ptr,
+                                 BIN_TYPE** cuda_data,
+                                 ROW_PTR_TYPE** cuda_row_ptr,
+                                 ROW_PTR_TYPE** cuda_partition_ptr) {
+  std::vector<std::vector<BIN_TYPE>> partitioned_data;
+  std::vector<std::vector<ROW_PTR_TYPE>> partitioned_data_ptr;
+  std::vector<ROW_PTR_TYPE> partition_ptr;
+  GetSparseDataPartitioned<BIN_TYPE, ROW_PTR_TYPE>(host_data, host_row_ptr, &partitioned_data, &partitioned_data_ptr, &partition_ptr);
+  InitCUDAMemoryFromHostMemory<ROW_PTR_TYPE>(cuda_partition_ptr, partition_ptr.data(), partition_ptr.size(), __FILE__, __LINE__);
+  AllocateCUDAMemory<BIN_TYPE>(cuda_data, partition_ptr.back(), __FILE__, __LINE__);
+  AllocateCUDAMemory<ROW_PTR_TYPE>(cuda_row_ptr, (num_data_ + 1) * partitioned_data_ptr.size(),  __FILE__, __LINE__);
+  for (size_t i = 0; i < partitioned_data.size(); ++i) {
+    const std::vector<ROW_PTR_TYPE>& data_ptr_for_this_partition = partitioned_data_ptr[i];
+    const std::vector<BIN_TYPE>& data_for_this_partition = partitioned_data[i];
+    CopyFromHostToCUDADevice<BIN_TYPE>((*cuda_data) + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size(), __FILE__, __LINE__);
+    CopyFromHostToCUDADevice<ROW_PTR_TYPE>((*cuda_row_ptr) + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size(), __FILE__, __LINE__);
+  }
+}
+
+template <typename BIN_TYPE>
+const BIN_TYPE* CUDARowData::GetBin() const {
+  if (bit_type_ == 8) {
+    return reinterpret_cast<const BIN_TYPE*>(cuda_data_uint8_t_);
+  } else if (bit_type_ == 16) {
+    return reinterpret_cast<const BIN_TYPE*>(cuda_data_uint16_t_);
+  } else if (bit_type_ == 32) {
+    return reinterpret_cast<const BIN_TYPE*>(cuda_data_uint32_t_);
+  } else {
+    Log::Fatal("Unknown bit_type %d for GetBin.", bit_type_);
+  }
+}
+
+template const uint8_t* CUDARowData::GetBin<uint8_t>() const;
+
+template const uint16_t* CUDARowData::GetBin<uint16_t>() const;
+
+template const uint32_t* CUDARowData::GetBin<uint32_t>() const;
+
+template <typename PTR_TYPE>
+const PTR_TYPE* CUDARowData::GetRowPtr() const {
+  if (row_ptr_bit_type_ == 16) {
+    return reinterpret_cast<const PTR_TYPE*>(cuda_row_ptr_uint16_t_);
+  } else if (row_ptr_bit_type_ == 32) {
+    return reinterpret_cast<const PTR_TYPE*>(cuda_row_ptr_uint32_t_);
+  } else if (row_ptr_bit_type_ == 64) {
+    return reinterpret_cast<const PTR_TYPE*>(cuda_row_ptr_uint64_t_);
+  } else {
+    Log::Fatal("Unknown row_ptr_bit_type = %d for GetRowPtr.", row_ptr_bit_type_);
+  }
+}
+
+template const uint16_t* CUDARowData::GetRowPtr<uint16_t>() const;
+
+template const uint32_t* CUDARowData::GetRowPtr<uint32_t>() const;
+
+template const uint64_t* CUDARowData::GetRowPtr<uint64_t>() const;
+
+template <typename PTR_TYPE>
+const PTR_TYPE* CUDARowData::GetPartitionPtr() const {
+  if (row_ptr_bit_type_ == 16) {
+    return reinterpret_cast<const PTR_TYPE*>(cuda_partition_ptr_uint16_t_);
+  } else if (row_ptr_bit_type_ == 32) {
+    return reinterpret_cast<const PTR_TYPE*>(cuda_partition_ptr_uint32_t_);
+  } else if (row_ptr_bit_type_ == 64) {
+    return reinterpret_cast<const PTR_TYPE*>(cuda_partition_ptr_uint64_t_);
+  } else {
+    Log::Fatal("Unknown row_ptr_bit_type = %d for GetPartitionPtr.", row_ptr_bit_type_);
+  }
+}
+
+template const uint16_t* CUDARowData::GetPartitionPtr<uint16_t>() const;
+
+template const uint32_t* CUDARowData::GetPartitionPtr<uint32_t>() const;
+
+template const uint64_t* CUDARowData::GetPartitionPtr<uint64_t>() const;
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp
new file mode 100644
index 000000000000..975edb44c9d5
--- /dev/null
+++ b/src/io/cuda/cuda_tree.cpp
@@ -0,0 +1,310 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_tree.hpp>
+
+namespace LightGBM {
+
+CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear,
+  const int gpu_device_id, const bool has_categorical_feature):
+Tree(max_leaves, track_branch_features, is_linear),
+num_threads_per_block_add_prediction_to_score_(1024) {
+  is_cuda_tree_ = true;
+  if (gpu_device_id >= 0) {
+    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
+  } else {
+    SetCUDADevice(0, __FILE__, __LINE__);
+  }
+  if (has_categorical_feature) {
+    cuda_cat_boundaries_.Resize(max_leaves);
+    cuda_cat_boundaries_inner_.Resize(max_leaves);
+  }
+  InitCUDAMemory();
+}
+
+CUDATree::CUDATree(const Tree* host_tree):
+  Tree(*host_tree),
+  num_threads_per_block_add_prediction_to_score_(1024) {
+  is_cuda_tree_ = true;
+  InitCUDA();
+}
+
+CUDATree::~CUDATree() {
+  DeallocateCUDAMemory<int>(&cuda_left_child_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_right_child_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_split_feature_inner_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_split_feature_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_leaf_depth_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_leaf_parent_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_threshold_in_bin_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_threshold_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_internal_weight_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_internal_value_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int8_t>(&cuda_decision_type_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_leaf_value_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_count_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_leaf_weight_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_internal_count_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<float>(&cuda_split_gain_, __FILE__, __LINE__);
+  gpuAssert(cudaStreamDestroy(cuda_stream_), __FILE__, __LINE__);
+}
+
+void CUDATree::InitCUDAMemory() {
+  AllocateCUDAMemory<int>(&cuda_left_child_,
+                               static_cast<size_t>(max_leaves_),
+                               __FILE__,
+                               __LINE__);
+  AllocateCUDAMemory<int>(&cuda_right_child_,
+                               static_cast<size_t>(max_leaves_),
+                               __FILE__,
+                               __LINE__);
+  AllocateCUDAMemory<int>(&cuda_split_feature_inner_,
+                               static_cast<size_t>(max_leaves_),
+                               __FILE__,
+                               __LINE__);
+  AllocateCUDAMemory<int>(&cuda_split_feature_,
+                               static_cast<size_t>(max_leaves_),
+                               __FILE__,
+                               __LINE__);
+  AllocateCUDAMemory<int>(&cuda_leaf_depth_,
+                               static_cast<size_t>(max_leaves_),
+                               __FILE__,
+                               __LINE__);
+  AllocateCUDAMemory<int>(&cuda_leaf_parent_,
+                               static_cast<size_t>(max_leaves_),
+                               __FILE__,
+                               __LINE__);
+  AllocateCUDAMemory<uint32_t>(&cuda_threshold_in_bin_,
+                                    static_cast<size_t>(max_leaves_),
+                                    __FILE__,
+                                    __LINE__);
+  AllocateCUDAMemory<double>(&cuda_threshold_,
+                                  static_cast<size_t>(max_leaves_),
+                                  __FILE__,
+                                  __LINE__);
+  AllocateCUDAMemory<int8_t>(&cuda_decision_type_,
+                                  static_cast<size_t>(max_leaves_),
+                                  __FILE__,
+                                  __LINE__);
+  AllocateCUDAMemory<double>(&cuda_leaf_value_,
+                                  static_cast<size_t>(max_leaves_),
+                                  __FILE__,
+                                  __LINE__);
+  AllocateCUDAMemory<double>(&cuda_internal_weight_,
+                                  static_cast<size_t>(max_leaves_),
+                                  __FILE__,
+                                  __LINE__);
+  AllocateCUDAMemory<double>(&cuda_internal_value_,
+                                  static_cast<size_t>(max_leaves_),
+                                  __FILE__,
+                                  __LINE__);
+  AllocateCUDAMemory<double>(&cuda_leaf_weight_,
+                             static_cast<size_t>(max_leaves_),
+                             __FILE__,
+                             __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_count_,
+                                  static_cast<size_t>(max_leaves_),
+                                  __FILE__,
+                                  __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_internal_count_,
+                                       static_cast<size_t>(max_leaves_),
+                                       __FILE__,
+                                       __LINE__);
+  AllocateCUDAMemory<float>(&cuda_split_gain_,
+                                 static_cast<size_t>(max_leaves_),
+                                 __FILE__,
+                                 __LINE__);
+  SetCUDAMemory<double>(cuda_leaf_value_, 0.0f, 1, __FILE__, __LINE__);
+  SetCUDAMemory<double>(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__);
+  SetCUDAMemory<int>(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__);
+  CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_));
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDATree::InitCUDA() {
+  InitCUDAMemoryFromHostMemory<int>(&cuda_left_child_,
+                                    left_child_.data(),
+                                    left_child_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<int>(&cuda_right_child_,
+                                    right_child_.data(),
+                                    right_child_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<int>(&cuda_split_feature_inner_,
+                                    split_feature_inner_.data(),
+                                    split_feature_inner_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<int>(&cuda_split_feature_,
+                                    split_feature_.data(),
+                                    split_feature_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_threshold_in_bin_,
+                                    threshold_in_bin_.data(),
+                                    threshold_in_bin_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<double>(&cuda_threshold_,
+                                    threshold_.data(),
+                                    threshold_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<int>(&cuda_leaf_depth_,
+                                    leaf_depth_.data(),
+                                    leaf_depth_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<int8_t>(&cuda_decision_type_,
+                                       decision_type_.data(),
+                                       decision_type_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<double>(&cuda_internal_weight_,
+                                       internal_weight_.data(),
+                                       internal_weight_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<double>(&cuda_internal_value_,
+                                       internal_value_.data(),
+                                       internal_value_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_internal_count_,
+                                       internal_count_.data(),
+                                       internal_count_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_leaf_count_,
+                                       leaf_count_.data(),
+                                       leaf_count_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<float>(&cuda_split_gain_,
+                                       split_gain_.data(),
+                                       split_gain_.size(),
+                                       __FILE__,
+                                       __LINE__);
+  InitCUDAMemoryFromHostMemory<double>(&cuda_leaf_value_,
+                                    leaf_value_.data(),
+                                    leaf_value_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<double>(&cuda_leaf_weight_,
+                                    leaf_weight_.data(),
+                                    leaf_weight_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  InitCUDAMemoryFromHostMemory<int>(&cuda_leaf_parent_,
+                                    leaf_parent_.data(),
+                                    leaf_parent_.size(),
+                                    __FILE__,
+                                    __LINE__);
+  CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_));
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+int CUDATree::Split(const int leaf_index,
+           const int real_feature_index,
+           const double real_threshold,
+           const MissingType missing_type,
+           const CUDASplitInfo* cuda_split_info) {
+  LaunchSplitKernel(leaf_index, real_feature_index, real_threshold, missing_type, cuda_split_info);
+  ++num_leaves_;
+  return num_leaves_ - 1;
+}
+
+int CUDATree::SplitCategorical(const int leaf_index,
+           const int real_feature_index,
+           const MissingType missing_type,
+           const CUDASplitInfo* cuda_split_info,
+           uint32_t* cuda_bitset,
+           size_t cuda_bitset_len,
+           uint32_t* cuda_bitset_inner,
+           size_t cuda_bitset_inner_len) {
+  LaunchSplitCategoricalKernel(leaf_index, real_feature_index,
+    missing_type, cuda_split_info,
+    cuda_bitset_len, cuda_bitset_inner_len);
+  cuda_bitset_.PushBack(cuda_bitset, cuda_bitset_len);
+  cuda_bitset_inner_.PushBack(cuda_bitset_inner, cuda_bitset_inner_len);
+  ++num_leaves_;
+  ++num_cat_;
+  return num_leaves_ - 1;
+}
+
+inline void CUDATree::Shrinkage(double rate) {
+  Tree::Shrinkage(rate);
+  LaunchShrinkageKernel(rate);
+}
+
+inline void CUDATree::AddBias(double val) {
+  Tree::AddBias(val);
+  LaunchAddBiasKernel(val);
+}
+
+void CUDATree::ToHost() {
+  left_child_.resize(max_leaves_ - 1);
+  right_child_.resize(max_leaves_ - 1);
+  split_feature_inner_.resize(max_leaves_ - 1);
+  split_feature_.resize(max_leaves_ - 1);
+  threshold_in_bin_.resize(max_leaves_ - 1);
+  threshold_.resize(max_leaves_ - 1);
+  decision_type_.resize(max_leaves_ - 1, 0);
+  split_gain_.resize(max_leaves_ - 1);
+  leaf_parent_.resize(max_leaves_);
+  leaf_value_.resize(max_leaves_);
+  leaf_weight_.resize(max_leaves_);
+  leaf_count_.resize(max_leaves_);
+  internal_value_.resize(max_leaves_ - 1);
+  internal_weight_.resize(max_leaves_ - 1);
+  internal_count_.resize(max_leaves_ - 1);
+  leaf_depth_.resize(max_leaves_);
+
+  const size_t num_leaves_size = static_cast<size_t>(num_leaves_);
+  CopyFromCUDADeviceToHost<int>(left_child_.data(), cuda_left_child_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int>(right_child_.data(), cuda_right_child_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int>(split_feature_inner_.data(), cuda_split_feature_inner_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int>(split_feature_.data(), cuda_split_feature_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<uint32_t>(threshold_in_bin_.data(), cuda_threshold_in_bin_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<double>(threshold_.data(), cuda_threshold_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int8_t>(decision_type_.data(), cuda_decision_type_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<float>(split_gain_.data(), cuda_split_gain_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int>(leaf_parent_.data(), cuda_leaf_parent_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, num_leaves_size, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<double>(leaf_weight_.data(), cuda_leaf_weight_, num_leaves_size, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<data_size_t>(leaf_count_.data(), cuda_leaf_count_, num_leaves_size, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<double>(internal_value_.data(), cuda_internal_value_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<double>(internal_weight_.data(), cuda_internal_weight_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<data_size_t>(internal_count_.data(), cuda_internal_count_, num_leaves_size - 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int>(leaf_depth_.data(), cuda_leaf_depth_, num_leaves_size, __FILE__, __LINE__);
+
+  if (num_cat_ > 0) {
+    cuda_cat_boundaries_inner_.Resize(num_cat_ + 1);
+    cuda_cat_boundaries_.Resize(num_cat_ + 1);
+    cat_boundaries_ = cuda_cat_boundaries_.ToHost();
+    cat_boundaries_inner_ = cuda_cat_boundaries_inner_.ToHost();
+    cat_threshold_ = cuda_bitset_.ToHost();
+    cat_threshold_inner_ = cuda_bitset_inner_.ToHost();
+  }
+
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDATree::SyncLeafOutputFromHostToCUDA() {
+  CopyFromHostToCUDADevice<double>(cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), __FILE__, __LINE__);
+}
+
+void CUDATree::SyncLeafOutputFromCUDAToHost() {
+  CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__);
+}
+
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu
new file mode 100644
index 000000000000..d51308ae9942
--- /dev/null
+++ b/src/io/cuda/cuda_tree.cu
@@ -0,0 +1,308 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_tree.hpp>
+
+namespace LightGBM {
+
+__device__ void SetDecisionTypeCUDA(int8_t* decision_type, bool input, int8_t mask) {
+  if (input) {
+    (*decision_type) |= mask;
+  } else {
+    (*decision_type) &= (127 - mask);
+  }
+}
+
+__device__ void SetMissingTypeCUDA(int8_t* decision_type, int8_t input) {
+  (*decision_type) &= 3;
+  (*decision_type) |= (input << 2);
+}
+
+__device__ bool GetDecisionTypeCUDA(int8_t decision_type, int8_t mask) {
+  return (decision_type & mask) > 0;
+}
+
+__device__ int8_t GetMissingTypeCUDA(int8_t decision_type) {
+  return (decision_type >> 2) & 3;
+}
+
+__device__ bool IsZeroCUDA(double fval) {
+  return (fval >= -kZeroThreshold && fval <= kZeroThreshold);
+}
+
+__global__ void SplitKernel(  // split information
+                            const int leaf_index,
+                            const int real_feature_index,
+                            const double real_threshold,
+                            const MissingType missing_type,
+                            const CUDASplitInfo* cuda_split_info,
+                            // tree structure
+                            const int num_leaves,
+                            int* leaf_parent,
+                            int* leaf_depth,
+                            int* left_child,
+                            int* right_child,
+                            int* split_feature_inner,
+                            int* split_feature,
+                            float* split_gain,
+                            double* internal_weight,
+                            double* internal_value,
+                            data_size_t* internal_count,
+                            double* leaf_weight,
+                            double* leaf_value,
+                            data_size_t* leaf_count,
+                            int8_t* decision_type,
+                            uint32_t* threshold_in_bin,
+                            double* threshold) {
+  const int new_node_index = num_leaves - 1;
+  const int thread_index = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  const int parent_index = leaf_parent[leaf_index];
+  if (thread_index == 0) {
+    if (parent_index >= 0) {
+      // if cur node is left child
+      if (left_child[parent_index] == ~leaf_index) {
+        left_child[parent_index] = new_node_index;
+      } else {
+        right_child[parent_index] = new_node_index;
+      }
+    }
+    left_child[new_node_index] = ~leaf_index;
+    right_child[new_node_index] = ~num_leaves;
+    leaf_parent[leaf_index] = new_node_index;
+    leaf_parent[num_leaves] = new_node_index;
+  } else if (thread_index == 1) {
+    // add new node
+    split_feature_inner[new_node_index] = cuda_split_info->inner_feature_index;
+  } else if (thread_index == 2) {
+    split_feature[new_node_index] = real_feature_index;
+  } else if (thread_index == 3) {
+    split_gain[new_node_index] = static_cast<float>(cuda_split_info->gain);
+  } else if (thread_index == 4) {
+    // save current leaf value to internal node before change
+    internal_weight[new_node_index] = leaf_weight[leaf_index];
+    leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians;
+  } else if (thread_index == 5) {
+    internal_value[new_node_index] = leaf_value[leaf_index];
+    leaf_value[leaf_index] = isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value;
+  } else if (thread_index == 6) {
+    internal_count[new_node_index] = cuda_split_info->left_count + cuda_split_info->right_count;
+  } else if (thread_index == 7) {
+    leaf_count[leaf_index] = cuda_split_info->left_count;
+  } else if (thread_index == 8) {
+    leaf_value[num_leaves] = isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value;
+  } else if (thread_index == 9) {
+    leaf_weight[num_leaves] = cuda_split_info->right_sum_hessians;
+  } else if (thread_index == 10) {
+    leaf_count[num_leaves] = cuda_split_info->right_count;
+  } else if (thread_index == 11) {
+    // update leaf depth
+    leaf_depth[num_leaves] = leaf_depth[leaf_index] + 1;
+    leaf_depth[leaf_index]++;
+  } else if (thread_index == 12) {
+    decision_type[new_node_index] = 0;
+    SetDecisionTypeCUDA(&decision_type[new_node_index], false, kCategoricalMask);
+    SetDecisionTypeCUDA(&decision_type[new_node_index], cuda_split_info->default_left, kDefaultLeftMask);
+    SetMissingTypeCUDA(&decision_type[new_node_index], static_cast<int8_t>(missing_type));
+  } else if (thread_index == 13) {
+    threshold_in_bin[new_node_index] = cuda_split_info->threshold;
+  } else if (thread_index == 14) {
+    threshold[new_node_index] = real_threshold;
+  }
+}
+
+void CUDATree::LaunchSplitKernel(const int leaf_index,
+                                 const int real_feature_index,
+                                 const double real_threshold,
+                                 const MissingType missing_type,
+                                 const CUDASplitInfo* cuda_split_info) {
+  SplitKernel<<<3, 5, 0, cuda_stream_>>>(
+    // split information
+    leaf_index,
+    real_feature_index,
+    real_threshold,
+    missing_type,
+    cuda_split_info,
+    // tree structure
+    num_leaves_,
+    cuda_leaf_parent_,
+    cuda_leaf_depth_,
+    cuda_left_child_,
+    cuda_right_child_,
+    cuda_split_feature_inner_,
+    cuda_split_feature_,
+    cuda_split_gain_,
+    cuda_internal_weight_,
+    cuda_internal_value_,
+    cuda_internal_count_,
+    cuda_leaf_weight_,
+    cuda_leaf_value_,
+    cuda_leaf_count_,
+    cuda_decision_type_,
+    cuda_threshold_in_bin_,
+    cuda_threshold_);
+}
+
+__global__ void SplitCategoricalKernel(  // split information
+  const int leaf_index,
+  const int real_feature_index,
+  const MissingType missing_type,
+  const CUDASplitInfo* cuda_split_info,
+  // tree structure
+  const int num_leaves,
+  int* leaf_parent,
+  int* leaf_depth,
+  int* left_child,
+  int* right_child,
+  int* split_feature_inner,
+  int* split_feature,
+  float* split_gain,
+  double* internal_weight,
+  double* internal_value,
+  data_size_t* internal_count,
+  double* leaf_weight,
+  double* leaf_value,
+  data_size_t* leaf_count,
+  int8_t* decision_type,
+  uint32_t* threshold_in_bin,
+  double* threshold,
+  size_t cuda_bitset_len,
+  size_t cuda_bitset_inner_len,
+  int num_cat,
+  int* cuda_cat_boundaries,
+  int* cuda_cat_boundaries_inner) {
+  const int new_node_index = num_leaves - 1;
+  const int thread_index = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  const int parent_index = leaf_parent[leaf_index];
+  if (thread_index == 0) {
+    if (parent_index >= 0) {
+      // if cur node is left child
+      if (left_child[parent_index] == ~leaf_index) {
+        left_child[parent_index] = new_node_index;
+      } else {
+        right_child[parent_index] = new_node_index;
+      }
+    }
+    left_child[new_node_index] = ~leaf_index;
+    right_child[new_node_index] = ~num_leaves;
+    leaf_parent[leaf_index] = new_node_index;
+    leaf_parent[num_leaves] = new_node_index;
+  } else if (thread_index == 1) {
+    // add new node
+    split_feature_inner[new_node_index] = cuda_split_info->inner_feature_index;
+  } else if (thread_index == 2) {
+    split_feature[new_node_index] = real_feature_index;
+  } else if (thread_index == 3) {
+    split_gain[new_node_index] = static_cast<float>(cuda_split_info->gain);
+  } else if (thread_index == 4) {
+    // save current leaf value to internal node before change
+    internal_weight[new_node_index] = leaf_weight[leaf_index];
+    leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians;
+  } else if (thread_index == 5) {
+    internal_value[new_node_index] = leaf_value[leaf_index];
+    leaf_value[leaf_index] = isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value;
+  } else if (thread_index == 6) {
+    internal_count[new_node_index] = cuda_split_info->left_count + cuda_split_info->right_count;
+  } else if (thread_index == 7) {
+    leaf_count[leaf_index] = cuda_split_info->left_count;
+  } else if (thread_index == 8) {
+    leaf_value[num_leaves] = isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value;
+  } else if (thread_index == 9) {
+    leaf_weight[num_leaves] = cuda_split_info->right_sum_hessians;
+  } else if (thread_index == 10) {
+    leaf_count[num_leaves] = cuda_split_info->right_count;
+  } else if (thread_index == 11) {
+    // update leaf depth
+    leaf_depth[num_leaves] = leaf_depth[leaf_index] + 1;
+    leaf_depth[leaf_index]++;
+  } else if (thread_index == 12) {
+    decision_type[new_node_index] = 0;
+    SetDecisionTypeCUDA(&decision_type[new_node_index], true, kCategoricalMask);
+    SetMissingTypeCUDA(&decision_type[new_node_index], static_cast<int8_t>(missing_type));
+  } else if (thread_index == 13) {
+    threshold_in_bin[new_node_index] = num_cat;
+  } else if (thread_index == 14) {
+    threshold[new_node_index] = num_cat;
+  } else if (thread_index == 15) {
+    if (num_cat == 0) {
+      cuda_cat_boundaries[num_cat] = 0;
+    }
+    cuda_cat_boundaries[num_cat + 1] = cuda_cat_boundaries[num_cat] + cuda_bitset_len;
+  } else if (thread_index == 16) {
+    if (num_cat == 0) {
+      cuda_cat_boundaries_inner[num_cat] = 0;
+    }
+    cuda_cat_boundaries_inner[num_cat + 1] = cuda_cat_boundaries_inner[num_cat] + cuda_bitset_inner_len;
+  }
+}
+
+void CUDATree::LaunchSplitCategoricalKernel(const int leaf_index,
+  const int real_feature_index,
+  const MissingType missing_type,
+  const CUDASplitInfo* cuda_split_info,
+  size_t cuda_bitset_len,
+  size_t cuda_bitset_inner_len) {
+  SplitCategoricalKernel<<<3, 6, 0, cuda_stream_>>>(
+    // split information
+    leaf_index,
+    real_feature_index,
+    missing_type,
+    cuda_split_info,
+    // tree structure
+    num_leaves_,
+    cuda_leaf_parent_,
+    cuda_leaf_depth_,
+    cuda_left_child_,
+    cuda_right_child_,
+    cuda_split_feature_inner_,
+    cuda_split_feature_,
+    cuda_split_gain_,
+    cuda_internal_weight_,
+    cuda_internal_value_,
+    cuda_internal_count_,
+    cuda_leaf_weight_,
+    cuda_leaf_value_,
+    cuda_leaf_count_,
+    cuda_decision_type_,
+    cuda_threshold_in_bin_,
+    cuda_threshold_,
+    cuda_bitset_len,
+    cuda_bitset_inner_len,
+    num_cat_,
+    cuda_cat_boundaries_.RawData(),
+    cuda_cat_boundaries_inner_.RawData());
+}
+
+__global__ void ShrinkageKernel(const double rate, double* cuda_leaf_value, const int num_leaves) {
+  const int leaf_index = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (leaf_index < num_leaves) {
+    cuda_leaf_value[leaf_index] *= rate;
+  }
+}
+
+void CUDATree::LaunchShrinkageKernel(const double rate) {
+  const int num_threads_per_block = 1024;
+  const int num_blocks = (num_leaves_ + num_threads_per_block - 1) / num_threads_per_block;
+  ShrinkageKernel<<<num_blocks, num_threads_per_block>>>(rate, cuda_leaf_value_, num_leaves_);
+}
+
+__global__ void AddBiasKernel(const double val, double* cuda_leaf_value, const int num_leaves) {
+  const int leaf_index = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (leaf_index < num_leaves) {
+    cuda_leaf_value[leaf_index] += val;
+  }
+}
+
+void CUDATree::LaunchAddBiasKernel(const double val) {
+  const int num_threads_per_block = 1024;
+  const int num_blocks = (num_leaves_ + num_threads_per_block - 1) / num_threads_per_block;
+  AddBiasKernel<<<num_blocks, num_threads_per_block>>>(val, cuda_leaf_value_, num_leaves_);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index e5cabe682caa..a407f8af3d80 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -333,23 +333,25 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   }
   if (used_features.empty()) {
     Log::Warning(
-        "There are no meaningful features, as all feature values are "
-        "constant.");
+        "There are no meaningful features which satisfy the provided configuration. "
+        "Decreasing Dataset parameters min_data_in_bin or min_data_in_leaf and re-constructing "
+        "Dataset might resolve this warning.");
   }
   auto features_in_group = NoGroup(used_features);
 
   auto is_sparse = io_config.is_enable_sparse;
-  if (io_config.device_type == std::string("cuda")) {
+  if (io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) {
       LGBM_config_::current_device = lgbm_device_cuda;
-      if (is_sparse) {
+      if ((io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) && is_sparse) {
         Log::Warning("Using sparse features with CUDA is currently not supported.");
+        is_sparse = false;
       }
-      is_sparse = false;
   }
 
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
-    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
+    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda")
+      || io_config.device_type == std::string("cuda_exp");
     features_in_group = FastFeatureBundling(
         *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
         num_sample_col, static_cast<data_size_t>(total_sample_cnt),
@@ -425,6 +427,8 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
       ++num_numeric_features_;
     }
   }
+  device_type_ = io_config.device_type;
+  gpu_device_id_ = io_config.gpu_device_id;
 }
 
 void Dataset::FinishLoad() {
@@ -436,6 +440,14 @@ void Dataset::FinishLoad() {
       feature_groups_[i]->FinishLoad();
     }
   }
+  #ifdef USE_CUDA_EXP
+  if (device_type_ == std::string("cuda_exp")) {
+    CreateCUDAColumnData();
+    metadata_.CreateCUDAMetadata(gpu_device_id_);
+  } else {
+    cuda_column_data_.reset(nullptr);
+  }
+  #endif  // USE_CUDA_EXP
   is_finish_load_ = true;
 }
 
@@ -767,6 +779,8 @@ void Dataset::CreateValid(const Dataset* dataset) {
   label_idx_ = dataset->label_idx_;
   real_feature_idx_ = dataset->real_feature_idx_;
   forced_bin_bounds_ = dataset->forced_bin_bounds_;
+  device_type_ = dataset->device_type_;
+  gpu_device_id_ = dataset->gpu_device_id_;
 }
 
 void Dataset::ReSize(data_size_t num_data) {
@@ -832,6 +846,19 @@ void Dataset::CopySubrow(const Dataset* fullset,
       }
     }
   }
+  // update CUDA storage for column data and metadata
+  device_type_ = fullset->device_type_;
+  gpu_device_id_ = fullset->gpu_device_id_;
+
+  #ifdef USE_CUDA_EXP
+  if (device_type_ == std::string("cuda_exp")) {
+    if (cuda_column_data_ == nullptr) {
+      cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
+      metadata_.CreateCUDAMetadata(gpu_device_id_);
+    }
+    cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
+  }
+  #endif  // USE_CUDA_EXP
 }
 
 bool Dataset::SetFloatField(const char* field_name, const float* field_data,
@@ -1469,6 +1496,169 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
       raw_data_.push_back(other->raw_data_[i]);
     }
   }
+  #ifdef USE_CUDA_EXP
+  if (device_type_ == std::string("cuda_exp")) {
+    CreateCUDAColumnData();
+  } else {
+    cuda_column_data_ = nullptr;
+  }
+  #endif  // USE_CUDA_EXP
 }
 
+const void* Dataset::GetColWiseData(
+  const int feature_group_index,
+  const int sub_feature_index,
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int num_threads) const {
+  return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator, num_threads);
+}
+
+const void* Dataset::GetColWiseData(
+  const int feature_group_index,
+  const int sub_feature_index,
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator);
+}
+
+#ifdef USE_CUDA_EXP
+void Dataset::CreateCUDAColumnData() {
+  cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_));
+  int num_columns = 0;
+  std::vector<const void*> column_data;
+  std::vector<BinIterator*> column_bin_iterator;
+  std::vector<uint8_t> column_bit_type;
+  int feature_index = 0;
+  std::vector<int> feature_to_column(num_features_, -1);
+  std::vector<uint32_t> feature_max_bins(num_features_, 0);
+  std::vector<uint32_t> feature_min_bins(num_features_, 0);
+  std::vector<uint32_t> feature_offsets(num_features_, 0);
+  std::vector<uint32_t> feature_most_freq_bins(num_features_, 0);
+  std::vector<uint32_t> feature_default_bin(num_features_, 0);
+  std::vector<uint8_t> feature_missing_is_zero(num_features_, 0);
+  std::vector<uint8_t> feature_missing_is_na(num_features_, 0);
+  std::vector<uint8_t> feature_mfb_is_zero(num_features_, 0);
+  std::vector<uint8_t> feature_mfb_is_na(num_features_, 0);
+  for (int feature_group_index = 0; feature_group_index < num_groups_; ++feature_group_index) {
+    if (feature_groups_[feature_group_index]->is_multi_val_) {
+      for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) {
+        uint8_t bit_type = 0;
+        bool is_sparse = false;
+        BinIterator* bin_iterator = nullptr;
+        const void* one_column_data = GetColWiseData(feature_group_index,
+                                                     sub_feature_index,
+                                                     &bit_type,
+                                                     &is_sparse,
+                                                     &bin_iterator);
+        column_data.emplace_back(one_column_data);
+        column_bin_iterator.emplace_back(bin_iterator);
+        column_bit_type.emplace_back(bit_type);
+        feature_to_column[feature_index] = num_columns;
+        ++num_columns;
+        const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index);
+        feature_max_bins[feature_index] = feature_max_bin(feature_index);
+        feature_min_bins[feature_index] = feature_min_bin(feature_index);
+        const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin();
+        feature_offsets[feature_index] = static_cast<uint32_t>(most_freq_bin == 0);
+        feature_most_freq_bins[feature_index] = most_freq_bin;
+        feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin();
+        if (feature_bin_mapper->missing_type() == MissingType::Zero) {
+          feature_missing_is_zero[feature_index] = 1;
+          feature_missing_is_na[feature_index] = 0;
+          if (feature_default_bin[feature_index] == feature_most_freq_bins[feature_index]) {
+            feature_mfb_is_zero[feature_index] = 1;
+          } else {
+            feature_mfb_is_zero[feature_index] = 0;
+          }
+          feature_mfb_is_na[feature_index] = 0;
+        } else if (feature_bin_mapper->missing_type() == MissingType::NaN) {
+          feature_missing_is_zero[feature_index] = 0;
+          feature_missing_is_na[feature_index] = 1;
+          feature_mfb_is_zero[feature_index] = 0;
+          if (feature_most_freq_bins[feature_index] + feature_min_bins[feature_index] == feature_max_bins[feature_index] &&
+              feature_most_freq_bins[feature_index] > 0) {
+            feature_mfb_is_na[feature_index] = 1;
+          } else {
+            feature_mfb_is_na[feature_index] = 0;
+          }
+        } else {
+          feature_missing_is_zero[feature_index] = 0;
+          feature_missing_is_na[feature_index] = 0;
+          feature_mfb_is_zero[feature_index] = 0;
+          feature_mfb_is_na[feature_index] = 0;
+        }
+        ++feature_index;
+      }
+    } else {
+      uint8_t bit_type = 0;
+      bool is_sparse = false;
+      BinIterator* bin_iterator = nullptr;
+      const void* one_column_data = GetColWiseData(feature_group_index,
+                                                   -1,
+                                                   &bit_type,
+                                                   &is_sparse,
+                                                   &bin_iterator);
+      column_data.emplace_back(one_column_data);
+      column_bin_iterator.emplace_back(bin_iterator);
+      column_bit_type.emplace_back(bit_type);
+      for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) {
+        feature_to_column[feature_index] = num_columns;
+        const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index);
+        feature_max_bins[feature_index] = feature_max_bin(feature_index);
+        feature_min_bins[feature_index] = feature_min_bin(feature_index);
+        const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin();
+        feature_offsets[feature_index] = static_cast<uint32_t>(most_freq_bin == 0);
+        feature_most_freq_bins[feature_index] = most_freq_bin;
+        feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin();
+        if (feature_bin_mapper->missing_type() == MissingType::Zero) {
+          feature_missing_is_zero[feature_index] = 1;
+          feature_missing_is_na[feature_index] = 0;
+          if (feature_default_bin[feature_index] == feature_most_freq_bins[feature_index]) {
+            feature_mfb_is_zero[feature_index] = 1;
+          } else {
+            feature_mfb_is_zero[feature_index] = 0;
+          }
+          feature_mfb_is_na[feature_index] = 0;
+        } else if (feature_bin_mapper->missing_type() == MissingType::NaN) {
+          feature_missing_is_zero[feature_index] = 0;
+          feature_missing_is_na[feature_index] = 1;
+          feature_mfb_is_zero[feature_index] = 0;
+          if (feature_most_freq_bins[feature_index] + feature_min_bins[feature_index] == feature_max_bins[feature_index] &&
+              feature_most_freq_bins[feature_index] > 0) {
+            feature_mfb_is_na[feature_index] = 1;
+          } else {
+            feature_mfb_is_na[feature_index] = 0;
+          }
+        } else {
+          feature_missing_is_zero[feature_index] = 0;
+          feature_missing_is_na[feature_index] = 0;
+          feature_mfb_is_zero[feature_index] = 0;
+          feature_mfb_is_na[feature_index] = 0;
+        }
+        ++feature_index;
+      }
+      ++num_columns;
+    }
+  }
+  cuda_column_data_->Init(num_columns,
+                          column_data,
+                          column_bin_iterator,
+                          column_bit_type,
+                          feature_max_bins,
+                          feature_min_bins,
+                          feature_offsets,
+                          feature_most_freq_bins,
+                          feature_default_bin,
+                          feature_missing_is_zero,
+                          feature_missing_is_na,
+                          feature_mfb_is_zero,
+                          feature_mfb_is_na,
+                          feature_to_column);
+}
+
+#endif  // USE_CUDA_EXP
+
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 1beff696f904..246424600b03 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -234,8 +234,9 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
       auto sample_data = SampleTextDataFromMemory(text_data);
       CheckSampleSize(sample_data.size(),
                       static_cast<size_t>(dataset->num_data_));
-      // construct feature bin mappers
+      // construct feature bin mappers & clear sample data
       ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
+      std::vector<std::string>().swap(sample_data);
       if (dataset->has_raw()) {
         dataset->ResizeRaw(dataset->num_data_);
       }
@@ -254,8 +255,9 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
       }
       CheckSampleSize(sample_data.size(),
                       static_cast<size_t>(dataset->num_data_));
-      // construct feature bin mappers
+      // construct feature bin mappers & clear sample data
       ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
+      std::vector<std::string>().swap(sample_data);
       if (dataset->has_raw()) {
         dataset->ResizeRaw(dataset->num_data_);
       }
@@ -270,6 +272,21 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
     is_load_from_binary = true;
     Log::Info("Load from binary file %s", bin_filename.c_str());
     dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
+
+    // checks whether there's a initial score file when loaded from binary data files
+    // the intial score file should with suffix ".bin.init"
+    dataset->metadata_.LoadInitialScore(bin_filename);
+
+    dataset->device_type_ = config_.device_type;
+    dataset->gpu_device_id_ = config_.gpu_device_id;
+    #ifdef USE_CUDA_EXP
+    if (config_.device_type == std::string("cuda_exp")) {
+      dataset->CreateCUDAColumnData();
+      dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_);
+    } else {
+      dataset->cuda_column_data_ = nullptr;
+    }
+    #endif  // USE_CUDA_EXP
   }
   // check meta data
   dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
@@ -289,7 +306,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
   auto bin_filename = CheckCanLoadFromBin(filename);
   if (bin_filename.size() == 0) {
     auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_,
-                                                               config_.precise_float_parser, dataset->parser_config_str_));
+                                                               config_.precise_float_parser, train_data->parser_config_str_));
     if (parser == nullptr) {
       Log::Fatal("Could not recognize data format of %s", filename);
     }
@@ -326,6 +343,9 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
   } else {
     // load data from binary file
     dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), 0, 1, &num_global_data, &used_data_indices));
+    // checks whether there's a initial score file when loaded from binary data files
+    // the intial score file should with suffix ".bin.init"
+    dataset->metadata_.LoadInitialScore(bin_filename);
   }
   // not need to check validation data
   // check meta data
@@ -589,7 +609,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
     read_cnt = reader->Read(buffer.data(), size_of_feature);
 
     if (read_cnt != size_of_feature) {
-      Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
+      Log::Fatal("Binary file error: feature %d is incorrect, read count: %zu", i, read_cnt);
     }
     dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
       new FeatureGroup(buffer.data(),
@@ -619,7 +639,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
     for (int i = 0; i < dataset->num_data(); ++i) {
       read_cnt = reader->Read(buffer.data(), row_size);
       if (read_cnt != row_size) {
-        Log::Fatal("Binary file error: row %d of raw data is incorrect, read count: %d", i, read_cnt);
+        Log::Fatal("Binary file error: row %d of raw data is incorrect, read count: %zu", i, read_cnt);
       }
       mem_ptr = buffer.data();
       const float* tmp_ptr_raw_row = reinterpret_cast<const float*>(mem_ptr);
@@ -637,11 +657,14 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
   return dataset.release();
 }
 
-
 Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
-                                                int** sample_indices, int num_col, const int* num_per_col,
-                                                size_t total_sample_size, data_size_t num_data) {
-  CheckSampleSize(total_sample_size, static_cast<size_t>(num_data));
+                                                int** sample_indices,
+                                                int num_col,
+                                                const int* num_per_col,
+                                                size_t total_sample_size,
+                                                data_size_t num_local_data,
+                                                int64_t num_dist_data) {
+  CheckSampleSize(total_sample_size, static_cast<size_t>(num_dist_data));
   int num_total_features = num_col;
   if (Network::num_machines() > 1) {
     num_total_features = Network::GlobalSyncUpByMax(num_total_features);
@@ -665,7 +688,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
   std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, num_col, categorical_features_);
 
   const data_size_t filter_cnt = static_cast<data_size_t>(
-    static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
+    static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_dist_data);
   if (Network::num_machines() == 1) {
     // if only one machine, find bin locally
     OMP_INIT_EX();
@@ -785,10 +808,11 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
       cp_ptr += bin_mappers[i]->SizesInByte();
     }
   }
-  auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
+  CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
+  auto dataset = std::unique_ptr<Dataset>(new Dataset(num_local_data));
   dataset->Construct(&bin_mappers, num_total_features, forced_bin_bounds, sample_indices, sample_values, num_per_col, num_col, total_sample_size, config_);
   if (dataset->has_raw()) {
-    dataset->ResizeRaw(num_data);
+    dataset->ResizeRaw(num_local_data);
   }
   dataset->set_feature_names(feature_names_);
   return dataset.release();
@@ -1164,6 +1188,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
       cp_ptr += bin_mappers[i]->SizesInByte();
     }
   }
+  CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature);
   dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
                      Common::Vector2Ptr<double>(&sample_values).data(),
                      Common::VectorSize<int>(sample_indices).data(), static_cast<int>(sample_indices.size()), sample_data.size(), config_);
@@ -1235,7 +1260,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
   } else {
     OMP_INIT_EX();
     // if need to prediction with initial model
-    std::vector<double> init_score(dataset->num_data_ * num_class_);
+    std::vector<double> init_score(static_cast<size_t>(dataset->num_data_) * num_class_);
     #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
     for (data_size_t i = 0; i < dataset->num_data_; ++i) {
       OMP_LOOP_EX_BEGIN();
@@ -1302,7 +1327,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
                                             const std::vector<data_size_t>& used_data_indices, Dataset* dataset) {
   std::vector<double> init_score;
   if (predict_fun_) {
-    init_score = std::vector<double>(dataset->num_data_ * num_class_);
+    init_score = std::vector<double>(static_cast<size_t>(dataset->num_data_) * num_class_);
   }
   std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
     [this, &init_score, &parser, &dataset]
@@ -1443,4 +1468,44 @@ std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced
   return forced_bins;
 }
 
+void DatasetLoader::CheckCategoricalFeatureNumBin(
+  const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
+  const int max_bin, const std::vector<int>& max_bin_by_feature) const {
+  bool need_warning = false;
+  if (bin_mappers.size() < 1024) {
+    for (size_t i = 0; i < bin_mappers.size(); ++i) {
+      const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
+      if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
+        need_warning = true;
+        break;
+      }
+    }
+  } else {
+    const int num_threads = OMP_NUM_THREADS();
+    std::vector<bool> thread_need_warning(num_threads, false);
+    Threading::For<size_t>(0, bin_mappers.size(), 1,
+      [&bin_mappers, &thread_need_warning, &max_bin_by_feature, max_bin] (int thread_index, size_t start, size_t end) {
+        for (size_t i = start; i < end; ++i) {
+          thread_need_warning[thread_index] = false;
+          const int max_bin_for_this_feature = max_bin_by_feature.empty() ? max_bin : max_bin_by_feature[i];
+          if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::CategoricalBin && bin_mappers[i]->num_bin() > max_bin_for_this_feature) {
+            thread_need_warning[thread_index] = true;
+            break;
+          }
+        }
+      });
+    for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
+      if (thread_need_warning[thread_index]) {
+        need_warning = true;
+        break;
+      }
+    }
+  }
+
+  if (need_warning) {
+    Log::Warning("Categorical features with more bins than the configured maximum bin number found.");
+    Log::Warning("For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.");
+  }
+}
+
 }  // namespace LightGBM
diff --git a/src/io/dense_bin.cpp b/src/io/dense_bin.cpp
new file mode 100644
index 000000000000..89475f57110a
--- /dev/null
+++ b/src/io/dense_bin.cpp
@@ -0,0 +1,103 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#include "dense_bin.hpp"
+
+namespace LightGBM {
+
+template <>
+const void* DenseBin<uint8_t, false>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int /*num_threads*/) const {
+  *is_sparse = false;
+  *bit_type = 8;
+  bin_iterator->clear();
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint16_t, false>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int /*num_threads*/) const {
+  *is_sparse = false;
+  *bit_type = 16;
+  bin_iterator->clear();
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint32_t, false>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int /*num_threads*/) const {
+  *is_sparse = false;
+  *bit_type = 32;
+  bin_iterator->clear();
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint8_t, true>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int /*num_threads*/) const {
+  *is_sparse = false;
+  *bit_type = 4;
+  bin_iterator->clear();
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint8_t, false>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = false;
+  *bit_type = 8;
+  *bin_iterator = nullptr;
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint16_t, false>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = false;
+  *bit_type = 16;
+  *bin_iterator = nullptr;
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint32_t, false>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = false;
+  *bit_type = 32;
+  *bin_iterator = nullptr;
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+template <>
+const void* DenseBin<uint8_t, true>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = false;
+  *bit_type = 4;
+  *bin_iterator = nullptr;
+  return reinterpret_cast<const void*>(data_.data());
+}
+
+}  // namespace LightGBM
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 20ffd724e34c..5d95d9dc6073 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -461,9 +461,13 @@ class DenseBin : public Bin {
 
   DenseBin<VAL_T, IS_4BIT>* Clone() override;
 
+  const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const override;
+
+  const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override;
+
  private:
   data_size_t num_data_;
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
   std::vector<VAL_T, CHAllocator<VAL_T>> data_;
 #else
   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp
index 49fc834b87df..185487629022 100644
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -18,6 +18,9 @@ Metadata::Metadata() {
   weight_load_from_file_ = false;
   query_load_from_file_ = false;
   init_score_load_from_file_ = false;
+  #ifdef USE_CUDA_EXP
+  cuda_metadata_ = nullptr;
+  #endif  // USE_CUDA_EXP
 }
 
 void Metadata::Init(const char* data_filename) {
@@ -26,7 +29,7 @@ void Metadata::Init(const char* data_filename) {
   LoadQueryBoundaries();
   LoadWeights();
   LoadQueryWeights();
-  LoadInitialScore();
+  LoadInitialScore(data_filename_);
 }
 
 Metadata::~Metadata() {
@@ -302,6 +305,11 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
     init_score_[i] = Common::AvoidInf(init_score[i]);
   }
   init_score_load_from_file_ = false;
+  #ifdef USE_CUDA_EXP
+  if (cuda_metadata_ != nullptr) {
+    cuda_metadata_->SetInitScore(init_score_.data(), len);
+  }
+  #endif  // USE_CUDA_EXP
 }
 
 void Metadata::SetLabel(const label_t* label, data_size_t len) {
@@ -318,6 +326,11 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) {
   for (data_size_t i = 0; i < num_data_; ++i) {
     label_[i] = Common::AvoidInf(label[i]);
   }
+  #ifdef USE_CUDA_EXP
+  if (cuda_metadata_ != nullptr) {
+    cuda_metadata_->SetLabel(label_.data(), len);
+  }
+  #endif  // USE_CUDA_EXP
 }
 
 void Metadata::SetWeights(const label_t* weights, data_size_t len) {
@@ -340,6 +353,11 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) {
   }
   LoadQueryWeights();
   weight_load_from_file_ = false;
+  #ifdef USE_CUDA_EXP
+  if (cuda_metadata_ != nullptr) {
+    cuda_metadata_->SetWeights(weights_.data(), len);
+  }
+  #endif  // USE_CUDA_EXP
 }
 
 void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
@@ -366,6 +384,16 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
   }
   LoadQueryWeights();
   query_load_from_file_ = false;
+  #ifdef USE_CUDA_EXP
+  if (cuda_metadata_ != nullptr) {
+    if (query_weights_.size() > 0) {
+      CHECK_EQ(query_weights_.size(), static_cast<size_t>(num_queries_));
+      cuda_metadata_->SetQuery(query_boundaries_.data(), query_weights_.data(), num_queries_);
+    } else {
+      cuda_metadata_->SetQuery(query_boundaries_.data(), nullptr, num_queries_);
+    }
+  }
+  #endif  // USE_CUDA_EXP
 }
 
 void Metadata::LoadWeights() {
@@ -390,10 +418,10 @@ void Metadata::LoadWeights() {
   weight_load_from_file_ = true;
 }
 
-void Metadata::LoadInitialScore() {
+void Metadata::LoadInitialScore(const std::string& data_filename) {
   num_init_score_ = 0;
-  std::string init_score_filename(data_filename_);
-  init_score_filename = std::string(data_filename_);
+  std::string init_score_filename(data_filename);
+  init_score_filename = std::string(data_filename);
   // default init_score file name
   init_score_filename.append(".init");
   TextReader<size_t> reader(init_score_filename.c_str(), false);
@@ -472,6 +500,13 @@ void Metadata::LoadQueryWeights() {
   }
 }
 
+#ifdef USE_CUDA_EXP
+void Metadata::CreateCUDAMetadata(const int gpu_device_id) {
+  cuda_metadata_.reset(new CUDAMetadata(gpu_device_id));
+  cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_);
+}
+#endif  // USE_CUDA_EXP
+
 void Metadata::LoadFromMemory(const void* memory) {
   const char* mem_ptr = reinterpret_cast<const char*>(memory);
 
diff --git a/src/io/multi_val_dense_bin.cpp b/src/io/multi_val_dense_bin.cpp
new file mode 100644
index 000000000000..f6cf41b9bb21
--- /dev/null
+++ b/src/io/multi_val_dense_bin.cpp
@@ -0,0 +1,64 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include "multi_val_dense_bin.hpp"
+
+namespace LightGBM {
+
+
+#ifdef USE_CUDA_EXP
+template <>
+const void* MultiValDenseBin<uint8_t>::GetRowWiseData(uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = data_.data();
+  *bit_type = 8;
+  *total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
+  CHECK_EQ(*total_size, data_.size());
+  *is_sparse = false;
+  *out_data_ptr = nullptr;
+  *data_ptr_bit_type = 0;
+  return to_return;
+}
+
+template <>
+const void* MultiValDenseBin<uint16_t>::GetRowWiseData(uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint16_t* data_ptr = data_.data();
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_ptr);
+  *bit_type = 16;
+  *total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
+  CHECK_EQ(*total_size, data_.size());
+  *is_sparse = false;
+  *out_data_ptr = nullptr;
+  *data_ptr_bit_type = 0;
+  return to_return;
+}
+
+template <>
+const void* MultiValDenseBin<uint32_t>::GetRowWiseData(uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint32_t* data_ptr = data_.data();
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_ptr);
+  *bit_type = 32;
+  *total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
+  CHECK_EQ(*total_size, data_.size());
+  *is_sparse = false;
+  *out_data_ptr = nullptr;
+  *data_ptr_bit_type = 0;
+  return to_return;
+}
+
+#endif  // USE_CUDA_EXP
+
+}  // namespace LightGBM
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index 9559e38b7f72..8de9cf305952 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -7,6 +7,7 @@
 
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/threading.h>
 
 #include <algorithm>
 #include <cstdint>
@@ -210,6 +211,14 @@ class MultiValDenseBin : public MultiValBin {
 
   MultiValDenseBin<VAL_T>* Clone() override;
 
+  #ifdef USE_CUDA_EXP
+  const void* GetRowWiseData(uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) const override;
+  #endif  // USE_CUDA_EXP
+
  private:
   data_size_t num_data_;
   int num_bin_;
@@ -229,4 +238,5 @@ MultiValDenseBin<VAL_T>* MultiValDenseBin<VAL_T>::Clone() {
 }
 
 }  // namespace LightGBM
+
 #endif   // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
diff --git a/src/io/multi_val_sparse_bin.cpp b/src/io/multi_val_sparse_bin.cpp
new file mode 100644
index 000000000000..359bf31c1053
--- /dev/null
+++ b/src/io/multi_val_sparse_bin.cpp
@@ -0,0 +1,158 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include "multi_val_sparse_bin.hpp"
+
+namespace LightGBM {
+
+#ifdef USE_CUDA_EXP
+
+template <>
+const void* MultiValSparseBin<uint16_t, uint8_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = data_.data();
+  *bit_type = 8;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 16;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint16_t, uint16_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
+  *bit_type = 16;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 16;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint16_t, uint32_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
+  *bit_type = 32;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 16;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint32_t, uint8_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = data_.data();
+  *bit_type = 8;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 32;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint32_t, uint16_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
+  *bit_type = 16;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 32;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint32_t, uint32_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
+  *bit_type = 32;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 32;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint64_t, uint8_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = data_.data();
+  *bit_type = 8;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 64;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint64_t, uint16_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
+  *bit_type = 16;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 64;
+  return to_return;
+}
+
+template <>
+const void* MultiValSparseBin<uint64_t, uint32_t>::GetRowWiseData(
+  uint8_t* bit_type,
+  size_t* total_size,
+  bool* is_sparse,
+  const void** out_data_ptr,
+  uint8_t* data_ptr_bit_type) const {
+  const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
+  *bit_type = 32;
+  *total_size = data_.size();
+  *is_sparse = true;
+  *out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
+  *data_ptr_bit_type = 64;
+  return to_return;
+}
+
+#endif  // USE_CUDA_EXP
+
+}  // namespace LightGBM
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index 1699380732c6..80acbb681ab6 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -7,6 +7,7 @@
 
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>
+#include <LightGBM/utils/threading.h>
 
 #include <algorithm>
 #include <cstdint>
@@ -290,6 +291,15 @@ class MultiValSparseBin : public MultiValBin {
 
   MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;
 
+
+  #ifdef USE_CUDA_EXP
+  const void* GetRowWiseData(uint8_t* bit_type,
+    size_t* total_size,
+    bool* is_sparse,
+    const void** out_data_ptr,
+    uint8_t* data_ptr_bit_type) const override;
+  #endif  // USE_CUDA_EXP
+
  private:
   data_size_t num_data_;
   int num_bin_;
@@ -317,4 +327,5 @@ MultiValSparseBin<INDEX_T, VAL_T>* MultiValSparseBin<INDEX_T, VAL_T>::Clone() {
 }
 
 }  // namespace LightGBM
+
 #endif  // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
diff --git a/src/io/sparse_bin.cpp b/src/io/sparse_bin.cpp
new file mode 100644
index 000000000000..8c45fd512a04
--- /dev/null
+++ b/src/io/sparse_bin.cpp
@@ -0,0 +1,86 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#include "sparse_bin.hpp"
+
+namespace LightGBM {
+
+template <>
+const void* SparseBin<uint8_t>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int num_threads) const {
+  *is_sparse = true;
+  *bit_type = 8;
+  for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
+    bin_iterator->emplace_back(new SparseBinIterator<uint8_t>(this, 0));
+  }
+  return nullptr;
+}
+
+template <>
+const void* SparseBin<uint16_t>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int num_threads) const {
+  *is_sparse = true;
+  *bit_type = 16;
+  for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
+    bin_iterator->emplace_back(new SparseBinIterator<uint16_t>(this, 0));
+  }
+  return nullptr;
+}
+
+template <>
+const void* SparseBin<uint32_t>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  std::vector<BinIterator*>* bin_iterator,
+  const int num_threads) const {
+  *is_sparse = true;
+  *bit_type = 32;
+  for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
+    bin_iterator->emplace_back(new SparseBinIterator<uint32_t>(this, 0));
+  }
+  return nullptr;
+}
+
+template <>
+const void* SparseBin<uint8_t>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = true;
+  *bit_type = 8;
+  *bin_iterator = new SparseBinIterator<uint8_t>(this, 0);
+  return nullptr;
+}
+
+template <>
+const void* SparseBin<uint16_t>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = true;
+  *bit_type = 16;
+  *bin_iterator = new SparseBinIterator<uint16_t>(this, 0);
+  return nullptr;
+}
+
+template <>
+const void* SparseBin<uint32_t>::GetColWiseData(
+  uint8_t* bit_type,
+  bool* is_sparse,
+  BinIterator** bin_iterator) const {
+  *is_sparse = true;
+  *bit_type = 32;
+  *bin_iterator = new SparseBinIterator<uint32_t>(this, 0);
+  return nullptr;
+}
+
+}  // namespace LightGBM
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index e4259a48862b..40a4856934b5 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -620,6 +620,10 @@ class SparseBin : public Bin {
     }
   }
 
+  const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector<BinIterator*>* bin_iterator, const int num_threads) const override;
+
+  const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override;
+
  private:
   data_size_t num_data_;
   std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>>
@@ -665,4 +669,5 @@ BinIterator* SparseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin,
 }
 
 }  // namespace LightGBM
+
 #endif  // LightGBM_IO_SPARSE_BIN_HPP_
diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
index 478c520f1c68..199424733f80 100644
--- a/src/io/train_share_states.cpp
+++ b/src/io/train_share_states.cpp
@@ -382,6 +382,9 @@ void TrainingShareStates::CalcBinOffsets(const std::vector<std::unique_ptr<Featu
     }
     num_hist_total_bin_ = static_cast<int>(feature_hist_offsets_.back());
   }
+  #ifdef USE_CUDA_EXP
+  column_hist_offsets_ = *offsets;
+  #endif  // USE_CUDA_EXP
 }
 
 void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
diff --git a/src/io/tree.cpp b/src/io/tree.cpp
index e3c770491ff6..6cd0d50b4da7 100644
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -53,6 +53,9 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear)
     leaf_features_.resize(max_leaves_);
     leaf_features_inner_.resize(max_leaves_);
   }
+  #ifdef USE_CUDA_EXP
+  is_cuda_tree_ = false;
+  #endif  // USE_CUDA_EXP
 }
 
 int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
@@ -337,11 +340,7 @@ std::string Tree::ToString() const {
   std::stringstream str_buf;
   Common::C_stringstream(str_buf);
 
-  #if ((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__)))
-  using CommonLegacy::ArrayToString;  // Slower & unsafe regarding locale.
-  #else
   using CommonC::ArrayToString;
-  #endif
 
   str_buf << "num_leaves=" << num_leaves_ << '\n';
   str_buf << "num_cat=" << num_cat_ << '\n';
@@ -521,6 +520,7 @@ std::string Tree::NodeToJSON(int index) const {
 std::string Tree::NumericalDecisionIfElse(int node) const {
   std::stringstream str_buf;
   Common::C_stringstream(str_buf);
+  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
   uint8_t missing_type = GetMissingType(decision_type_[node]);
   bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
   if (missing_type == MissingType::None
@@ -734,6 +734,10 @@ Tree::Tree(const char* str, size_t* used_len) {
     is_linear_ = false;
   }
 
+  #ifdef USE_CUDA_EXP
+  is_cuda_tree_ = false;
+  #endif  // USE_CUDA_EXP
+
   if ((num_leaves_ <= 1) && !is_linear_) {
     return;
   }
diff --git a/src/network/ifaddrs_patch.cpp b/src/network/ifaddrs_patch.cpp
deleted file mode 100644
index f7f30a8e4770..000000000000
--- a/src/network/ifaddrs_patch.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*!
-* Copyright (c) 2009, Jay Loden, Dave Daeschler, Giampaolo Rodola.
- * Licensed under the BSD 3-Clause License.
- * See https://github.com/giampaolo/psutil/blob/master/LICENSE
- */
-#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
-
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <net/if.h>
-#include <netinet/in.h>
-#include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/sockio.h>
-
-#include "ifaddrs_patch.h"
-
-#define MAX(x, y) ((x) > (y)?(x):(y))
-#define SIZE(p) MAX((p).ss_len, sizeof(p))
-
-
-static struct sockaddr *
-sa_dup(struct sockaddr_storage *sa1) {
-    struct sockaddr *sa2;
-    size_t sz = sizeof(struct sockaddr_storage);
-    sa2 = (struct sockaddr *) calloc(1, sz);
-    memcpy(sa2, sa1, sz);
-    return(sa2);
-}
-
-
-void freeifaddrs(struct ifaddrs *ifp) {
-    if (NULL == ifp) return;
-    free(ifp->ifa_name);
-    free(ifp->ifa_addr);
-    free(ifp->ifa_netmask);
-    free(ifp->ifa_dstaddr);
-    freeifaddrs(ifp->ifa_next);
-    free(ifp);
-}
-
-
-int getifaddrs(struct ifaddrs **ifap) {
-    int sd = -1;
-    char *ccp, *ecp;
-    struct lifconf ifc;
-    struct lifreq *ifr;
-    struct lifnum lifn;
-    struct ifaddrs *cifa = NULL; /* current */
-    struct ifaddrs *pifa = NULL; /* previous */
-    const size_t IFREQSZ = sizeof(struct lifreq);
-
-    sd = socket(AF_INET, SOCK_STREAM, 0);
-    if (sd < 0)
-        goto error;
-
-    ifc.lifc_buf = NULL;
-    *ifap = NULL;
-    /* find how much memory to allocate for the SIOCGLIFCONF call */
-    lifn.lifn_family = AF_UNSPEC;
-    lifn.lifn_flags = 0;
-    if (ioctl(sd, SIOCGLIFNUM, &lifn) < 0)
-        goto error;
-
-    /* Sun and Apple code likes to pad the interface count here in case interfaces
-     * are coming up between calls */
-    lifn.lifn_count += 4;
-
-    ifc.lifc_family = AF_UNSPEC;
-    ifc.lifc_len = lifn.lifn_count * sizeof(struct lifreq);
-    ifc.lifc_buf = static_cast<char*>(calloc(1, ifc.lifc_len));
-    if (ioctl(sd, SIOCGLIFCONF, &ifc) < 0)
-        goto error;
-
-    ccp = reinterpret_cast<char *>(ifc.lifc_req);
-    ecp = ccp + ifc.lifc_len;
-
-    while (ccp < ecp) {
-        ifr = (struct lifreq *) ccp;
-        cifa = (struct ifaddrs *) calloc(1, sizeof(struct ifaddrs));
-        cifa->ifa_next = NULL;
-        cifa->ifa_name = strdup(ifr->lifr_name);
-
-        if (pifa == NULL) {
-            *ifap = cifa; /* first one */
-        } else {
-            pifa->ifa_next = cifa;
-        }
-
-        if (ioctl(sd, SIOCGLIFADDR, ifr, IFREQSZ) < 0)
-            goto error;
-        cifa->ifa_addr = sa_dup(&ifr->lifr_addr);
-
-        if (ioctl(sd, SIOCGLIFNETMASK, ifr, IFREQSZ) < 0)
-            goto error;
-        cifa->ifa_netmask = sa_dup(&ifr->lifr_addr);
-
-        cifa->ifa_flags = 0;
-        cifa->ifa_dstaddr = NULL;
-
-        if (0 == ioctl(sd, SIOCGLIFFLAGS, ifr)) /* optional */
-            cifa->ifa_flags = ifr->lifr_flags;
-
-        if (ioctl(sd, SIOCGLIFDSTADDR, ifr, IFREQSZ) < 0) {
-            if (0 == ioctl(sd, SIOCGLIFBRDADDR, ifr, IFREQSZ))
-                cifa->ifa_dstaddr = sa_dup(&ifr->lifr_addr);
-        } else {
-            cifa->ifa_dstaddr = sa_dup(&ifr->lifr_addr);
-        }
-
-        pifa = cifa;
-        ccp += IFREQSZ;
-    }
-    free(ifc.lifc_buf);
-    close(sd);
-    return 0;
-error:
-    if (ifc.lifc_buf != NULL)
-        free(ifc.lifc_buf);
-    if (sd != -1)
-        close(sd);
-    freeifaddrs(*ifap);
-    return (-1);
-}
-
-#endif  // (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
diff --git a/src/network/ifaddrs_patch.h b/src/network/ifaddrs_patch.h
deleted file mode 100644
index 1b9f60d7c4b1..000000000000
--- a/src/network/ifaddrs_patch.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*!
- * Copyright (c) 2009, Jay Loden, Dave Daeschler, Giampaolo Rodola.
- * Licensed under the BSD 3-Clause License.
- * See https://github.com/giampaolo/psutil/blob/master/LICENSE
- */
-
-/*
- * - https://lists.samba.org/archive/samba-technical/2009-February/063079.html
- * - https://github.com/giampaolo/psutil/blob/master/psutil/arch/solaris/v10/ifaddrs.h
- */
-
-#ifndef LIGHTGBM_NETWORK_IFADDRS_PATCH_H_
-#define LIGHTGBM_NETWORK_IFADDRS_PATCH_H_
-
-#include <sys/socket.h>
-#include <net/if.h>
-
-#undef  ifa_dstaddr
-#undef  ifa_broadaddr
-#define ifa_broadaddr ifa_dstaddr
-
-struct ifaddrs {
-    struct ifaddrs  *ifa_next;
-    char            *ifa_name;
-    unsigned int     ifa_flags;
-    struct sockaddr *ifa_addr;
-    struct sockaddr *ifa_netmask;
-    struct sockaddr *ifa_dstaddr;
-};
-
-extern int getifaddrs(struct ifaddrs **);
-extern void freeifaddrs(struct ifaddrs *);
-
-#endif  // LIGHTGBM_NETWORK_IFADDRS_PATCH_H_
diff --git a/src/network/linker_topo.cpp b/src/network/linker_topo.cpp
index caafa87e7d72..fccfb1e63829 100644
--- a/src/network/linker_topo.cpp
+++ b/src/network/linker_topo.cpp
@@ -155,7 +155,7 @@ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) {
       rec_map.ranks[i] = next_node_idx;
       // get receive block information
       const int recv_block_start = cur_group_idx / distance[i];
-      rec_map.recv_block_start[i] = group_block_start[recv_block_start * distance[i]];
+      rec_map.recv_block_start[i] = group_block_start[static_cast<size_t>(recv_block_start) * distance[i]];
       int recv_block_len = 0;
       // accumulate block len
       for (int j = 0; j < distance[i]; ++j) {
@@ -164,7 +164,7 @@ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) {
       rec_map.recv_block_len[i] = recv_block_len;
       // get send block information
       const int send_block_start = (cur_group_idx + dir * distance[i]) / distance[i];
-      rec_map.send_block_start[i] = group_block_start[send_block_start * distance[i]];
+      rec_map.send_block_start[i] = group_block_start[static_cast<size_t>(send_block_start) * distance[i]];
       int send_block_len = 0;
       // accumulate block len
       for (int j = 0; j < distance[i]; ++j) {
diff --git a/src/network/linkers_socket.cpp b/src/network/linkers_socket.cpp
index 4e9c374316cf..03c737559b3a 100644
--- a/src/network/linkers_socket.cpp
+++ b/src/network/linkers_socket.cpp
@@ -118,7 +118,7 @@ void Linkers::ParseMachineList(const std::string& machines, const std::string& f
                "Please check machine_list_filename or machines parameter");
   }
   if (client_ips_.size() != static_cast<size_t>(num_machines_)) {
-    Log::Warning("World size is larger than the machine_list size, change world size to %d", client_ips_.size());
+    Log::Warning("World size is larger than the machine_list size, change world size to %zu", client_ips_.size());
     num_machines_ = static_cast<int>(client_ips_.size());
   }
 }
diff --git a/src/network/socket_wrapper.hpp b/src/network/socket_wrapper.hpp
index caa12c279133..37850cc10edc 100644
--- a/src/network/socket_wrapper.hpp
+++ b/src/network/socket_wrapper.hpp
@@ -35,12 +35,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-// ifaddrs.h is not available on Solaris 10
-#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
-  #include "ifaddrs_patch.h"
-#else
-  #include <ifaddrs.h>
-#endif
+#include <ifaddrs.h>
 
 #endif  // defined(_WIN32)
 
@@ -60,8 +55,8 @@ const int INVALID_SOCKET = -1;
 #endif
 
 #ifdef _WIN32
-#ifndef _MSC_VER
-// not using visual studio in windows
+// existence of inet_pton is checked in CMakeLists.txt and configure.win, then stored in WIN_HAS_INET_PTON
+#ifndef WIN_HAS_INET_PTON
 inline int inet_pton(int af, const char *src, void *dst) {
   struct sockaddr_storage ss;
   int size = sizeof(ss);
diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp
index 52be93eeeca1..2f9d93509362 100644
--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -34,7 +34,7 @@ class BinaryLogloss: public ObjectiveFunction {
     }
     is_pos_ = is_pos;
     if (is_pos_ == nullptr) {
-      is_pos_ = [](label_t label) {return label > 0; };
+      is_pos_ = [](label_t label) { return label > 0; };
     }
   }
 
diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp
index 88aa0ee040e6..0ee8d359cd67 100644
--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -121,7 +121,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
           if (label_int_[i] == k) {
             gradients[idx] = static_cast<score_t>((p - 1.0f) * weights_[i]);
           } else {
-            gradients[idx] = static_cast<score_t>((p) * weights_[i]);
+            gradients[idx] = static_cast<score_t>(p * weights_[i]);
           }
           hessians[idx] = static_cast<score_t>((factor_ * p * (1.0f - p))* weights_[i]);
         }
diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp
index e711da012066..d2f988d5b72a 100644
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -24,7 +24,7 @@ namespace LightGBM {
     for (data_size_t i = 0; i < cnt_data; ++i) {                          \
       ref_data[i] = data_reader(i);                                       \
     }                                                                     \
-    const double float_pos = (1.0f - alpha) * cnt_data;                   \
+    const double float_pos = static_cast<double>(1.0 - alpha) * cnt_data; \
     const data_size_t pos = static_cast<data_size_t>(float_pos);          \
     if (pos < 1) {                                                        \
       return ref_data[ArrayArgs<T>::ArgMax(ref_data)];                    \
@@ -135,7 +135,7 @@ class RegressionL2loss: public ObjectiveFunction {
     } else {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>((score[i] - label_[i]) * weights_[i]);
+        gradients[i] = static_cast<score_t>(static_cast<score_t>((score[i] - label_[i])) * weights_[i]);
         hessians[i] = static_cast<score_t>(weights_[i]);
       }
     }
@@ -176,7 +176,7 @@ class RegressionL2loss: public ObjectiveFunction {
     if (weights_ != nullptr) {
       #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        suml += label_[i] * weights_[i];
+        suml += static_cast<double>(label_[i]) * weights_[i];
         sumw += weights_[i];
       }
     } else {
@@ -330,7 +330,7 @@ class RegressionHuberLoss: public RegressionL2loss {
         if (std::abs(diff) <= alpha_) {
           gradients[i] = static_cast<score_t>(diff * weights_[i]);
         } else {
-          gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i] * alpha_);
+          gradients[i] = static_cast<score_t>(Common::Sign(diff) * static_cast<score_t>(weights_[i]) * alpha_);
         }
         hessians[i] = static_cast<score_t>(weights_[i]);
       }
@@ -439,17 +439,20 @@ class RegressionPoissonLoss: public RegressionL2loss {
    */
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
+    double exp_max_delta_step_ = std::exp(max_delta_step_);
     if (weights_ == nullptr) {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>(std::exp(score[i]) - label_[i]);
-        hessians[i] = static_cast<score_t>(std::exp(score[i] + max_delta_step_));
+        double exp_score = std::exp(score[i]);
+        gradients[i] = static_cast<score_t>(exp_score - label_[i]);
+        hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_);
       }
     } else {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>((std::exp(score[i]) - label_[i]) * weights_[i]);
-        hessians[i] = static_cast<score_t>(std::exp(score[i] + max_delta_step_) * weights_[i]);
+        double exp_score = std::exp(score[i]);
+        gradients[i] = static_cast<score_t>((exp_score - label_[i]) * weights_[i]);
+        hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_ * weights_[i]);
       }
     }
   }
@@ -689,14 +692,16 @@ class RegressionGammaLoss : public RegressionPoissonLoss {
     if (weights_ == nullptr) {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>(1.0 - label_[i] * std::exp(-score[i]));
-        hessians[i] = static_cast<score_t>(label_[i] * std::exp(-score[i]));
+        double exp_score = std::exp(-score[i]);
+        gradients[i] = static_cast<score_t>(1.0 - label_[i] * exp_score);
+        hessians[i] = static_cast<score_t>(label_[i] * exp_score);
       }
     } else {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>((1.0 - label_[i] * std::exp(-score[i])) * weights_[i]);
-        hessians[i] = static_cast<score_t>(label_[i] * std::exp(-score[i]) * weights_[i]);
+        double exp_score = std::exp(-score[i]);
+        gradients[i] = static_cast<score_t>((1.0 - label_[i] * exp_score) * weights_[i]);
+        hessians[i] = static_cast<score_t>(label_[i] * exp_score * weights_[i]);
       }
     }
   }
@@ -725,16 +730,20 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
     if (weights_ == nullptr) {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>(-label_[i] * std::exp((1 - rho_) * score[i]) + std::exp((2 - rho_) * score[i]));
-        hessians[i] = static_cast<score_t>(-label_[i] * (1 - rho_) * std::exp((1 - rho_) * score[i]) +
-          (2 - rho_) * std::exp((2 - rho_) * score[i]));
+        double exp_1_score = std::exp((1 - rho_) * score[i]);
+        double exp_2_score = std::exp((2 - rho_) * score[i]);
+        gradients[i] = static_cast<score_t>(-label_[i] * exp_1_score + exp_2_score);
+        hessians[i] = static_cast<score_t>(-label_[i] * (1 - rho_) * exp_1_score +
+          (2 - rho_) * exp_2_score);
       }
     } else {
       #pragma omp parallel for schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
-        gradients[i] = static_cast<score_t>((-label_[i] * std::exp((1 - rho_) * score[i]) + std::exp((2 - rho_) * score[i])) * weights_[i]);
-        hessians[i] = static_cast<score_t>((-label_[i] * (1 - rho_) * std::exp((1 - rho_) * score[i]) +
-          (2 - rho_) * std::exp((2 - rho_) * score[i])) * weights_[i]);
+        double exp_1_score = std::exp((1 - rho_) * score[i]);
+        double exp_2_score = std::exp((2 - rho_) * score[i]);
+        gradients[i] = static_cast<score_t>((-label_[i] * exp_1_score + exp_2_score) * weights_[i]);
+        hessians[i] = static_cast<score_t>((-label_[i] * (1 - rho_) * exp_1_score +
+          (2 - rho_) * exp_2_score) * weights_[i]);
       }
     }
   }
diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp
index baee5bf991e4..513ccc1c2462 100644
--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -117,7 +117,7 @@ class CrossEntropy: public ObjectiveFunction {
       #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
 
       for (data_size_t i = 0; i < num_data_; ++i) {
-        suml += label_[i] * weights_[i];
+        suml += static_cast<double>(label_[i]) * weights_[i];
         sumw += weights_[i];
       }
     } else {
@@ -247,7 +247,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
       #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
 
       for (data_size_t i = 0; i < num_data_; ++i) {
-        suml += label_[i] * weights_[i];
+        suml += static_cast<double>(label_[i]) * weights_[i];
         sumw += weights_[i];
       }
     } else {
diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp
index 4bc149148c79..4c29deb82de4 100644
--- a/src/treelearner/cost_effective_gradient_boosting.hpp
+++ b/src/treelearner/cost_effective_gradient_boosting.hpp
@@ -10,6 +10,7 @@
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/log.h>
+#include <LightGBM/utils/threading.h>
 
 #include <vector>
 
@@ -32,6 +33,7 @@ class CostEfficientGradientBoosting {
       return true;
     }
   }
+
   void Init() {
     auto train_data = tree_learner_->train_data_;
     if (!init_) {
@@ -63,7 +65,18 @@ class CostEfficientGradientBoosting {
     }
     init_ = true;
   }
-  double DetlaGain(int feature_index, int real_fidx, int leaf_index,
+
+  void BeforeTrain() {
+    // clear the splits in splits_per_leaf_
+    Threading::For<size_t>(0, splits_per_leaf_.size(), 1024,
+      [this] (int /*thread_index*/, size_t start, size_t end) {
+      for (size_t i = start; i < end; ++i) {
+        splits_per_leaf_[i].Reset();
+      }
+    });
+  }
+
+  double DeltaGain(int feature_index, int real_fidx, int leaf_index,
                    int num_data_in_leaf, SplitInfo split_info) {
     auto config = tree_learner_->config_;
     double delta =
@@ -82,6 +95,7 @@ class CostEfficientGradientBoosting {
                      feature_index] = split_info;
     return delta;
   }
+
   void UpdateLeafBestSplits(Tree* tree, int best_leaf,
                             const SplitInfo* best_split_info,
                             std::vector<SplitInfo>* best_split_per_leaf) {
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp
new file mode 100644
index 000000000000..51589a673aa8
--- /dev/null
+++ b/src/treelearner/cuda/cuda_best_split_finder.cpp
@@ -0,0 +1,369 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <algorithm>
+
+#include "cuda_best_split_finder.hpp"
+#include "cuda_leaf_splits.hpp"
+
+namespace LightGBM {
+
+CUDABestSplitFinder::CUDABestSplitFinder(
+  const hist_t* cuda_hist,
+  const Dataset* train_data,
+  const std::vector<uint32_t>& feature_hist_offsets,
+  const Config* config):
+  num_features_(train_data->num_features()),
+  num_leaves_(config->num_leaves),
+  feature_hist_offsets_(feature_hist_offsets),
+  lambda_l1_(config->lambda_l1),
+  lambda_l2_(config->lambda_l2),
+  min_data_in_leaf_(config->min_data_in_leaf),
+  min_sum_hessian_in_leaf_(config->min_sum_hessian_in_leaf),
+  min_gain_to_split_(config->min_gain_to_split),
+  cat_smooth_(config->cat_smooth),
+  cat_l2_(config->cat_l2),
+  max_cat_threshold_(config->max_cat_threshold),
+  min_data_per_group_(config->min_data_per_group),
+  max_cat_to_onehot_(config->max_cat_to_onehot),
+  extra_trees_(config->extra_trees),
+  extra_seed_(config->extra_seed),
+  use_smoothing_(config->path_smooth > 0),
+  path_smooth_(config->path_smooth),
+  num_total_bin_(feature_hist_offsets.empty() ? 0 : static_cast<int>(feature_hist_offsets.back())),
+  cuda_hist_(cuda_hist) {
+  InitFeatureMetaInfo(train_data);
+  cuda_leaf_best_split_info_ = nullptr;
+  cuda_best_split_info_ = nullptr;
+  cuda_best_split_info_buffer_ = nullptr;
+  cuda_is_feature_used_bytree_ = nullptr;
+}
+
+CUDABestSplitFinder::~CUDABestSplitFinder() {
+  DeallocateCUDAMemory<CUDASplitInfo>(&cuda_leaf_best_split_info_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<CUDASplitInfo>(&cuda_best_split_info_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_best_split_info_buffer_, __FILE__, __LINE__);
+  cuda_split_find_tasks_.Clear();
+  DeallocateCUDAMemory<int8_t>(&cuda_is_feature_used_bytree_, __FILE__, __LINE__);
+  gpuAssert(cudaStreamDestroy(cuda_streams_[0]), __FILE__, __LINE__);
+  gpuAssert(cudaStreamDestroy(cuda_streams_[1]), __FILE__, __LINE__);
+  cuda_streams_.clear();
+  cuda_streams_.shrink_to_fit();
+}
+
+void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) {
+  feature_missing_type_.resize(num_features_);
+  feature_mfb_offsets_.resize(num_features_);
+  feature_default_bins_.resize(num_features_);
+  feature_num_bins_.resize(num_features_);
+  max_num_bin_in_feature_ = 0;
+  has_categorical_feature_ = false;
+  max_num_categorical_bin_ = 0;
+  is_categorical_.resize(train_data->num_features(), 0);
+  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
+    const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index);
+    if (bin_mapper->bin_type() == BinType::CategoricalBin) {
+      has_categorical_feature_ = true;
+      is_categorical_[inner_feature_index] = 1;
+      if (bin_mapper->num_bin() > max_num_categorical_bin_) {
+        max_num_categorical_bin_ = bin_mapper->num_bin();
+      }
+    }
+    const MissingType missing_type = bin_mapper->missing_type();
+    feature_missing_type_[inner_feature_index] = missing_type;
+    feature_mfb_offsets_[inner_feature_index] = static_cast<int8_t>(bin_mapper->GetMostFreqBin() == 0);
+    feature_default_bins_[inner_feature_index] = bin_mapper->GetDefaultBin();
+    feature_num_bins_[inner_feature_index] = static_cast<uint32_t>(bin_mapper->num_bin());
+    const int num_bin_hist = bin_mapper->num_bin() - feature_mfb_offsets_[inner_feature_index];
+    if (num_bin_hist > max_num_bin_in_feature_) {
+      max_num_bin_in_feature_ = num_bin_hist;
+    }
+  }
+  if (max_num_bin_in_feature_ > NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER) {
+    use_global_memory_ = true;
+  } else {
+    use_global_memory_ = false;
+  }
+}
+
+void CUDABestSplitFinder::Init() {
+  InitCUDAFeatureMetaInfo();
+  cuda_streams_.resize(2);
+  CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0]));
+  CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1]));
+  AllocateCUDAMemory<int>(&cuda_best_split_info_buffer_, 8, __FILE__, __LINE__);
+  if (use_global_memory_) {
+    AllocateCUDAMemory<hist_t>(&cuda_feature_hist_grad_buffer_, static_cast<size_t>(num_total_bin_), __FILE__, __LINE__);
+    AllocateCUDAMemory<hist_t>(&cuda_feature_hist_hess_buffer_, static_cast<size_t>(num_total_bin_), __FILE__, __LINE__);
+    if (has_categorical_feature_) {
+      AllocateCUDAMemory<hist_t>(&cuda_feature_hist_stat_buffer_, static_cast<size_t>(num_total_bin_), __FILE__, __LINE__);
+      AllocateCUDAMemory<data_size_t>(&cuda_feature_hist_index_buffer_, static_cast<size_t>(num_total_bin_), __FILE__, __LINE__);
+    }
+  }
+}
+
+void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() {
+  AllocateCUDAMemory<int8_t>(&cuda_is_feature_used_bytree_, static_cast<size_t>(num_features_), __FILE__, __LINE__);
+
+  // intialize split find task information (a split find task is one pass through the histogram of a feature)
+  num_tasks_ = 0;
+  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
+    const uint32_t num_bin = feature_num_bins_[inner_feature_index];
+    const MissingType missing_type = feature_missing_type_[inner_feature_index];
+    if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) {
+      num_tasks_ += 2;
+    } else {
+      ++num_tasks_;
+    }
+  }
+  split_find_tasks_.resize(num_tasks_);
+  split_find_tasks_.shrink_to_fit();
+  int cur_task_index = 0;
+  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
+    const uint32_t num_bin = feature_num_bins_[inner_feature_index];
+    const MissingType missing_type = feature_missing_type_[inner_feature_index];
+    if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) {
+      if (missing_type == MissingType::Zero) {
+        SplitFindTask* new_task = &split_find_tasks_[cur_task_index];
+        new_task->reverse = false;
+        new_task->skip_default_bin = true;
+        new_task->na_as_missing = false;
+        new_task->inner_feature_index = inner_feature_index;
+        new_task->assume_out_default_left = false;
+        new_task->is_categorical = false;
+        uint32_t num_bin = feature_num_bins_[inner_feature_index];
+        new_task->is_one_hot = false;
+        new_task->hist_offset = feature_hist_offsets_[inner_feature_index];
+        new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index];
+        new_task->default_bin = feature_default_bins_[inner_feature_index];
+        new_task->num_bin = num_bin;
+        ++cur_task_index;
+
+        new_task = &split_find_tasks_[cur_task_index];
+        new_task->reverse = true;
+        new_task->skip_default_bin = true;
+        new_task->na_as_missing = false;
+        new_task->inner_feature_index = inner_feature_index;
+        new_task->assume_out_default_left = true;
+        new_task->is_categorical = false;
+        num_bin = feature_num_bins_[inner_feature_index];
+        new_task->is_one_hot = false;
+        new_task->hist_offset = feature_hist_offsets_[inner_feature_index];
+        new_task->default_bin = feature_default_bins_[inner_feature_index];
+        new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index];
+        new_task->num_bin = num_bin;
+        ++cur_task_index;
+      } else {
+        SplitFindTask* new_task = &split_find_tasks_[cur_task_index];
+        new_task->reverse = false;
+        new_task->skip_default_bin = false;
+        new_task->na_as_missing = true;
+        new_task->inner_feature_index = inner_feature_index;
+        new_task->assume_out_default_left = false;
+        new_task->is_categorical = false;
+        uint32_t num_bin = feature_num_bins_[inner_feature_index];
+        new_task->is_one_hot = false;
+        new_task->hist_offset = feature_hist_offsets_[inner_feature_index];
+        new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index];
+        new_task->default_bin = feature_default_bins_[inner_feature_index];
+        new_task->num_bin = num_bin;
+        ++cur_task_index;
+
+        new_task = &split_find_tasks_[cur_task_index];
+        new_task->reverse = true;
+        new_task->skip_default_bin = false;
+        new_task->na_as_missing = true;
+        new_task->inner_feature_index = inner_feature_index;
+        new_task->assume_out_default_left = true;
+        new_task->is_categorical = false;
+        num_bin = feature_num_bins_[inner_feature_index];
+        new_task->is_one_hot = false;
+        new_task->hist_offset = feature_hist_offsets_[inner_feature_index];
+        new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index];
+        new_task->default_bin = feature_default_bins_[inner_feature_index];
+        new_task->num_bin = num_bin;
+        ++cur_task_index;
+      }
+    } else {
+      SplitFindTask& new_task = split_find_tasks_[cur_task_index];
+      const uint32_t num_bin = feature_num_bins_[inner_feature_index];
+      if (is_categorical_[inner_feature_index]) {
+        new_task.reverse = false;
+        new_task.is_categorical = true;
+        new_task.is_one_hot = (static_cast<int>(num_bin) <= max_cat_to_onehot_);
+      } else {
+        new_task.reverse = true;
+        new_task.is_categorical = false;
+        new_task.is_one_hot = false;
+      }
+      new_task.skip_default_bin = false;
+      new_task.na_as_missing = false;
+      new_task.inner_feature_index = inner_feature_index;
+      if (missing_type != MissingType::NaN && !is_categorical_[inner_feature_index]) {
+        new_task.assume_out_default_left = true;
+      } else {
+        new_task.assume_out_default_left = false;
+      }
+      new_task.hist_offset = feature_hist_offsets_[inner_feature_index];
+      new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index];
+      new_task.default_bin = feature_default_bins_[inner_feature_index];
+      new_task.num_bin = num_bin;
+      ++cur_task_index;
+    }
+  }
+  CHECK_EQ(cur_task_index, static_cast<int>(split_find_tasks_.size()));
+
+  if (extra_trees_) {
+    cuda_randoms_.Resize(num_tasks_ * 2);
+    LaunchInitCUDARandomKernel();
+  }
+
+  const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK;
+  const size_t cuda_best_leaf_split_info_buffer_size = static_cast<size_t>(num_task_blocks) * static_cast<size_t>(num_leaves_);
+
+  AllocateCUDAMemory<CUDASplitInfo>(&cuda_leaf_best_split_info_,
+                                    cuda_best_leaf_split_info_buffer_size,
+                                    __FILE__,
+                                    __LINE__);
+
+  cuda_split_find_tasks_.Resize(num_tasks_);
+  CopyFromHostToCUDADevice<SplitFindTask>(cuda_split_find_tasks_.RawData(),
+                                          split_find_tasks_.data(),
+                                          split_find_tasks_.size(),
+                                          __FILE__,
+                                          __LINE__);
+
+  const size_t output_buffer_size = 2 * static_cast<size_t>(num_tasks_);
+  AllocateCUDAMemory<CUDASplitInfo>(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__);
+
+  max_num_categories_in_split_ = std::min(max_cat_threshold_, max_num_categorical_bin_ / 2);
+  AllocateCUDAMemory<uint32_t>(&cuda_cat_threshold_feature_, max_num_categories_in_split_ * output_buffer_size, __FILE__, __LINE__);
+  AllocateCUDAMemory<int>(&cuda_cat_threshold_real_feature_, max_num_categories_in_split_ * output_buffer_size, __FILE__, __LINE__);
+  AllocateCUDAMemory<uint32_t>(&cuda_cat_threshold_leaf_, max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__);
+  AllocateCUDAMemory<int>(&cuda_cat_threshold_real_leaf_, max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__);
+  AllocateCatVectors(cuda_leaf_best_split_info_, cuda_cat_threshold_leaf_, cuda_cat_threshold_real_leaf_, cuda_best_leaf_split_info_buffer_size);
+  AllocateCatVectors(cuda_best_split_info_, cuda_cat_threshold_feature_, cuda_cat_threshold_real_feature_, output_buffer_size);
+}
+
+void CUDABestSplitFinder::ResetTrainingData(
+  const hist_t* cuda_hist,
+  const Dataset* train_data,
+  const std::vector<uint32_t>& feature_hist_offsets) {
+  cuda_hist_ = cuda_hist;
+  num_features_ = train_data->num_features();
+  feature_hist_offsets_ = feature_hist_offsets;
+  InitFeatureMetaInfo(train_data);
+  DeallocateCUDAMemory<int8_t>(&cuda_is_feature_used_bytree_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<CUDASplitInfo>(&cuda_best_split_info_, __FILE__, __LINE__);
+  InitCUDAFeatureMetaInfo();
+}
+
+void CUDABestSplitFinder::ResetConfig(const Config* config, const hist_t* cuda_hist) {
+  num_leaves_ = config->num_leaves;
+  lambda_l1_ = config->lambda_l1;
+  lambda_l2_ = config->lambda_l2;
+  min_data_in_leaf_ = config->min_data_in_leaf;
+  min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf;
+  min_gain_to_split_ = config->min_gain_to_split;
+  cat_smooth_ = config->cat_smooth;
+  cat_l2_ = config->cat_l2;
+  max_cat_threshold_ = config->max_cat_threshold;
+  min_data_per_group_ = config->min_data_per_group;
+  max_cat_to_onehot_ = config->max_cat_to_onehot;
+  extra_trees_ = config->extra_trees;
+  extra_seed_ = config->extra_seed;
+  use_smoothing_ = (config->path_smooth > 0.0f);
+  path_smooth_ = config->path_smooth;
+  cuda_hist_ = cuda_hist;
+
+  const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK;
+  size_t cuda_best_leaf_split_info_buffer_size = static_cast<size_t>(num_task_blocks) * static_cast<size_t>(num_leaves_);
+  DeallocateCUDAMemory<CUDASplitInfo>(&cuda_leaf_best_split_info_, __FILE__, __LINE__);
+  AllocateCUDAMemory<CUDASplitInfo>(&cuda_leaf_best_split_info_,
+                                    cuda_best_leaf_split_info_buffer_size,
+                                    __FILE__,
+                                    __LINE__);
+  max_num_categories_in_split_ = std::min(max_cat_threshold_, max_num_categorical_bin_ / 2);
+  size_t total_cat_threshold_size = max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size;
+  DeallocateCUDAMemory<uint32_t>(&cuda_cat_threshold_leaf_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_cat_threshold_real_leaf_, __FILE__, __LINE__);
+  AllocateCUDAMemory<uint32_t>(&cuda_cat_threshold_leaf_, total_cat_threshold_size, __FILE__, __LINE__);
+  AllocateCUDAMemory<int>(&cuda_cat_threshold_real_leaf_, total_cat_threshold_size, __FILE__, __LINE__);
+  AllocateCatVectors(cuda_leaf_best_split_info_, cuda_cat_threshold_leaf_, cuda_cat_threshold_real_leaf_, cuda_best_leaf_split_info_buffer_size);
+
+  cuda_best_leaf_split_info_buffer_size = 2 * static_cast<size_t>(num_tasks_);
+  total_cat_threshold_size = max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size;
+  DeallocateCUDAMemory<uint32_t>(&cuda_cat_threshold_feature_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_cat_threshold_real_feature_, __FILE__, __LINE__);
+  AllocateCUDAMemory<uint32_t>(&cuda_cat_threshold_feature_, total_cat_threshold_size, __FILE__, __LINE__);
+  AllocateCUDAMemory<int>(&cuda_cat_threshold_real_feature_, total_cat_threshold_size, __FILE__, __LINE__);
+  AllocateCatVectors(cuda_best_split_info_, cuda_cat_threshold_feature_, cuda_cat_threshold_real_feature_, cuda_best_leaf_split_info_buffer_size);
+}
+
+void CUDABestSplitFinder::BeforeTrain(const std::vector<int8_t>& is_feature_used_bytree) {
+  CopyFromHostToCUDADevice<int8_t>(cuda_is_feature_used_bytree_,
+                                   is_feature_used_bytree.data(),
+                                   is_feature_used_bytree.size(), __FILE__, __LINE__);
+}
+
+void CUDABestSplitFinder::FindBestSplitsForLeaf(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const CUDALeafSplitsStruct* larger_leaf_splits,
+  const int smaller_leaf_index,
+  const int larger_leaf_index,
+  const data_size_t num_data_in_smaller_leaf,
+  const data_size_t num_data_in_larger_leaf,
+  const double sum_hessians_in_smaller_leaf,
+  const double sum_hessians_in_larger_leaf) {
+  const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
+    sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
+  const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
+    sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
+  LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+    smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
+  LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
+}
+
+const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits(
+    const int cur_num_leaves,
+    const int smaller_leaf_index,
+    const int larger_leaf_index,
+    int* smaller_leaf_best_split_feature,
+    uint32_t* smaller_leaf_best_split_threshold,
+    uint8_t* smaller_leaf_best_split_default_left,
+    int* larger_leaf_best_split_feature,
+    uint32_t* larger_leaf_best_split_threshold,
+    uint8_t* larger_leaf_best_split_default_left,
+    int* best_leaf_index,
+    int* num_cat_threshold) {
+  LaunchFindBestFromAllSplitsKernel(
+    cur_num_leaves,
+    smaller_leaf_index,
+    larger_leaf_index,
+    smaller_leaf_best_split_feature,
+    smaller_leaf_best_split_threshold,
+    smaller_leaf_best_split_default_left,
+    larger_leaf_best_split_feature,
+    larger_leaf_best_split_threshold,
+    larger_leaf_best_split_default_left,
+    best_leaf_index,
+    num_cat_threshold);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  return cuda_leaf_best_split_info_ + (*best_leaf_index);
+}
+
+void CUDABestSplitFinder::AllocateCatVectors(CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len) {
+  LaunchAllocateCatVectorsKernel(cuda_split_infos, cat_threshold_vec, cat_threshold_real_vec, len);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu
new file mode 100644
index 000000000000..e11fe436a320
--- /dev/null
+++ b/src/treelearner/cuda/cuda_best_split_finder.cu
@@ -0,0 +1,1800 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <algorithm>
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+#include "cuda_best_split_finder.hpp"
+
+namespace LightGBM {
+
+__device__ void ReduceBestGainWarp(double gain, bool found, uint32_t thread_index, double* out_gain, bool* out_found, uint32_t* out_thread_index) {
+  const uint32_t mask = 0xffffffff;
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) {
+    const bool other_found = __shfl_down_sync(mask, found, offset);
+    const double other_gain = __shfl_down_sync(mask, gain, offset);
+    const uint32_t other_thread_index = __shfl_down_sync(mask, thread_index, offset);
+    if ((other_found && found && other_gain > gain) || (!found && other_found)) {
+      found = other_found;
+      gain = other_gain;
+      thread_index = other_thread_index;
+    }
+  }
+  if (warpLane == 0) {
+    *out_gain = gain;
+    *out_found = found;
+    *out_thread_index = thread_index;
+  }
+}
+
+__device__ uint32_t ReduceBestGainBlock(double gain, bool found, uint32_t thread_index) {
+  const uint32_t mask = 0xffffffff;
+  for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) {
+    const bool other_found = __shfl_down_sync(mask, found, offset);
+    const double other_gain = __shfl_down_sync(mask, gain, offset);
+    const uint32_t other_thread_index = __shfl_down_sync(mask, thread_index, offset);
+    if ((other_found && found && other_gain > gain) || (!found && other_found)) {
+      found = other_found;
+      gain = other_gain;
+      thread_index = other_thread_index;
+    }
+  }
+  return thread_index;
+}
+
+__device__ uint32_t ReduceBestGain(double gain, bool found, uint32_t thread_index,
+    double* shared_gain_buffer, bool* shared_found_buffer, uint32_t* shared_thread_index_buffer) {
+  const uint32_t warpID = threadIdx.x / warpSize;
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  const uint32_t num_warp = blockDim.x / warpSize;
+  ReduceBestGainWarp(gain, found, thread_index, shared_gain_buffer + warpID, shared_found_buffer + warpID, shared_thread_index_buffer + warpID);
+  __syncthreads();
+  if (warpID == 0) {
+    gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : kMinScore;
+    found = warpLane < num_warp ? shared_found_buffer[warpLane] : false;
+    thread_index = warpLane < num_warp ? shared_thread_index_buffer[warpLane] : 0;
+    thread_index = ReduceBestGainBlock(gain, found, thread_index);
+  }
+  return thread_index;
+}
+
+__device__ void ReduceBestGainForLeaves(double* gain, int* leaves, int cuda_cur_num_leaves) {
+  const unsigned int tid = threadIdx.x;
+  for (unsigned int s = 1; s < cuda_cur_num_leaves; s *= 2) {
+    if (tid % (2 * s) == 0 && (tid + s) < cuda_cur_num_leaves) {
+      const uint32_t tid_s = tid + s;
+      if ((leaves[tid] == -1 && leaves[tid_s] != -1) || (leaves[tid] != -1 && leaves[tid_s] != -1 && gain[tid_s] > gain[tid])) {
+        gain[tid] = gain[tid_s];
+        leaves[tid] = leaves[tid_s];
+      }
+    }
+    __syncthreads();
+  }
+}
+
+__device__ void ReduceBestGainForLeavesWarp(double gain, int leaf_index, double* out_gain, int* out_leaf_index) {
+  const uint32_t mask = 0xffffffff;
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) {
+    const int other_leaf_index = __shfl_down_sync(mask, leaf_index, offset);
+    const double other_gain = __shfl_down_sync(mask, gain, offset);
+    if ((leaf_index != -1 && other_leaf_index != -1 && other_gain > gain) || (leaf_index == -1 && other_leaf_index != -1)) {
+      gain = other_gain;
+      leaf_index = other_leaf_index;
+    }
+  }
+  if (warpLane == 0) {
+    *out_gain = gain;
+    *out_leaf_index = leaf_index;
+  }
+}
+
+__device__ int ReduceBestGainForLeavesBlock(double gain, int leaf_index) {
+  const uint32_t mask = 0xffffffff;
+  for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) {
+    const int other_leaf_index = __shfl_down_sync(mask, leaf_index, offset);
+    const double other_gain = __shfl_down_sync(mask, gain, offset);
+    if ((leaf_index != -1 && other_leaf_index != -1 && other_gain > gain) || (leaf_index == -1 && other_leaf_index != -1)) {
+      gain = other_gain;
+      leaf_index = other_leaf_index;
+    }
+  }
+  return leaf_index;
+}
+
+__device__ int ReduceBestGainForLeaves(double gain, int leaf_index, double* shared_gain_buffer, int* shared_leaf_index_buffer) {
+  const uint32_t warpID = threadIdx.x / warpSize;
+  const uint32_t warpLane = threadIdx.x % warpSize;
+  const uint32_t num_warp = blockDim.x / warpSize;
+  ReduceBestGainForLeavesWarp(gain, leaf_index, shared_gain_buffer + warpID, shared_leaf_index_buffer + warpID);
+  __syncthreads();
+  if (warpID == 0) {
+    gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : kMinScore;
+    leaf_index = warpLane < num_warp ? shared_leaf_index_buffer[warpLane] : -1;
+    leaf_index = ReduceBestGainForLeavesBlock(gain, leaf_index);
+  }
+  return leaf_index;
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE>
+__device__ void FindBestSplitsForLeafKernelInner(
+  // input feature information
+  const hist_t* feature_hist_ptr,
+  // input task information
+  const SplitFindTask* task,
+  CUDARandom* cuda_random,
+  // input config parameter values
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  // input parent node information
+  const double parent_gain,
+  const double sum_gradients,
+  const double sum_hessians,
+  const data_size_t num_data,
+  const double parent_output,
+  // output parameters
+  CUDASplitInfo* cuda_best_split_info) {
+  const double cnt_factor = num_data / sum_hessians;
+  const double min_gain_shift = parent_gain + min_gain_to_split;
+
+  cuda_best_split_info->is_valid = false;
+
+  hist_t local_grad_hist = 0.0f;
+  hist_t local_hess_hist = 0.0f;
+  double local_gain = 0.0f;
+  bool threshold_found = false;
+  uint32_t threshold_value = 0;
+  __shared__ int rand_threshold;
+  if (USE_RAND && threadIdx.x == 0) {
+    if (task->num_bin - 2 > 0) {
+      rand_threshold = cuda_random->NextInt(0, task->num_bin - 2);
+    }
+  }
+  __shared__ uint32_t best_thread_index;
+  __shared__ double shared_double_buffer[32];
+  __shared__ bool shared_bool_buffer[32];
+  __shared__ uint32_t shared_int_buffer[32];
+  const unsigned int threadIdx_x = threadIdx.x;
+  const bool skip_sum = REVERSE ?
+    (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
+    (task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast<int>(task->default_bin));
+  const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset;
+  if (!REVERSE) {
+    if (task->na_as_missing && task->mfb_offset == 1) {
+      if (threadIdx_x < static_cast<uint32_t>(task->num_bin) && threadIdx_x > 0) {
+        const unsigned int bin_offset = (threadIdx_x - 1) << 1;
+        local_grad_hist = feature_hist_ptr[bin_offset];
+        local_hess_hist = feature_hist_ptr[bin_offset + 1];
+      }
+    } else {
+      if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
+        const unsigned int bin_offset = threadIdx_x << 1;
+        local_grad_hist = feature_hist_ptr[bin_offset];
+        local_hess_hist = feature_hist_ptr[bin_offset + 1];
+      }
+    }
+  } else {
+    if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) &&
+      threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
+      const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x;
+      const unsigned int bin_offset = read_index << 1;
+      local_grad_hist = feature_hist_ptr[bin_offset];
+      local_hess_hist = feature_hist_ptr[bin_offset + 1];
+    }
+  }
+  __syncthreads();
+  if (!REVERSE && task->na_as_missing && task->mfb_offset == 1) {
+    const hist_t sum_gradients_non_default = ShuffleReduceSum<hist_t>(local_grad_hist, shared_double_buffer, blockDim.x);
+    __syncthreads();
+    const hist_t sum_hessians_non_default = ShuffleReduceSum<hist_t>(local_hess_hist, shared_double_buffer, blockDim.x);
+    if (threadIdx_x == 0) {
+      local_grad_hist += (sum_gradients - sum_gradients_non_default);
+      local_hess_hist += (sum_hessians - sum_hessians_non_default);
+    }
+  }
+  if (threadIdx_x == 0) {
+    local_hess_hist += kEpsilon;
+  }
+  local_gain = kMinScore;
+  local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_double_buffer);
+  __syncthreads();
+  local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_double_buffer);
+  if (REVERSE) {
+    if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) && threadIdx_x <= task->num_bin - 2 && !skip_sum) {
+      const double sum_right_gradient = local_grad_hist;
+      const double sum_right_hessian = local_hess_hist;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double sum_left_gradient = sum_gradients - sum_right_gradient;
+      const double sum_left_hessian = sum_hessians - sum_right_hessian;
+      const data_size_t left_count = num_data - right_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+        (!USE_RAND || static_cast<int>(task->num_bin - 2 - threadIdx_x) == rand_threshold)) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian, sum_right_gradient,
+          sum_right_hessian, lambda_l1,
+          lambda_l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > min_gain_shift) {
+          local_gain = current_gain - min_gain_shift;
+          threshold_value = static_cast<uint32_t>(task->num_bin - 2 - threadIdx_x);
+          threshold_found = true;
+        }
+      }
+    }
+  } else {
+    const uint32_t end = (task->na_as_missing && task->mfb_offset == 1) ? static_cast<uint32_t>(task->num_bin - 2) : feature_num_bin_minus_offset - 2;
+    if (threadIdx_x <= end && !skip_sum) {
+      const double sum_left_gradient = local_grad_hist;
+      const double sum_left_hessian = local_hess_hist;
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = num_data - left_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+        (!USE_RAND || static_cast<int>(threadIdx_x + task->mfb_offset) == rand_threshold)) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian, sum_right_gradient,
+          sum_right_hessian, lambda_l1,
+          lambda_l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > min_gain_shift) {
+          local_gain = current_gain - min_gain_shift;
+          threshold_value = (task->na_as_missing && task->mfb_offset == 1) ?
+            static_cast<uint32_t>(threadIdx_x) :
+            static_cast<uint32_t>(threadIdx_x + task->mfb_offset);
+          threshold_found = true;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_bool_buffer, shared_int_buffer);
+  if (threadIdx_x == 0) {
+    best_thread_index = result;
+  }
+  __syncthreads();
+  if (threshold_found && threadIdx_x == best_thread_index) {
+    cuda_best_split_info->is_valid = true;
+    cuda_best_split_info->threshold = threshold_value;
+    cuda_best_split_info->gain = local_gain;
+    cuda_best_split_info->default_left = task->assume_out_default_left;
+    if (REVERSE) {
+      const double sum_right_gradient = local_grad_hist;
+      const double sum_right_hessian = local_hess_hist - kEpsilon;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double sum_left_gradient = sum_gradients - sum_right_gradient;
+      const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon;
+      const data_size_t left_count = num_data - right_count;
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, right_output);
+    } else {
+      const double sum_left_gradient = local_grad_hist;
+      const double sum_left_hessian = local_hess_hist - kEpsilon;
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon;
+      const data_size_t right_count = num_data - left_count;
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, right_output);
+    }
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+__device__ void FindBestSplitsForLeafKernelCategoricalInner(
+  // input feature information
+  const hist_t* feature_hist_ptr,
+  // input task information
+  const SplitFindTask* task,
+  CUDARandom* cuda_random,
+  // input config parameter values
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  const double cat_smooth,
+  const double cat_l2,
+  const int max_cat_threshold,
+  const int min_data_per_group,
+  // input parent node information
+  const double parent_gain,
+  const double sum_gradients,
+  const double sum_hessians,
+  const data_size_t num_data,
+  const double parent_output,
+  // output parameters
+  CUDASplitInfo* cuda_best_split_info) {
+  __shared__ double shared_gain_buffer[32];
+  __shared__ bool shared_found_buffer[32];
+  __shared__ uint32_t shared_thread_index_buffer[32];
+  __shared__ uint32_t best_thread_index;
+  const double cnt_factor = num_data / sum_hessians;
+  const double min_gain_shift = parent_gain + min_gain_to_split;
+  double l2 = lambda_l2;
+
+  double local_gain = min_gain_shift;
+  bool threshold_found = false;
+
+  cuda_best_split_info->is_valid = false;
+
+  const int bin_start = 1 - task->mfb_offset;
+  const int bin_end = task->num_bin - task->mfb_offset;
+  const int threadIdx_x = static_cast<int>(threadIdx.x);
+
+  __shared__ int rand_threshold;
+
+  if (task->is_one_hot) {
+    if (USE_RAND && threadIdx.x == 0) {
+      rand_threshold = 0;
+      if (bin_end > bin_start) {
+        rand_threshold = cuda_random->NextInt(bin_start, bin_end);
+      }
+    }
+    __syncthreads();
+    if (threadIdx_x >= bin_start && threadIdx_x < bin_end) {
+      const int bin_offset = (threadIdx_x << 1);
+      const hist_t grad = feature_hist_ptr[bin_offset];
+      const hist_t hess = feature_hist_ptr[bin_offset + 1];
+      data_size_t cnt =
+            static_cast<data_size_t>(__double2int_rn(hess * cnt_factor));
+      if (cnt >= min_data_in_leaf && hess >= min_sum_hessian_in_leaf) {
+        const data_size_t other_count = num_data - cnt;
+        if (other_count >= min_data_in_leaf) {
+          const double sum_other_hessian = sum_hessians - hess - kEpsilon;
+          if (sum_other_hessian >= min_sum_hessian_in_leaf && (!USE_RAND || static_cast<int>(threadIdx_x) == rand_threshold)) {
+            const double sum_other_gradient = sum_gradients - grad;
+            double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+              sum_other_gradient, sum_other_hessian, grad,
+              hess + kEpsilon, lambda_l1,
+              l2, path_smooth, other_count, cnt, parent_output);
+            if (current_gain > min_gain_shift) {
+              local_gain = current_gain;
+              threshold_found = true;
+            }
+          }
+        }
+      }
+    }
+    __syncthreads();
+    const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer);
+    if (threadIdx_x == 0) {
+      best_thread_index = result;
+    }
+    __syncthreads();
+    if (threshold_found && threadIdx_x == best_thread_index) {
+      cuda_best_split_info->is_valid = true;
+      cuda_best_split_info->num_cat_threshold = 1;
+      cuda_best_split_info->gain = local_gain - min_gain_shift;
+      *(cuda_best_split_info->cat_threshold) = static_cast<uint32_t>(threadIdx_x + task->mfb_offset);
+      cuda_best_split_info->default_left = false;
+      const int bin_offset = (threadIdx_x << 1);
+      const hist_t sum_left_gradient = feature_hist_ptr[bin_offset];
+      const hist_t sum_left_hessian = feature_hist_ptr[bin_offset + 1];
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, right_output);
+    }
+  } else {
+    __shared__ double shared_value_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER];
+    __shared__ int16_t shared_index_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER];
+    __shared__ uint16_t shared_mem_buffer_uint16[32];
+    __shared__ double shared_mem_buffer_double[32];
+    __shared__ int used_bin;
+    l2 += cat_l2;
+    uint16_t is_valid_bin = 0;
+    int best_dir = 0;
+    double best_sum_left_gradient = 0.0f;
+    double best_sum_left_hessian = 0.0f;
+    if (threadIdx_x >= bin_start && threadIdx_x < bin_end) {
+      const int bin_offset = (threadIdx_x << 1);
+      const double hess = feature_hist_ptr[bin_offset + 1];
+      if (__double2int_rn(hess * cnt_factor) >= cat_smooth) {
+        const double grad = feature_hist_ptr[bin_offset];
+        shared_value_buffer[threadIdx_x] = grad / (hess + cat_smooth);
+        is_valid_bin = 1;
+      } else {
+        shared_value_buffer[threadIdx_x] = kMaxScore;
+      }
+    } else {
+      shared_value_buffer[threadIdx_x] = kMaxScore;
+    }
+    shared_index_buffer[threadIdx_x] = threadIdx_x;
+    __syncthreads();
+    const int local_used_bin = ShuffleReduceSum<uint16_t>(is_valid_bin, shared_mem_buffer_uint16, blockDim.x);
+    if (threadIdx_x == 0) {
+      used_bin = local_used_bin;
+    }
+    __syncthreads();
+    BitonicArgSort_1024<double, int16_t, true>(shared_value_buffer, shared_index_buffer, bin_end);
+    __syncthreads();
+    const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2);
+
+    if (USE_RAND) {
+      rand_threshold = 0;
+      const int max_threshold = max(min(max_num_cat, used_bin) - 1, 0);
+      if (max_threshold > 0) {
+        rand_threshold = cuda_random->NextInt(0, max_threshold);
+      }
+    }
+
+    // left to right
+    double grad = 0.0f;
+    double hess = 0.0f;
+    if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) {
+      const int bin_offset = (shared_index_buffer[threadIdx_x] << 1);
+      grad = feature_hist_ptr[bin_offset];
+      hess = feature_hist_ptr[bin_offset + 1];
+    }
+    if (threadIdx_x == 0) {
+      hess += kEpsilon;
+    }
+    __syncthreads();
+    double sum_left_gradient = ShufflePrefixSum<double>(grad, shared_mem_buffer_double);
+    __syncthreads();
+    double sum_left_hessian = ShufflePrefixSum<double>(hess, shared_mem_buffer_double);
+    if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) {
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = num_data - left_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+        (!USE_RAND || threadIdx_x == static_cast<int>(rand_threshold))) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian, sum_right_gradient,
+          sum_right_hessian, lambda_l1,
+          l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > local_gain) {
+          local_gain = current_gain;
+          threshold_found = true;
+          best_dir = 1;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+        }
+      }
+    }
+    __syncthreads();
+
+    // right to left
+    grad = 0.0f;
+    hess = 0.0f;
+    if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) {
+      const int bin_offset = (shared_index_buffer[used_bin - 1 - threadIdx_x] << 1);
+      grad = feature_hist_ptr[bin_offset];
+      hess = feature_hist_ptr[bin_offset + 1];
+    }
+    if (threadIdx_x == 0) {
+      hess += kEpsilon;
+    }
+    __syncthreads();
+    sum_left_gradient = ShufflePrefixSum<double>(grad, shared_mem_buffer_double);
+    __syncthreads();
+    sum_left_hessian = ShufflePrefixSum<double>(hess, shared_mem_buffer_double);
+    if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) {
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = num_data - left_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+        (!USE_RAND || threadIdx_x == static_cast<int>(rand_threshold))) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian, sum_right_gradient,
+          sum_right_hessian, lambda_l1,
+          l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > local_gain) {
+          local_gain = current_gain;
+          threshold_found = true;
+          best_dir = -1;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+        }
+      }
+    }
+    __syncthreads();
+
+    const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer);
+    if (threadIdx_x == 0) {
+      best_thread_index = result;
+    }
+    __syncthreads();
+    if (threshold_found && threadIdx_x == best_thread_index) {
+      cuda_best_split_info->is_valid = true;
+      cuda_best_split_info->num_cat_threshold = threadIdx_x + 1;
+      cuda_best_split_info->gain = local_gain - min_gain_shift;
+      if (best_dir == 1) {
+        for (int i = 0; i < threadIdx_x + 1; ++i) {
+          (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[i] + task->mfb_offset;
+        }
+      } else {
+        for (int i = 0; i < threadIdx_x + 1; ++i) {
+          (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[used_bin - 1 - i] + task->mfb_offset;
+        }
+      }
+      cuda_best_split_info->default_left = false;
+      const hist_t sum_left_gradient = best_sum_left_gradient;
+      const hist_t sum_left_hessian = best_sum_left_hessian;
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, right_output);
+    }
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool IS_LARGER>
+__global__ void FindBestSplitsForLeafKernel(
+  // input feature information
+  const int8_t* is_feature_used_bytree,
+  // input task information
+  const int num_tasks,
+  const SplitFindTask* tasks,
+  CUDARandom* cuda_randoms,
+  // input leaf information
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const CUDALeafSplitsStruct* larger_leaf_splits,
+  // input config parameter values
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const double cat_smooth,
+  const double cat_l2,
+  const int max_cat_threshold,
+  const int min_data_per_group,
+  // output
+  CUDASplitInfo* cuda_best_split_info) {
+  const unsigned int task_index = blockIdx.x;
+  const SplitFindTask* task = tasks + task_index;
+  const int inner_feature_index = task->inner_feature_index;
+  const double parent_gain = IS_LARGER ? larger_leaf_splits->gain : smaller_leaf_splits->gain;
+  const double sum_gradients = IS_LARGER ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients;
+  const double sum_hessians = (IS_LARGER ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon;
+  const data_size_t num_data = IS_LARGER ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf;
+  const double parent_output = IS_LARGER ? larger_leaf_splits->leaf_value : smaller_leaf_splits->leaf_value;
+  const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index;
+  CUDASplitInfo* out = cuda_best_split_info + output_offset;
+  CUDARandom* cuda_random = USE_RAND ?
+    (IS_LARGER ? cuda_randoms + task_index * 2 + 1 : cuda_randoms + task_index * 2) : nullptr;
+  if (is_feature_used_bytree[inner_feature_index]) {
+    const hist_t* hist_ptr = (IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset * 2;
+    if (task->is_categorical) {
+      FindBestSplitsForLeafKernelCategoricalInner<USE_RAND, USE_L1, USE_SMOOTHING>(
+        // input feature information
+        hist_ptr,
+        // input task information
+        task,
+        cuda_random,
+        // input config parameter values
+        lambda_l1,
+        lambda_l2,
+        path_smooth,
+        min_data_in_leaf,
+        min_sum_hessian_in_leaf,
+        min_gain_to_split,
+        cat_smooth,
+        cat_l2,
+        max_cat_threshold,
+        min_data_per_group,
+        // input parent node information
+        parent_gain,
+        sum_gradients,
+        sum_hessians,
+        num_data,
+        parent_output,
+        // output parameters
+        out);
+    } else {
+      if (!task->reverse) {
+        FindBestSplitsForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, false>(
+          // input feature information
+          hist_ptr,
+          // input task information
+          task,
+          cuda_random,
+          // input config parameter values
+          lambda_l1,
+          lambda_l2,
+          path_smooth,
+          min_data_in_leaf,
+          min_sum_hessian_in_leaf,
+          min_gain_to_split,
+          // input parent node information
+          parent_gain,
+          sum_gradients,
+          sum_hessians,
+          num_data,
+          parent_output,
+          // output parameters
+          out);
+      } else {
+        FindBestSplitsForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, true>(
+          // input feature information
+          hist_ptr,
+          // input task information
+          task,
+          cuda_random,
+          // input config parameter values
+          lambda_l1,
+          lambda_l2,
+          path_smooth,
+          min_data_in_leaf,
+          min_sum_hessian_in_leaf,
+          min_gain_to_split,
+          // input parent node information
+          parent_gain,
+          sum_gradients,
+          sum_hessians,
+          num_data,
+          parent_output,
+          // output parameters
+          out);
+      }
+    }
+  } else {
+    out->is_valid = false;
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE>
+__device__ void FindBestSplitsForLeafKernelInner_GlobalMemory(
+  // input feature information
+  const hist_t* feature_hist_ptr,
+  // input task information
+  const SplitFindTask* task,
+  CUDARandom* cuda_random,
+  // input config parameter values
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  // input parent node information
+  const double parent_gain,
+  const double sum_gradients,
+  const double sum_hessians,
+  const data_size_t num_data,
+  const double parent_output,
+  // output parameters
+  CUDASplitInfo* cuda_best_split_info,
+  // buffer
+  hist_t* hist_grad_buffer_ptr,
+  hist_t* hist_hess_buffer_ptr) {
+  const double cnt_factor = num_data / sum_hessians;
+  const double min_gain_shift = parent_gain + min_gain_to_split;
+
+  cuda_best_split_info->is_valid = false;
+  double local_gain = 0.0f;
+  bool threshold_found = false;
+  uint32_t threshold_value = 0;
+  __shared__ int rand_threshold;
+  if (USE_RAND && threadIdx.x == 0) {
+    if (task->num_bin - 2 > 0) {
+      rand_threshold = cuda_random->NextInt(0, task->num_bin - 2);
+    }
+  }
+  __shared__ uint32_t best_thread_index;
+  __shared__ double shared_double_buffer[32];
+  __shared__ bool shared_found_buffer[32];
+  __shared__ uint32_t shared_thread_index_buffer[32];
+  const unsigned int threadIdx_x = threadIdx.x;
+  const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset;
+  if (!REVERSE) {
+    if (task->na_as_missing && task->mfb_offset == 1) {
+      uint32_t bin_start = threadIdx_x > 0 ? threadIdx_x : blockDim.x;
+      hist_t thread_sum_gradients = 0.0f;
+      hist_t thread_sum_hessians = 0.0f;
+      for (unsigned int bin = bin_start; bin < static_cast<uint32_t>(task->num_bin); bin += blockDim.x) {
+        const unsigned int bin_offset = (bin - 1) << 1;
+        const hist_t grad = feature_hist_ptr[bin_offset];
+        const hist_t hess = feature_hist_ptr[bin_offset + 1];
+        hist_grad_buffer_ptr[bin] = grad;
+        hist_hess_buffer_ptr[bin] = hess;
+        thread_sum_gradients += grad;
+        thread_sum_hessians += hess;
+      }
+      const hist_t sum_gradients_non_default = ShuffleReduceSum<double>(thread_sum_gradients, shared_double_buffer, blockDim.x);
+      __syncthreads();
+      const hist_t sum_hessians_non_default = ShuffleReduceSum<double>(thread_sum_hessians, shared_double_buffer, blockDim.x);
+      if (threadIdx_x == 0) {
+        hist_grad_buffer_ptr[0] = sum_gradients - sum_gradients_non_default;
+        hist_hess_buffer_ptr[0] = sum_hessians - sum_hessians_non_default;
+      }
+    } else {
+      for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) {
+        const bool skip_sum =
+          (task->skip_default_bin && (bin + task->mfb_offset) == static_cast<int>(task->default_bin));
+        if (!skip_sum) {
+          const unsigned int bin_offset = bin << 1;
+          hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset];
+          hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1];
+        } else {
+          hist_grad_buffer_ptr[bin] = 0.0f;
+          hist_hess_buffer_ptr[bin] = 0.0f;
+        }
+      }
+    }
+  } else {
+    for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) {
+      const bool skip_sum = bin >= static_cast<unsigned int>(task->na_as_missing) &&
+        (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast<int>(task->default_bin));
+      if (!skip_sum) {
+        const unsigned int read_index = feature_num_bin_minus_offset - 1 - bin;
+        const unsigned int bin_offset = read_index << 1;
+        hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset];
+        hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1];
+      } else {
+        hist_grad_buffer_ptr[bin] = 0.0f;
+        hist_hess_buffer_ptr[bin] = 0.0f;
+      }
+    }
+  }
+  __syncthreads();
+  if (threadIdx_x == 0) {
+    hist_hess_buffer_ptr[0] += kEpsilon;
+  }
+  local_gain = kMinScore;
+  GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast<size_t>(feature_num_bin_minus_offset));
+  __syncthreads();
+  GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast<size_t>(feature_num_bin_minus_offset));
+  if (REVERSE) {
+    for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) {
+      const bool skip_sum = (bin >= static_cast<unsigned int>(task->na_as_missing) &&
+        (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast<int>(task->default_bin)));
+      if (!skip_sum) {
+        const double sum_right_gradient = hist_grad_buffer_ptr[bin];
+        const double sum_right_hessian = hist_hess_buffer_ptr[bin];
+        const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+        const double sum_left_gradient = sum_gradients - sum_right_gradient;
+        const double sum_left_hessian = sum_hessians - sum_right_hessian;
+        const data_size_t left_count = num_data - right_count;
+        if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+          sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+          (!USE_RAND || static_cast<int>(task->num_bin - 2 - bin) == rand_threshold)) {
+          double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian, sum_right_gradient,
+            sum_right_hessian, lambda_l1,
+            lambda_l2, path_smooth, left_count, right_count, parent_output);
+          // gain with split is worse than without split
+          if (current_gain > min_gain_shift) {
+            local_gain = current_gain - min_gain_shift;
+            threshold_value = static_cast<uint32_t>(task->num_bin - 2 - bin);
+            threshold_found = true;
+          }
+        }
+      }
+    }
+  } else {
+    const uint32_t end = (task->na_as_missing && task->mfb_offset == 1) ? static_cast<uint32_t>(task->num_bin - 2) : feature_num_bin_minus_offset - 2;
+    for (unsigned int bin = threadIdx_x; bin <= end; bin += blockDim.x) {
+      const bool skip_sum =
+        (task->skip_default_bin && (bin + task->mfb_offset) == static_cast<int>(task->default_bin));
+      if (!skip_sum) {
+        const double sum_left_gradient = hist_grad_buffer_ptr[bin];
+        const double sum_left_hessian = hist_hess_buffer_ptr[bin];
+        const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+        const double sum_right_gradient = sum_gradients - sum_left_gradient;
+        const double sum_right_hessian = sum_hessians - sum_left_hessian;
+        const data_size_t right_count = num_data - left_count;
+        if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+          sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+          (!USE_RAND || static_cast<int>(bin + task->mfb_offset) == rand_threshold)) {
+          double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian, sum_right_gradient,
+            sum_right_hessian, lambda_l1,
+            lambda_l2, path_smooth, left_count, right_count, parent_output);
+          // gain with split is worse than without split
+          if (current_gain > min_gain_shift) {
+            local_gain = current_gain - min_gain_shift;
+            threshold_value = (task->na_as_missing && task->mfb_offset == 1) ?
+              bin : static_cast<uint32_t>(bin + task->mfb_offset);
+            threshold_found = true;
+          }
+        }
+      }
+    }
+  }
+  __syncthreads();
+  const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_found_buffer, shared_thread_index_buffer);
+  if (threadIdx_x == 0) {
+    best_thread_index = result;
+  }
+  __syncthreads();
+  if (threshold_found && threadIdx_x == best_thread_index) {
+    cuda_best_split_info->is_valid = true;
+    cuda_best_split_info->threshold = threshold_value;
+    cuda_best_split_info->gain = local_gain;
+    cuda_best_split_info->default_left = task->assume_out_default_left;
+    if (REVERSE) {
+      const unsigned int best_bin = static_cast<uint32_t>(task->num_bin - 2 - threshold_value);
+      const double sum_right_gradient = hist_grad_buffer_ptr[best_bin];
+      const double sum_right_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double sum_left_gradient = sum_gradients - sum_right_gradient;
+      const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon;
+      const data_size_t left_count = num_data - right_count;
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, right_output);
+    } else {
+      const unsigned int best_bin = (task->na_as_missing && task->mfb_offset == 1) ?
+        threshold_value : static_cast<uint32_t>(threshold_value - task->mfb_offset);
+      const double sum_left_gradient = hist_grad_buffer_ptr[best_bin];
+      const double sum_left_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon;
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon;
+      const data_size_t right_count = num_data - left_count;
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, lambda_l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, lambda_l2, right_output);
+    }
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+__device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory(
+  // input feature information
+  const hist_t* feature_hist_ptr,
+  // input task information
+  const SplitFindTask* task,
+  CUDARandom* cuda_random,
+  // input config parameter values
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  const double cat_smooth,
+  const double cat_l2,
+  const int max_cat_threshold,
+  const int min_data_per_group,
+  // input parent node information
+  const double parent_gain,
+  const double sum_gradients,
+  const double sum_hessians,
+  const data_size_t num_data,
+  const double parent_output,
+  // buffer
+  hist_t* hist_grad_buffer_ptr,
+  hist_t* hist_hess_buffer_ptr,
+  hist_t* hist_stat_buffer_ptr,
+  data_size_t* hist_index_buffer_ptr,
+  // output parameters
+  CUDASplitInfo* cuda_best_split_info) {
+  __shared__ double shared_gain_buffer[32];
+  __shared__ bool shared_found_buffer[32];
+  __shared__ uint32_t shared_thread_index_buffer[32];
+  __shared__ uint32_t best_thread_index;
+  const double cnt_factor = num_data / sum_hessians;
+  const double min_gain_shift = parent_gain + min_gain_to_split;
+  double l2 = lambda_l2;
+
+  double local_gain = kMinScore;
+  bool threshold_found = false;
+
+  cuda_best_split_info->is_valid = false;
+
+  __shared__ int rand_threshold;
+
+  const int bin_start = 1 - task->mfb_offset;
+  const int bin_end = task->num_bin - task->mfb_offset;
+  int best_threshold = -1;
+  const int threadIdx_x = static_cast<int>(threadIdx.x);
+  if (task->is_one_hot) {
+    if (USE_RAND && threadIdx.x == 0) {
+      rand_threshold = 0;
+      if (bin_end > bin_start) {
+        rand_threshold = cuda_random->NextInt(bin_start, bin_end);
+      }
+    }
+    __syncthreads();
+    for (int bin = bin_start + threadIdx_x; bin < bin_end; bin += static_cast<int>(blockDim.x)) {
+      const int bin_offset = (bin << 1);
+      const hist_t grad = feature_hist_ptr[bin_offset];
+      const hist_t hess = feature_hist_ptr[bin_offset + 1];
+      data_size_t cnt =
+            static_cast<data_size_t>(__double2int_rn(hess * cnt_factor));
+      if (cnt >= min_data_in_leaf && hess >= min_sum_hessian_in_leaf) {
+        const data_size_t other_count = num_data - cnt;
+        if (other_count >= min_data_in_leaf) {
+          const double sum_other_hessian = sum_hessians - hess - kEpsilon;
+          if (sum_other_hessian >= min_sum_hessian_in_leaf && (!USE_RAND || bin == rand_threshold)) {
+            const double sum_other_gradient = sum_gradients - grad;
+            double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+              sum_other_gradient, sum_other_hessian, grad,
+              hess + kEpsilon, lambda_l1,
+              l2, path_smooth, other_count, cnt, parent_output);
+            if (current_gain > min_gain_shift) {
+              best_threshold = bin;
+              local_gain = current_gain - min_gain_shift;
+              threshold_found = true;
+            }
+          }
+        }
+      }
+    }
+    __syncthreads();
+    const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer);
+    if (threadIdx_x == 0) {
+      best_thread_index = result;
+    }
+    __syncthreads();
+    if (threshold_found && threadIdx_x == best_thread_index) {
+      cuda_best_split_info->is_valid = true;
+      cuda_best_split_info->num_cat_threshold = 1;
+      cuda_best_split_info->cat_threshold = new uint32_t[1];
+      *(cuda_best_split_info->cat_threshold) = static_cast<uint32_t>(best_threshold);
+      cuda_best_split_info->default_left = false;
+      const int bin_offset = (best_threshold << 1);
+      const hist_t sum_left_gradient = feature_hist_ptr[bin_offset];
+      const hist_t sum_left_hessian = feature_hist_ptr[bin_offset + 1];
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, right_output);
+    }
+  } else {
+    __shared__ uint16_t shared_mem_buffer_uint16[32];
+    __shared__ int used_bin;
+    l2 += cat_l2;
+    uint16_t is_valid_bin = 0;
+    int best_dir = 0;
+    double best_sum_left_gradient = 0.0f;
+    double best_sum_left_hessian = 0.0f;
+    for (int bin = 0; bin < bin_end; bin += static_cast<int>(blockDim.x)) {
+      if (bin >= bin_start) {
+        const int bin_offset = (bin << 1);
+        const double hess = feature_hist_ptr[bin_offset + 1];
+        if (__double2int_rn(hess * cnt_factor) >= cat_smooth) {
+          const double grad = feature_hist_ptr[bin_offset];
+          hist_stat_buffer_ptr[bin] = grad / (hess + cat_smooth);
+          hist_index_buffer_ptr[bin] = threadIdx_x;
+          is_valid_bin = 1;
+        } else {
+          hist_stat_buffer_ptr[bin] = kMaxScore;
+          hist_index_buffer_ptr[bin] = -1;
+        }
+      }
+    }
+    __syncthreads();
+    const int local_used_bin = ShuffleReduceSum<uint16_t>(is_valid_bin, shared_mem_buffer_uint16, blockDim.x);
+    if (threadIdx_x == 0) {
+      used_bin = local_used_bin;
+    }
+    __syncthreads();
+    BitonicArgSortDevice<double, data_size_t, true, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 11>(
+      hist_stat_buffer_ptr, hist_index_buffer_ptr, task->num_bin - task->mfb_offset);
+    const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2);
+    if (USE_RAND) {
+      rand_threshold = 0;
+      const int max_threshold = max(min(max_num_cat, used_bin) - 1, 0);
+      if (max_threshold > 0) {
+        rand_threshold = cuda_random->NextInt(0, max_threshold);
+      }
+    }
+    __syncthreads();
+
+    // left to right
+    for (int bin = static_cast<int>(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast<int>(blockDim.x)) {
+      const int bin_offset = (hist_index_buffer_ptr[bin] << 1);
+      hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset];
+      hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1];
+    }
+    if (threadIdx_x == 0) {
+      hist_hess_buffer_ptr[0] += kEpsilon;
+    }
+    __syncthreads();
+    GlobalMemoryPrefixSum<double>(hist_grad_buffer_ptr, static_cast<size_t>(bin_end));
+    __syncthreads();
+    GlobalMemoryPrefixSum<double>(hist_hess_buffer_ptr, static_cast<size_t>(bin_end));
+    for (int bin = static_cast<int>(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast<int>(blockDim.x)) {
+      const double sum_left_gradient = hist_grad_buffer_ptr[bin];
+      const double sum_left_hessian = hist_hess_buffer_ptr[bin];
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = num_data - left_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian, sum_right_gradient,
+          sum_right_hessian, lambda_l1,
+          l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > min_gain_shift) {
+          local_gain = current_gain - min_gain_shift;
+          threshold_found = true;
+          best_dir = 1;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+          best_threshold = bin;
+        }
+      }
+    }
+    __syncthreads();
+
+    // right to left
+    for (int bin = static_cast<int>(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast<int>(blockDim.x)) {
+      const int bin_offset = (hist_index_buffer_ptr[used_bin - 1 - bin] << 1);
+      hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset];
+      hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1];
+    }
+    if (threadIdx_x == 0) {
+      hist_hess_buffer_ptr[0] += kEpsilon;
+    }
+    __syncthreads();
+    GlobalMemoryPrefixSum<double>(hist_grad_buffer_ptr, static_cast<size_t>(bin_end));
+    __syncthreads();
+    GlobalMemoryPrefixSum<double>(hist_hess_buffer_ptr, static_cast<size_t>(bin_end));
+    for (int bin = static_cast<int>(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast<int>(blockDim.x)) {
+      const double sum_left_gradient = hist_grad_buffer_ptr[bin];
+      const double sum_left_hessian = hist_hess_buffer_ptr[bin];
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = num_data - left_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian, sum_right_gradient,
+          sum_right_hessian, lambda_l1,
+          l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > min_gain_shift) {
+          local_gain = current_gain - min_gain_shift;
+          threshold_found = true;
+          best_dir = -1;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+          best_threshold = bin;
+        }
+      }
+    }
+    __syncthreads();
+
+    const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer);
+    if (threadIdx_x == 0) {
+      best_thread_index = result;
+    }
+    __syncthreads();
+    if (threshold_found && threadIdx_x == best_thread_index) {
+      cuda_best_split_info->is_valid = true;
+      cuda_best_split_info->num_cat_threshold = best_threshold + 1;
+      cuda_best_split_info->cat_threshold = new uint32_t[best_threshold + 1];
+      cuda_best_split_info->gain = local_gain;
+      if (best_dir == 1) {
+        for (int i = 0; i < best_threshold + 1; ++i) {
+          (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[i] + task->mfb_offset;
+        }
+      } else {
+        for (int i = 0; i < best_threshold + 1; ++i) {
+          (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[used_bin - 1 - i] + task->mfb_offset;
+        }
+      }
+      cuda_best_split_info->default_left = false;
+      const hist_t sum_left_gradient = best_sum_left_gradient;
+      const hist_t sum_left_hessian = best_sum_left_hessian;
+      const data_size_t left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      const double sum_right_gradient = sum_gradients - sum_left_gradient;
+      const double sum_right_hessian = sum_hessians - sum_left_hessian;
+      const data_size_t right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output);
+      const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output);
+      cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+      cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+      cuda_best_split_info->left_count = left_count;
+      cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+      cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+      cuda_best_split_info->right_count = right_count;
+      cuda_best_split_info->left_value = left_output;
+      cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+        sum_left_hessian, lambda_l1, l2, left_output);
+      cuda_best_split_info->right_value = right_output;
+      cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+        sum_right_hessian, lambda_l1, l2, right_output);
+    }
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool IS_LARGER>
+__global__ void FindBestSplitsForLeafKernel_GlobalMemory(
+  // input feature information
+  const int8_t* is_feature_used_bytree,
+  // input task information
+  const int num_tasks,
+  const SplitFindTask* tasks,
+  CUDARandom* cuda_randoms,
+  // input leaf information
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const CUDALeafSplitsStruct* larger_leaf_splits,
+  // input config parameter values
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const double cat_smooth,
+  const double cat_l2,
+  const int max_cat_threshold,
+  const int min_data_per_group,
+  // output
+  CUDASplitInfo* cuda_best_split_info,
+  // buffer
+  hist_t* feature_hist_grad_buffer,
+  hist_t* feature_hist_hess_buffer,
+  hist_t* feature_hist_stat_buffer,
+  data_size_t* feature_hist_index_buffer) {
+  const unsigned int task_index = blockIdx.x;
+  const SplitFindTask* task = tasks + task_index;
+  const double parent_gain = IS_LARGER ? larger_leaf_splits->gain : smaller_leaf_splits->gain;
+  const double sum_gradients = IS_LARGER ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients;
+  const double sum_hessians = (IS_LARGER ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon;
+  const data_size_t num_data = IS_LARGER ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf;
+  const double parent_output = IS_LARGER ? larger_leaf_splits->leaf_value : smaller_leaf_splits->leaf_value;
+  const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index;
+  CUDASplitInfo* out = cuda_best_split_info + output_offset;
+  CUDARandom* cuda_random = USE_RAND ?
+    (IS_LARGER ? cuda_randoms + task_index * 2 + 1: cuda_randoms + task_index * 2) : nullptr;
+  if (is_feature_used_bytree[task->inner_feature_index]) {
+    const uint32_t hist_offset = task->hist_offset;
+    const hist_t* hist_ptr = (IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + hist_offset * 2;
+    hist_t* hist_grad_buffer_ptr = feature_hist_grad_buffer + hist_offset * 2;
+    hist_t* hist_hess_buffer_ptr = feature_hist_hess_buffer + hist_offset * 2;
+    hist_t* hist_stat_buffer_ptr = feature_hist_stat_buffer + hist_offset * 2;
+    data_size_t* hist_index_buffer_ptr = feature_hist_index_buffer + hist_offset * 2;
+    if (task->is_categorical) {
+      FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory<USE_RAND, USE_L1, USE_SMOOTHING>(
+        // input feature information
+        hist_ptr,
+        // input task information
+        task,
+        cuda_random,
+        // input config parameter values
+        lambda_l1,
+        lambda_l2,
+        path_smooth,
+        min_data_in_leaf,
+        min_sum_hessian_in_leaf,
+        min_gain_to_split,
+        cat_smooth,
+        cat_l2,
+        max_cat_threshold,
+        min_data_per_group,
+        // input parent node information
+        parent_gain,
+        sum_gradients,
+        sum_hessians,
+        num_data,
+        parent_output,
+        // buffer
+        hist_grad_buffer_ptr,
+        hist_hess_buffer_ptr,
+        hist_stat_buffer_ptr,
+        hist_index_buffer_ptr,
+        // output parameters
+        out);
+    } else {
+      if (!task->reverse) {
+        FindBestSplitsForLeafKernelInner_GlobalMemory<USE_RAND, USE_L1, USE_SMOOTHING, false>(
+          // input feature information
+          hist_ptr,
+          // input task information
+          task,
+          cuda_random,
+          // input config parameter values
+          lambda_l1,
+          lambda_l2,
+          path_smooth,
+          min_data_in_leaf,
+          min_sum_hessian_in_leaf,
+          min_gain_to_split,
+          // input parent node information
+          parent_gain,
+          sum_gradients,
+          sum_hessians,
+          num_data,
+          parent_output,
+          // output parameters
+          out,
+          // buffer
+          hist_grad_buffer_ptr,
+          hist_hess_buffer_ptr);
+      } else {
+        FindBestSplitsForLeafKernelInner_GlobalMemory<USE_RAND, USE_L1, USE_SMOOTHING, true>(
+          // input feature information
+          hist_ptr,
+          // input task information
+          task,
+          cuda_random,
+          // input config parameter values
+          lambda_l1,
+          lambda_l2,
+          path_smooth,
+          min_data_in_leaf,
+          min_sum_hessian_in_leaf,
+          min_gain_to_split,
+          // input parent node information
+          parent_gain,
+          sum_gradients,
+          sum_hessians,
+          num_data,
+          parent_output,
+          // output parameters
+          out,
+          // buffer
+          hist_grad_buffer_ptr,
+          hist_hess_buffer_ptr);
+      }
+    }
+  } else {
+    out->is_valid = false;
+  }
+}
+
+#define LaunchFindBestSplitsForLeafKernel_PARAMS \
+  const CUDALeafSplitsStruct* smaller_leaf_splits, \
+  const CUDALeafSplitsStruct* larger_leaf_splits, \
+  const int smaller_leaf_index, \
+  const int larger_leaf_index, \
+  const bool is_smaller_leaf_valid, \
+  const bool is_larger_leaf_valid
+
+#define LaunchFindBestSplitsForLeafKernel_ARGS \
+  smaller_leaf_splits, \
+  larger_leaf_splits, \
+  smaller_leaf_index, \
+  larger_leaf_index, \
+  is_smaller_leaf_valid, \
+  is_larger_leaf_valid
+
+#define FindBestSplitsForLeafKernel_ARGS \
+    cuda_is_feature_used_bytree_, \
+    num_tasks_, \
+    cuda_split_find_tasks_.RawData(), \
+    cuda_randoms_.RawData(), \
+    smaller_leaf_splits, \
+    larger_leaf_splits, \
+    min_data_in_leaf_, \
+    min_sum_hessian_in_leaf_, \
+    min_gain_to_split_, \
+    lambda_l1_, \
+    lambda_l2_, \
+    path_smooth_, \
+    cat_smooth_, \
+    cat_l2_, \
+    max_cat_threshold_, \
+    min_data_per_group_, \
+    cuda_best_split_info_
+
+#define GlobalMemory_Buffer_ARGS \
+  cuda_feature_hist_grad_buffer_, \
+  cuda_feature_hist_hess_buffer_, \
+  cuda_feature_hist_stat_buffer_, \
+  cuda_feature_hist_index_buffer_
+
+void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(LaunchFindBestSplitsForLeafKernel_PARAMS) {
+  if (!is_smaller_leaf_valid && !is_larger_leaf_valid) {
+    return;
+  }
+  if (!extra_trees_) {
+    LaunchFindBestSplitsForLeafKernelInner0<false>(LaunchFindBestSplitsForLeafKernel_ARGS);
+  } else {
+    LaunchFindBestSplitsForLeafKernelInner0<true>(LaunchFindBestSplitsForLeafKernel_ARGS);
+  }
+}
+
+template <bool USE_RAND>
+void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner0(LaunchFindBestSplitsForLeafKernel_PARAMS) {
+  if (lambda_l1_ <= 0.0f) {
+    LaunchFindBestSplitsForLeafKernelInner1<USE_RAND, false>(LaunchFindBestSplitsForLeafKernel_ARGS);
+  } else {
+    LaunchFindBestSplitsForLeafKernelInner1<USE_RAND, true>(LaunchFindBestSplitsForLeafKernel_ARGS);
+  }
+}
+
+template <bool USE_RAND, bool USE_L1>
+void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner1(LaunchFindBestSplitsForLeafKernel_PARAMS) {
+  if (!use_smoothing_) {
+    LaunchFindBestSplitsForLeafKernelInner2<USE_RAND, USE_L1, false>(LaunchFindBestSplitsForLeafKernel_ARGS);
+  } else {
+    LaunchFindBestSplitsForLeafKernelInner2<USE_RAND, USE_L1, true>(LaunchFindBestSplitsForLeafKernel_ARGS);
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBestSplitsForLeafKernel_PARAMS) {
+  if (!use_global_memory_) {
+    if (is_smaller_leaf_valid) {
+      FindBestSplitsForLeafKernel<USE_RAND, USE_L1, USE_SMOOTHING, false>
+        <<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[0]>>>
+        (FindBestSplitsForLeafKernel_ARGS);
+    }
+    SynchronizeCUDADevice(__FILE__, __LINE__);
+    if (is_larger_leaf_valid) {
+      FindBestSplitsForLeafKernel<USE_RAND, USE_L1, USE_SMOOTHING, true>
+        <<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[1]>>>
+        (FindBestSplitsForLeafKernel_ARGS);
+    }
+  } else {
+    if (is_smaller_leaf_valid) {
+      FindBestSplitsForLeafKernel_GlobalMemory<USE_RAND, USE_L1, USE_SMOOTHING, false>
+        <<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[0]>>>
+        (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS);
+    }
+    SynchronizeCUDADevice(__FILE__, __LINE__);
+    if (is_larger_leaf_valid) {
+      FindBestSplitsForLeafKernel_GlobalMemory<USE_RAND, USE_L1, USE_SMOOTHING, true>
+        <<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[1]>>>
+        (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS);
+    }
+  }
+}
+
+#undef LaunchFindBestSplitsForLeafKernel_PARAMS
+#undef FindBestSplitsForLeafKernel_ARGS
+#undef GlobalMemory_Buffer_ARGS
+
+__device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read_index,
+  uint32_t num_features_aligned) {
+  const uint32_t threadIdx_x = threadIdx.x;
+  for (unsigned int s = 1; s < num_features_aligned; s <<= 1) {
+    if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < num_features_aligned) {
+      const uint32_t pos_to_compare = threadIdx_x + s;
+      if ((!found[threadIdx_x] && found[pos_to_compare]) ||
+        (found[threadIdx_x] && found[pos_to_compare] && gain[threadIdx_x] < gain[pos_to_compare])) {
+        found[threadIdx_x] = found[pos_to_compare];
+        gain[threadIdx_x] = gain[pos_to_compare];
+        shared_read_index[threadIdx_x] = shared_read_index[pos_to_compare];
+      }
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index,
+  CUDASplitInfo* cuda_leaf_best_split_info,
+  // input parameters
+  const SplitFindTask* tasks,
+  const CUDASplitInfo* cuda_best_split_info,
+  const int num_tasks,
+  const int num_tasks_aligned,
+  const int num_blocks_per_leaf,
+  const bool larger_only,
+  const int num_leaves) {
+  __shared__ double shared_gain_buffer[32];
+  __shared__ bool shared_found_buffer[32];
+  __shared__ uint32_t shared_thread_index_buffer[32];
+  const uint32_t threadIdx_x = threadIdx.x;
+  const uint32_t blockIdx_x = blockIdx.x;
+
+  bool best_found = false;
+  double best_gain = kMinScore;
+  uint32_t shared_read_index = 0;
+
+  const bool is_smaller = (blockIdx_x < static_cast<unsigned int>(num_blocks_per_leaf) && !larger_only);
+  const uint32_t leaf_block_index = (is_smaller || larger_only) ? blockIdx_x : (blockIdx_x - static_cast<unsigned int>(num_blocks_per_leaf));
+  const int task_index = static_cast<int>(leaf_block_index * blockDim.x + threadIdx_x);
+  const uint32_t read_index = is_smaller ? static_cast<uint32_t>(task_index) : static_cast<uint32_t>(task_index + num_tasks);
+  if (task_index < num_tasks) {
+    best_found = cuda_best_split_info[read_index].is_valid;
+    best_gain = cuda_best_split_info[read_index].gain;
+    shared_read_index = read_index;
+  } else {
+    best_found = false;
+  }
+
+  __syncthreads();
+  const uint32_t best_read_index = ReduceBestGain(best_gain, best_found, shared_read_index,
+      shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer);
+  if (threadIdx.x == 0) {
+    const int leaf_index_ref = is_smaller ? smaller_leaf_index : larger_leaf_index;
+    const unsigned buffer_write_pos = static_cast<unsigned int>(leaf_index_ref) + leaf_block_index * num_leaves;
+    CUDASplitInfo* cuda_split_info = cuda_leaf_best_split_info + buffer_write_pos;
+    const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index;
+    if (best_split_info->is_valid) {
+      *cuda_split_info = *best_split_info;
+      cuda_split_info->inner_feature_index = is_smaller ? tasks[best_read_index].inner_feature_index :
+        tasks[static_cast<int>(best_read_index) - num_tasks].inner_feature_index;
+      cuda_split_info->is_valid = true;
+    } else {
+      cuda_split_info->gain = kMinScore;
+      cuda_split_info->is_valid = false;
+    }
+  }
+}
+
+__global__ void SyncBestSplitForLeafKernelAllBlocks(
+  const int smaller_leaf_index,
+  const int larger_leaf_index,
+  const unsigned int num_blocks_per_leaf,
+  const int num_leaves,
+  CUDASplitInfo* cuda_leaf_best_split_info,
+  const bool larger_only) {
+  if (!larger_only) {
+    if (blockIdx.x == 0) {
+      CUDASplitInfo* smaller_leaf_split_info = cuda_leaf_best_split_info + smaller_leaf_index;
+      for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) {
+        const unsigned int leaf_read_pos = static_cast<unsigned int>(smaller_leaf_index) + block_index * static_cast<unsigned int>(num_leaves);
+        const CUDASplitInfo* other_split_info = cuda_leaf_best_split_info + leaf_read_pos;
+        if ((other_split_info->is_valid && smaller_leaf_split_info->is_valid &&
+          other_split_info->gain > smaller_leaf_split_info->gain) ||
+            (!smaller_leaf_split_info->is_valid && other_split_info->is_valid)) {
+          *smaller_leaf_split_info = *other_split_info;
+        }
+      }
+    }
+  }
+  if (larger_leaf_index >= 0) {
+    if (blockIdx.x == 1 || larger_only) {
+      CUDASplitInfo* larger_leaf_split_info = cuda_leaf_best_split_info + larger_leaf_index;
+      for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) {
+        const unsigned int leaf_read_pos = static_cast<unsigned int>(larger_leaf_index) + block_index * static_cast<unsigned int>(num_leaves);
+        const CUDASplitInfo* other_split_info = cuda_leaf_best_split_info + leaf_read_pos;
+        if ((other_split_info->is_valid && larger_leaf_split_info->is_valid &&
+          other_split_info->gain > larger_leaf_split_info->gain) ||
+            (!larger_leaf_split_info->is_valid && other_split_info->is_valid)) {
+            *larger_leaf_split_info = *other_split_info;
+        }
+      }
+    }
+  }
+}
+
+__global__ void SetInvalidLeafSplitInfoKernel(
+  CUDASplitInfo* cuda_leaf_best_split_info,
+  const bool is_smaller_leaf_valid,
+  const bool is_larger_leaf_valid,
+  const int smaller_leaf_index,
+  const int larger_leaf_index) {
+  if (!is_smaller_leaf_valid) {
+    cuda_leaf_best_split_info[smaller_leaf_index].is_valid = false;
+  }
+  if (!is_larger_leaf_valid && larger_leaf_index >= 0) {
+    cuda_leaf_best_split_info[larger_leaf_index].is_valid = false;
+  }
+}
+
+void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(
+  const int host_smaller_leaf_index,
+  const int host_larger_leaf_index,
+  const bool is_smaller_leaf_valid,
+  const bool is_larger_leaf_valid) {
+  if (!is_smaller_leaf_valid || !is_larger_leaf_valid) {
+    SetInvalidLeafSplitInfoKernel<<<1, 1>>>(
+      cuda_leaf_best_split_info_,
+      is_smaller_leaf_valid, is_larger_leaf_valid,
+      host_smaller_leaf_index, host_larger_leaf_index);
+  }
+  if (!is_smaller_leaf_valid && !is_larger_leaf_valid) {
+    return;
+  }
+  int num_tasks = num_tasks_;
+  int num_tasks_aligned = 1;
+  num_tasks -= 1;
+  while (num_tasks > 0) {
+    num_tasks_aligned <<= 1;
+    num_tasks >>= 1;
+  }
+  const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK;
+  if (host_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) {
+    SyncBestSplitForLeafKernel<<<num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK, 0, cuda_streams_[0]>>>(
+      host_smaller_leaf_index,
+      host_larger_leaf_index,
+      cuda_leaf_best_split_info_,
+      cuda_split_find_tasks_.RawData(),
+      cuda_best_split_info_,
+      num_tasks_,
+      num_tasks_aligned,
+      num_blocks_per_leaf,
+      false,
+      num_leaves_);
+    if (num_blocks_per_leaf > 1) {
+      SyncBestSplitForLeafKernelAllBlocks<<<1, 1, 0, cuda_streams_[0]>>>(
+        host_smaller_leaf_index,
+        host_larger_leaf_index,
+        num_blocks_per_leaf,
+        num_leaves_,
+        cuda_leaf_best_split_info_,
+        false);
+    }
+    SynchronizeCUDADevice(__FILE__, __LINE__);
+    SyncBestSplitForLeafKernel<<<num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK, 0, cuda_streams_[1]>>>(
+      host_smaller_leaf_index,
+      host_larger_leaf_index,
+      cuda_leaf_best_split_info_,
+      cuda_split_find_tasks_.RawData(),
+      cuda_best_split_info_,
+      num_tasks_,
+      num_tasks_aligned,
+      num_blocks_per_leaf,
+      true,
+      num_leaves_);
+    if (num_blocks_per_leaf > 1) {
+      SyncBestSplitForLeafKernelAllBlocks<<<1, 1, 0, cuda_streams_[1]>>>(
+        host_smaller_leaf_index,
+        host_larger_leaf_index,
+        num_blocks_per_leaf,
+        num_leaves_,
+        cuda_leaf_best_split_info_,
+        true);
+    }
+  } else {
+    const bool larger_only = (!is_smaller_leaf_valid && is_larger_leaf_valid);
+    SyncBestSplitForLeafKernel<<<num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK>>>(
+      host_smaller_leaf_index,
+      host_larger_leaf_index,
+      cuda_leaf_best_split_info_,
+      cuda_split_find_tasks_.RawData(),
+      cuda_best_split_info_,
+      num_tasks_,
+      num_tasks_aligned,
+      num_blocks_per_leaf,
+      larger_only,
+      num_leaves_);
+    if (num_blocks_per_leaf > 1) {
+      SynchronizeCUDADevice(__FILE__, __LINE__);
+      SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>(
+        host_smaller_leaf_index,
+        host_larger_leaf_index,
+        num_blocks_per_leaf,
+        num_leaves_,
+        cuda_leaf_best_split_info_,
+        larger_only);
+    }
+  }
+}
+
+__global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves,
+  CUDASplitInfo* cuda_leaf_best_split_info,
+  int* cuda_best_split_info_buffer) {
+  __shared__ double gain_shared_buffer[32];
+  __shared__ int leaf_index_shared_buffer[32];
+  double thread_best_gain = kMinScore;
+  int thread_best_leaf_index = -1;
+  const int threadIdx_x = static_cast<int>(threadIdx.x);
+  for (int leaf_index = threadIdx_x; leaf_index < cur_num_leaves; leaf_index += static_cast<int>(blockDim.x)) {
+    const double leaf_best_gain = cuda_leaf_best_split_info[leaf_index].gain;
+    if (cuda_leaf_best_split_info[leaf_index].is_valid && leaf_best_gain > thread_best_gain) {
+      thread_best_gain = leaf_best_gain;
+      thread_best_leaf_index = leaf_index;
+    }
+  }
+  const int best_leaf_index = ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf_index, gain_shared_buffer, leaf_index_shared_buffer);
+  if (threadIdx_x == 0) {
+    cuda_best_split_info_buffer[6] = best_leaf_index;
+    if (best_leaf_index != -1) {
+      cuda_leaf_best_split_info[best_leaf_index].is_valid = false;
+      cuda_leaf_best_split_info[cur_num_leaves].is_valid = false;
+      cuda_best_split_info_buffer[7] = cuda_leaf_best_split_info[best_leaf_index].num_cat_threshold;
+    }
+  }
+}
+
+__global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int larger_leaf_index,
+  int* cuda_best_split_info_buffer,
+  const CUDASplitInfo* cuda_leaf_best_split_info) {
+  const unsigned int threadIdx_x = blockIdx.x;
+  if (threadIdx_x == 0) {
+    cuda_best_split_info_buffer[0] = cuda_leaf_best_split_info[smaller_leaf_index].inner_feature_index;
+  } else if (threadIdx_x == 1) {
+    cuda_best_split_info_buffer[1] = cuda_leaf_best_split_info[smaller_leaf_index].threshold;
+  } else if (threadIdx_x == 2) {
+    cuda_best_split_info_buffer[2] = cuda_leaf_best_split_info[smaller_leaf_index].default_left;
+  }
+  if (larger_leaf_index >= 0) {
+    if (threadIdx_x == 3) {
+      cuda_best_split_info_buffer[3] = cuda_leaf_best_split_info[larger_leaf_index].inner_feature_index;
+    } else if (threadIdx_x == 4) {
+      cuda_best_split_info_buffer[4] = cuda_leaf_best_split_info[larger_leaf_index].threshold;
+    } else if (threadIdx_x == 5) {
+      cuda_best_split_info_buffer[5] = cuda_leaf_best_split_info[larger_leaf_index].default_left;
+    }
+  }
+}
+
+void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(
+  const int cur_num_leaves,
+  const int smaller_leaf_index, const int larger_leaf_index,
+  int* smaller_leaf_best_split_feature,
+  uint32_t* smaller_leaf_best_split_threshold,
+  uint8_t* smaller_leaf_best_split_default_left,
+  int* larger_leaf_best_split_feature,
+  uint32_t* larger_leaf_best_split_threshold,
+  uint8_t* larger_leaf_best_split_default_left,
+  int* best_leaf_index,
+  int* num_cat_threshold) {
+  FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cur_num_leaves,
+    cuda_leaf_best_split_info_,
+    cuda_best_split_info_buffer_);
+  PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index,
+    cuda_best_split_info_buffer_,
+    cuda_leaf_best_split_info_);
+  std::vector<int> host_leaf_best_split_info_buffer(8, 0);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<int>(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 8, __FILE__, __LINE__);
+  *smaller_leaf_best_split_feature = host_leaf_best_split_info_buffer[0];
+  *smaller_leaf_best_split_threshold = static_cast<uint32_t>(host_leaf_best_split_info_buffer[1]);
+  *smaller_leaf_best_split_default_left = static_cast<uint8_t>(host_leaf_best_split_info_buffer[2]);
+  if (larger_leaf_index >= 0) {
+    *larger_leaf_best_split_feature = host_leaf_best_split_info_buffer[3];
+    *larger_leaf_best_split_threshold = static_cast<uint32_t>(host_leaf_best_split_info_buffer[4]);
+    *larger_leaf_best_split_default_left = static_cast<uint8_t>(host_leaf_best_split_info_buffer[5]);
+  }
+  *best_leaf_index = host_leaf_best_split_info_buffer[6];
+  *num_cat_threshold = host_leaf_best_split_info_buffer[7];
+}
+
+__global__ void AllocateCatVectorsKernel(
+  CUDASplitInfo* cuda_split_infos, size_t len,
+  const int max_num_categories_in_split,
+  const bool has_categorical_feature,
+  uint32_t* cat_threshold_vec,
+  int* cat_threshold_real_vec) {
+  const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
+  if (i < len) {
+    if (has_categorical_feature) {
+      cuda_split_infos[i].cat_threshold = cat_threshold_vec + i * max_num_categories_in_split;
+      cuda_split_infos[i].cat_threshold_real = cat_threshold_real_vec + i * max_num_categories_in_split;
+      cuda_split_infos[i].num_cat_threshold = 0;
+    } else {
+      cuda_split_infos[i].cat_threshold = nullptr;
+      cuda_split_infos[i].cat_threshold_real = nullptr;
+      cuda_split_infos[i].num_cat_threshold = 0;
+    }
+  }
+}
+
+void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel(
+  CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len) {
+  const int num_blocks = (static_cast<int>(len) + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER;
+  AllocateCatVectorsKernel<<<num_blocks, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER>>>(
+    cuda_split_infos, len, max_num_categories_in_split_, has_categorical_feature_, cat_threshold_vec, cat_threshold_real_vec);
+}
+
+__global__ void InitCUDARandomKernel(
+  const int seed,
+  const int num_tasks,
+  CUDARandom* cuda_randoms) {
+  const int task_index = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (task_index < num_tasks) {
+    cuda_randoms[task_index].SetSeed(seed + task_index);
+  }
+}
+
+void CUDABestSplitFinder::LaunchInitCUDARandomKernel() {
+  const int num_blocks = (static_cast<int>(cuda_randoms_.Size()) +
+    NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER;
+  InitCUDARandomKernel<<<num_blocks, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER>>>(extra_seed_,
+    static_cast<int>(cuda_randoms_.Size()), cuda_randoms_.RawData());
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp
new file mode 100644
index 000000000000..3efc6011c83b
--- /dev/null
+++ b/src/treelearner/cuda/cuda_best_split_finder.hpp
@@ -0,0 +1,206 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
+#define LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/bin.h>
+#include <LightGBM/dataset.h>
+
+#include <vector>
+
+#include <LightGBM/cuda/cuda_random.hpp>
+#include <LightGBM/cuda/cuda_split_info.hpp>
+
+#include "cuda_leaf_splits.hpp"
+
+#define NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER (256)
+#define NUM_THREADS_FIND_BEST_LEAF (256)
+#define NUM_TASKS_PER_SYNC_BLOCK (1024)
+
+namespace LightGBM {
+
+struct SplitFindTask {
+  int inner_feature_index;
+  bool reverse;
+  bool skip_default_bin;
+  bool na_as_missing;
+  bool assume_out_default_left;
+  bool is_categorical;
+  bool is_one_hot;
+  uint32_t hist_offset;
+  uint8_t mfb_offset;
+  uint32_t num_bin;
+  uint32_t default_bin;
+  int rand_threshold;
+};
+
+class CUDABestSplitFinder {
+ public:
+  CUDABestSplitFinder(
+    const hist_t* cuda_hist,
+    const Dataset* train_data,
+    const std::vector<uint32_t>& feature_hist_offsets,
+    const Config* config);
+
+  ~CUDABestSplitFinder();
+
+  void InitFeatureMetaInfo(const Dataset* train_data);
+
+  void Init();
+
+  void InitCUDAFeatureMetaInfo();
+
+  void BeforeTrain(const std::vector<int8_t>& is_feature_used_bytree);
+
+  void FindBestSplitsForLeaf(
+    const CUDALeafSplitsStruct* smaller_leaf_splits,
+    const CUDALeafSplitsStruct* larger_leaf_splits,
+    const int smaller_leaf_index,
+    const int larger_leaf_index,
+    const data_size_t num_data_in_smaller_leaf,
+    const data_size_t num_data_in_larger_leaf,
+    const double sum_hessians_in_smaller_leaf,
+    const double sum_hessians_in_larger_leaf);
+
+  const CUDASplitInfo* FindBestFromAllSplits(
+    const int cur_num_leaves,
+    const int smaller_leaf_index,
+    const int larger_leaf_index,
+    int* smaller_leaf_best_split_feature,
+    uint32_t* smaller_leaf_best_split_threshold,
+    uint8_t* smaller_leaf_best_split_default_left,
+    int* larger_leaf_best_split_feature,
+    uint32_t* larger_leaf_best_split_threshold,
+    uint8_t* larger_leaf_best_split_default_left,
+    int* best_leaf_index,
+    int* num_cat_threshold);
+
+  void ResetTrainingData(
+    const hist_t* cuda_hist,
+    const Dataset* train_data,
+    const std::vector<uint32_t>& feature_hist_offsets);
+
+  void ResetConfig(const Config* config, const hist_t* cuda_hist);
+
+ private:
+  #define LaunchFindBestSplitsForLeafKernel_PARAMS \
+    const CUDALeafSplitsStruct* smaller_leaf_splits, \
+    const CUDALeafSplitsStruct* larger_leaf_splits, \
+    const int smaller_leaf_index, \
+    const int larger_leaf_index, \
+    const bool is_smaller_leaf_valid, \
+    const bool is_larger_leaf_valid
+
+  void LaunchFindBestSplitsForLeafKernel(LaunchFindBestSplitsForLeafKernel_PARAMS);
+
+  template <bool USE_RAND>
+  void LaunchFindBestSplitsForLeafKernelInner0(LaunchFindBestSplitsForLeafKernel_PARAMS);
+
+  template <bool USE_RAND, bool USE_L1>
+  void LaunchFindBestSplitsForLeafKernelInner1(LaunchFindBestSplitsForLeafKernel_PARAMS);
+
+  template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+  void LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBestSplitsForLeafKernel_PARAMS);
+
+  #undef LaunchFindBestSplitsForLeafKernel_PARAMS
+
+  void LaunchSyncBestSplitForLeafKernel(
+    const int host_smaller_leaf_index,
+    const int host_larger_leaf_index,
+    const bool is_smaller_leaf_valid,
+    const bool is_larger_leaf_valid);
+
+  void LaunchFindBestFromAllSplitsKernel(
+    const int cur_num_leaves,
+    const int smaller_leaf_index,
+    const int larger_leaf_index,
+    int* smaller_leaf_best_split_feature,
+    uint32_t* smaller_leaf_best_split_threshold,
+    uint8_t* smaller_leaf_best_split_default_left,
+    int* larger_leaf_best_split_feature,
+    uint32_t* larger_leaf_best_split_threshold,
+    uint8_t* larger_leaf_best_split_default_left,
+    int* best_leaf_index,
+    data_size_t* num_cat_threshold);
+
+  void AllocateCatVectors(CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len);
+
+  void LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len);
+
+  void LaunchInitCUDARandomKernel();
+
+  // Host memory
+  int num_features_;
+  int num_leaves_;
+  int max_num_bin_in_feature_;
+  std::vector<uint32_t> feature_hist_offsets_;
+  std::vector<uint8_t> feature_mfb_offsets_;
+  std::vector<uint32_t> feature_default_bins_;
+  std::vector<uint32_t> feature_num_bins_;
+  std::vector<MissingType> feature_missing_type_;
+  double lambda_l1_;
+  double lambda_l2_;
+  data_size_t min_data_in_leaf_;
+  double min_sum_hessian_in_leaf_;
+  double min_gain_to_split_;
+  double cat_smooth_;
+  double cat_l2_;
+  int max_cat_threshold_;
+  int min_data_per_group_;
+  int max_cat_to_onehot_;
+  bool extra_trees_;
+  int extra_seed_;
+  bool use_smoothing_;
+  double path_smooth_;
+  std::vector<cudaStream_t> cuda_streams_;
+  // for best split find tasks
+  std::vector<SplitFindTask> split_find_tasks_;
+  int num_tasks_;
+  // use global memory
+  bool use_global_memory_;
+  // number of total bins in the dataset
+  const int num_total_bin_;
+  // has categorical feature
+  bool has_categorical_feature_;
+  // maximum number of bins of categorical features
+  int max_num_categorical_bin_;
+  // marks whether a feature is categorical
+  std::vector<int8_t> is_categorical_;
+
+  // CUDA memory, held by this object
+  // for per leaf best split information
+  CUDASplitInfo* cuda_leaf_best_split_info_;
+  // for best split information when finding best split
+  CUDASplitInfo* cuda_best_split_info_;
+  // best split information buffer, to be copied to host
+  int* cuda_best_split_info_buffer_;
+  // find best split task information
+  CUDAVector<SplitFindTask> cuda_split_find_tasks_;
+  int8_t* cuda_is_feature_used_bytree_;
+  // used when finding best split with global memory
+  hist_t* cuda_feature_hist_grad_buffer_;
+  hist_t* cuda_feature_hist_hess_buffer_;
+  hist_t* cuda_feature_hist_stat_buffer_;
+  data_size_t* cuda_feature_hist_index_buffer_;
+  uint32_t* cuda_cat_threshold_leaf_;
+  int* cuda_cat_threshold_real_leaf_;
+  uint32_t* cuda_cat_threshold_feature_;
+  int* cuda_cat_threshold_real_feature_;
+  int max_num_categories_in_split_;
+  // used for extremely randomized trees
+  CUDAVector<CUDARandom> cuda_randoms_;
+
+  // CUDA memory, held by other object
+  const hist_t* cuda_hist_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp
new file mode 100644
index 000000000000..2321d1112c52
--- /dev/null
+++ b/src/treelearner/cuda/cuda_data_partition.cpp
@@ -0,0 +1,402 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <algorithm>
+#include <memory>
+
+#include "cuda_data_partition.hpp"
+
+namespace LightGBM {
+
+CUDADataPartition::CUDADataPartition(
+  const Dataset* train_data,
+  const int num_total_bin,
+  const int num_leaves,
+  const int num_threads,
+  hist_t* cuda_hist):
+
+  num_data_(train_data->num_data()),
+  num_features_(train_data->num_features()),
+  num_total_bin_(num_total_bin),
+  num_leaves_(num_leaves),
+  num_threads_(num_threads),
+  cuda_hist_(cuda_hist) {
+  CalcBlockDim(num_data_);
+  max_num_split_indices_blocks_ = grid_dim_;
+  cur_num_leaves_ = 1;
+  cuda_column_data_ = train_data->cuda_column_data();
+
+  is_categorical_feature_.resize(train_data->num_features(), false);
+  is_single_feature_in_column_.resize(train_data->num_features(), false);
+  for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) {
+    if (train_data->FeatureBinMapper(feature_index)->bin_type() == BinType::CategoricalBin) {
+      is_categorical_feature_[feature_index] = true;
+    }
+    const int feature_group_index = train_data->Feature2Group(feature_index);
+    if (!train_data->IsMultiGroup(feature_group_index)) {
+      if ((feature_index == 0 || train_data->Feature2Group(feature_index - 1) != feature_group_index) &&
+        (feature_index == train_data->num_features() - 1 || train_data->Feature2Group(feature_index + 1) != feature_group_index)) {
+        is_single_feature_in_column_[feature_index] = true;
+      }
+    } else {
+      is_single_feature_in_column_[feature_index] = true;
+    }
+  }
+
+  cuda_data_indices_ = nullptr;
+  cuda_leaf_data_start_ = nullptr;
+  cuda_leaf_data_end_ = nullptr;
+  cuda_leaf_num_data_ = nullptr;
+  cuda_hist_pool_ = nullptr;
+  cuda_leaf_output_ = nullptr;
+  cuda_block_to_left_offset_ = nullptr;
+  cuda_data_index_to_leaf_index_ = nullptr;
+  cuda_block_data_to_left_offset_ = nullptr;
+  cuda_block_data_to_right_offset_ = nullptr;
+  cuda_out_data_indices_in_leaf_ = nullptr;
+  cuda_split_info_buffer_ = nullptr;
+  cuda_num_data_ = nullptr;
+  cuda_add_train_score_ = nullptr;
+}
+
+CUDADataPartition::~CUDADataPartition() {
+  DeallocateCUDAMemory<data_size_t>(&cuda_data_indices_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_data_start_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_data_end_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_num_data_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<hist_t*>(&cuda_hist_pool_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_leaf_output_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_block_data_to_left_offset_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_block_data_to_right_offset_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_split_info_buffer_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_num_data_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_add_train_score_, __FILE__, __LINE__);
+  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[0]));
+  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[1]));
+  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[2]));
+  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[3]));
+  cuda_streams_.clear();
+  cuda_streams_.shrink_to_fit();
+}
+
+void CUDADataPartition::Init() {
+  // allocate CUDA memory
+  AllocateCUDAMemory<data_size_t>(&cuda_data_indices_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_data_start_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_data_end_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_num_data_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  // leave some space for alignment
+  AllocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_block_data_to_left_offset_, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_block_data_to_right_offset_, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+  SetCUDAMemory<data_size_t>(cuda_block_data_to_left_offset_, 0, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+  SetCUDAMemory<data_size_t>(cuda_block_data_to_right_offset_, 0, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateCUDAMemory<hist_t*>(&cuda_hist_pool_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  CopyFromHostToCUDADevice<hist_t*>(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__);
+
+  AllocateCUDAMemory<int>(&cuda_split_info_buffer_, 16, __FILE__, __LINE__);
+
+  AllocateCUDAMemory<double>(&cuda_leaf_output_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+
+  cuda_streams_.resize(4);
+  gpuAssert(cudaStreamCreate(&cuda_streams_[0]), __FILE__, __LINE__);
+  gpuAssert(cudaStreamCreate(&cuda_streams_[1]), __FILE__, __LINE__);
+  gpuAssert(cudaStreamCreate(&cuda_streams_[2]), __FILE__, __LINE__);
+  gpuAssert(cudaStreamCreate(&cuda_streams_[3]), __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__);
+  add_train_score_.resize(num_data_, 0.0f);
+  AllocateCUDAMemory<double>(&cuda_add_train_score_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  use_bagging_ = false;
+  used_indices_ = nullptr;
+}
+
+void CUDADataPartition::BeforeTrain() {
+  if (!use_bagging_) {
+    LaunchFillDataIndicesBeforeTrain();
+  }
+  SetCUDAMemory<data_size_t>(cuda_leaf_num_data_, 0, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  SetCUDAMemory<data_size_t>(cuda_leaf_data_start_, 0, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  SetCUDAMemory<data_size_t>(cuda_leaf_data_end_, 0, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  if (!use_bagging_) {
+    CopyFromCUDADeviceToCUDADevice<data_size_t>(cuda_leaf_num_data_, cuda_num_data_, 1, __FILE__, __LINE__);
+    CopyFromCUDADeviceToCUDADevice<data_size_t>(cuda_leaf_data_end_, cuda_num_data_, 1, __FILE__, __LINE__);
+  } else {
+    CopyFromHostToCUDADevice<data_size_t>(cuda_leaf_num_data_, &num_used_indices_, 1, __FILE__, __LINE__);
+    CopyFromHostToCUDADevice<data_size_t>(cuda_leaf_data_end_, &num_used_indices_, 1, __FILE__, __LINE__);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  CopyFromHostToCUDADevice<hist_t*>(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__);
+}
+
+void CUDADataPartition::Split(
+  // input best split info
+  const CUDASplitInfo* best_split_info,
+  const int left_leaf_index,
+  const int right_leaf_index,
+  const int leaf_best_split_feature,
+  const uint32_t leaf_best_split_threshold,
+  const uint32_t* categorical_bitset,
+  const int categorical_bitset_len,
+  const uint8_t leaf_best_split_default_left,
+  const data_size_t num_data_in_leaf,
+  const data_size_t leaf_data_start,
+  // for leaf information update
+  CUDALeafSplitsStruct* smaller_leaf_splits,
+  CUDALeafSplitsStruct* larger_leaf_splits,
+  // gather information for CPU, used for launching kernels
+  data_size_t* left_leaf_num_data,
+  data_size_t* right_leaf_num_data,
+  data_size_t* left_leaf_start,
+  data_size_t* right_leaf_start,
+  double* left_leaf_sum_of_hessians,
+  double* right_leaf_sum_of_hessians,
+  double* left_leaf_sum_of_gradients,
+  double* right_leaf_sum_of_gradients) {
+  CalcBlockDim(num_data_in_leaf);
+  global_timer.Start("GenDataToLeftBitVector");
+  GenDataToLeftBitVector(num_data_in_leaf,
+                         leaf_best_split_feature,
+                         leaf_best_split_threshold,
+                         categorical_bitset,
+                         categorical_bitset_len,
+                         leaf_best_split_default_left,
+                         leaf_data_start,
+                         left_leaf_index,
+                         right_leaf_index);
+  global_timer.Stop("GenDataToLeftBitVector");
+  global_timer.Start("SplitInner");
+
+  SplitInner(num_data_in_leaf,
+             best_split_info,
+             left_leaf_index,
+             right_leaf_index,
+             smaller_leaf_splits,
+             larger_leaf_splits,
+             left_leaf_num_data,
+             right_leaf_num_data,
+             left_leaf_start,
+             right_leaf_start,
+             left_leaf_sum_of_hessians,
+             right_leaf_sum_of_hessians,
+             left_leaf_sum_of_gradients,
+             right_leaf_sum_of_gradients);
+  global_timer.Stop("SplitInner");
+}
+
+void CUDADataPartition::GenDataToLeftBitVector(
+    const data_size_t num_data_in_leaf,
+    const int split_feature_index,
+    const uint32_t split_threshold,
+    const uint32_t* categorical_bitset,
+    const int categorical_bitset_len,
+    const uint8_t split_default_left,
+    const data_size_t leaf_data_start,
+    const int left_leaf_index,
+    const int right_leaf_index) {
+  if (is_categorical_feature_[split_feature_index]) {
+    LaunchGenDataToLeftBitVectorCategoricalKernel(
+      num_data_in_leaf,
+      split_feature_index,
+      categorical_bitset,
+      categorical_bitset_len,
+      split_default_left,
+      leaf_data_start,
+      left_leaf_index,
+      right_leaf_index);
+  } else {
+    LaunchGenDataToLeftBitVectorKernel(
+      num_data_in_leaf,
+      split_feature_index,
+      split_threshold,
+      split_default_left,
+      leaf_data_start,
+      left_leaf_index,
+      right_leaf_index);
+  }
+}
+
+void CUDADataPartition::SplitInner(
+  const data_size_t num_data_in_leaf,
+  const CUDASplitInfo* best_split_info,
+  const int left_leaf_index,
+  const int right_leaf_index,
+  // for leaf splits information update
+  CUDALeafSplitsStruct* smaller_leaf_splits,
+  CUDALeafSplitsStruct* larger_leaf_splits,
+  data_size_t* left_leaf_num_data,
+  data_size_t* right_leaf_num_data,
+  data_size_t* left_leaf_start,
+  data_size_t* right_leaf_start,
+  double* left_leaf_sum_of_hessians,
+  double* right_leaf_sum_of_hessians,
+  double* left_leaf_sum_of_gradients,
+  double* right_leaf_sum_of_gradients) {
+  LaunchSplitInnerKernel(
+    num_data_in_leaf,
+    best_split_info,
+    left_leaf_index,
+    right_leaf_index,
+    smaller_leaf_splits,
+    larger_leaf_splits,
+    left_leaf_num_data,
+    right_leaf_num_data,
+    left_leaf_start,
+    right_leaf_start,
+    left_leaf_sum_of_hessians,
+    right_leaf_sum_of_hessians,
+    left_leaf_sum_of_gradients,
+    right_leaf_sum_of_gradients);
+  ++cur_num_leaves_;
+}
+
+void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) {
+  const CUDATree* cuda_tree = nullptr;
+  std::unique_ptr<CUDATree> cuda_tree_ptr;
+  if (tree->is_cuda_tree()) {
+    cuda_tree = reinterpret_cast<const CUDATree*>(tree);
+  } else {
+    cuda_tree_ptr.reset(new CUDATree(tree));
+    cuda_tree = cuda_tree_ptr.get();
+  }
+  const data_size_t num_data_in_root = root_num_data();
+  if (use_bagging_) {
+    // we need restore the order of indices in cuda_data_indices_
+    CopyFromHostToCUDADevice<data_size_t>(cuda_data_indices_, used_indices_, static_cast<size_t>(num_used_indices_), __FILE__, __LINE__);
+  }
+  LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), cuda_add_train_score_);
+  CopyFromCUDADeviceToHost<double>(add_train_score_.data(),
+    cuda_add_train_score_, static_cast<size_t>(num_data_in_root), __FILE__, __LINE__);
+  if (!use_bagging_) {
+    OMP_INIT_EX();
+    #pragma omp parallel for schedule(static) num_threads(num_threads_)
+    for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) {
+      OMP_LOOP_EX_BEGIN();
+      scores[data_index] += add_train_score_[data_index];
+      OMP_LOOP_EX_END();
+    }
+    OMP_THROW_EX();
+  } else {
+    OMP_INIT_EX();
+    #pragma omp parallel for schedule(static) num_threads(num_threads_)
+    for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) {
+      OMP_LOOP_EX_BEGIN();
+      scores[used_indices_[data_index]] += add_train_score_[data_index];
+      OMP_LOOP_EX_END();
+    }
+    OMP_THROW_EX();
+  }
+}
+
+void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) {
+  const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80;
+  const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION);
+  int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1;
+  CHECK_GT(split_indices_block_size_data_partition, 0);
+  int split_indices_block_size_data_partition_aligned = 1;
+  while (split_indices_block_size_data_partition > 0) {
+    split_indices_block_size_data_partition_aligned <<= 1;
+    split_indices_block_size_data_partition >>= 1;
+  }
+  const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned;
+  grid_dim_ = num_blocks_final;
+  block_dim_ = split_indices_block_size_data_partition_aligned;
+}
+
+void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices) {
+  use_bagging_ = true;
+  num_used_indices_ = num_used_indices;
+  used_indices_ = used_indices;
+  CopyFromHostToCUDADevice<data_size_t>(cuda_data_indices_, used_indices, static_cast<size_t>(num_used_indices), __FILE__, __LINE__);
+  LaunchFillDataIndexToLeafIndex();
+}
+
+void CUDADataPartition::ResetTrainingData(const Dataset* train_data, const int num_total_bin, hist_t* cuda_hist) {
+  const data_size_t old_num_data = num_data_;
+  num_data_ = train_data->num_data();
+  num_features_ = train_data->num_features();
+  num_total_bin_ = num_total_bin;
+  cuda_column_data_ = train_data->cuda_column_data();
+  cuda_hist_ = cuda_hist;
+  CopyFromHostToCUDADevice<hist_t*>(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__);
+  CopyFromHostToCUDADevice<int>(cuda_num_data_, &num_data_, 1, __FILE__, __LINE__);
+  if (num_data_ > old_num_data) {
+    CalcBlockDim(num_data_);
+    const int old_max_num_split_indices_blocks = max_num_split_indices_blocks_;
+    max_num_split_indices_blocks_ = grid_dim_;
+    if (max_num_split_indices_blocks_ > old_max_num_split_indices_blocks) {
+      DeallocateCUDAMemory<data_size_t>(&cuda_block_data_to_left_offset_, __FILE__, __LINE__);
+      DeallocateCUDAMemory<data_size_t>(&cuda_block_data_to_right_offset_, __FILE__, __LINE__);
+      AllocateCUDAMemory<data_size_t>(&cuda_block_data_to_left_offset_, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+      AllocateCUDAMemory<data_size_t>(&cuda_block_data_to_right_offset_, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+      SetCUDAMemory<data_size_t>(cuda_block_data_to_left_offset_, 0, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+      SetCUDAMemory<data_size_t>(cuda_block_data_to_right_offset_, 0, static_cast<size_t>(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__);
+    }
+    DeallocateCUDAMemory<data_size_t>(&cuda_data_indices_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<double>(&cuda_add_train_score_, __FILE__, __LINE__);
+    add_train_score_.resize(num_data_, 0.0f);
+
+    AllocateCUDAMemory<data_size_t>(&cuda_data_indices_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+    AllocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+    AllocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+    AllocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+    AllocateCUDAMemory<double>(&cuda_add_train_score_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+  used_indices_ = nullptr;
+  use_bagging_ = false;
+  num_used_indices_ = 0;
+  cur_num_leaves_ = 1;
+}
+
+void CUDADataPartition::ResetConfig(const Config* config, hist_t* cuda_hist) {
+  num_threads_ = OMP_NUM_THREADS();
+  num_leaves_ = config->num_leaves;
+  cuda_hist_ = cuda_hist;
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_data_start_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_data_end_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<data_size_t>(&cuda_leaf_num_data_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<hist_t*>(&cuda_hist_pool_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_leaf_output_, __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_data_start_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_data_end_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  AllocateCUDAMemory<data_size_t>(&cuda_leaf_num_data_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  AllocateCUDAMemory<hist_t*>(&cuda_hist_pool_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+  AllocateCUDAMemory<double>(&cuda_leaf_output_, static_cast<size_t>(num_leaves_), __FILE__, __LINE__);
+}
+
+void CUDADataPartition::SetBaggingSubset(const Dataset* subset) {
+  num_used_indices_ = subset->num_data();
+  used_indices_ = nullptr;
+  use_bagging_ = true;
+  cuda_column_data_ = subset->cuda_column_data();
+}
+
+void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int num_leaves) {
+  if (leaf_pred.size() != static_cast<size_t>(num_data_)) {
+    DeallocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__);
+    InitCUDAMemoryFromHostMemory<int>(&cuda_data_index_to_leaf_index_, leaf_pred.data(), leaf_pred.size(), __FILE__, __LINE__);
+    num_data_ = static_cast<data_size_t>(leaf_pred.size());
+  } else {
+    CopyFromHostToCUDADevice<int>(cuda_data_index_to_leaf_index_, leaf_pred.data(), leaf_pred.size(), __FILE__, __LINE__);
+  }
+  num_leaves_ = num_leaves;
+  cur_num_leaves_ = num_leaves;
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu
new file mode 100644
index 000000000000..a2cd87eac6cf
--- /dev/null
+++ b/src/treelearner/cuda/cuda_data_partition.cu
@@ -0,0 +1,1074 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_data_partition.hpp"
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+#include <LightGBM/tree.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace LightGBM {
+
+__global__ void FillDataIndicesBeforeTrainKernel(const data_size_t num_data,
+  data_size_t* data_indices, int* cuda_data_index_to_leaf_index) {
+  const unsigned int data_index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (data_index < num_data) {
+    data_indices[data_index] = data_index;
+    cuda_data_index_to_leaf_index[data_index] = 0;
+  }
+}
+
+__global__ void FillDataIndexToLeafIndexKernel(
+  const data_size_t num_data,
+  const data_size_t* data_indices,
+  int* data_index_to_leaf_index) {
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (data_index < num_data) {
+    data_index_to_leaf_index[data_indices[data_index]] = 0;
+  }
+}
+
+void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() {
+  const data_size_t num_data_in_root = root_num_data();
+  const int num_blocks = (num_data_in_root + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION;
+  FillDataIndicesBeforeTrainKernel<<<num_blocks, FILL_INDICES_BLOCK_SIZE_DATA_PARTITION>>>(num_data_in_root, cuda_data_indices_, cuda_data_index_to_leaf_index_);
+}
+
+void CUDADataPartition::LaunchFillDataIndexToLeafIndex() {
+  const data_size_t num_data_in_root = root_num_data();
+  const int num_blocks = (num_data_in_root + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION;
+  FillDataIndexToLeafIndexKernel<<<num_blocks, FILL_INDICES_BLOCK_SIZE_DATA_PARTITION>>>(num_data_in_root, cuda_data_indices_, cuda_data_index_to_leaf_index_);
+}
+
+__device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf, uint16_t* block_to_left_offset,
+  data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer,
+  const uint16_t thread_to_left_offset_cnt, uint16_t* shared_mem_buffer) {
+  const unsigned int threadIdx_x = threadIdx.x;
+  const unsigned int blockDim_x = blockDim.x;
+  const uint16_t thread_to_left_offset = ShufflePrefixSum<uint16_t>(thread_to_left_offset_cnt, shared_mem_buffer);
+  const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf ? static_cast<data_size_t>(blockDim_x) :
+    num_data_in_leaf - static_cast<data_size_t>(blockIdx.x * blockDim_x);
+  if (static_cast<data_size_t>(threadIdx_x) < num_data_in_block) {
+    block_to_left_offset[threadIdx_x] = thread_to_left_offset;
+  }
+  if (threadIdx_x == blockDim_x - 1) {
+    if (num_data_in_block > 0) {
+      const data_size_t data_to_left = static_cast<data_size_t>(thread_to_left_offset);
+      block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left;
+      block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left;
+    } else {
+      block_to_left_offset_buffer[blockIdx.x + 1] = 0;
+      block_to_right_offset_buffer[blockIdx.x + 1] = 0;
+    }
+  }
+}
+
+template <typename T>
+__device__ bool CUDAFindInBitset(const uint32_t* bits, int n, T pos) {
+  int i1 = pos / 32;
+  if (i1 >= n) {
+    return false;
+  }
+  int i2 = pos % 32;
+  return (bits[i1] >> i2) & 1;
+}
+
+
+
+#define UpdateDataIndexToLeafIndexKernel_PARAMS \
+  const BIN_TYPE* column_data, \
+  const data_size_t num_data_in_leaf, \
+  const data_size_t* data_indices_in_leaf, \
+  const uint32_t th, \
+  const uint32_t t_zero_bin, \
+  const uint32_t max_bin, \
+  const uint32_t min_bin, \
+  const int left_leaf_index, \
+  const int right_leaf_index, \
+  const int default_leaf_index, \
+  const int missing_default_leaf_index
+
+#define UpdateDataIndexToLeafIndex_ARGS \
+  column_data, \
+  num_data_in_leaf, \
+  data_indices_in_leaf, th, \
+  t_zero_bin, \
+  max_bin, \
+  min_bin, \
+  left_leaf_index, \
+  right_leaf_index, \
+  default_leaf_index, \
+  missing_default_leaf_index
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, bool MAX_TO_LEFT, bool USE_MIN_BIN, typename BIN_TYPE>
+__global__ void UpdateDataIndexToLeafIndexKernel(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  int* cuda_data_index_to_leaf_index) {
+  const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (local_data_index < num_data_in_leaf) {
+    const unsigned int global_data_index = data_indices_in_leaf[local_data_index];
+    const uint32_t bin = static_cast<uint32_t>(column_data[global_data_index]);
+    if (!MIN_IS_MAX) {
+      if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) ||
+        (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin)) {
+        cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index;
+      } else if ((USE_MIN_BIN && (bin < min_bin || bin > max_bin)) ||
+                 (!USE_MIN_BIN && bin == 0)) {
+        if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) {
+          cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index;
+        } else {
+          cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index;
+        }
+      } else if (bin > th) {
+        cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index;
+      } else {
+        cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index;
+      }
+    } else {
+      if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) {
+        cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index;
+      } else if (bin != max_bin) {
+        if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) {
+          cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index;
+        } else {
+          cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index;
+        }
+      } else {
+        if (MISSING_IS_NA && !MFB_IS_NA) {
+          cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index;
+        } else {
+          if (!MAX_TO_LEFT) {
+            cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index;
+          } else {
+            cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename BIN_TYPE>
+void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  const bool missing_is_zero,
+  const bool missing_is_na,
+  const bool mfb_is_zero,
+  const bool mfb_is_na,
+  const bool max_to_left,
+  const bool is_single_feature_in_column) {
+  if (min_bin < max_bin) {
+    if (!missing_is_zero) {
+      LaunchUpdateDataIndexToLeafIndexKernel_Inner0<false, false, BIN_TYPE>
+        (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column);
+    } else {
+      LaunchUpdateDataIndexToLeafIndexKernel_Inner0<false, true, BIN_TYPE>
+        (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column);
+    }
+  } else {
+    if (!missing_is_zero) {
+      LaunchUpdateDataIndexToLeafIndexKernel_Inner0<true, false, BIN_TYPE>
+        (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column);
+    } else {
+      LaunchUpdateDataIndexToLeafIndexKernel_Inner0<true, true, BIN_TYPE>
+        (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column);
+    }
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, typename BIN_TYPE>
+void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner0(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  const bool missing_is_na,
+  const bool mfb_is_zero,
+  const bool mfb_is_na,
+  const bool max_to_left,
+  const bool is_single_feature_in_column) {
+  if (!missing_is_na) {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner1<MIN_IS_MAX, MISSING_IS_ZERO, false, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column);
+  } else {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner1<MIN_IS_MAX, MISSING_IS_ZERO, true, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, typename BIN_TYPE>
+void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner1(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  const bool mfb_is_zero,
+  const bool mfb_is_na,
+  const bool max_to_left,
+  const bool is_single_feature_in_column) {
+  if (!mfb_is_zero) {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner2<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, false, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left, is_single_feature_in_column);
+  } else {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner2<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, true, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, typename BIN_TYPE>
+void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner2(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  const bool mfb_is_na,
+  const bool max_to_left,
+  const bool is_single_feature_in_column) {
+  if (!mfb_is_na) {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner3<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, false, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, max_to_left, is_single_feature_in_column);
+  } else {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner3<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, true, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, max_to_left, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, typename BIN_TYPE>
+void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner3(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  const bool max_to_left,
+  const bool is_single_feature_in_column) {
+  if (!max_to_left) {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner4<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, false, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, is_single_feature_in_column);
+  } else {
+    LaunchUpdateDataIndexToLeafIndexKernel_Inner4<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, true, BIN_TYPE>
+      (UpdateDataIndexToLeafIndex_ARGS, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, bool MAX_TO_LEFT, typename BIN_TYPE>
+void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner4(
+  UpdateDataIndexToLeafIndexKernel_PARAMS,
+  const bool is_single_feature_in_column) {
+  if (!is_single_feature_in_column) {
+    UpdateDataIndexToLeafIndexKernel<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, MAX_TO_LEFT, true, BIN_TYPE>
+      <<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(
+        UpdateDataIndexToLeafIndex_ARGS,
+        cuda_data_index_to_leaf_index_);
+  } else {
+    UpdateDataIndexToLeafIndexKernel<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, MAX_TO_LEFT, false, BIN_TYPE>
+      <<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(
+        UpdateDataIndexToLeafIndex_ARGS,
+        cuda_data_index_to_leaf_index_);
+  }
+}
+
+#define GenDataToLeftBitVectorKernel_PARMS \
+  const BIN_TYPE* column_data, \
+  const data_size_t num_data_in_leaf, \
+  const data_size_t* data_indices_in_leaf, \
+  const uint32_t th, \
+  const uint32_t t_zero_bin, \
+  const uint32_t max_bin, \
+  const uint32_t min_bin, \
+  const uint8_t split_default_to_left, \
+  const uint8_t split_missing_default_to_left
+
+#define GenBitVector_ARGS \
+  column_data, \
+  num_data_in_leaf, \
+  data_indices_in_leaf, \
+  th, \
+  t_zero_bin, \
+  max_bin, \
+  min_bin, \
+  split_default_to_left,  \
+  split_missing_default_to_left
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, bool MAX_TO_LEFT, bool USE_MIN_BIN, typename BIN_TYPE>
+__global__ void GenDataToLeftBitVectorKernel(
+  GenDataToLeftBitVectorKernel_PARMS,
+  uint16_t* block_to_left_offset,
+  data_size_t* block_to_left_offset_buffer,
+  data_size_t* block_to_right_offset_buffer) {
+  __shared__ uint16_t shared_mem_buffer[32];
+  uint16_t thread_to_left_offset_cnt = 0;
+  const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (local_data_index < num_data_in_leaf) {
+    const unsigned int global_data_index = data_indices_in_leaf[local_data_index];
+    const uint32_t bin = static_cast<uint32_t>(column_data[global_data_index]);
+    if (!MIN_IS_MAX) {
+      if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) ||
+        (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin)) {
+        thread_to_left_offset_cnt = split_missing_default_to_left;
+      } else if ((USE_MIN_BIN && (bin < min_bin || bin > max_bin)) ||
+                 (!USE_MIN_BIN && bin == 0)) {
+        if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) {
+          thread_to_left_offset_cnt = split_missing_default_to_left;
+        } else {
+          thread_to_left_offset_cnt = split_default_to_left;
+        }
+      } else if (bin <= th) {
+        thread_to_left_offset_cnt = 1;
+      }
+    } else {
+      if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) {
+        thread_to_left_offset_cnt = split_missing_default_to_left;
+      } else if (bin != max_bin) {
+        if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) {
+          thread_to_left_offset_cnt = split_missing_default_to_left;
+        } else {
+          thread_to_left_offset_cnt = split_default_to_left;
+        }
+      } else {
+        if (MISSING_IS_NA && !MFB_IS_NA) {
+          thread_to_left_offset_cnt = split_missing_default_to_left;
+        } else if (MAX_TO_LEFT) {
+          thread_to_left_offset_cnt = 1;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer,
+    thread_to_left_offset_cnt, shared_mem_buffer);
+}
+
+template <typename BIN_TYPE>
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner(
+  GenDataToLeftBitVectorKernel_PARMS,
+  const bool missing_is_zero,
+  const bool missing_is_na,
+  const bool mfb_is_zero,
+  const bool mfb_is_na,
+  const bool max_bin_to_left,
+  const bool is_single_feature_in_column) {
+  if (min_bin < max_bin) {
+    if (!missing_is_zero) {
+      LaunchGenDataToLeftBitVectorKernelInner0<false, false, BIN_TYPE>
+        (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+    } else {
+      LaunchGenDataToLeftBitVectorKernelInner0<false, true, BIN_TYPE>
+        (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+    }
+  } else {
+    if (!missing_is_zero) {
+      LaunchGenDataToLeftBitVectorKernelInner0<true, false, BIN_TYPE>
+        (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+    } else {
+      LaunchGenDataToLeftBitVectorKernelInner0<true, true, BIN_TYPE>
+        (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+    }
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, typename BIN_TYPE>
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner0(
+  GenDataToLeftBitVectorKernel_PARMS,
+  const bool missing_is_na,
+  const bool mfb_is_zero,
+  const bool mfb_is_na,
+  const bool max_bin_to_left,
+  const bool is_single_feature_in_column) {
+  if (!missing_is_na) {
+    LaunchGenDataToLeftBitVectorKernelInner1<MIN_IS_MAX, MISSING_IS_ZERO, false, BIN_TYPE>
+      (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+  } else {
+    LaunchGenDataToLeftBitVectorKernelInner1<MIN_IS_MAX, MISSING_IS_ZERO, true, BIN_TYPE>
+      (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, typename BIN_TYPE>
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner1(
+  GenDataToLeftBitVectorKernel_PARMS,
+  const bool mfb_is_zero,
+  const bool mfb_is_na,
+  const bool max_bin_to_left,
+  const bool is_single_feature_in_column) {
+  if (!mfb_is_zero) {
+    LaunchGenDataToLeftBitVectorKernelInner2<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, false, BIN_TYPE>
+      (GenBitVector_ARGS, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+  } else {
+    LaunchGenDataToLeftBitVectorKernelInner2<MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, true, BIN_TYPE>
+      (GenBitVector_ARGS, mfb_is_na, max_bin_to_left, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, typename BIN_TYPE>
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner2(
+  GenDataToLeftBitVectorKernel_PARMS,
+  const bool mfb_is_na,
+  const bool max_bin_to_left,
+  const bool is_single_feature_in_column) {
+  if (!mfb_is_na) {
+    LaunchGenDataToLeftBitVectorKernelInner3
+      <MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, false, BIN_TYPE>
+      (GenBitVector_ARGS, max_bin_to_left, is_single_feature_in_column);
+  } else {
+    LaunchGenDataToLeftBitVectorKernelInner3
+      <MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, true, BIN_TYPE>
+      (GenBitVector_ARGS, max_bin_to_left, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, typename BIN_TYPE>
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner3(
+  GenDataToLeftBitVectorKernel_PARMS,
+  const bool max_bin_to_left,
+  const bool is_single_feature_in_column) {
+  if (!max_bin_to_left) {
+    LaunchGenDataToLeftBitVectorKernelInner4
+      <MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, false, BIN_TYPE>
+      (GenBitVector_ARGS, is_single_feature_in_column);
+  } else {
+    LaunchGenDataToLeftBitVectorKernelInner4
+      <MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, true, BIN_TYPE>
+      (GenBitVector_ARGS, is_single_feature_in_column);
+  }
+}
+
+template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, bool MAX_TO_LEFT, typename BIN_TYPE>
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner4(
+  GenDataToLeftBitVectorKernel_PARMS,
+  const bool is_single_feature_in_column) {
+  if (!is_single_feature_in_column) {
+    GenDataToLeftBitVectorKernel
+      <MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, MAX_TO_LEFT, true, BIN_TYPE>
+      <<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_ARGS,
+        cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_);
+  } else {
+    GenDataToLeftBitVectorKernel
+      <MIN_IS_MAX, MISSING_IS_ZERO, MISSING_IS_NA, MFB_IS_ZERO, MFB_IS_NA, MAX_TO_LEFT, false, BIN_TYPE>
+      <<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_ARGS,
+        cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_);
+  }
+}
+
+void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(
+  const data_size_t num_data_in_leaf,
+  const int split_feature_index,
+  const uint32_t split_threshold,
+  const uint8_t split_default_left,
+  const data_size_t leaf_data_start,
+  const int left_leaf_index,
+  const int right_leaf_index) {
+  const bool missing_is_zero = static_cast<bool>(cuda_column_data_->feature_missing_is_zero(split_feature_index));
+  const bool missing_is_na = static_cast<bool>(cuda_column_data_->feature_missing_is_na(split_feature_index));
+  const bool mfb_is_zero = static_cast<bool>(cuda_column_data_->feature_mfb_is_zero(split_feature_index));
+  const bool mfb_is_na = static_cast<bool>(cuda_column_data_->feature_mfb_is_na(split_feature_index));
+  const bool is_single_feature_in_column = is_single_feature_in_column_[split_feature_index];
+  const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index);
+  const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index);
+  const uint32_t min_bin = is_single_feature_in_column ? 1 : cuda_column_data_->feature_min_bin(split_feature_index);
+  const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index);
+  uint32_t th = split_threshold + min_bin;
+  uint32_t t_zero_bin = min_bin + default_bin;
+  if (most_freq_bin == 0) {
+    --th;
+    --t_zero_bin;
+  }
+  uint8_t split_default_to_left = 0;
+  uint8_t split_missing_default_to_left = 0;
+  int default_leaf_index = right_leaf_index;
+  int missing_default_leaf_index = right_leaf_index;
+  if (most_freq_bin <= split_threshold) {
+    split_default_to_left = 1;
+    default_leaf_index = left_leaf_index;
+  }
+  if (missing_is_zero || missing_is_na) {
+    if (split_default_left) {
+      split_missing_default_to_left = 1;
+      missing_default_leaf_index = left_leaf_index;
+    }
+  }
+  const int column_index = cuda_column_data_->feature_to_column(split_feature_index);
+  const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index);
+
+  const bool max_bin_to_left = (max_bin <= th);
+
+  const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start;
+  const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index);
+
+  if (bit_type == 8) {
+    const uint8_t* column_data = reinterpret_cast<const uint8_t*>(column_data_pointer);
+    LaunchGenDataToLeftBitVectorKernelInner<uint8_t>(
+      GenBitVector_ARGS,
+      missing_is_zero,
+      missing_is_na,
+      mfb_is_zero,
+      mfb_is_na,
+      max_bin_to_left,
+      is_single_feature_in_column);
+    LaunchUpdateDataIndexToLeafIndexKernel<uint8_t>(
+      UpdateDataIndexToLeafIndex_ARGS,
+      missing_is_zero,
+      missing_is_na,
+      mfb_is_zero,
+      mfb_is_na,
+      max_bin_to_left,
+      is_single_feature_in_column);
+  } else if (bit_type == 16) {
+    const uint16_t* column_data = reinterpret_cast<const uint16_t*>(column_data_pointer);
+    LaunchGenDataToLeftBitVectorKernelInner<uint16_t>(
+      GenBitVector_ARGS,
+      missing_is_zero,
+      missing_is_na,
+      mfb_is_zero,
+      mfb_is_na,
+      max_bin_to_left,
+      is_single_feature_in_column);
+    LaunchUpdateDataIndexToLeafIndexKernel<uint16_t>(
+      UpdateDataIndexToLeafIndex_ARGS,
+      missing_is_zero,
+      missing_is_na,
+      mfb_is_zero,
+      mfb_is_na,
+      max_bin_to_left,
+      is_single_feature_in_column);
+  } else if (bit_type == 32) {
+    const uint32_t* column_data = reinterpret_cast<const uint32_t*>(column_data_pointer);
+    LaunchGenDataToLeftBitVectorKernelInner<uint32_t>(
+      GenBitVector_ARGS,
+      missing_is_zero,
+      missing_is_na,
+      mfb_is_zero,
+      mfb_is_na,
+      max_bin_to_left,
+      is_single_feature_in_column);
+    LaunchUpdateDataIndexToLeafIndexKernel<uint32_t>(
+      UpdateDataIndexToLeafIndex_ARGS,
+      missing_is_zero,
+      missing_is_na,
+      mfb_is_zero,
+      mfb_is_na,
+      max_bin_to_left,
+      is_single_feature_in_column);
+  }
+}
+
+#undef UpdateDataIndexToLeafIndexKernel_PARAMS
+#undef UpdateDataIndexToLeafIndex_ARGS
+#undef GenDataToLeftBitVectorKernel_PARMS
+#undef GenBitVector_ARGS
+
+template <typename BIN_TYPE, bool USE_MIN_BIN>
+__global__ void UpdateDataIndexToLeafIndexKernel_Categorical(
+  const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf,
+  const uint32_t* bitset, const int bitset_len, const BIN_TYPE* column_data,
+  // values from feature
+  const uint32_t max_bin, const uint32_t min_bin, const int8_t mfb_offset,
+  int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index,
+  const int default_leaf_index) {
+  const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (local_data_index < num_data_in_leaf) {
+    const unsigned int global_data_index = data_indices_in_leaf[local_data_index];
+    const uint32_t bin = static_cast<uint32_t>(column_data[global_data_index]);
+    if (USE_MIN_BIN && (bin < min_bin || bin > max_bin)) {
+      cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index;
+    } else if (!USE_MIN_BIN && bin == 0) {
+      cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index;
+    } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) {
+      cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index;
+    } else {
+      cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index;
+    }
+  }
+}
+
+// for categorical features
+template <typename BIN_TYPE, bool USE_MIN_BIN>
+__global__ void GenDataToLeftBitVectorKernel_Categorical(
+  const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf,
+  const uint32_t* bitset, int bitset_len, const BIN_TYPE* column_data,
+  // values from feature
+  const uint32_t max_bin, const uint32_t min_bin, const int8_t mfb_offset,
+  const uint8_t split_default_to_left,
+  uint16_t* block_to_left_offset,
+  data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) {
+  __shared__ uint16_t shared_mem_buffer[32];
+  uint16_t thread_to_left_offset_cnt = 0;
+  const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (local_data_index < num_data_in_leaf) {
+    const unsigned int global_data_index = data_indices_in_leaf[local_data_index];
+    const uint32_t bin = static_cast<uint32_t>(column_data[global_data_index]);
+    if (USE_MIN_BIN && (bin < min_bin || bin > max_bin)) {
+      thread_to_left_offset_cnt = split_default_to_left;
+    } else if (!USE_MIN_BIN && bin == 0) {
+      thread_to_left_offset_cnt = split_default_to_left;
+    } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) {
+      thread_to_left_offset_cnt = 1;
+    }
+  }
+  __syncthreads();
+  PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer,
+    thread_to_left_offset_cnt, shared_mem_buffer);
+}
+
+#define GenBitVector_Categorical_ARGS \
+  num_data_in_leaf, data_indices_in_leaf, \
+  bitset, bitset_len, \
+  column_data, max_bin, min_bin, mfb_offset, split_default_to_left, \
+  cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_
+
+#define UpdateDataIndexToLeafIndex_Categorical_ARGS \
+  num_data_in_leaf, data_indices_in_leaf, \
+  bitset, bitset_len, \
+  column_data, max_bin, min_bin, mfb_offset,  \
+  cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index
+
+void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel(
+  const data_size_t num_data_in_leaf,
+  const int split_feature_index,
+  const uint32_t* bitset,
+  const int bitset_len,
+  const uint8_t split_default_left,
+  const data_size_t leaf_data_start,
+  const int left_leaf_index,
+  const int right_leaf_index) {
+  const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start;
+  const int column_index = cuda_column_data_->feature_to_column(split_feature_index);
+  const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index);
+  const bool is_single_feature_in_column = is_single_feature_in_column_[split_feature_index];
+  const uint32_t min_bin = is_single_feature_in_column ? 1 : cuda_column_data_->feature_min_bin(split_feature_index);
+  const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index);
+  const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index);
+  const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index);
+  const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index);
+  const int8_t mfb_offset = static_cast<int8_t>(most_freq_bin == 0);
+  std::vector<uint32_t> host_bitset(bitset_len, 0);
+  CopyFromCUDADeviceToHost<uint32_t>(host_bitset.data(), bitset, bitset_len, __FILE__, __LINE__);
+  uint8_t split_default_to_left = 0;
+  int default_leaf_index = right_leaf_index;
+  if (most_freq_bin > 0 && Common::FindInBitset(host_bitset.data(), bitset_len, most_freq_bin)) {
+    split_default_to_left = 1;
+    default_leaf_index = left_leaf_index;
+  }
+  if (bit_type == 8) {
+    const uint8_t* column_data = reinterpret_cast<const uint8_t*>(column_data_pointer);
+    if (is_single_feature_in_column) {
+      GenDataToLeftBitVectorKernel_Categorical<uint8_t, false><<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_Categorical_ARGS);
+      UpdateDataIndexToLeafIndexKernel_Categorical<uint8_t, false><<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(UpdateDataIndexToLeafIndex_Categorical_ARGS);
+    } else {
+      GenDataToLeftBitVectorKernel_Categorical<uint8_t, true><<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_Categorical_ARGS);
+      UpdateDataIndexToLeafIndexKernel_Categorical<uint8_t, true><<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(UpdateDataIndexToLeafIndex_Categorical_ARGS);
+    }
+  } else if (bit_type == 16) {
+    const uint16_t* column_data = reinterpret_cast<const uint16_t*>(column_data_pointer);
+    if (is_single_feature_in_column) {
+      GenDataToLeftBitVectorKernel_Categorical<uint16_t, false><<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_Categorical_ARGS);
+      UpdateDataIndexToLeafIndexKernel_Categorical<uint16_t, false><<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(UpdateDataIndexToLeafIndex_Categorical_ARGS);
+    } else {
+      GenDataToLeftBitVectorKernel_Categorical<uint16_t, true><<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_Categorical_ARGS);
+      UpdateDataIndexToLeafIndexKernel_Categorical<uint16_t, true><<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(UpdateDataIndexToLeafIndex_Categorical_ARGS);
+    }
+  } else if (bit_type == 32) {
+    const uint32_t* column_data = reinterpret_cast<const uint32_t*>(column_data_pointer);
+    if (is_single_feature_in_column) {
+      GenDataToLeftBitVectorKernel_Categorical<uint32_t, false><<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_Categorical_ARGS);
+      UpdateDataIndexToLeafIndexKernel_Categorical<uint32_t, false><<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(UpdateDataIndexToLeafIndex_Categorical_ARGS);
+    } else {
+      GenDataToLeftBitVectorKernel_Categorical<uint32_t, true><<<grid_dim_, block_dim_, 0, cuda_streams_[0]>>>(GenBitVector_Categorical_ARGS);
+      UpdateDataIndexToLeafIndexKernel_Categorical<uint32_t, true><<<grid_dim_, block_dim_, 0, cuda_streams_[3]>>>(UpdateDataIndexToLeafIndex_Categorical_ARGS);
+    }
+  }
+}
+
+#undef GenBitVector_Categorical_ARGS
+#undef UpdateDataIndexToLeafIndex_Categorical_ARGS
+
+__global__ void AggregateBlockOffsetKernel0(
+  const int left_leaf_index,
+  const int right_leaf_index,
+  data_size_t* block_to_left_offset_buffer,
+  data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start,
+  data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices,
+  const data_size_t num_blocks) {
+  __shared__ uint32_t shared_mem_buffer[32];
+  __shared__ uint32_t to_left_total_count;
+  const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index];
+  const unsigned int blockDim_x = blockDim.x;
+  const unsigned int threadIdx_x = threadIdx.x;
+  const data_size_t num_blocks_plus_1 = num_blocks + 1;
+  const uint32_t num_blocks_per_thread = (num_blocks_plus_1 + blockDim_x - 1) / blockDim_x;
+  const uint32_t remain = num_blocks_plus_1 - ((num_blocks_per_thread - 1) * blockDim_x);
+  const uint32_t remain_offset = remain * num_blocks_per_thread;
+  uint32_t thread_start_block_index = 0;
+  uint32_t thread_end_block_index = 0;
+  if (threadIdx_x < remain) {
+    thread_start_block_index = threadIdx_x * num_blocks_per_thread;
+    thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks_plus_1);
+  } else {
+    thread_start_block_index = remain_offset + (num_blocks_per_thread - 1) * (threadIdx_x - remain);
+    thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread - 1, num_blocks_plus_1);
+  }
+  if (threadIdx.x == 0) {
+    block_to_right_offset_buffer[0] = 0;
+  }
+  __syncthreads();
+  for (uint32_t block_index = thread_start_block_index + 1; block_index < thread_end_block_index; ++block_index) {
+    block_to_left_offset_buffer[block_index] += block_to_left_offset_buffer[block_index - 1];
+    block_to_right_offset_buffer[block_index] += block_to_right_offset_buffer[block_index - 1];
+  }
+  __syncthreads();
+  uint32_t block_to_left_offset = 0;
+  uint32_t block_to_right_offset = 0;
+  if (thread_start_block_index < thread_end_block_index && thread_start_block_index > 1) {
+    block_to_left_offset = block_to_left_offset_buffer[thread_start_block_index - 1];
+    block_to_right_offset = block_to_right_offset_buffer[thread_start_block_index - 1];
+  }
+  block_to_left_offset = ShufflePrefixSum<uint32_t>(block_to_left_offset, shared_mem_buffer);
+  __syncthreads();
+  block_to_right_offset = ShufflePrefixSum<uint32_t>(block_to_right_offset, shared_mem_buffer);
+  if (threadIdx_x == blockDim_x - 1) {
+    to_left_total_count = block_to_left_offset + block_to_left_offset_buffer[num_blocks];
+  }
+  __syncthreads();
+  const uint32_t to_left_thread_block_offset = block_to_left_offset;
+  const uint32_t to_right_thread_block_offset = block_to_right_offset + to_left_total_count;
+  for (uint32_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) {
+    block_to_left_offset_buffer[block_index] += to_left_thread_block_offset;
+    block_to_right_offset_buffer[block_index] += to_right_thread_block_offset;
+  }
+  __syncthreads();
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    const data_size_t old_leaf_data_end = cuda_leaf_data_end[left_leaf_index];
+    cuda_leaf_data_end[left_leaf_index] = cuda_leaf_data_start[left_leaf_index] + static_cast<data_size_t>(to_left_total_count);
+    cuda_leaf_num_data[left_leaf_index] = static_cast<data_size_t>(to_left_total_count);
+    cuda_leaf_data_start[right_leaf_index] = cuda_leaf_data_end[left_leaf_index];
+    cuda_leaf_data_end[right_leaf_index] = old_leaf_data_end;
+    cuda_leaf_num_data[right_leaf_index] = num_data_in_leaf - static_cast<data_size_t>(to_left_total_count);
+  }
+}
+
+__global__ void AggregateBlockOffsetKernel1(
+  const int left_leaf_index,
+  const int right_leaf_index,
+  data_size_t* block_to_left_offset_buffer,
+  data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start,
+  data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices,
+  const data_size_t num_blocks) {
+  __shared__ uint32_t shared_mem_buffer[32];
+  __shared__ uint32_t to_left_total_count;
+  const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index];
+  const unsigned int threadIdx_x = threadIdx.x;
+  uint32_t block_to_left_offset = 0;
+  uint32_t block_to_right_offset = 0;
+  if (threadIdx_x < static_cast<unsigned int>(num_blocks)) {
+    block_to_left_offset = block_to_left_offset_buffer[threadIdx_x + 1];
+    block_to_right_offset = block_to_right_offset_buffer[threadIdx_x + 1];
+  }
+  block_to_left_offset = ShufflePrefixSum<uint32_t>(block_to_left_offset, shared_mem_buffer);
+  __syncthreads();
+  block_to_right_offset = ShufflePrefixSum<uint32_t>(block_to_right_offset, shared_mem_buffer);
+  if (threadIdx.x == blockDim.x - 1) {
+    to_left_total_count = block_to_left_offset;
+  }
+  __syncthreads();
+  if (threadIdx_x < static_cast<unsigned int>(num_blocks)) {
+    block_to_left_offset_buffer[threadIdx_x + 1] = block_to_left_offset;
+    block_to_right_offset_buffer[threadIdx_x + 1] = block_to_right_offset + to_left_total_count;
+  }
+  if (threadIdx_x == 0) {
+    block_to_right_offset_buffer[0] = to_left_total_count;
+  }
+  __syncthreads();
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    const data_size_t old_leaf_data_end = cuda_leaf_data_end[left_leaf_index];
+    cuda_leaf_data_end[left_leaf_index] = cuda_leaf_data_start[left_leaf_index] + static_cast<data_size_t>(to_left_total_count);
+    cuda_leaf_num_data[left_leaf_index] = static_cast<data_size_t>(to_left_total_count);
+    cuda_leaf_data_start[right_leaf_index] = cuda_leaf_data_end[left_leaf_index];
+    cuda_leaf_data_end[right_leaf_index] = old_leaf_data_end;
+    cuda_leaf_num_data[right_leaf_index] = num_data_in_leaf - static_cast<data_size_t>(to_left_total_count);
+  }
+}
+
+__global__ void SplitTreeStructureKernel(const int left_leaf_index,
+  const int right_leaf_index,
+  data_size_t* block_to_left_offset_buffer,
+  data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start,
+  data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices,
+  const CUDASplitInfo* best_split_info,
+  // for leaf splits information update
+  CUDALeafSplitsStruct* smaller_leaf_splits,
+  CUDALeafSplitsStruct* larger_leaf_splits,
+  const int num_total_bin,
+  hist_t* cuda_hist, hist_t** cuda_hist_pool,
+  double* cuda_leaf_output,
+  int* cuda_split_info_buffer) {
+  const unsigned int to_left_total_cnt = cuda_leaf_num_data[left_leaf_index];
+  double* cuda_split_info_buffer_for_hessians = reinterpret_cast<double*>(cuda_split_info_buffer + 8);
+  const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (global_thread_index == 0) {
+    cuda_leaf_output[left_leaf_index] = best_split_info->left_value;
+  } else if (global_thread_index == 1) {
+    cuda_leaf_output[right_leaf_index] = best_split_info->right_value;
+  } else if (global_thread_index == 2) {
+    cuda_split_info_buffer[0] = left_leaf_index;
+  } else if (global_thread_index == 3) {
+    cuda_split_info_buffer[1] = cuda_leaf_num_data[left_leaf_index];
+  } else if (global_thread_index == 4) {
+    cuda_split_info_buffer[2] = cuda_leaf_data_start[left_leaf_index];
+  } else if (global_thread_index == 5) {
+    cuda_split_info_buffer[3] = right_leaf_index;
+  } else if (global_thread_index == 6) {
+    cuda_split_info_buffer[4] = cuda_leaf_num_data[right_leaf_index];
+  } else if (global_thread_index == 7) {
+    cuda_split_info_buffer[5] = cuda_leaf_data_start[right_leaf_index];
+  } else if (global_thread_index == 8) {
+    cuda_split_info_buffer_for_hessians[0] = best_split_info->left_sum_hessians;
+    cuda_split_info_buffer_for_hessians[2] = best_split_info->left_sum_gradients;
+  } else if (global_thread_index == 9) {
+    cuda_split_info_buffer_for_hessians[1] = best_split_info->right_sum_hessians;
+    cuda_split_info_buffer_for_hessians[3] = best_split_info->right_sum_gradients;
+  }
+
+  if (cuda_leaf_num_data[left_leaf_index] < cuda_leaf_num_data[right_leaf_index]) {
+    if (global_thread_index == 0) {
+      hist_t* parent_hist_ptr = cuda_hist_pool[left_leaf_index];
+      cuda_hist_pool[right_leaf_index] = parent_hist_ptr;
+      cuda_hist_pool[left_leaf_index] = cuda_hist + 2 * right_leaf_index * num_total_bin;
+      smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[left_leaf_index];
+      larger_leaf_splits->hist_in_leaf = cuda_hist_pool[right_leaf_index];
+    } else if (global_thread_index == 1) {
+      smaller_leaf_splits->sum_of_gradients = best_split_info->left_sum_gradients;
+    } else if (global_thread_index == 2) {
+      smaller_leaf_splits->sum_of_hessians = best_split_info->left_sum_hessians;
+    } else if (global_thread_index == 3) {
+      smaller_leaf_splits->num_data_in_leaf = to_left_total_cnt;
+    } else if (global_thread_index == 4) {
+      smaller_leaf_splits->gain = best_split_info->left_gain;
+    } else if (global_thread_index == 5) {
+      smaller_leaf_splits->leaf_value = best_split_info->left_value;
+    } else if (global_thread_index == 6) {
+      smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices;
+    } else if (global_thread_index == 7) {
+      larger_leaf_splits->leaf_index = right_leaf_index;
+    } else if (global_thread_index == 8) {
+      larger_leaf_splits->sum_of_gradients = best_split_info->right_sum_gradients;
+    } else if (global_thread_index == 9) {
+      larger_leaf_splits->sum_of_hessians = best_split_info->right_sum_hessians;
+    } else if (global_thread_index == 10) {
+      larger_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[right_leaf_index];
+    } else if (global_thread_index == 11) {
+      larger_leaf_splits->gain = best_split_info->right_gain;
+    } else if (global_thread_index == 12) {
+      larger_leaf_splits->leaf_value = best_split_info->right_value;
+    } else if (global_thread_index == 13) {
+      larger_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[left_leaf_index];
+    } else if (global_thread_index == 14) {
+      cuda_split_info_buffer[6] = left_leaf_index;
+    } else if (global_thread_index == 15) {
+      cuda_split_info_buffer[7] = right_leaf_index;
+    } else if (global_thread_index == 16) {
+      smaller_leaf_splits->leaf_index = left_leaf_index;
+    }
+  } else {
+    if (global_thread_index == 0) {
+      larger_leaf_splits->leaf_index = left_leaf_index;
+    } else if (global_thread_index == 1) {
+      larger_leaf_splits->sum_of_gradients = best_split_info->left_sum_gradients;
+    } else if (global_thread_index == 2) {
+      larger_leaf_splits->sum_of_hessians = best_split_info->left_sum_hessians;
+    } else if (global_thread_index == 3) {
+      larger_leaf_splits->num_data_in_leaf = to_left_total_cnt;
+    } else if (global_thread_index == 4) {
+      larger_leaf_splits->gain = best_split_info->left_gain;
+    } else if (global_thread_index == 5) {
+      larger_leaf_splits->leaf_value = best_split_info->left_value;
+    } else if (global_thread_index == 6) {
+      larger_leaf_splits->data_indices_in_leaf = cuda_data_indices;
+    } else if (global_thread_index == 7) {
+      smaller_leaf_splits->leaf_index = right_leaf_index;
+    } else if (global_thread_index == 8) {
+      smaller_leaf_splits->sum_of_gradients = best_split_info->right_sum_gradients;
+    } else if (global_thread_index == 9) {
+      smaller_leaf_splits->sum_of_hessians = best_split_info->right_sum_hessians;
+    } else if (global_thread_index == 10) {
+      smaller_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[right_leaf_index];
+    } else if (global_thread_index == 11) {
+      smaller_leaf_splits->gain = best_split_info->right_gain;
+    } else if (global_thread_index == 12) {
+      smaller_leaf_splits->leaf_value = best_split_info->right_value;
+    } else if (global_thread_index == 13) {
+      smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[left_leaf_index];
+    } else if (global_thread_index == 14) {
+      cuda_hist_pool[right_leaf_index] = cuda_hist + 2 * right_leaf_index * num_total_bin;
+      smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[right_leaf_index];
+    } else if (global_thread_index == 15) {
+      larger_leaf_splits->hist_in_leaf = cuda_hist_pool[left_leaf_index];
+    } else if (global_thread_index == 16) {
+      cuda_split_info_buffer[6] = right_leaf_index;
+    } else if (global_thread_index == 17) {
+      cuda_split_info_buffer[7] = left_leaf_index;
+    }
+  }
+}
+
+__global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf_index,
+  const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data,
+  const data_size_t* cuda_data_indices,
+  const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer,
+  const uint16_t* block_to_left_offset, data_size_t* out_data_indices_in_leaf) {
+  const data_size_t leaf_num_data_offset = cuda_leaf_data_start[left_leaf_index];
+  const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index] + cuda_leaf_num_data[right_leaf_index];
+  const unsigned int threadIdx_x = threadIdx.x;
+  const unsigned int blockDim_x = blockDim.x;
+  const unsigned int global_thread_index = blockIdx.x * blockDim_x + threadIdx_x;
+  const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset;
+  const uint16_t* block_to_left_offset_ptr = block_to_left_offset + blockIdx.x * blockDim_x;
+  const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x];
+  const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x];
+  data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset;
+  data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset;
+  if (static_cast<data_size_t>(global_thread_index) < num_data_in_leaf) {
+    const uint32_t thread_to_left_offset = (threadIdx_x == 0 ? 0 : block_to_left_offset_ptr[threadIdx_x - 1]);
+    const bool to_left = block_to_left_offset_ptr[threadIdx_x] > thread_to_left_offset;
+    if (to_left) {
+      left_out_data_indices_in_leaf[thread_to_left_offset] = cuda_data_indices_in_leaf[global_thread_index];
+    } else {
+      const uint32_t thread_to_right_offset = threadIdx.x - thread_to_left_offset;
+      right_out_data_indices_in_leaf[thread_to_right_offset] = cuda_data_indices_in_leaf[global_thread_index];
+    }
+  }
+}
+
+__global__ void CopyDataIndicesKernel(
+  const data_size_t num_data_in_leaf,
+  const data_size_t* out_data_indices_in_leaf,
+  data_size_t* cuda_data_indices) {
+  const unsigned int threadIdx_x = threadIdx.x;
+  const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx_x;
+  if (global_thread_index < num_data_in_leaf) {
+    cuda_data_indices[global_thread_index] = out_data_indices_in_leaf[global_thread_index];
+  }
+}
+
+void CUDADataPartition::LaunchSplitInnerKernel(
+  const data_size_t num_data_in_leaf,
+  const CUDASplitInfo* best_split_info,
+  const int left_leaf_index,
+  const int right_leaf_index,
+  // for leaf splits information update
+  CUDALeafSplitsStruct* smaller_leaf_splits,
+  CUDALeafSplitsStruct* larger_leaf_splits,
+  data_size_t* left_leaf_num_data_ref,
+  data_size_t* right_leaf_num_data_ref,
+  data_size_t* left_leaf_start_ref,
+  data_size_t* right_leaf_start_ref,
+  double* left_leaf_sum_of_hessians_ref,
+  double* right_leaf_sum_of_hessians_ref,
+  double* left_leaf_sum_of_gradients_ref,
+  double* right_leaf_sum_of_gradients_ref) {
+  int num_blocks_final_ref = grid_dim_ - 1;
+  int num_blocks_final_aligned = 1;
+  while (num_blocks_final_ref > 0) {
+    num_blocks_final_aligned <<= 1;
+    num_blocks_final_ref >>= 1;
+  }
+  global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel");
+
+  if (grid_dim_ > AGGREGATE_BLOCK_SIZE_DATA_PARTITION) {
+    AggregateBlockOffsetKernel0<<<1, AGGREGATE_BLOCK_SIZE_DATA_PARTITION, 0, cuda_streams_[0]>>>(
+      left_leaf_index,
+      right_leaf_index,
+      cuda_block_data_to_left_offset_,
+      cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_,
+      cuda_leaf_num_data_, cuda_data_indices_,
+      grid_dim_);
+  } else {
+    AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(
+      left_leaf_index,
+      right_leaf_index,
+      cuda_block_data_to_left_offset_,
+      cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_,
+      cuda_leaf_num_data_, cuda_data_indices_,
+      grid_dim_);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel");
+  global_timer.Start("CUDADataPartition::SplitInnerKernel");
+  SplitInnerKernel<<<grid_dim_, block_dim_, 0, cuda_streams_[1]>>>(
+    left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_,
+    cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_block_to_left_offset_,
+    cuda_out_data_indices_in_leaf_);
+  global_timer.Stop("CUDADataPartition::SplitInnerKernel");
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+
+  global_timer.Start("CUDADataPartition::SplitTreeStructureKernel");
+  SplitTreeStructureKernel<<<4, 5, 0, cuda_streams_[0]>>>(left_leaf_index, right_leaf_index,
+    cuda_block_data_to_left_offset_,
+    cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_,
+    cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_,
+    best_split_info,
+    smaller_leaf_splits,
+    larger_leaf_splits,
+    num_total_bin_,
+    cuda_hist_,
+    cuda_hist_pool_,
+    cuda_leaf_output_, cuda_split_info_buffer_);
+  global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel");
+  std::vector<int> cpu_split_info_buffer(16);
+  const double* cpu_sum_hessians_info = reinterpret_cast<const double*>(cpu_split_info_buffer.data() + 8);
+  global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync");
+  CopyFromCUDADeviceToHostAsync<int>(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 16, cuda_streams_[0], __FILE__, __LINE__);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync");
+  const data_size_t left_leaf_num_data = cpu_split_info_buffer[1];
+  const data_size_t left_leaf_data_start = cpu_split_info_buffer[2];
+  const data_size_t right_leaf_num_data = cpu_split_info_buffer[4];
+  global_timer.Start("CUDADataPartition::CopyDataIndicesKernel");
+  CopyDataIndicesKernel<<<grid_dim_, block_dim_, 0, cuda_streams_[2]>>>(
+    left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start);
+  global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel");
+  const data_size_t right_leaf_data_start = cpu_split_info_buffer[5];
+  *left_leaf_num_data_ref = left_leaf_num_data;
+  *left_leaf_start_ref = left_leaf_data_start;
+  *right_leaf_num_data_ref = right_leaf_num_data;
+  *right_leaf_start_ref = right_leaf_data_start;
+  *left_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[0];
+  *right_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[1];
+  *left_leaf_sum_of_gradients_ref = cpu_sum_hessians_info[2];
+  *right_leaf_sum_of_gradients_ref = cpu_sum_hessians_info[3];
+}
+
+template <bool USE_BAGGING>
+__global__ void AddPredictionToScoreKernel(
+  const data_size_t* data_indices_in_leaf,
+  const double* leaf_value, double* cuda_scores,
+  const int* cuda_data_index_to_leaf_index, const data_size_t num_data) {
+  const unsigned int threadIdx_x = threadIdx.x;
+  const unsigned int blockIdx_x = blockIdx.x;
+  const unsigned int blockDim_x = blockDim.x;
+  const data_size_t local_data_index = static_cast<data_size_t>(blockIdx_x * blockDim_x + threadIdx_x);
+  if (local_data_index < num_data) {
+    if (USE_BAGGING) {
+      const data_size_t global_data_index = data_indices_in_leaf[local_data_index];
+      const int leaf_index = cuda_data_index_to_leaf_index[global_data_index];
+      const double leaf_prediction_value = leaf_value[leaf_index];
+      cuda_scores[local_data_index] = leaf_prediction_value;
+    } else {
+      const int leaf_index = cuda_data_index_to_leaf_index[local_data_index];
+      const double leaf_prediction_value = leaf_value[leaf_index];
+      cuda_scores[local_data_index] = leaf_prediction_value;
+    }
+  }
+}
+
+void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores) {
+  global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel");
+  const data_size_t num_data_in_root = root_num_data();
+  const int num_blocks = (num_data_in_root + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION;
+  if (use_bagging_) {
+    AddPredictionToScoreKernel<true><<<num_blocks, FILL_INDICES_BLOCK_SIZE_DATA_PARTITION>>>(
+      cuda_data_indices_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_in_root);
+  } else {
+    AddPredictionToScoreKernel<false><<<num_blocks, FILL_INDICES_BLOCK_SIZE_DATA_PARTITION>>>(
+      cuda_data_indices_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_in_root);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel");
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp
new file mode 100644
index 000000000000..c4c58f3ebac0
--- /dev/null
+++ b/src/treelearner/cuda/cuda_data_partition.hpp
@@ -0,0 +1,394 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
+#define LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/bin.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/tree.h>
+
+#include <vector>
+
+#include <LightGBM/cuda/cuda_column_data.hpp>
+#include <LightGBM/cuda/cuda_split_info.hpp>
+#include <LightGBM/cuda/cuda_tree.hpp>
+
+#include "cuda_leaf_splits.hpp"
+
+#define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024)
+#define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (1024)
+#define AGGREGATE_BLOCK_SIZE_DATA_PARTITION (1024)
+
+namespace LightGBM {
+
+class CUDADataPartition {
+ public:
+  CUDADataPartition(
+    const Dataset* train_data,
+    const int num_total_bin,
+    const int num_leaves,
+    const int num_threads,
+    hist_t* cuda_hist);
+
+  ~CUDADataPartition();
+
+  void Init();
+
+  void BeforeTrain();
+
+  void Split(
+    // input best split info
+    const CUDASplitInfo* best_split_info,
+    const int left_leaf_index,
+    const int right_leaf_index,
+    const int leaf_best_split_feature,
+    const uint32_t leaf_best_split_threshold,
+    const uint32_t* categorical_bitset,
+    const int categorical_bitset_len,
+    const uint8_t leaf_best_split_default_left,
+    const data_size_t num_data_in_leaf,
+    const data_size_t leaf_data_start,
+    // for leaf information update
+    CUDALeafSplitsStruct* smaller_leaf_splits,
+    CUDALeafSplitsStruct* larger_leaf_splits,
+    // gather information for CPU, used for launching kernels
+    data_size_t* left_leaf_num_data,
+    data_size_t* right_leaf_num_data,
+    data_size_t* left_leaf_start,
+    data_size_t* right_leaf_start,
+    double* left_leaf_sum_of_hessians,
+    double* right_leaf_sum_of_hessians,
+    double* left_leaf_sum_of_gradients,
+    double* right_leaf_sum_of_gradients);
+
+  void UpdateTrainScore(const Tree* tree, double* cuda_scores);
+
+  void SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices);
+
+  void SetBaggingSubset(const Dataset* subset);
+
+  void ResetTrainingData(const Dataset* train_data, const int num_total_bin, hist_t* cuda_hist);
+
+  void ResetConfig(const Config* config, hist_t* cuda_hist);
+
+  void ResetByLeafPred(const std::vector<int>& leaf_pred, int num_leaves);
+
+  data_size_t root_num_data() const {
+    if (use_bagging_) {
+      return num_used_indices_;
+    } else {
+      return num_data_;
+    }
+  }
+
+  const data_size_t* cuda_data_indices() const { return cuda_data_indices_; }
+
+  const data_size_t* cuda_leaf_num_data() const { return cuda_leaf_num_data_; }
+
+  const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; }
+
+  const int* cuda_data_index_to_leaf_index() const { return cuda_data_index_to_leaf_index_; }
+
+  bool use_bagging() const { return use_bagging_; }
+
+ private:
+  void CalcBlockDim(const data_size_t num_data_in_leaf);
+
+  void GenDataToLeftBitVector(
+    const data_size_t num_data_in_leaf,
+    const int split_feature_index,
+    const uint32_t split_threshold,
+    const uint32_t* categorical_bitset,
+    const int categorical_bitset_len,
+    const uint8_t split_default_left,
+    const data_size_t leaf_data_start,
+    const int left_leaf_index,
+    const int right_leaf_index);
+
+  void SplitInner(
+    // input best split info
+    const data_size_t num_data_in_leaf,
+    const CUDASplitInfo* best_split_info,
+    const int left_leaf_index,
+    const int right_leaf_index,
+    // for leaf splits information update
+    CUDALeafSplitsStruct* smaller_leaf_splits,
+    CUDALeafSplitsStruct* larger_leaf_splits,
+    // gather information for CPU, used for launching kernels
+    data_size_t* left_leaf_num_data,
+    data_size_t* right_leaf_num_data,
+    data_size_t* left_leaf_start,
+    data_size_t* right_leaf_start,
+    double* left_leaf_sum_of_hessians,
+    double* right_leaf_sum_of_hessians,
+    double* left_leaf_sum_of_gradients,
+    double* right_leaf_sum_of_gradients);
+
+  // kernel launch functions
+  void LaunchFillDataIndicesBeforeTrain();
+
+  void LaunchSplitInnerKernel(
+    // input best split info
+    const data_size_t num_data_in_leaf,
+    const CUDASplitInfo* best_split_info,
+    const int left_leaf_index,
+    const int right_leaf_index,
+    // for leaf splits information update
+    CUDALeafSplitsStruct* smaller_leaf_splits,
+    CUDALeafSplitsStruct* larger_leaf_splits,
+    // gather information for CPU, used for launching kernels
+    data_size_t* left_leaf_num_data,
+    data_size_t* right_leaf_num_data,
+    data_size_t* left_leaf_start,
+    data_size_t* right_leaf_start,
+    double* left_leaf_sum_of_hessians,
+    double* right_leaf_sum_of_hessians,
+    double* left_leaf_sum_of_gradients,
+    double* right_leaf_sum_of_gradients);
+
+  void LaunchGenDataToLeftBitVectorKernel(
+    const data_size_t num_data_in_leaf,
+    const int split_feature_index,
+    const uint32_t split_threshold,
+    const uint8_t split_default_left,
+    const data_size_t leaf_data_start,
+    const int left_leaf_index,
+    const int right_leaf_index);
+
+  void LaunchGenDataToLeftBitVectorCategoricalKernel(
+    const data_size_t num_data_in_leaf,
+    const int split_feature_index,
+    const uint32_t* bitset,
+    const int bitset_len,
+    const uint8_t split_default_left,
+    const data_size_t leaf_data_start,
+    const int left_leaf_index,
+    const int right_leaf_index);
+
+#define GenDataToLeftBitVectorKernel_PARMS \
+  const BIN_TYPE* column_data, \
+  const data_size_t num_data_in_leaf, \
+  const data_size_t* data_indices_in_leaf, \
+  const uint32_t th, \
+  const uint32_t t_zero_bin, \
+  const uint32_t max_bin, \
+  const uint32_t min_bin, \
+  const uint8_t split_default_to_left, \
+  const uint8_t split_missing_default_to_left
+
+  template <typename BIN_TYPE>
+  void LaunchGenDataToLeftBitVectorKernelInner(
+    GenDataToLeftBitVectorKernel_PARMS,
+    const bool missing_is_zero,
+    const bool missing_is_na,
+    const bool mfb_is_zero,
+    const bool mfb_is_na,
+    const bool max_bin_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, typename BIN_TYPE>
+  void LaunchGenDataToLeftBitVectorKernelInner0(
+    GenDataToLeftBitVectorKernel_PARMS,
+    const bool missing_is_na,
+    const bool mfb_is_zero,
+    const bool mfb_is_na,
+    const bool max_bin_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, typename BIN_TYPE>
+  void LaunchGenDataToLeftBitVectorKernelInner1(
+    GenDataToLeftBitVectorKernel_PARMS,
+    const bool mfb_is_zero,
+    const bool mfb_is_na,
+    const bool max_bin_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, typename BIN_TYPE>
+  void LaunchGenDataToLeftBitVectorKernelInner2(
+    GenDataToLeftBitVectorKernel_PARMS,
+    const bool mfb_is_na,
+    const bool max_bin_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, typename BIN_TYPE>
+  void LaunchGenDataToLeftBitVectorKernelInner3(
+    GenDataToLeftBitVectorKernel_PARMS,
+    const bool max_bin_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, bool MAX_TO_LEFT, typename BIN_TYPE>
+  void LaunchGenDataToLeftBitVectorKernelInner4(
+    GenDataToLeftBitVectorKernel_PARMS,
+    const bool is_single_feature_in_column);
+
+#undef GenDataToLeftBitVectorKernel_PARMS
+
+#define UpdateDataIndexToLeafIndexKernel_PARAMS \
+  const BIN_TYPE* column_data, \
+  const data_size_t num_data_in_leaf, \
+  const data_size_t* data_indices_in_leaf, \
+  const uint32_t th, \
+  const uint32_t t_zero_bin, \
+  const uint32_t max_bin_ref, \
+  const uint32_t min_bin_ref, \
+  const int left_leaf_index, \
+  const int right_leaf_index, \
+  const int default_leaf_index, \
+  const int missing_default_leaf_index
+
+  template <typename BIN_TYPE>
+  void LaunchUpdateDataIndexToLeafIndexKernel(
+    UpdateDataIndexToLeafIndexKernel_PARAMS,
+    const bool missing_is_zero,
+    const bool missing_is_na,
+    const bool mfb_is_zero,
+    const bool mfb_is_na,
+    const bool max_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, typename BIN_TYPE>
+  void LaunchUpdateDataIndexToLeafIndexKernel_Inner0(
+    UpdateDataIndexToLeafIndexKernel_PARAMS,
+    const bool missing_is_na,
+    const bool mfb_is_zero,
+    const bool mfb_is_na,
+    const bool max_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, typename BIN_TYPE>
+  void LaunchUpdateDataIndexToLeafIndexKernel_Inner1(
+    UpdateDataIndexToLeafIndexKernel_PARAMS,
+    const bool mfb_is_zero,
+    const bool mfb_is_na,
+    const bool max_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, typename BIN_TYPE>
+  void LaunchUpdateDataIndexToLeafIndexKernel_Inner2(
+    UpdateDataIndexToLeafIndexKernel_PARAMS,
+    const bool mfb_is_na,
+    const bool max_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, typename BIN_TYPE>
+  void LaunchUpdateDataIndexToLeafIndexKernel_Inner3(
+    UpdateDataIndexToLeafIndexKernel_PARAMS,
+    const bool max_to_left,
+    const bool is_single_feature_in_column);
+
+  template <bool MIN_IS_MAX, bool MISSING_IS_ZERO, bool MISSING_IS_NA, bool MFB_IS_ZERO, bool MFB_IS_NA, bool MAX_TO_LEFT, typename BIN_TYPE>
+  void LaunchUpdateDataIndexToLeafIndexKernel_Inner4(
+    UpdateDataIndexToLeafIndexKernel_PARAMS,
+    const bool is_single_feature_in_column);
+
+#undef UpdateDataIndexToLeafIndexKernel_PARAMS
+
+  void LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores);
+
+  void LaunchFillDataIndexToLeafIndex();
+
+  // Host memory
+
+  // dataset information
+  /*! \brief number of training data */
+  data_size_t num_data_;
+  /*! \brief number of features in training data */
+  int num_features_;
+  /*! \brief number of total bins in training data */
+  int num_total_bin_;
+  /*! \brief bin data stored by column */
+  const CUDAColumnData* cuda_column_data_;
+  /*! \brief grid dimension when splitting one leaf */
+  int grid_dim_;
+  /*! \brief block dimension when splitting one leaf */
+  int block_dim_;
+  /*! \brief add train score buffer in host */
+  mutable std::vector<double> add_train_score_;
+  /*! \brief data indices used in this iteration */
+  const data_size_t* used_indices_;
+  /*! \brief marks whether a feature is a categorical feature */
+  std::vector<bool> is_categorical_feature_;
+  /*! \brief marks whether a feature is the only feature in its group */
+  std::vector<bool> is_single_feature_in_column_;
+
+  // config information
+  /*! \brief maximum number of leaves in a tree */
+  int num_leaves_;
+  /*! \brief number of threads */
+  int num_threads_;
+
+  // per iteration information
+  /*! \brief whether bagging is used in this iteration */
+  bool use_bagging_;
+  /*! \brief number of used data indices in this iteration */
+  data_size_t num_used_indices_;
+
+  // tree structure information
+  /*! \brief current number of leaves in tree */
+  int cur_num_leaves_;
+
+  // split algorithm related
+  /*! \brief maximum number of blocks to aggregate after finding bit vector by blocks */
+  int max_num_split_indices_blocks_;
+
+  // CUDA streams
+  /*! \brief cuda streams used for asynchronizing kernel computing and memory copy */
+  std::vector<cudaStream_t> cuda_streams_;
+
+
+  // CUDA memory, held by this object
+
+  // tree structure information
+  /*! \brief data indices by leaf */
+  data_size_t* cuda_data_indices_;
+  /*! \brief start position of each leaf in cuda_data_indices_ */
+  data_size_t* cuda_leaf_data_start_;
+  /*! \brief end position of each leaf in cuda_data_indices_  */
+  data_size_t* cuda_leaf_data_end_;
+  /*! \brief number of data in each leaf */
+  data_size_t* cuda_leaf_num_data_;
+  /*! \brief records the histogram of each leaf */
+  hist_t** cuda_hist_pool_;
+  /*! \brief records the value of each leaf */
+  double* cuda_leaf_output_;
+
+  // split data algorithm related
+  uint16_t* cuda_block_to_left_offset_;
+  /*! \brief maps data index to leaf index, for adding scores to training data set */
+  int* cuda_data_index_to_leaf_index_;
+  /*! \brief prefix sum of number of data going to left in all blocks */
+  data_size_t* cuda_block_data_to_left_offset_;
+  /*! \brief prefix sum of number of data going to right in all blocks */
+  data_size_t* cuda_block_data_to_right_offset_;
+  /*! \brief buffer for splitting data indices, will be copied back to cuda_data_indices_ after split */
+  data_size_t* cuda_out_data_indices_in_leaf_;
+
+  // split tree structure algorithm related
+  /*! \brief buffer to store split information, prepared to be copied to cpu */
+  int* cuda_split_info_buffer_;
+
+  // dataset information
+  /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */
+  data_size_t* cuda_num_data_;
+
+  // for train score update
+  /*! \brief added train score buffer in CUDA */
+  double* cuda_add_train_score_;
+
+
+  // CUDA memory, held by other object
+
+  // dataset information
+  /*! \brief beginning of histograms, for initialization of cuda_hist_pool_ */
+  hist_t* cuda_hist_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp
new file mode 100644
index 000000000000..83227165af19
--- /dev/null
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp
@@ -0,0 +1,196 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_histogram_constructor.hpp"
+
+#include <algorithm>
+
+namespace LightGBM {
+
+CUDAHistogramConstructor::CUDAHistogramConstructor(
+  const Dataset* train_data,
+  const int num_leaves,
+  const int num_threads,
+  const std::vector<uint32_t>& feature_hist_offsets,
+  const int min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const int gpu_device_id,
+  const bool gpu_use_dp):
+  num_data_(train_data->num_data()),
+  num_features_(train_data->num_features()),
+  num_leaves_(num_leaves),
+  num_threads_(num_threads),
+  min_data_in_leaf_(min_data_in_leaf),
+  min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf),
+  gpu_device_id_(gpu_device_id),
+  gpu_use_dp_(gpu_use_dp) {
+  InitFeatureMetaInfo(train_data, feature_hist_offsets);
+  cuda_row_data_.reset(nullptr);
+  cuda_feature_num_bins_ = nullptr;
+  cuda_feature_hist_offsets_ = nullptr;
+  cuda_feature_most_freq_bins_ = nullptr;
+  cuda_hist_ = nullptr;
+  cuda_need_fix_histogram_features_ = nullptr;
+  cuda_need_fix_histogram_features_num_bin_aligned_ = nullptr;
+}
+
+CUDAHistogramConstructor::~CUDAHistogramConstructor() {
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_num_bins_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_hist_offsets_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_feature_most_freq_bins_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<hist_t>(&cuda_hist_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<int>(&cuda_need_fix_histogram_features_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__);
+  gpuAssert(cudaStreamDestroy(cuda_stream_), __FILE__, __LINE__);
+}
+
+void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, const std::vector<uint32_t>& feature_hist_offsets) {
+  need_fix_histogram_features_.clear();
+  need_fix_histogram_features_num_bin_aligend_.clear();
+  feature_num_bins_.clear();
+  feature_most_freq_bins_.clear();
+  for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) {
+    const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index);
+    const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin();
+    if (most_freq_bin != 0) {
+      need_fix_histogram_features_.emplace_back(feature_index);
+      uint32_t num_bin_ref = static_cast<uint32_t>(bin_mapper->num_bin()) - 1;
+      uint32_t num_bin_aligned = 1;
+      while (num_bin_ref > 0) {
+        num_bin_aligned <<= 1;
+        num_bin_ref >>= 1;
+      }
+      need_fix_histogram_features_num_bin_aligend_.emplace_back(num_bin_aligned);
+    }
+    feature_num_bins_.emplace_back(static_cast<uint32_t>(bin_mapper->num_bin()));
+    feature_most_freq_bins_.emplace_back(most_freq_bin);
+  }
+  feature_hist_offsets_.clear();
+  for (size_t i = 0; i < feature_hist_offsets.size(); ++i) {
+    feature_hist_offsets_.emplace_back(feature_hist_offsets[i]);
+  }
+  if (feature_hist_offsets.empty()) {
+    num_total_bin_ = 0;
+  } else {
+    num_total_bin_ = static_cast<int>(feature_hist_offsets.back());
+  }
+}
+
+void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) {
+  cuda_gradients_ = gradients;
+  cuda_hessians_ = hessians;
+  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+}
+
+void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) {
+  AllocateCUDAMemory<hist_t>(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_num_bins_,
+    feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_hist_offsets_,
+    feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_most_freq_bins_,
+    feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__);
+
+  cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_, gpu_use_dp_));
+  cuda_row_data_->Init(train_data, share_state);
+
+  CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_));
+
+  InitCUDAMemoryFromHostMemory<int>(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(),
+    need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__);
+
+  if (cuda_row_data_->NumLargeBinPartition() > 0) {
+    int grid_dim_x = 0, grid_dim_y = 0, block_dim_x = 0, block_dim_y = 0;
+    CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_);
+    const size_t buffer_size = static_cast<size_t>(grid_dim_y) * static_cast<size_t>(num_total_bin_) * 2;
+    AllocateCUDAMemory<float>(&cuda_hist_buffer_, buffer_size, __FILE__, __LINE__);
+  }
+}
+
+void CUDAHistogramConstructor::ConstructHistogramForLeaf(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+  const data_size_t num_data_in_smaller_leaf,
+  const data_size_t num_data_in_larger_leaf,
+  const double sum_hessians_in_smaller_leaf,
+  const double sum_hessians_in_larger_leaf) {
+  if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) &&
+    (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) {
+    return;
+  }
+  LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel");
+  LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits);
+  global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel");
+}
+
+void CUDAHistogramConstructor::CalcConstructHistogramKernelDim(
+  int* grid_dim_x,
+  int* grid_dim_y,
+  int* block_dim_x,
+  int* block_dim_y,
+  const data_size_t num_data_in_smaller_leaf) {
+  *block_dim_x = cuda_row_data_->max_num_column_per_partition();
+  *block_dim_y = NUM_THRADS_PER_BLOCK / cuda_row_data_->max_num_column_per_partition();
+  *grid_dim_x = cuda_row_data_->num_feature_partitions();
+  *grid_dim_y = std::max(min_grid_dim_y_,
+    ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y));
+}
+
+void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states) {
+  num_data_ = train_data->num_data();
+  num_features_ = train_data->num_features();
+  InitFeatureMetaInfo(train_data, share_states->feature_hist_offsets());
+  if (feature_num_bins_.size() > 0) {
+    DeallocateCUDAMemory<uint32_t>(&cuda_feature_num_bins_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<uint32_t>(&cuda_feature_hist_offsets_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<uint32_t>(&cuda_feature_most_freq_bins_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<int>(&cuda_need_fix_histogram_features_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<hist_t>(&cuda_hist_, __FILE__, __LINE__);
+  }
+
+  AllocateCUDAMemory<hist_t>(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_num_bins_,
+    feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_hist_offsets_,
+    feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__);
+
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_most_freq_bins_,
+    feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__);
+
+  cuda_row_data_.reset(new CUDARowData(train_data, share_states, gpu_device_id_, gpu_use_dp_));
+  cuda_row_data_->Init(train_data, share_states);
+
+  InitCUDAMemoryFromHostMemory<int>(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__);
+  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(),
+    need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__);
+}
+
+void CUDAHistogramConstructor::ResetConfig(const Config* config) {
+  num_threads_ = OMP_NUM_THREADS();
+  num_leaves_ = config->num_leaves;
+  min_data_in_leaf_ = config->min_data_in_leaf;
+  min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf;
+  DeallocateCUDAMemory<hist_t>(&cuda_hist_, __FILE__, __LINE__);
+  AllocateCUDAMemory<hist_t>(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu
new file mode 100644
index 000000000000..e1888f0c4b66
--- /dev/null
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -0,0 +1,432 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_histogram_constructor.hpp"
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+
+#include <algorithm>
+
+namespace LightGBM {
+
+template <typename BIN_TYPE, typename HIST_TYPE, size_t SHARED_HIST_SIZE>
+__global__ void CUDAConstructHistogramDenseKernel(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const score_t* cuda_gradients,
+  const score_t* cuda_hessians,
+  const BIN_TYPE* data,
+  const uint32_t* column_hist_offsets,
+  const uint32_t* column_hist_offsets_full,
+  const int* feature_partition_column_index_offsets,
+  const data_size_t num_data) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  __shared__ HIST_TYPE shared_hist[SHARED_HIST_SIZE];
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x];
+  const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1];
+  const BIN_TYPE* data_ptr = data + static_cast<size_t>(partition_column_start) * num_data;
+  const int num_columns_in_partition = partition_column_end - partition_column_start;
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist[i] = 0.0f;
+  }
+  __syncthreads();
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (static_cast<size_t>(blockIdx_y) * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const int column_index = static_cast<int>(threadIdx.x) + partition_column_start;
+  if (threadIdx.x < static_cast<unsigned int>(num_columns_in_partition)) {
+    HIST_TYPE* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1);
+    for (data_size_t inner_data_index = static_cast<data_size_t>(threadIdx.y); inner_data_index < block_num_data; inner_data_index += blockDim.y) {
+      const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+      const score_t grad = cuda_gradients[data_index];
+      const score_t hess = cuda_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[static_cast<size_t>(data_index) * num_columns_in_partition + threadIdx.x]);
+      const uint32_t pos = bin << 1;
+      HIST_TYPE* pos_ptr = shared_hist_ptr + pos;
+      atomicAdd_block(pos_ptr, grad);
+      atomicAdd_block(pos_ptr + 1, hess);
+    }
+  }
+  __syncthreads();
+  hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1);
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]);
+  }
+}
+
+template <typename BIN_TYPE, typename DATA_PTR_TYPE, typename HIST_TYPE, size_t SHARED_HIST_SIZE>
+__global__ void CUDAConstructHistogramSparseKernel(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const score_t* cuda_gradients,
+  const score_t* cuda_hessians,
+  const BIN_TYPE* data,
+  const DATA_PTR_TYPE* row_ptr,
+  const DATA_PTR_TYPE* partition_ptr,
+  const uint32_t* column_hist_offsets_full,
+  const data_size_t num_data) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  __shared__ HIST_TYPE shared_hist[SHARED_HIST_SIZE];
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const DATA_PTR_TYPE* block_row_ptr = row_ptr + static_cast<size_t>(blockIdx.x) * (num_data + 1);
+  const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x];
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist[i] = 0.0f;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  for (data_size_t i = 0; i < num_iteration_this; ++i) {
+    const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+    const DATA_PTR_TYPE row_start = block_row_ptr[data_index];
+    const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1];
+    const DATA_PTR_TYPE row_size = row_end - row_start;
+    if (threadIdx.x < row_size) {
+      const score_t grad = cuda_gradients[data_index];
+      const score_t hess = cuda_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[row_start + threadIdx.x]);
+      const uint32_t pos = bin << 1;
+      HIST_TYPE* pos_ptr = shared_hist + pos;
+      atomicAdd_block(pos_ptr, grad);
+      atomicAdd_block(pos_ptr + 1, hess);
+    }
+    inner_data_index += blockDim.y;
+  }
+  __syncthreads();
+  hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1);
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]);
+  }
+}
+
+template <typename BIN_TYPE>
+__global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const score_t* cuda_gradients,
+  const score_t* cuda_hessians,
+  const BIN_TYPE* data,
+  const uint32_t* column_hist_offsets,
+  const uint32_t* column_hist_offsets_full,
+  const int* feature_partition_column_index_offsets,
+  const data_size_t num_data,
+  float* global_hist_buffer) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x];
+  const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1];
+  const BIN_TYPE* data_ptr = data + static_cast<size_t>(partition_column_start) * num_data;
+  const int num_columns_in_partition = partition_column_end - partition_column_start;
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  const int num_total_bin = column_hist_offsets_full[gridDim.x];
+  float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist[i] = 0.0f;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (static_cast<size_t>(blockIdx_y) * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  const int column_index = static_cast<int>(threadIdx.x) + partition_column_start;
+  if (threadIdx.x < static_cast<unsigned int>(num_columns_in_partition)) {
+    float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1);
+    for (data_size_t i = 0; i < num_iteration_this; ++i) {
+      const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+      const score_t grad = cuda_gradients[data_index];
+      const score_t hess = cuda_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[static_cast<size_t>(data_index) * num_columns_in_partition + threadIdx.x]);
+      const uint32_t pos = bin << 1;
+      float* pos_ptr = shared_hist_ptr + pos;
+      atomicAdd_block(pos_ptr, grad);
+      atomicAdd_block(pos_ptr + 1, hess);
+      inner_data_index += blockDim.y;
+    }
+  }
+  __syncthreads();
+  hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1);
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]);
+  }
+}
+
+template <typename BIN_TYPE, typename DATA_PTR_TYPE>
+__global__ void CUDAConstructHistogramSparseKernel_GlobalMemory(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const score_t* cuda_gradients,
+  const score_t* cuda_hessians,
+  const BIN_TYPE* data,
+  const DATA_PTR_TYPE* row_ptr,
+  const DATA_PTR_TYPE* partition_ptr,
+  const uint32_t* column_hist_offsets_full,
+  const data_size_t num_data,
+  float* global_hist_buffer) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const DATA_PTR_TYPE* block_row_ptr = row_ptr + static_cast<size_t>(blockIdx.x) * (num_data + 1);
+  const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x];
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  const int num_total_bin = column_hist_offsets_full[gridDim.x];
+  float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist[i] = 0.0f;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  for (data_size_t i = 0; i < num_iteration_this; ++i) {
+    const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+    const DATA_PTR_TYPE row_start = block_row_ptr[data_index];
+    const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1];
+    const DATA_PTR_TYPE row_size = row_end - row_start;
+    if (threadIdx.x < row_size) {
+      const score_t grad = cuda_gradients[data_index];
+      const score_t hess = cuda_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[row_start + threadIdx.x]);
+      const uint32_t pos = bin << 1;
+      float* pos_ptr = shared_hist + pos;
+      atomicAdd_block(pos_ptr, grad);
+      atomicAdd_block(pos_ptr + 1, hess);
+    }
+    inner_data_index += blockDim.y;
+  }
+  __syncthreads();
+  hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1);
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]);
+  }
+}
+
+void CUDAHistogramConstructor::LaunchConstructHistogramKernel(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const data_size_t num_data_in_smaller_leaf) {
+  if (cuda_row_data_->shared_hist_size() == DP_SHARED_HIST_SIZE && gpu_use_dp_) {
+    LaunchConstructHistogramKernelInner<double, DP_SHARED_HIST_SIZE>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else if (cuda_row_data_->shared_hist_size() == SP_SHARED_HIST_SIZE && !gpu_use_dp_) {
+    LaunchConstructHistogramKernelInner<float, SP_SHARED_HIST_SIZE>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else {
+    Log::Fatal("Unknown shared histogram size %d", cuda_row_data_->shared_hist_size());
+  }
+}
+
+template <typename HIST_TYPE, size_t SHARED_HIST_SIZE>
+void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const data_size_t num_data_in_smaller_leaf) {
+  if (cuda_row_data_->bit_type() == 8) {
+    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint8_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else if (cuda_row_data_->bit_type() == 16) {
+    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else if (cuda_row_data_->bit_type() == 32) {
+    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint32_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else {
+    Log::Fatal("Unknown bit_type = %d", cuda_row_data_->bit_type());
+  }
+}
+
+template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE>
+void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner0(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const data_size_t num_data_in_smaller_leaf) {
+  if (cuda_row_data_->row_ptr_bit_type() == 16) {
+    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else if (cuda_row_data_->row_ptr_bit_type() == 32) {
+    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint32_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else if (cuda_row_data_->row_ptr_bit_type() == 64) {
+    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint64_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else {
+    if (!cuda_row_data_->is_sparse()) {
+      LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    } else {
+      Log::Fatal("Unknown row_ptr_bit_type = %d", cuda_row_data_->row_ptr_bit_type());
+    }
+  }
+}
+
+template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE>
+void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner1(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const data_size_t num_data_in_smaller_leaf) {
+  if (cuda_row_data_->NumLargeBinPartition() == 0) {
+    LaunchConstructHistogramKernelInner2<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, PTR_TYPE, false>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  } else {
+    LaunchConstructHistogramKernelInner2<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, PTR_TYPE, true>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  }
+}
+
+template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE, bool USE_GLOBAL_MEM_BUFFER>
+void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner2(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const data_size_t num_data_in_smaller_leaf) {
+  int grid_dim_x = 0;
+  int grid_dim_y = 0;
+  int block_dim_x = 0;
+  int block_dim_y = 0;
+  CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf);
+  dim3 grid_dim(grid_dim_x, grid_dim_y);
+  dim3 block_dim(block_dim_x, block_dim_y);
+  if (!USE_GLOBAL_MEM_BUFFER) {
+    if (cuda_row_data_->is_sparse()) {
+      CUDAConstructHistogramSparseKernel<BIN_TYPE, PTR_TYPE, HIST_TYPE, SHARED_HIST_SIZE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+        cuda_smaller_leaf_splits,
+        cuda_gradients_, cuda_hessians_,
+        cuda_row_data_->GetBin<BIN_TYPE>(),
+        cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+        cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+        cuda_row_data_->cuda_partition_hist_offsets(),
+        num_data_);
+    } else {
+      CUDAConstructHistogramDenseKernel<BIN_TYPE, HIST_TYPE, SHARED_HIST_SIZE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+        cuda_smaller_leaf_splits,
+        cuda_gradients_, cuda_hessians_,
+        cuda_row_data_->GetBin<BIN_TYPE>(),
+        cuda_row_data_->cuda_column_hist_offsets(),
+        cuda_row_data_->cuda_partition_hist_offsets(),
+        cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+        num_data_);
+    }
+  } else {
+    if (cuda_row_data_->is_sparse()) {
+      CUDAConstructHistogramSparseKernel_GlobalMemory<BIN_TYPE, PTR_TYPE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+        cuda_smaller_leaf_splits,
+        cuda_gradients_, cuda_hessians_,
+        cuda_row_data_->GetBin<BIN_TYPE>(),
+        cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+        cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+        cuda_row_data_->cuda_partition_hist_offsets(),
+        num_data_,
+        cuda_hist_buffer_);
+    } else {
+      CUDAConstructHistogramDenseKernel_GlobalMemory<BIN_TYPE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+        cuda_smaller_leaf_splits,
+        cuda_gradients_, cuda_hessians_,
+        cuda_row_data_->GetBin<BIN_TYPE>(),
+        cuda_row_data_->cuda_column_hist_offsets(),
+        cuda_row_data_->cuda_partition_hist_offsets(),
+        cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+        num_data_,
+        cuda_hist_buffer_);
+    }
+  }
+}
+
+__global__ void SubtractHistogramKernel(
+  const int num_total_bin,
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits) {
+  const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int cuda_larger_leaf_index = cuda_larger_leaf_splits->leaf_index;
+  if (cuda_larger_leaf_index >= 0) {
+    const hist_t* smaller_leaf_hist = cuda_smaller_leaf_splits->hist_in_leaf;
+    hist_t* larger_leaf_hist = cuda_larger_leaf_splits->hist_in_leaf;
+    if (global_thread_index < 2 * num_total_bin) {
+      larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index];
+    }
+  }
+}
+
+__global__ void FixHistogramKernel(
+  const uint32_t* cuda_feature_num_bins,
+  const uint32_t* cuda_feature_hist_offsets,
+  const uint32_t* cuda_feature_most_freq_bins,
+  const int* cuda_need_fix_histogram_features,
+  const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {
+  __shared__ hist_t shared_mem_buffer[32];
+  const unsigned int blockIdx_x = blockIdx.x;
+  const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
+  const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];
+  const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index];
+  const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index];
+  const double leaf_sum_gradients = cuda_smaller_leaf_splits->sum_of_gradients;
+  const double leaf_sum_hessians = cuda_smaller_leaf_splits->sum_of_hessians;
+  hist_t* feature_hist = cuda_smaller_leaf_splits->hist_in_leaf + feature_hist_offset * 2;
+  const unsigned int threadIdx_x = threadIdx.x;
+  const uint32_t num_bin = cuda_feature_num_bins[feature_index];
+  const uint32_t hist_pos = threadIdx_x << 1;
+  const hist_t bin_gradient = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[hist_pos] : 0.0f;
+  const hist_t bin_hessian = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[hist_pos + 1] : 0.0f;
+  const hist_t sum_gradient = ShuffleReduceSum<hist_t>(bin_gradient, shared_mem_buffer, num_bin_aligned);
+  const hist_t sum_hessian = ShuffleReduceSum<hist_t>(bin_hessian, shared_mem_buffer, num_bin_aligned);
+  if (threadIdx_x == 0) {
+    feature_hist[most_freq_bin << 1] = leaf_sum_gradients - sum_gradient;
+    feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - sum_hessian;
+  }
+}
+
+void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits) {
+  const int num_subtract_threads = 2 * num_total_bin_;
+  const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE;
+  global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel");
+  if (need_fix_histogram_features_.size() > 0) {
+    FixHistogramKernel<<<need_fix_histogram_features_.size(), FIX_HISTOGRAM_BLOCK_SIZE, 0, cuda_stream_>>>(
+      cuda_feature_num_bins_,
+      cuda_feature_hist_offsets_,
+      cuda_feature_most_freq_bins_,
+      cuda_need_fix_histogram_features_,
+      cuda_need_fix_histogram_features_num_bin_aligned_,
+      cuda_smaller_leaf_splits);
+  }
+  global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel");
+  global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel");
+  SubtractHistogramKernel<<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+    num_total_bin_,
+    cuda_smaller_leaf_splits,
+    cuda_larger_leaf_splits);
+  global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel");
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp
new file mode 100644
index 000000000000..e364003ed934
--- /dev/null
+++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp
@@ -0,0 +1,169 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
+#define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_row_data.hpp>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/tree.h>
+
+#include <memory>
+#include <vector>
+
+#include "cuda_leaf_splits.hpp"
+
+#define NUM_DATA_PER_THREAD (400)
+#define NUM_THRADS_PER_BLOCK (504)
+#define NUM_FEATURE_PER_THREAD_GROUP (28)
+#define SUBTRACT_BLOCK_SIZE (1024)
+#define FIX_HISTOGRAM_SHARED_MEM_SIZE (1024)
+#define FIX_HISTOGRAM_BLOCK_SIZE (512)
+#define USED_HISTOGRAM_BUFFER_NUM (8)
+
+namespace LightGBM {
+
+class CUDAHistogramConstructor {
+ public:
+  CUDAHistogramConstructor(
+    const Dataset* train_data,
+    const int num_leaves,
+    const int num_threads,
+    const std::vector<uint32_t>& feature_hist_offsets,
+    const int min_data_in_leaf,
+    const double min_sum_hessian_in_leaf,
+    const int gpu_device_id,
+    const bool gpu_use_dp);
+
+  ~CUDAHistogramConstructor();
+
+  void Init(const Dataset* train_data, TrainingShareStates* share_state);
+
+  void ConstructHistogramForLeaf(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+    const data_size_t num_data_in_smaller_leaf,
+    const data_size_t num_data_in_larger_leaf,
+    const double sum_hessians_in_smaller_leaf,
+    const double sum_hessians_in_larger_leaf);
+
+  void ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states);
+
+  void ResetConfig(const Config* config);
+
+  void BeforeTrain(const score_t* gradients, const score_t* hessians);
+
+  const hist_t* cuda_hist() const { return cuda_hist_; }
+
+  hist_t* cuda_hist_pointer() { return cuda_hist_; }
+
+ private:
+  void InitFeatureMetaInfo(const Dataset* train_data, const std::vector<uint32_t>& feature_hist_offsets);
+
+  void CalcConstructHistogramKernelDim(
+    int* grid_dim_x,
+    int* grid_dim_y,
+    int* block_dim_x,
+    int* block_dim_y,
+    const data_size_t num_data_in_smaller_leaf);
+
+  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE>
+  void LaunchConstructHistogramKernelInner(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const data_size_t num_data_in_smaller_leaf);
+
+  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE>
+  void LaunchConstructHistogramKernelInner0(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const data_size_t num_data_in_smaller_leaf);
+
+  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE>
+  void LaunchConstructHistogramKernelInner1(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const data_size_t num_data_in_smaller_leaf);
+
+  template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE, bool USE_GLOBAL_MEM_BUFFER>
+  void LaunchConstructHistogramKernelInner2(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const data_size_t num_data_in_smaller_leaf);
+
+  void LaunchConstructHistogramKernel(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const data_size_t num_data_in_smaller_leaf);
+
+  void LaunchSubtractHistogramKernel(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const CUDALeafSplitsStruct* cuda_larger_leaf_splits);
+
+  // Host memory
+
+  /*! \brief size of training data */
+  data_size_t num_data_;
+  /*! \brief number of features in training data */
+  int num_features_;
+  /*! \brief maximum number of leaves */
+  int num_leaves_;
+  /*! \brief number of threads */
+  int num_threads_;
+  /*! \brief total number of bins in histogram */
+  int num_total_bin_;
+  /*! \brief number of bins per feature */
+  std::vector<uint32_t> feature_num_bins_;
+  /*! \brief offsets in histogram of all features */
+  std::vector<uint32_t> feature_hist_offsets_;
+  /*! \brief most frequent bins in each feature */
+  std::vector<uint32_t> feature_most_freq_bins_;
+  /*! \brief minimum number of data allowed per leaf */
+  int min_data_in_leaf_;
+  /*! \brief minimum sum value of hessians allowed per leaf */
+  double min_sum_hessian_in_leaf_;
+  /*! \brief cuda stream for histogram construction */
+  cudaStream_t cuda_stream_;
+  /*! \brief indices of feature whose histograms need to be fixed */
+  std::vector<int> need_fix_histogram_features_;
+  /*! \brief aligned number of bins of the features whose histograms need to be fixed */
+  std::vector<uint32_t> need_fix_histogram_features_num_bin_aligend_;
+  /*! \brief minimum number of blocks allowed in the y dimension */
+  const int min_grid_dim_y_ = 160;
+
+
+  // CUDA memory, held by this object
+
+  /*! \brief CUDA row wise data */
+  std::unique_ptr<CUDARowData> cuda_row_data_;
+  /*! \brief number of bins per feature */
+  uint32_t* cuda_feature_num_bins_;
+  /*! \brief offsets in histogram of all features */
+  uint32_t* cuda_feature_hist_offsets_;
+  /*! \brief most frequent bins in each feature */
+  uint32_t* cuda_feature_most_freq_bins_;
+  /*! \brief CUDA histograms */
+  hist_t* cuda_hist_;
+  /*! \brief CUDA histograms buffer for each block */
+  float* cuda_hist_buffer_;
+  /*! \brief indices of feature whose histograms need to be fixed */
+  int* cuda_need_fix_histogram_features_;
+  /*! \brief aligned number of bins of the features whose histograms need to be fixed */
+  uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_;
+
+  // CUDA memory, held by other object
+
+  /*! \brief gradients on CUDA */
+  const score_t* cuda_gradients_;
+  /*! \brief hessians on CUDA */
+  const score_t* cuda_hessians_;
+
+  /*! \brief GPU device index */
+  const int gpu_device_id_;
+  /*! \brief use double precision histogram per block */
+  const bool gpu_use_dp_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp
new file mode 100644
index 000000000000..9d093f0f164b
--- /dev/null
+++ b/src/treelearner/cuda/cuda_leaf_splits.cpp
@@ -0,0 +1,71 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_leaf_splits.hpp"
+
+namespace LightGBM {
+
+CUDALeafSplits::CUDALeafSplits(const data_size_t num_data):
+num_data_(num_data) {
+  cuda_struct_ = nullptr;
+  cuda_sum_of_gradients_buffer_ = nullptr;
+  cuda_sum_of_hessians_buffer_ = nullptr;
+}
+
+CUDALeafSplits::~CUDALeafSplits() {
+  DeallocateCUDAMemory<CUDALeafSplitsStruct>(&cuda_struct_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__);
+}
+
+void CUDALeafSplits::Init() {
+  num_blocks_init_from_gradients_ = (num_data_ + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
+
+  // allocate more memory for sum reduction in CUDA
+  // only the first element records the final sum
+  AllocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+  AllocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+
+  AllocateCUDAMemory<CUDALeafSplitsStruct>(&cuda_struct_, 1, __FILE__, __LINE__);
+}
+
+void CUDALeafSplits::InitValues() {
+  LaunchInitValuesEmptyKernel();
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDALeafSplits::InitValues(
+  const double lambda_l1, const double lambda_l2,
+  const score_t* cuda_gradients, const score_t* cuda_hessians,
+  const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf,
+  const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) {
+  cuda_gradients_ = cuda_gradients;
+  cuda_hessians_ = cuda_hessians;
+  SetCUDAMemory<double>(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+  SetCUDAMemory<double>(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+  LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf);
+  CopyFromCUDADeviceToHost<double>(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDALeafSplits::Resize(const data_size_t num_data) {
+  if (num_data > num_data_) {
+    DeallocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__);
+    DeallocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__);
+    num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
+    AllocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+    AllocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+  } else {
+    num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
+  }
+  num_data_ = num_data;
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu
new file mode 100644
index 000000000000..15c2983ef1d2
--- /dev/null
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -0,0 +1,129 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_leaf_splits.hpp"
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+
+namespace LightGBM {
+
+template <bool USE_INDICES>
+__global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians,
+  const data_size_t num_data, const data_size_t* cuda_bagging_data_indices,
+  double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) {
+  __shared__ double shared_mem_buffer[32];
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  double gradient = 0.0f;
+  double hessian = 0.0f;
+  if (data_index < num_data) {
+    gradient = USE_INDICES ? cuda_gradients[cuda_bagging_data_indices[data_index]] : cuda_gradients[data_index];
+    hessian = USE_INDICES ? cuda_hessians[cuda_bagging_data_indices[data_index]] : cuda_hessians[data_index];
+  }
+  const double block_sum_gradient = ShuffleReduceSum<double>(gradient, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  const double block_sum_hessian = ShuffleReduceSum<double>(hessian, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    cuda_sum_of_gradients[blockIdx.x] += block_sum_gradient;
+    cuda_sum_of_hessians[blockIdx.x] += block_sum_hessian;
+  }
+}
+
+__global__ void CUDAInitValuesKernel2(
+  const double lambda_l1,
+  const double lambda_l2,
+  const int num_blocks_to_reduce,
+  double* cuda_sum_of_gradients,
+  double* cuda_sum_of_hessians,
+  const data_size_t num_data,
+  const data_size_t* cuda_data_indices_in_leaf,
+  hist_t* cuda_hist_in_leaf,
+  CUDALeafSplitsStruct* cuda_struct) {
+  __shared__ double shared_mem_buffer[32];
+  double thread_sum_of_gradients = 0.0f;
+  double thread_sum_of_hessians = 0.0f;
+  for (int block_index = static_cast<int>(threadIdx.x); block_index < num_blocks_to_reduce; block_index += static_cast<int>(blockDim.x)) {
+    thread_sum_of_gradients += cuda_sum_of_gradients[block_index];
+    thread_sum_of_hessians += cuda_sum_of_hessians[block_index];
+  }
+  const double sum_of_gradients = ShuffleReduceSum<double>(thread_sum_of_gradients, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  const double sum_of_hessians = ShuffleReduceSum<double>(thread_sum_of_hessians, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    cuda_sum_of_hessians[0] = sum_of_hessians;
+    cuda_struct->leaf_index = 0;
+    cuda_struct->sum_of_gradients = sum_of_gradients;
+    cuda_struct->sum_of_hessians = sum_of_hessians;
+    cuda_struct->num_data_in_leaf = num_data;
+    const bool use_l1 = lambda_l1 > 0.0f;
+    if (!use_l1) {
+      // no smoothing on root node
+      cuda_struct->gain = CUDALeafSplits::GetLeafGain<false, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    } else {
+      // no smoothing on root node
+      cuda_struct->gain = CUDALeafSplits::GetLeafGain<true, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    }
+    if (!use_l1) {
+      // no smoothing on root node
+      cuda_struct->leaf_value =
+        CUDALeafSplits::CalculateSplittedLeafOutput<false, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    } else {
+      // no smoothing on root node
+      cuda_struct->leaf_value =
+        CUDALeafSplits::CalculateSplittedLeafOutput<true, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    }
+    cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf;
+    cuda_struct->hist_in_leaf = cuda_hist_in_leaf;
+  }
+}
+
+__global__ void InitValuesEmptyKernel(CUDALeafSplitsStruct* cuda_struct) {
+  cuda_struct->leaf_index = -1;
+  cuda_struct->sum_of_gradients = 0.0f;
+  cuda_struct->sum_of_hessians = 0.0f;
+  cuda_struct->num_data_in_leaf = 0;
+  cuda_struct->gain = 0.0f;
+  cuda_struct->leaf_value = 0.0f;
+  cuda_struct->data_indices_in_leaf = nullptr;
+  cuda_struct->hist_in_leaf = nullptr;
+}
+
+void CUDALeafSplits::LaunchInitValuesEmptyKernel() {
+  InitValuesEmptyKernel<<<1, 1>>>(cuda_struct_);
+}
+
+void CUDALeafSplits::LaunchInitValuesKernal(
+  const double lambda_l1, const double lambda_l2,
+  const data_size_t* cuda_bagging_data_indices,
+  const data_size_t* cuda_data_indices_in_leaf,
+  const data_size_t num_used_indices,
+  hist_t* cuda_hist_in_leaf) {
+  if (cuda_bagging_data_indices == nullptr) {
+    CUDAInitValuesKernel1<false><<<num_blocks_init_from_gradients_, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
+      cuda_gradients_, cuda_hessians_, num_used_indices, nullptr, cuda_sum_of_gradients_buffer_,
+      cuda_sum_of_hessians_buffer_);
+  } else {
+    CUDAInitValuesKernel1<true><<<num_blocks_init_from_gradients_, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
+      cuda_gradients_, cuda_hessians_, num_used_indices, cuda_bagging_data_indices, cuda_sum_of_gradients_buffer_,
+      cuda_sum_of_hessians_buffer_);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  CUDAInitValuesKernel2<<<1, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
+    lambda_l1, lambda_l2,
+    num_blocks_init_from_gradients_,
+    cuda_sum_of_gradients_buffer_,
+    cuda_sum_of_hessians_buffer_,
+    num_used_indices,
+    cuda_data_indices_in_leaf,
+    cuda_hist_in_leaf,
+    cuda_struct_);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp
new file mode 100644
index 000000000000..fe04cf5bcace
--- /dev/null
+++ b/src/treelearner/cuda/cuda_leaf_splits.hpp
@@ -0,0 +1,160 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
+#define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/meta.h>
+
+#define NUM_THRADS_PER_BLOCK_LEAF_SPLITS (1024)
+#define NUM_DATA_THREAD_ADD_LEAF_SPLITS (6)
+
+namespace LightGBM {
+
+struct CUDALeafSplitsStruct {
+ public:
+  int leaf_index;
+  double sum_of_gradients;
+  double sum_of_hessians;
+  data_size_t num_data_in_leaf;
+  double gain;
+  double leaf_value;
+  const data_size_t* data_indices_in_leaf;
+  hist_t* hist_in_leaf;
+};
+
+class CUDALeafSplits {
+ public:
+  explicit CUDALeafSplits(const data_size_t num_data);
+
+  ~CUDALeafSplits();
+
+  void Init();
+
+  void InitValues(
+    const double lambda_l1, const double lambda_l2,
+    const score_t* cuda_gradients, const score_t* cuda_hessians,
+    const data_size_t* cuda_bagging_data_indices,
+    const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
+    hist_t* cuda_hist_in_leaf, double* root_sum_hessians);
+
+  void InitValues();
+
+  const CUDALeafSplitsStruct* GetCUDAStruct() const { return cuda_struct_; }
+
+  CUDALeafSplitsStruct* GetCUDAStructRef() { return cuda_struct_; }
+
+  void Resize(const data_size_t num_data);
+
+  __device__ static double ThresholdL1(double s, double l1) {
+    const double reg_s = fmax(0.0, fabs(s) - l1);
+    if (s >= 0.0f) {
+      return reg_s;
+    } else {
+      return -reg_s;
+    }
+  }
+
+  template <bool USE_L1, bool USE_SMOOTHING>
+  __device__ static double CalculateSplittedLeafOutput(double sum_gradients,
+                                          double sum_hessians, double l1, double l2,
+                                          double path_smooth, data_size_t num_data,
+                                          double parent_output) {
+    double ret;
+    if (USE_L1) {
+      ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
+    } else {
+      ret = -sum_gradients / (sum_hessians + l2);
+    }
+    if (USE_SMOOTHING) {
+      ret = ret * (num_data / path_smooth) / (num_data / path_smooth + 1) \
+          + parent_output / (num_data / path_smooth + 1);
+    }
+    return ret;
+  }
+
+  template <bool USE_L1>
+  __device__ static double GetLeafGainGivenOutput(double sum_gradients,
+                                      double sum_hessians, double l1,
+                                      double l2, double output) {
+    if (USE_L1) {
+      const double sg_l1 = ThresholdL1(sum_gradients, l1);
+      return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output);
+    } else {
+      return -(2.0 * sum_gradients * output +
+                (sum_hessians + l2) * output * output);
+    }
+  }
+
+  template <bool USE_L1, bool USE_SMOOTHING>
+  __device__ static double GetLeafGain(double sum_gradients, double sum_hessians,
+                          double l1, double l2,
+                          double path_smooth, data_size_t num_data,
+                          double parent_output) {
+    if (!USE_SMOOTHING) {
+      if (USE_L1) {
+        const double sg_l1 = ThresholdL1(sum_gradients, l1);
+        return (sg_l1 * sg_l1) / (sum_hessians + l2);
+      } else {
+        return (sum_gradients * sum_gradients) / (sum_hessians + l2);
+      }
+    } else {
+      const double output = CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(
+          sum_gradients, sum_hessians, l1, l2, path_smooth, num_data, parent_output);
+      return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2, output);
+    }
+  }
+
+  template <bool USE_L1, bool USE_SMOOTHING>
+  __device__ static double GetSplitGains(double sum_left_gradients,
+                            double sum_left_hessians,
+                            double sum_right_gradients,
+                            double sum_right_hessians,
+                            double l1, double l2,
+                            double path_smooth,
+                            data_size_t left_count,
+                            data_size_t right_count,
+                            double parent_output) {
+    return GetLeafGain<USE_L1, USE_SMOOTHING>(sum_left_gradients,
+                      sum_left_hessians,
+                      l1, l2, path_smooth, left_count, parent_output) +
+          GetLeafGain<USE_L1, USE_SMOOTHING>(sum_right_gradients,
+                      sum_right_hessians,
+                      l1, l2, path_smooth, right_count, parent_output);
+  }
+
+ private:
+  void LaunchInitValuesEmptyKernel();
+
+  void LaunchInitValuesKernal(
+    const double lambda_l1, const double lambda_l2,
+    const data_size_t* cuda_bagging_data_indices,
+    const data_size_t* cuda_data_indices_in_leaf,
+    const data_size_t num_used_indices,
+    hist_t* cuda_hist_in_leaf);
+
+  // Host memory
+  data_size_t num_data_;
+  int num_blocks_init_from_gradients_;
+
+  // CUDA memory, held by this object
+  CUDALeafSplitsStruct* cuda_struct_;
+  double* cuda_sum_of_gradients_buffer_;
+  double* cuda_sum_of_hessians_buffer_;
+
+  // CUDA memory, held by other object
+  const score_t* cuda_gradients_;
+  const score_t* cuda_hessians_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
new file mode 100644
index 000000000000..f8a4fcd92f9f
--- /dev/null
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -0,0 +1,464 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_single_gpu_tree_learner.hpp"
+
+#include <LightGBM/cuda/cuda_tree.hpp>
+#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/feature_group.h>
+#include <LightGBM/network.h>
+#include <LightGBM/objective_function.h>
+
+#include <algorithm>
+#include <memory>
+
+namespace LightGBM {
+
+CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config): SerialTreeLearner(config) {
+  cuda_gradients_ = nullptr;
+  cuda_hessians_ = nullptr;
+}
+
+CUDASingleGPUTreeLearner::~CUDASingleGPUTreeLearner() {
+  DeallocateCUDAMemory<score_t>(&cuda_gradients_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<score_t>(&cuda_hessians_, __FILE__, __LINE__);
+}
+
+void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
+  SerialTreeLearner::Init(train_data, is_constant_hessian);
+  num_threads_ = OMP_NUM_THREADS();
+  // use the first gpu by default
+  gpu_device_id_ = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
+  SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
+
+  cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_));
+  cuda_smaller_leaf_splits_->Init();
+  cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_));
+  cuda_larger_leaf_splits_->Init();
+
+  cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, config_->num_leaves, num_threads_,
+    share_state_->feature_hist_offsets(),
+    config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_, config_->gpu_use_dp));
+  cuda_histogram_constructor_->Init(train_data_, share_state_.get());
+
+  const auto& feature_hist_offsets = share_state_->feature_hist_offsets();
+  const int num_total_bin = feature_hist_offsets.empty() ? 0 : static_cast<int>(feature_hist_offsets.back());
+  cuda_data_partition_.reset(new CUDADataPartition(
+    train_data_, num_total_bin, config_->num_leaves, num_threads_,
+    cuda_histogram_constructor_->cuda_hist_pointer()));
+  cuda_data_partition_->Init();
+
+  cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(),
+    train_data_, this->share_state_->feature_hist_offsets(), config_));
+  cuda_best_split_finder_->Init();
+
+  leaf_best_split_feature_.resize(config_->num_leaves, -1);
+  leaf_best_split_threshold_.resize(config_->num_leaves, 0);
+  leaf_best_split_default_left_.resize(config_->num_leaves, 0);
+  leaf_num_data_.resize(config_->num_leaves, 0);
+  leaf_data_start_.resize(config_->num_leaves, 0);
+  leaf_sum_hessians_.resize(config_->num_leaves, 0.0f);
+
+  AllocateCUDAMemory<score_t>(&cuda_gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateCUDAMemory<score_t>(&cuda_hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateBitset();
+
+  cuda_leaf_gradient_stat_buffer_ = nullptr;
+  cuda_leaf_hessian_stat_buffer_ = nullptr;
+  leaf_stat_buffer_size_ = 0;
+  num_cat_threshold_ = 0;
+}
+
+void CUDASingleGPUTreeLearner::BeforeTrain() {
+  const data_size_t root_num_data = cuda_data_partition_->root_num_data();
+  CopyFromHostToCUDADevice<score_t>(cuda_gradients_, gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  CopyFromHostToCUDADevice<score_t>(cuda_hessians_, hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  const data_size_t* leaf_splits_init_indices =
+    cuda_data_partition_->use_bagging() ? cuda_data_partition_->cuda_data_indices() : nullptr;
+  cuda_data_partition_->BeforeTrain();
+  cuda_smaller_leaf_splits_->InitValues(
+    config_->lambda_l1,
+    config_->lambda_l2,
+    cuda_gradients_,
+    cuda_hessians_,
+    leaf_splits_init_indices,
+    cuda_data_partition_->cuda_data_indices(),
+    root_num_data,
+    cuda_histogram_constructor_->cuda_hist_pointer(),
+    &leaf_sum_hessians_[0]);
+  leaf_num_data_[0] = root_num_data;
+  cuda_larger_leaf_splits_->InitValues();
+  cuda_histogram_constructor_->BeforeTrain(cuda_gradients_, cuda_hessians_);
+  col_sampler_.ResetByTree();
+  cuda_best_split_finder_->BeforeTrain(col_sampler_.is_feature_used_bytree());
+  leaf_data_start_[0] = 0;
+  smaller_leaf_index_ = 0;
+  larger_leaf_index_ = -1;
+}
+
+void CUDASingleGPUTreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const {
+  cuda_data_partition_->UpdateTrainScore(tree, out_score);
+}
+
+Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients,
+  const score_t* hessians, bool /*is_first_tree*/) {
+  gradients_ = gradients;
+  hessians_ = hessians;
+  global_timer.Start("CUDASingleGPUTreeLearner::BeforeTrain");
+  BeforeTrain();
+  global_timer.Stop("CUDASingleGPUTreeLearner::BeforeTrain");
+  const bool track_branch_features = !(config_->interaction_constraints_vector.empty());
+  std::unique_ptr<CUDATree> tree(new CUDATree(config_->num_leaves, track_branch_features,
+    config_->linear_tree, config_->gpu_device_id, has_categorical_feature_));
+  for (int i = 0; i < config_->num_leaves - 1; ++i) {
+    global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf");
+    const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_];
+    const data_size_t num_data_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_num_data_[larger_leaf_index_];
+    const double sum_hessians_in_smaller_leaf = leaf_sum_hessians_[smaller_leaf_index_];
+    const double sum_hessians_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_sum_hessians_[larger_leaf_index_];
+    cuda_histogram_constructor_->ConstructHistogramForLeaf(
+      cuda_smaller_leaf_splits_->GetCUDAStruct(),
+      cuda_larger_leaf_splits_->GetCUDAStruct(),
+      num_data_in_smaller_leaf,
+      num_data_in_larger_leaf,
+      sum_hessians_in_smaller_leaf,
+      sum_hessians_in_larger_leaf);
+    global_timer.Stop("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf");
+    global_timer.Start("CUDASingleGPUTreeLearner::FindBestSplitsForLeaf");
+    cuda_best_split_finder_->FindBestSplitsForLeaf(
+      cuda_smaller_leaf_splits_->GetCUDAStruct(),
+      cuda_larger_leaf_splits_->GetCUDAStruct(),
+      smaller_leaf_index_, larger_leaf_index_,
+      num_data_in_smaller_leaf, num_data_in_larger_leaf,
+      sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf);
+    global_timer.Stop("CUDASingleGPUTreeLearner::FindBestSplitsForLeaf");
+    global_timer.Start("CUDASingleGPUTreeLearner::FindBestFromAllSplits");
+    const CUDASplitInfo* best_split_info = nullptr;
+    if (larger_leaf_index_ >= 0) {
+      best_split_info = cuda_best_split_finder_->FindBestFromAllSplits(
+        tree->num_leaves(),
+        smaller_leaf_index_,
+        larger_leaf_index_,
+        &leaf_best_split_feature_[smaller_leaf_index_],
+        &leaf_best_split_threshold_[smaller_leaf_index_],
+        &leaf_best_split_default_left_[smaller_leaf_index_],
+        &leaf_best_split_feature_[larger_leaf_index_],
+        &leaf_best_split_threshold_[larger_leaf_index_],
+        &leaf_best_split_default_left_[larger_leaf_index_],
+        &best_leaf_index_,
+        &num_cat_threshold_);
+    } else {
+      best_split_info = cuda_best_split_finder_->FindBestFromAllSplits(
+        tree->num_leaves(),
+        smaller_leaf_index_,
+        larger_leaf_index_,
+        &leaf_best_split_feature_[smaller_leaf_index_],
+        &leaf_best_split_threshold_[smaller_leaf_index_],
+        &leaf_best_split_default_left_[smaller_leaf_index_],
+        nullptr,
+        nullptr,
+        nullptr,
+        &best_leaf_index_,
+        &num_cat_threshold_);
+    }
+    global_timer.Stop("CUDASingleGPUTreeLearner::FindBestFromAllSplits");
+
+    if (best_leaf_index_ == -1) {
+      Log::Warning("No further splits with positive gain, training stopped with %d leaves.", (i + 1));
+      break;
+    }
+
+    global_timer.Start("CUDASingleGPUTreeLearner::Split");
+    if (num_cat_threshold_ > 0) {
+      ConstructBitsetForCategoricalSplit(best_split_info);
+    }
+
+    int right_leaf_index = 0;
+    if (train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->bin_type() == BinType::CategoricalBin) {
+      right_leaf_index = tree->SplitCategorical(best_leaf_index_,
+                                       train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]),
+                                       train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(),
+                                       best_split_info,
+                                       cuda_bitset_,
+                                       cuda_bitset_len_,
+                                       cuda_bitset_inner_,
+                                       cuda_bitset_inner_len_);
+    } else {
+      right_leaf_index = tree->Split(best_leaf_index_,
+                                       train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]),
+                                       train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_],
+                                        leaf_best_split_threshold_[best_leaf_index_]),
+                                       train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(),
+                                       best_split_info);
+    }
+
+    double sum_left_gradients = 0.0f;
+    double sum_right_gradients = 0.0f;
+    cuda_data_partition_->Split(best_split_info,
+                                best_leaf_index_,
+                                right_leaf_index,
+                                leaf_best_split_feature_[best_leaf_index_],
+                                leaf_best_split_threshold_[best_leaf_index_],
+                                cuda_bitset_inner_,
+                                static_cast<int>(cuda_bitset_inner_len_),
+                                leaf_best_split_default_left_[best_leaf_index_],
+                                leaf_num_data_[best_leaf_index_],
+                                leaf_data_start_[best_leaf_index_],
+                                cuda_smaller_leaf_splits_->GetCUDAStructRef(),
+                                cuda_larger_leaf_splits_->GetCUDAStructRef(),
+                                &leaf_num_data_[best_leaf_index_],
+                                &leaf_num_data_[right_leaf_index],
+                                &leaf_data_start_[best_leaf_index_],
+                                &leaf_data_start_[right_leaf_index],
+                                &leaf_sum_hessians_[best_leaf_index_],
+                                &leaf_sum_hessians_[right_leaf_index],
+                                &sum_left_gradients,
+                                &sum_right_gradients);
+    #ifdef DEBUG
+    CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients);
+    #endif  // DEBUG
+    smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index);
+    larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_);
+    global_timer.Stop("CUDASingleGPUTreeLearner::Split");
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  tree->ToHost();
+  return tree.release();
+}
+
+void CUDASingleGPUTreeLearner::ResetTrainingData(
+  const Dataset* train_data,
+  bool is_constant_hessian) {
+  SerialTreeLearner::ResetTrainingData(train_data, is_constant_hessian);
+  CHECK_EQ(num_features_, train_data_->num_features());
+  cuda_histogram_constructor_->ResetTrainingData(train_data, share_state_.get());
+  cuda_data_partition_->ResetTrainingData(train_data,
+    static_cast<int>(share_state_->feature_hist_offsets().back()),
+    cuda_histogram_constructor_->cuda_hist_pointer());
+  cuda_best_split_finder_->ResetTrainingData(
+    cuda_histogram_constructor_->cuda_hist(),
+    train_data,
+    share_state_->feature_hist_offsets());
+  cuda_smaller_leaf_splits_->Resize(num_data_);
+  cuda_larger_leaf_splits_->Resize(num_data_);
+  CHECK_EQ(is_constant_hessian, share_state_->is_constant_hessian);
+  DeallocateCUDAMemory<score_t>(&cuda_gradients_, __FILE__, __LINE__);
+  DeallocateCUDAMemory<score_t>(&cuda_hessians_, __FILE__, __LINE__);
+  AllocateCUDAMemory<score_t>(&cuda_gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  AllocateCUDAMemory<score_t>(&cuda_hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+}
+
+void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) {
+  const int old_num_leaves = config_->num_leaves;
+  SerialTreeLearner::ResetConfig(config);
+  if (config_->gpu_device_id >= 0 && config_->gpu_device_id != gpu_device_id_) {
+    Log::Fatal("Changing gpu device ID by resetting configuration parameter is not allowed for CUDA tree learner.");
+  }
+  num_threads_ = OMP_NUM_THREADS();
+  if (config_->num_leaves != old_num_leaves) {
+    leaf_best_split_feature_.resize(config_->num_leaves, -1);
+    leaf_best_split_threshold_.resize(config_->num_leaves, 0);
+    leaf_best_split_default_left_.resize(config_->num_leaves, 0);
+    leaf_num_data_.resize(config_->num_leaves, 0);
+    leaf_data_start_.resize(config_->num_leaves, 0);
+    leaf_sum_hessians_.resize(config_->num_leaves, 0.0f);
+  }
+  cuda_histogram_constructor_->ResetConfig(config);
+  cuda_best_split_finder_->ResetConfig(config, cuda_histogram_constructor_->cuda_hist());
+  cuda_data_partition_->ResetConfig(config, cuda_histogram_constructor_->cuda_hist_pointer());
+}
+
+void CUDASingleGPUTreeLearner::SetBaggingData(const Dataset* /*subset*/,
+  const data_size_t* used_indices, data_size_t num_data) {
+  cuda_data_partition_->SetUsedDataIndices(used_indices, num_data);
+}
+
+void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
+                                         data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const {
+  CHECK(tree->is_cuda_tree());
+  CUDATree* cuda_tree = reinterpret_cast<CUDATree*>(tree);
+  if (obj != nullptr && obj->IsRenewTreeOutput()) {
+    CHECK_LE(cuda_tree->num_leaves(), data_partition_->num_leaves());
+    const data_size_t* bag_mapper = nullptr;
+    if (total_num_data != num_data_) {
+      CHECK_EQ(bag_cnt, num_data_);
+      bag_mapper = bag_indices;
+    }
+    std::vector<int> n_nozeroworker_perleaf(tree->num_leaves(), 1);
+    int num_machines = Network::num_machines();
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < tree->num_leaves(); ++i) {
+      const double output = static_cast<double>(tree->LeafOutput(i));
+      data_size_t cnt_leaf_data = leaf_num_data_[i];
+      std::vector<data_size_t> index_mapper(cnt_leaf_data, -1);
+      CopyFromCUDADeviceToHost<data_size_t>(index_mapper.data(),
+        cuda_data_partition_->cuda_data_indices() + leaf_data_start_[i],
+        static_cast<size_t>(cnt_leaf_data), __FILE__, __LINE__);
+      if (cnt_leaf_data > 0) {
+        const double new_output = obj->RenewTreeOutput(output, residual_getter, index_mapper.data(), bag_mapper, cnt_leaf_data);
+        tree->SetLeafOutput(i, new_output);
+      } else {
+        CHECK_GT(num_machines, 1);
+        tree->SetLeafOutput(i, 0.0);
+        n_nozeroworker_perleaf[i] = 0;
+      }
+    }
+    if (num_machines > 1) {
+      std::vector<double> outputs(tree->num_leaves());
+      for (int i = 0; i < tree->num_leaves(); ++i) {
+        outputs[i] = static_cast<double>(tree->LeafOutput(i));
+      }
+      outputs = Network::GlobalSum(&outputs);
+      n_nozeroworker_perleaf = Network::GlobalSum(&n_nozeroworker_perleaf);
+      for (int i = 0; i < tree->num_leaves(); ++i) {
+        tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]);
+      }
+    }
+  }
+  cuda_tree->SyncLeafOutputFromHostToCUDA();
+}
+
+Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const {
+  std::unique_ptr<CUDATree> cuda_tree(new CUDATree(old_tree));
+  SetCUDAMemory<double>(cuda_leaf_gradient_stat_buffer_, 0, static_cast<size_t>(old_tree->num_leaves()), __FILE__, __LINE__);
+  SetCUDAMemory<double>(cuda_leaf_hessian_stat_buffer_, 0, static_cast<size_t>(old_tree->num_leaves()), __FILE__, __LINE__);
+  ReduceLeafStat(cuda_tree.get(), gradients, hessians, cuda_data_partition_->cuda_data_indices());
+  cuda_tree->SyncLeafOutputFromCUDAToHost();
+  return cuda_tree.release();
+}
+
+Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
+                                                  const score_t* gradients, const score_t* hessians) const {
+  cuda_data_partition_->ResetByLeafPred(leaf_pred, old_tree->num_leaves());
+  refit_num_data_ = static_cast<data_size_t>(leaf_pred.size());
+  data_size_t buffer_size = static_cast<data_size_t>(old_tree->num_leaves());
+  if (old_tree->num_leaves() > 2048) {
+    const int num_block = (refit_num_data_ + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+    buffer_size *= static_cast<data_size_t>(num_block + 1);
+  }
+  if (buffer_size != leaf_stat_buffer_size_) {
+    if (leaf_stat_buffer_size_ != 0) {
+      DeallocateCUDAMemory<double>(&cuda_leaf_gradient_stat_buffer_, __FILE__, __LINE__);
+      DeallocateCUDAMemory<double>(&cuda_leaf_hessian_stat_buffer_, __FILE__, __LINE__);
+    }
+    AllocateCUDAMemory<double>(&cuda_leaf_gradient_stat_buffer_, static_cast<size_t>(buffer_size), __FILE__, __LINE__);
+    AllocateCUDAMemory<double>(&cuda_leaf_hessian_stat_buffer_, static_cast<size_t>(buffer_size), __FILE__, __LINE__);
+  }
+  return FitByExistingTree(old_tree, gradients, hessians);
+}
+
+void CUDASingleGPUTreeLearner::ReduceLeafStat(
+  CUDATree* old_tree, const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf) const {
+  LaunchReduceLeafStatKernel(gradients, hessians, num_data_in_leaf, old_tree->cuda_leaf_parent(),
+    old_tree->cuda_left_child(), old_tree->cuda_right_child(),
+    old_tree->num_leaves(), refit_num_data_, old_tree->cuda_leaf_value_ref(), old_tree->shrinkage());
+}
+
+void CUDASingleGPUTreeLearner::ConstructBitsetForCategoricalSplit(
+  const CUDASplitInfo* best_split_info) {
+  LaunchConstructBitsetForCategoricalSplitKernel(best_split_info);
+}
+
+void CUDASingleGPUTreeLearner::AllocateBitset() {
+  has_categorical_feature_ = false;
+  categorical_bin_offsets_.clear();
+  categorical_bin_offsets_.push_back(0);
+  categorical_bin_to_value_.clear();
+  for (int i = 0; i < train_data_->num_features(); ++i) {
+    const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i);
+    if (bin_mapper->bin_type() == BinType::CategoricalBin) {
+      has_categorical_feature_ = true;
+      break;
+    }
+  }
+  if (has_categorical_feature_) {
+    int max_cat_value = 0;
+    int max_cat_num_bin = 0;
+    for (int i = 0; i < train_data_->num_features(); ++i) {
+      const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i);
+      if (bin_mapper->bin_type() == BinType::CategoricalBin) {
+        max_cat_value = std::max(bin_mapper->MaxCatValue(), max_cat_value);
+        max_cat_num_bin = std::max(bin_mapper->num_bin(), max_cat_num_bin);
+      }
+    }
+    // std::max(..., 1UL) to avoid error in the case when there are NaN's in the categorical values
+    const size_t cuda_bitset_max_size = std::max(static_cast<size_t>((max_cat_value + 31) / 32), 1UL);
+    const size_t cuda_bitset_inner_max_size = std::max(static_cast<size_t>((max_cat_num_bin + 31) / 32), 1UL);
+    AllocateCUDAMemory<uint32_t>(&cuda_bitset_, cuda_bitset_max_size, __FILE__, __LINE__);
+    AllocateCUDAMemory<uint32_t>(&cuda_bitset_inner_, cuda_bitset_inner_max_size, __FILE__, __LINE__);
+    const int max_cat_in_split = std::min(config_->max_cat_threshold, max_cat_num_bin / 2);
+    const int num_blocks = (max_cat_in_split + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+    AllocateCUDAMemory<size_t>(&cuda_block_bitset_len_buffer_, num_blocks, __FILE__, __LINE__);
+
+    for (int i = 0; i < train_data_->num_features(); ++i) {
+      const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i);
+      if (bin_mapper->bin_type() == BinType::CategoricalBin) {
+        categorical_bin_offsets_.push_back(bin_mapper->num_bin());
+      } else {
+        categorical_bin_offsets_.push_back(0);
+      }
+    }
+    for (size_t i = 1; i < categorical_bin_offsets_.size(); ++i) {
+      categorical_bin_offsets_[i] += categorical_bin_offsets_[i - 1];
+    }
+    categorical_bin_to_value_.resize(categorical_bin_offsets_.back(), 0);
+    for (int i = 0; i < train_data_->num_features(); ++i) {
+      const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i);
+      if (bin_mapper->bin_type() == BinType::CategoricalBin) {
+        const int offset = categorical_bin_offsets_[i];
+        for (int bin = 0; bin < bin_mapper->num_bin(); ++bin) {
+          categorical_bin_to_value_[offset + bin] = static_cast<int>(bin_mapper->BinToValue(bin));
+        }
+      }
+    }
+    InitCUDAMemoryFromHostMemory<int>(&cuda_categorical_bin_offsets_, categorical_bin_offsets_.data(), categorical_bin_offsets_.size(), __FILE__, __LINE__);
+    InitCUDAMemoryFromHostMemory<int>(&cuda_categorical_bin_to_value_, categorical_bin_to_value_.data(), categorical_bin_to_value_.size(), __FILE__, __LINE__);
+  } else {
+    cuda_bitset_ = nullptr;
+    cuda_bitset_inner_ = nullptr;
+  }
+  cuda_bitset_len_ = 0;
+  cuda_bitset_inner_len_ = 0;
+}
+
+#ifdef DEBUG
+void CUDASingleGPUTreeLearner::CheckSplitValid(
+  const int left_leaf,
+  const int right_leaf,
+  const double split_sum_left_gradients,
+  const double split_sum_right_gradients) {
+  std::vector<data_size_t> left_data_indices(leaf_num_data_[left_leaf]);
+  std::vector<data_size_t> right_data_indices(leaf_num_data_[right_leaf]);
+  CopyFromCUDADeviceToHost<data_size_t>(left_data_indices.data(),
+    cuda_data_partition_->cuda_data_indices() + leaf_data_start_[left_leaf],
+    leaf_num_data_[left_leaf], __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<data_size_t>(right_data_indices.data(),
+    cuda_data_partition_->cuda_data_indices() + leaf_data_start_[right_leaf],
+    leaf_num_data_[right_leaf], __FILE__, __LINE__);
+  double sum_left_gradients = 0.0f, sum_left_hessians = 0.0f;
+  double sum_right_gradients = 0.0f, sum_right_hessians = 0.0f;
+  for (size_t i = 0; i < left_data_indices.size(); ++i) {
+    const data_size_t index = left_data_indices[i];
+    sum_left_gradients += gradients_[index];
+    sum_left_hessians += hessians_[index];
+  }
+  for (size_t i = 0; i < right_data_indices.size(); ++i) {
+    const data_size_t index = right_data_indices[i];
+    sum_right_gradients += gradients_[index];
+    sum_right_hessians += hessians_[index];
+  }
+  CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f);
+  CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f);
+  CHECK_LE(std::fabs(sum_right_gradients - split_sum_right_gradients), 1e-6f);
+  CHECK_LE(std::fabs(sum_right_hessians - leaf_sum_hessians_[right_leaf]), 1e-6f);
+}
+#endif  // DEBUG
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
new file mode 100644
index 000000000000..f4a87de499cb
--- /dev/null
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -0,0 +1,261 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+
+#include "cuda_single_gpu_tree_learner.hpp"
+
+#include <algorithm>
+
+namespace LightGBM {
+
+__global__ void ReduceLeafStatKernel_SharedMemory(
+  const score_t* gradients,
+  const score_t* hessians,
+  const int num_leaves,
+  const data_size_t num_data,
+  const int* data_index_to_leaf_index,
+  double* leaf_grad_stat_buffer,
+  double* leaf_hess_stat_buffer) {
+  extern __shared__ double shared_mem[];
+  double* shared_grad_sum = shared_mem;
+  double* shared_hess_sum = shared_mem + num_leaves;
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  for (int leaf_index = static_cast<int>(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast<int>(blockDim.x)) {
+    shared_grad_sum[leaf_index] = 0.0f;
+    shared_hess_sum[leaf_index] = 0.0f;
+  }
+  __syncthreads();
+  if (data_index < num_data) {
+    const int leaf_index = data_index_to_leaf_index[data_index];
+    atomicAdd_block(shared_grad_sum + leaf_index, gradients[data_index]);
+    atomicAdd_block(shared_hess_sum + leaf_index, hessians[data_index]);
+  }
+  __syncthreads();
+  for (int leaf_index = static_cast<int>(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast<int>(blockDim.x)) {
+    atomicAdd_system(leaf_grad_stat_buffer + leaf_index, shared_grad_sum[leaf_index]);
+    atomicAdd_system(leaf_hess_stat_buffer + leaf_index, shared_hess_sum[leaf_index]);
+  }
+}
+
+__global__ void ReduceLeafStatKernel_GlobalMemory(
+  const score_t* gradients,
+  const score_t* hessians,
+  const int num_leaves,
+  const data_size_t num_data,
+  const int* data_index_to_leaf_index,
+  double* leaf_grad_stat_buffer,
+  double* leaf_hess_stat_buffer) {
+  const size_t offset = static_cast<size_t>(num_leaves) * (blockIdx.x + 1);
+  double* grad_sum = leaf_grad_stat_buffer + offset;
+  double* hess_sum = leaf_hess_stat_buffer + offset;
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  for (int leaf_index = static_cast<int>(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast<int>(blockDim.x)) {
+    grad_sum[leaf_index] = 0.0f;
+    hess_sum[leaf_index] = 0.0f;
+  }
+  __syncthreads();
+  if (data_index < num_data) {
+    const int leaf_index = data_index_to_leaf_index[data_index];
+    atomicAdd_block(grad_sum + leaf_index, gradients[data_index]);
+    atomicAdd_block(hess_sum + leaf_index, hessians[data_index]);
+  }
+  __syncthreads();
+  for (int leaf_index = static_cast<int>(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast<int>(blockDim.x)) {
+    atomicAdd_system(leaf_grad_stat_buffer + leaf_index, grad_sum[leaf_index]);
+    atomicAdd_system(leaf_hess_stat_buffer + leaf_index, hess_sum[leaf_index]);
+  }
+}
+
+template <bool USE_L1, bool USE_SMOOTHING>
+__global__ void CalcRefitLeafOutputKernel(
+  const int num_leaves,
+  const double* leaf_grad_stat_buffer,
+  const double* leaf_hess_stat_buffer,
+  const data_size_t* num_data_in_leaf,
+  const int* leaf_parent,
+  const int* left_child,
+  const int* right_child,
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const double shrinkage_rate,
+  const double refit_decay_rate,
+  double* leaf_value) {
+  const int leaf_index = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (leaf_index < num_leaves) {
+    const double sum_gradients = leaf_grad_stat_buffer[leaf_index];
+    const double sum_hessians = leaf_hess_stat_buffer[leaf_index];
+    const data_size_t num_data = num_data_in_leaf[leaf_index];
+    const double old_leaf_value = leaf_value[leaf_index];
+    double new_leaf_value = 0.0f;
+    if (!USE_SMOOTHING) {
+      new_leaf_value = CUDALeafSplits::CalculateSplittedLeafOutput<false, false>(sum_gradients, sum_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    } else {
+      const int parent = leaf_parent[leaf_index];
+      if (parent >= 0) {
+        const int sibliing = left_child[parent] == leaf_index ? right_child[parent] : left_child[parent];
+        const double sum_gradients_of_parent = sum_gradients + leaf_grad_stat_buffer[sibliing];
+        const double sum_hessians_of_parent = sum_hessians + leaf_hess_stat_buffer[sibliing];
+        const data_size_t num_data_in_parent = num_data + num_data_in_leaf[sibliing];
+        const double parent_output =
+          CUDALeafSplits::CalculateSplittedLeafOutput<false, true>(
+            sum_gradients_of_parent, sum_hessians_of_parent, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+          new_leaf_value = CUDALeafSplits::CalculateSplittedLeafOutput<false, true>(
+          sum_gradients, sum_hessians, lambda_l1, lambda_l2, path_smooth, num_data_in_parent, parent_output);
+      } else {
+        new_leaf_value = CUDALeafSplits::CalculateSplittedLeafOutput<false, false>(sum_gradients, sum_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+      }
+    }
+    if (isnan(new_leaf_value)) {
+      new_leaf_value = 0.0f;
+    } else {
+      new_leaf_value *= shrinkage_rate;
+    }
+    leaf_value[leaf_index] = refit_decay_rate * old_leaf_value + (1.0f - refit_decay_rate) * new_leaf_value;
+  }
+}
+
+void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel(
+  const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf,
+  const int* leaf_parent, const int* left_child, const int* right_child, const int num_leaves,
+  const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const {
+  int num_block = (num_data + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+  if (num_leaves <= 2048) {
+    ReduceLeafStatKernel_SharedMemory<<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE, 2 * num_leaves * sizeof(double)>>>(
+      gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(),
+      cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_);
+  } else {
+    ReduceLeafStatKernel_GlobalMemory<<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(
+      gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(),
+      cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_);
+  }
+  const bool use_l1 = config_->lambda_l1 > 0.0f;
+  const bool use_smoothing = config_->path_smooth > 0.0f;
+  num_block = (num_leaves + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+
+  #define CalcRefitLeafOutputKernel_ARGS \
+    num_leaves, cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_, num_data_in_leaf, \
+    leaf_parent, left_child, right_child, \
+    config_->lambda_l1, config_->lambda_l2, config_->path_smooth, \
+    shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value
+
+  if (!use_l1) {
+    if (!use_smoothing) {
+      CalcRefitLeafOutputKernel<false, false>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    } else {
+      CalcRefitLeafOutputKernel<false, true>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    }
+  } else {
+    if (!use_smoothing) {
+      CalcRefitLeafOutputKernel<true, false>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    } else {
+      CalcRefitLeafOutputKernel<true, true>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    }
+  }
+}
+
+template <typename T, bool IS_INNER>
+__global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t* out_len_buffer) {
+  __shared__ size_t shared_mem_buffer[32];
+  const T* vals = nullptr;
+  if (IS_INNER) {
+    vals = reinterpret_cast<const T*>(best_split_info->cat_threshold);
+  } else {
+    vals = reinterpret_cast<const T*>(best_split_info->cat_threshold_real);
+  }
+  const int i = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  size_t len = 0;
+  if (i < best_split_info->num_cat_threshold) {
+    const T val = vals[i];
+    len = (val / 32) + 1;
+  }
+  const size_t block_max_len = ShuffleReduceMax<size_t>(len, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    out_len_buffer[blockIdx.x] = block_max_len;
+  }
+}
+
+__global__ void ReduceBlockMaxLen(size_t* out_len_buffer, const int num_blocks) {
+  __shared__ size_t shared_mem_buffer[32];
+  size_t max_len = 0;
+  for (int i = static_cast<int>(threadIdx.x); i < num_blocks; i += static_cast<int>(blockDim.x)) {
+    max_len = max(out_len_buffer[i], max_len);
+  }
+  const size_t all_max_len = ShuffleReduceMax<size_t>(max_len, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    out_len_buffer[0] = max_len;
+  }
+}
+
+template <typename T, bool IS_INNER>
+__global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info, uint32_t* out, size_t cuda_bitset_len) {
+  const T* vals = nullptr;
+  if (IS_INNER) {
+    vals = reinterpret_cast<const T*>(best_split_info->cat_threshold);
+  } else {
+    vals = reinterpret_cast<const T*>(best_split_info->cat_threshold_real);
+  }
+  const int i = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (i < best_split_info->num_cat_threshold) {
+    const T val = vals[i];
+    // can use add instead of or here, because each bit will only be added once
+    atomicAdd_system(out + (val / 32), (0x1 << (val % 32)));
+  }
+}
+
+__global__ void SetRealThresholdKernel(
+  const CUDASplitInfo* best_split_info,
+  const int* categorical_bin_to_value,
+  const int* categorical_bin_offsets) {
+  const int num_cat_threshold = best_split_info->num_cat_threshold;
+  const int* categorical_bin_to_value_ptr = categorical_bin_to_value + categorical_bin_offsets[best_split_info->inner_feature_index];
+  int* cat_threshold_real = best_split_info->cat_threshold_real;
+  const uint32_t* cat_threshold = best_split_info->cat_threshold;
+  const int index = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (index < num_cat_threshold) {
+    cat_threshold_real[index] = categorical_bin_to_value_ptr[cat_threshold[index]];
+  }
+}
+
+template <typename T, bool IS_INNER>
+void CUDAConstructBitset(const CUDASplitInfo* best_split_info, const int num_cat_threshold, uint32_t* out, size_t bitset_len) {
+  const int num_blocks = (num_cat_threshold + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+  // clear the bitset vector first
+  SetCUDAMemory<uint32_t>(out, 0, bitset_len, __FILE__, __LINE__);
+  CUDAConstructBitsetKernel<T, IS_INNER><<<num_blocks, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(best_split_info, out, bitset_len);
+}
+
+template <typename T, bool IS_INNER>
+size_t CUDABitsetLen(const CUDASplitInfo* best_split_info, const int num_cat_threshold, size_t* out_len_buffer) {
+  const int num_blocks = (num_cat_threshold + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+  CalcBitsetLenKernel<T, IS_INNER><<<num_blocks, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(best_split_info, out_len_buffer);
+  ReduceBlockMaxLen<<<1, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(out_len_buffer, num_blocks);
+  size_t host_max_len = 0;
+  CopyFromCUDADeviceToHost<size_t>(&host_max_len, out_len_buffer, 1, __FILE__, __LINE__);
+  return host_max_len;
+}
+
+void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(
+  const CUDASplitInfo* best_split_info) {
+  const int num_blocks = (num_cat_threshold_ + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+  SetRealThresholdKernel<<<num_blocks, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>
+    (best_split_info, cuda_categorical_bin_to_value_, cuda_categorical_bin_offsets_);
+  cuda_bitset_inner_len_ = CUDABitsetLen<uint32_t, true>(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_);
+  CUDAConstructBitset<uint32_t, true>(best_split_info, num_cat_threshold_, cuda_bitset_inner_, cuda_bitset_inner_len_);
+  cuda_bitset_len_ = CUDABitsetLen<int, false>(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_);
+  CUDAConstructBitset<int, false>(best_split_info, num_cat_threshold_, cuda_bitset_, cuda_bitset_len_);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
new file mode 100644
index 000000000000..1b32b14cfba1
--- /dev/null
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -0,0 +1,143 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
+#define LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
+
+#include <memory>
+#include <vector>
+
+#ifdef USE_CUDA_EXP
+
+#include "cuda_leaf_splits.hpp"
+#include "cuda_histogram_constructor.hpp"
+#include "cuda_data_partition.hpp"
+#include "cuda_best_split_finder.hpp"
+
+#include "../serial_tree_learner.h"
+
+namespace LightGBM {
+
+#define CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE (1024)
+
+class CUDASingleGPUTreeLearner: public SerialTreeLearner {
+ public:
+  explicit CUDASingleGPUTreeLearner(const Config* config);
+
+  ~CUDASingleGPUTreeLearner();
+
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+
+  void ResetTrainingData(const Dataset* train_data,
+                         bool is_constant_hessian) override;
+
+  Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override;
+
+  void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override;
+
+  void AddPredictionToScore(const Tree* tree, double* out_score) const override;
+
+  void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
+                       data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
+
+  void ResetConfig(const Config* config) override;
+
+  Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
+
+  Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
+                          const score_t* gradients, const score_t* hessians) const override;
+
+ protected:
+  void BeforeTrain() override;
+
+  void ReduceLeafStat(CUDATree* old_tree, const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf) const;
+
+  void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf,
+    const int* leaf_parent, const int* left_child, const int* right_child,
+    const int num_leaves, const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const;
+
+  void ConstructBitsetForCategoricalSplit(const CUDASplitInfo* best_split_info);
+
+  void LaunchConstructBitsetForCategoricalSplitKernel(const CUDASplitInfo* best_split_info);
+
+  void AllocateBitset();
+
+  #ifdef DEUBG
+  void CheckSplitValid(
+    const int left_leaf, const int right_leaf,
+    const double sum_left_gradients, const double sum_right_gradients);
+  #endif  // DEBUG
+
+  // GPU device ID
+  int gpu_device_id_;
+  // number of threads on CPU
+  int num_threads_;
+
+  // CUDA components for tree training
+
+  // leaf splits information for smaller and larger leaves
+  std::unique_ptr<CUDALeafSplits> cuda_smaller_leaf_splits_;
+  std::unique_ptr<CUDALeafSplits> cuda_larger_leaf_splits_;
+  // data partition that partitions data indices into different leaves
+  std::unique_ptr<CUDADataPartition> cuda_data_partition_;
+  // for histogram construction
+  std::unique_ptr<CUDAHistogramConstructor> cuda_histogram_constructor_;
+  // for best split information finding, given the histograms
+  std::unique_ptr<CUDABestSplitFinder> cuda_best_split_finder_;
+
+  std::vector<int> leaf_best_split_feature_;
+  std::vector<uint32_t> leaf_best_split_threshold_;
+  std::vector<uint8_t> leaf_best_split_default_left_;
+  std::vector<data_size_t> leaf_num_data_;
+  std::vector<data_size_t> leaf_data_start_;
+  std::vector<double> leaf_sum_hessians_;
+  int smaller_leaf_index_;
+  int larger_leaf_index_;
+  int best_leaf_index_;
+  int num_cat_threshold_;
+  bool has_categorical_feature_;
+
+  std::vector<int> categorical_bin_to_value_;
+  std::vector<int> categorical_bin_offsets_;
+
+  mutable double* cuda_leaf_gradient_stat_buffer_;
+  mutable double* cuda_leaf_hessian_stat_buffer_;
+  mutable data_size_t leaf_stat_buffer_size_;
+  mutable data_size_t refit_num_data_;
+  uint32_t* cuda_bitset_;
+  size_t cuda_bitset_len_;
+  uint32_t* cuda_bitset_inner_;
+  size_t cuda_bitset_inner_len_;
+  size_t* cuda_block_bitset_len_buffer_;
+  int* cuda_categorical_bin_to_value_;
+  int* cuda_categorical_bin_offsets_;
+
+  /*! \brief gradients on CUDA */
+  score_t* cuda_gradients_;
+  /*! \brief hessians on CUDA */
+  score_t* cuda_hessians_;
+};
+
+}  // namespace LightGBM
+
+#else  // USE_CUDA_EXP
+
+// When GPU support is not compiled in, quit with an error message
+
+namespace LightGBM {
+
+class CUDASingleGPUTreeLearner: public SerialTreeLearner {
+ public:
+    #pragma warning(disable : 4702)
+    explicit CUDASingleGPUTreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
+      Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n"
+                 "Please recompile with CMake option -DUSE_CUDA_EXP=1");
+    }
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
index 5e4ccaeb9a52..a6bd4c47ae06 100644
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
@@ -63,6 +63,43 @@ CUDATreeLearner::CUDATreeLearner(const Config* config)
 }
 
 CUDATreeLearner::~CUDATreeLearner() {
+  #pragma omp parallel for schedule(static, num_gpu_)
+
+  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
+
+    if (device_features_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
+    }
+
+    if (device_gradients_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
+    }
+
+    if (device_hessians_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
+    }
+
+    if (device_feature_masks_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
+    }
+
+    if (device_data_indices_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
+    }
+
+    if (sync_counters_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
+    }
+
+    if (device_subhistograms_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
+    }
+
+    if (device_histogram_outputs_[device_id] != NULL) {
+      CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
+    }
+  }
 }
 
 
@@ -469,10 +506,30 @@ void CUDATreeLearner::InitGPU(int num_gpu) {
   } else {
     Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
   }
-  if (max_num_bin_ == 65) {
+
+  // ignore the feature groups that contain categorical features when producing warnings about max_bin.
+  // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
+  int max_num_bin_no_categorical = 0;
+  int cur_feature_group = 0;
+  bool categorical_feature_found = false;
+  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
+    const int feature_group = train_data_->Feature2Group(inner_feature_index);
+    const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
+    if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
+      categorical_feature_found = true;
+    }
+    if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
+      if (!categorical_feature_found) {
+        max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
+      }
+      categorical_feature_found = false;
+      cur_feature_group = feature_group;
+    }
+  }
+  if (max_num_bin_no_categorical == 65) {
     Log::Warning("Setting max_bin to 63 is suggested for best performance");
   }
-  if (max_num_bin_ == 17) {
+  if (max_num_bin_no_categorical == 17) {
     Log::Warning("Setting max_bin to 15 is suggested for best performance");
   }
 
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 30ba26aa7221..ead5027a95e4 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -245,7 +245,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
   }
   // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
   device_features_.reset();
-  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>(num_dense_feature4_ * num_data_, ctx_));
+  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>((uint64_t)num_dense_feature4_ * num_data_, ctx_));
   // unpin old buffer if necessary before destructing them
   if (ptr_pinned_gradients_) {
     queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
@@ -427,7 +427,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
     }
     #pragma omp critical
     queue_.enqueue_write_buffer(device_features_->get_buffer(),
-                        i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
+                        (uint64_t)i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
     #if GPU_DEBUG >= 1
     printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s[0], host4[0].s[1], host4[0].s[2], host4[0].s[3]);
     printf("Feature-groups copied to device with multipliers ");
@@ -503,7 +503,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
     }
     // copying the last 1 to (dword_features - 1) feature-groups in the last tuple
     queue_.enqueue_write_buffer(device_features_->get_buffer(),
-                        (num_dense_feature4_ - 1) * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
+                        (num_dense_feature4_ - 1) * (uint64_t)num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
     #if GPU_DEBUG >= 1
     printf("Last features copied to device\n");
     #endif
@@ -719,10 +719,30 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
   } else {
     Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
   }
-  if (max_num_bin_ == 65) {
+
+  // ignore the feature groups that contain categorical features when producing warnings about max_bin.
+  // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
+  int max_num_bin_no_categorical = 0;
+  int cur_feature_group = 0;
+  bool categorical_feature_found = false;
+  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
+    const int feature_group = train_data_->Feature2Group(inner_feature_index);
+    const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
+    if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
+      categorical_feature_found = true;
+    }
+    if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
+      if (!categorical_feature_found) {
+        max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
+      }
+      categorical_feature_found = false;
+      cur_feature_group = feature_group;
+    }
+  }
+  if (max_num_bin_no_categorical == 65) {
     Log::Warning("Setting max_bin to 63 is suggested for best performance");
   }
-  if (max_num_bin_ == 17) {
+  if (max_num_bin_no_categorical == 17) {
     Log::Warning("Setting max_bin to 15 is suggested for best performance");
   }
   ctx_ = boost::compute::context(dev_);
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 304c712f0723..402889d3a561 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -278,6 +278,10 @@ void SerialTreeLearner::BeforeTrain() {
   }
 
   larger_leaf_splits_->Init();
+
+  if (cegb_ != nullptr) {
+    cegb_->BeforeTrain();
+  }
 }
 
 bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
@@ -772,7 +776,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
   new_split.feature = real_fidx;
   if (cegb_ != nullptr) {
     new_split.gain -=
-        cegb_->DetlaGain(feature_index, real_fidx, leaf_splits->leaf_index(),
+        cegb_->DeltaGain(feature_index, real_fidx, leaf_splits->leaf_index(),
                          num_data, new_split);
   }
   if (new_split.monotone_type != 0) {
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 7dfadf05d119..7d05debbc12b 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -206,12 +206,12 @@ class SerialTreeLearner: public TreeLearner {
   std::unique_ptr<LeafSplits> smaller_leaf_splits_;
   /*! \brief stores best thresholds for all feature for larger leaf */
   std::unique_ptr<LeafSplits> larger_leaf_splits_;
-#ifdef USE_GPU
+#if defined(USE_GPU)
   /*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif USE_CUDA
+#elif defined(USE_CUDA) || defined(USE_CUDA_EXP)
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index ed13f646c388..ee3d16a51c1b 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -9,6 +9,7 @@
 #include "linear_tree_learner.h"
 #include "parallel_tree_learner.h"
 #include "serial_tree_learner.h"
+#include "cuda/cuda_single_gpu_tree_learner.hpp"
 
 namespace LightGBM {
 
@@ -48,6 +49,16 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
     } else if (learner_type == std::string("voting")) {
       return new VotingParallelTreeLearner<CUDATreeLearner>(config);
     }
+  } else if (device_type == std::string("cuda_exp")) {
+    if (learner_type == std::string("serial")) {
+      if (config->num_gpu == 1) {
+        return new CUDASingleGPUTreeLearner(config);
+      } else {
+        Log::Fatal("cuda_exp only supports training on a single GPU.");
+      }
+    } else {
+      Log::Fatal("cuda_exp only supports training on a single machine.");
+    }
   }
   return nullptr;
 }
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 18a8403eba85..93f0b215874d 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -1,6 +1,8 @@
 # coding: utf-8
 import filecmp
 import numbers
+import re
+from os import getenv
 from pathlib import Path
 
 import numpy as np
@@ -12,7 +14,7 @@
 import lightgbm as lgb
 from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
 
-from .utils import load_breast_cancer
+from .utils import dummy_obj, load_breast_cancer, mse_obj
 
 
 def test_basic(tmp_path):
@@ -46,8 +48,9 @@ def test_basic(tmp_path):
     assert bst.current_iteration() == 20
     assert bst.num_trees() == 20
     assert bst.num_model_per_iteration() == 1
-    assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
-    assert bst.upper_bound() == pytest.approx(3.3182142872462883)
+    if getenv('TASK', '') != 'cuda_exp':
+        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
+        assert bst.upper_bound() == pytest.approx(3.3182142872462883)
 
     tname = tmp_path / "svm_light.dat"
     model_file = tmp_path / "model.txt"
@@ -510,6 +513,72 @@ def test_choose_param_value():
     assert original_params == expected_params
 
 
+def test_choose_param_value_preserves_nones():
+
+    # preserves None found for main param and still removes aliases
+    params = lgb.basic._choose_param_value(
+        main_param_name="num_threads",
+        params={
+            "num_threads": None,
+            "n_jobs": 4,
+            "objective": "regression"
+        },
+        default_value=2
+    )
+    assert params == {"num_threads": None, "objective": "regression"}
+
+    # correctly chooses value when only an alias is provided
+    params = lgb.basic._choose_param_value(
+        main_param_name="num_threads",
+        params={
+            "n_jobs": None,
+            "objective": "regression"
+        },
+        default_value=2
+    )
+    assert params == {"num_threads": None, "objective": "regression"}
+
+    # adds None if that's given as the default and param not found
+    params = lgb.basic._choose_param_value(
+        main_param_name="min_data_in_leaf",
+        params={
+            "objective": "regression"
+        },
+        default_value=None
+    )
+    assert params == {"objective": "regression", "min_data_in_leaf": None}
+
+
+@pytest.mark.parametrize("objective_alias", lgb.basic._ConfigAliases.get("objective"))
+def test_choose_param_value_objective(objective_alias):
+    # If callable is found in objective
+    params = {objective_alias: dummy_obj}
+    params = lgb.basic._choose_param_value(
+        main_param_name="objective",
+        params=params,
+        default_value=None
+    )
+    assert params['objective'] == dummy_obj
+
+    # Value in params should be preferred to the default_value passed from keyword arguments
+    params = {objective_alias: dummy_obj}
+    params = lgb.basic._choose_param_value(
+        main_param_name="objective",
+        params=params,
+        default_value=mse_obj
+    )
+    assert params['objective'] == dummy_obj
+
+    # None of objective or its aliases in params, but default_value is callable.
+    params = {}
+    params = lgb.basic._choose_param_value(
+        main_param_name="objective",
+        params=params,
+        default_value=mse_obj
+    )
+    assert params['objective'] == mse_obj
+
+
 @pytest.mark.parametrize('collection', ['1d_np', '2d_np', 'pd_float', 'pd_str', '1d_list', '2d_list'])
 @pytest.mark.parametrize('dtype', [np.float32, np.float64])
 def test_list_to_1d_numpy(collection, dtype):
@@ -579,3 +648,111 @@ def test_param_aliases():
     assert all(len(i) >= 1 for i in aliases.values())
     assert all(k in v for k, v in aliases.items())
     assert lgb.basic._ConfigAliases.get('config', 'task') == {'config', 'config_file', 'task', 'task_type'}
+
+
+def _bad_gradients(preds, _):
+    return np.random.randn(len(preds) + 1), np.random.rand(len(preds) + 1)
+
+
+def _good_gradients(preds, _):
+    return np.random.randn(*preds.shape), np.random.rand(*preds.shape)
+
+
+def test_custom_objective_safety():
+    nrows = 100
+    X = np.random.randn(nrows, 5)
+    y_binary = np.arange(nrows) % 2
+    classes = [0, 1, 2]
+    nclass = len(classes)
+    y_multiclass = np.arange(nrows) % nclass
+    ds_binary = lgb.Dataset(X, y_binary).construct()
+    ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
+    bad_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
+    good_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
+    bad_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
+    good_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
+    good_bst_binary.update(fobj=_good_gradients)
+    with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
+        bad_bst_binary.update(fobj=_bad_gradients)
+    good_bst_multi.update(fobj=_good_gradients)
+    with pytest.raises(ValueError, match=re.escape(f"number of models per one iteration ({nclass})")):
+        bad_bst_multi.update(fobj=_bad_gradients)
+
+
+@pytest.mark.parametrize('dtype', [np.float32, np.float64])
+@pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto'])
+def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
+    pd = pytest.importorskip('pandas')
+    X = np.random.rand(10, 2).astype(dtype)
+    df = pd.DataFrame(X)
+    built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
+    assert built_data.dtype == dtype
+    assert np.shares_memory(X, built_data)
+
+
+@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
+def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
+    pd = pytest.importorskip('pandas')
+    X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
+    column_name = 'a' if feature_name == 'auto' else feature_name[0]
+    df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
+    data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
+    # check that the original data wasn't modified
+    np.testing.assert_equal(df[column_name], X[:, 0])
+    # check that the built data has the codes
+    np.testing.assert_equal(df[column_name].cat.codes, data[:, 0])
+
+
+@pytest.mark.parametrize('min_data_in_bin', [2, 10])
+def test_feature_num_bin(min_data_in_bin):
+    X = np.vstack([
+        np.random.rand(100),
+        np.array([1, 2] * 50),
+        np.array([0, 1, 2] * 33 + [0]),
+        np.array([1, 2] * 49 + 2 * [np.nan]),
+        np.zeros(100),
+        np.random.choice([0, 1], 100),
+    ]).T
+    n_continuous = X.shape[1] - 1
+    feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
+    ds_kwargs = dict(
+        params={'min_data_in_bin': min_data_in_bin},
+        categorical_feature=[n_continuous],  # last feature
+    )
+    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
+    expected_num_bins = [
+        100 // min_data_in_bin + 1,  # extra bin for zero
+        3,  # 0, 1, 2
+        3,  # 0, 1, 2
+        4,  # 0, 1, 2 + nan
+        0,  # unused
+        3,  # 0, 1 + nan
+    ]
+    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
+    assert actual_num_bins == expected_num_bins
+    # test using defined feature names
+    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
+    assert bins_by_name == expected_num_bins
+    # test using default feature names
+    ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
+    default_names = [f'Column_{i}' for i in range(X.shape[1])]
+    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
+    assert bins_by_default_name == expected_num_bins
+    # check for feature indices outside of range
+    num_features = X.shape[1]
+    with pytest.raises(
+        lgb.basic.LightGBMError,
+        match=(
+            f'Tried to retrieve number of bins for feature index {num_features}, '
+            f'but the valid feature indices are \\[0, {num_features - 1}\\].'
+        )
+    ):
+        ds.feature_num_bin(num_features)
+
+
+def test_feature_num_bin_with_max_bin_by_feature():
+    X = np.random.rand(100, 3)
+    max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
+    ds = lgb.Dataset(X, params={'max_bin_by_feature': max_bin_by_feature}).construct()
+    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
+    np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
diff --git a/tests/python_package_test/test_callback.py b/tests/python_package_test/test_callback.py
new file mode 100644
index 000000000000..1a101fd6799b
--- /dev/null
+++ b/tests/python_package_test/test_callback.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+import pytest
+
+import lightgbm as lgb
+
+from .utils import pickle_obj, unpickle_obj
+
+SERIALIZERS = ["pickle", "joblib", "cloudpickle"]
+
+
+def pickle_and_unpickle_object(obj, serializer):
+    with lgb.basic._TempFile() as tmp_file:
+        pickle_obj(
+            obj=obj,
+            filepath=tmp_file.name,
+            serializer=serializer
+        )
+        obj_from_disk = unpickle_obj(
+            filepath=tmp_file.name,
+            serializer=serializer
+        )
+    return obj_from_disk
+
+
+def reset_feature_fraction(boosting_round):
+    return 0.6 if boosting_round < 15 else 0.8
+
+
+@pytest.mark.parametrize('serializer', SERIALIZERS)
+def test_early_stopping_callback_is_picklable(serializer):
+    rounds = 5
+    callback = lgb.early_stopping(stopping_rounds=rounds)
+    callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
+    assert callback_from_disk.order == 30
+    assert callback_from_disk.before_iteration is False
+    assert callback.stopping_rounds == callback_from_disk.stopping_rounds
+    assert callback.stopping_rounds == rounds
+
+
+@pytest.mark.parametrize('serializer', SERIALIZERS)
+def test_log_evaluation_callback_is_picklable(serializer):
+    periods = 42
+    callback = lgb.log_evaluation(period=periods)
+    callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
+    assert callback_from_disk.order == 10
+    assert callback_from_disk.before_iteration is False
+    assert callback.period == callback_from_disk.period
+    assert callback.period == periods
+
+
+@pytest.mark.parametrize('serializer', SERIALIZERS)
+def test_record_evaluation_callback_is_picklable(serializer):
+    results = {}
+    callback = lgb.record_evaluation(eval_result=results)
+    callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
+    assert callback_from_disk.order == 20
+    assert callback_from_disk.before_iteration is False
+    assert callback.eval_result == callback_from_disk.eval_result
+    assert callback.eval_result is results
+
+
+@pytest.mark.parametrize('serializer', SERIALIZERS)
+def test_reset_parameter_callback_is_picklable(serializer):
+    params = {
+        'bagging_fraction': [0.7] * 5 + [0.6] * 5,
+        'feature_fraction': reset_feature_fraction
+    }
+    callback = lgb.reset_parameter(**params)
+    callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
+    assert callback_from_disk.order == 10
+    assert callback_from_disk.before_iteration is True
+    assert callback.kwargs == callback_from_disk.kwargs
+    assert callback.kwargs == params
diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index a9cb8436a847..bcc6f3834b1c 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -21,7 +21,7 @@ def __init__(self, directory, prefix, config_file='train.conf'):
                 if line and not line.startswith('#'):
                     key, value = [token.strip() for token in line.split('=')]
                     if 'early_stopping' not in key:  # disable early_stopping
-                        self.params[key] = value if key != 'num_trees' else int(value)
+                        self.params[key] = value if key not in {'num_trees', 'num_threads'} else int(value)
 
     def load_dataset(self, suffix, is_sparse=False):
         filename = str(self.path(suffix))
@@ -84,7 +84,7 @@ def test_binary_linear():
     X_test, _, X_test_fn = fd.load_dataset('.test')
     weight_train = fd.load_field('.train.weight')
     lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
-    gbm = lgb.LGBMClassifier(**fd.params)
+    gbm = lgb.LGBMClassifier(**fd.params, n_jobs=0)
     gbm.fit(X_train, y_train, sample_weight=weight_train)
     sk_pred = gbm.predict_proba(X_test)[:, 1]
     fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index b4a948070420..6bdf3ca50b2c 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -2,7 +2,6 @@
 """Tests for lightgbm.dask module"""
 
 import inspect
-import pickle
 import random
 import socket
 from itertools import groupby
@@ -15,6 +14,8 @@
 
 import lightgbm as lgb
 
+from .utils import sklearn_multiclass_custom_objective
+
 if not platform.startswith('linux'):
     pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
 if machine() != 'x86_64':
@@ -22,24 +23,18 @@
 if not lgb.compat.DASK_INSTALLED:
     pytest.skip('Dask is not installed', allow_module_level=True)
 
-import cloudpickle
 import dask.array as da
 import dask.dataframe as dd
-import joblib
 import numpy as np
 import pandas as pd
 import sklearn.utils.estimator_checks as sklearn_checks
 from dask.array.utils import assert_eq
 from dask.distributed import Client, LocalCluster, default_client, wait
-from pkg_resources import parse_version
 from scipy.sparse import csc_matrix, csr_matrix
 from scipy.stats import spearmanr
-from sklearn import __version__ as sk_version
 from sklearn.datasets import make_blobs, make_regression
 
-from .utils import make_ranking
-
-sk_version = parse_version(sk_version)
+from .utils import make_ranking, pickle_obj, unpickle_obj
 
 tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
 distributed_training_algorithms = ['data', 'voting']
@@ -61,7 +56,8 @@
 
 pytestmark = [
     pytest.mark.skipif(getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface'),
-    pytest.mark.skipif(getenv('TASK', '') == 'gpu', reason='Fails to run with GPU interface')
+    pytest.mark.skipif(getenv('TASK', '') == 'gpu', reason='Fails to run with GPU interface'),
+    pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Fails to run with CUDA Experimental interface')
 ]
 
 
@@ -236,30 +232,17 @@ def _constant_metric(y_true, y_pred):
     return metric_name, value, is_higher_better
 
 
-def _pickle(obj, filepath, serializer):
-    if serializer == 'pickle':
-        with open(filepath, 'wb') as f:
-            pickle.dump(obj, f)
-    elif serializer == 'joblib':
-        joblib.dump(obj, filepath)
-    elif serializer == 'cloudpickle':
-        with open(filepath, 'wb') as f:
-            cloudpickle.dump(obj, f)
-    else:
-        raise ValueError(f'Unrecognized serializer type: {serializer}')
-
-
-def _unpickle(filepath, serializer):
-    if serializer == 'pickle':
-        with open(filepath, 'rb') as f:
-            return pickle.load(f)
-    elif serializer == 'joblib':
-        return joblib.load(filepath)
-    elif serializer == 'cloudpickle':
-        with open(filepath, 'rb') as f:
-            return cloudpickle.load(f)
-    else:
-        raise ValueError(f'Unrecognized serializer type: {serializer}')
+def _objective_least_squares(y_true, y_pred):
+    grad = y_pred - y_true
+    hess = np.ones(len(y_true))
+    return grad, hess
+
+
+def _objective_logistic_regression(y_true, y_pred):
+    y_pred = 1.0 / (1.0 + np.exp(-y_pred))
+    grad = y_pred - y_true
+    hess = y_pred * (1.0 - y_pred)
+    return grad, hess
 
 
 @pytest.mark.parametrize('output', data_output)
@@ -455,6 +438,79 @@ def test_classifier_pred_contrib(output, task, cluster):
                 assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)
 
 
+@pytest.mark.parametrize('output', data_output)
+@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
+def test_classifier_custom_objective(output, task, cluster):
+    with Client(cluster) as client:
+        X, y, w, _, dX, dy, dw, _ = _create_data(
+            objective=task,
+            output=output,
+        )
+
+        params = {
+            "n_estimators": 50,
+            "num_leaves": 31,
+            "verbose": -1,
+            "seed": 708,
+            "deterministic": True,
+            "force_col_wise": True
+        }
+
+        if task == 'binary-classification':
+            params.update({
+                'objective': _objective_logistic_regression,
+            })
+        elif task == 'multiclass-classification':
+            params.update({
+                'objective': sklearn_multiclass_custom_objective,
+                'num_classes': 3
+            })
+
+        dask_classifier = lgb.DaskLGBMClassifier(
+            client=client,
+            time_out=5,
+            tree_learner='data',
+            **params
+        )
+        dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
+        dask_classifier_local = dask_classifier.to_local()
+        p1_raw = dask_classifier.predict(dX, raw_score=True).compute()
+        p1_raw_local = dask_classifier_local.predict(X, raw_score=True)
+
+        local_classifier = lgb.LGBMClassifier(**params)
+        local_classifier.fit(X, y, sample_weight=w)
+        p2_raw = local_classifier.predict(X, raw_score=True)
+
+        # with a custom objective, prediction result is a raw score instead of predicted class
+        if task == 'binary-classification':
+            p1_proba = 1.0 / (1.0 + np.exp(-p1_raw))
+            p1_class = (p1_proba > 0.5).astype(np.int64)
+            p1_proba_local = 1.0 / (1.0 + np.exp(-p1_raw_local))
+            p1_class_local = (p1_proba_local > 0.5).astype(np.int64)
+            p2_proba = 1.0 / (1.0 + np.exp(-p2_raw))
+            p2_class = (p2_proba > 0.5).astype(np.int64)
+        elif task == 'multiclass-classification':
+            p1_proba = np.exp(p1_raw) / np.sum(np.exp(p1_raw), axis=1).reshape(-1, 1)
+            p1_class = p1_proba.argmax(axis=1)
+            p1_proba_local = np.exp(p1_raw_local) / np.sum(np.exp(p1_raw_local), axis=1).reshape(-1, 1)
+            p1_class_local = p1_proba_local.argmax(axis=1)
+            p2_proba = np.exp(p2_raw) / np.sum(np.exp(p2_raw), axis=1).reshape(-1, 1)
+            p2_class = p2_proba.argmax(axis=1)
+
+        # function should have been preserved
+        assert callable(dask_classifier.objective_)
+        assert callable(dask_classifier_local.objective_)
+
+        # should correctly classify every sample
+        assert_eq(p1_class, y)
+        assert_eq(p1_class_local, y)
+        assert_eq(p2_class, y)
+
+        # probability estimates should be similar
+        assert_eq(p1_proba, p2_proba, atol=0.03)
+        assert_eq(p1_proba, p1_proba_local)
+
+
 def test_group_workers_by_host():
     hosts = [f'0.0.0.{i}' for i in range(2)]
     workers = [f'tcp://{host}:{p}' for p in range(2) for host in hosts]
@@ -700,6 +756,56 @@ def test_regressor_quantile(output, alpha, cluster):
             assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
 
 
+@pytest.mark.parametrize('output', data_output)
+def test_regressor_custom_objective(output, cluster):
+    with Client(cluster) as client:
+        X, y, w, _, dX, dy, dw, _ = _create_data(
+            objective='regression',
+            output=output
+        )
+
+        params = {
+            "n_estimators": 10,
+            "num_leaves": 10,
+            "objective": _objective_least_squares
+        }
+
+        dask_regressor = lgb.DaskLGBMRegressor(
+            client=client,
+            time_out=5,
+            tree_learner='data',
+            **params
+        )
+        dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
+        dask_regressor_local = dask_regressor.to_local()
+        p1 = dask_regressor.predict(dX)
+        p1_local = dask_regressor_local.predict(X)
+        s1_local = dask_regressor_local.score(X, y)
+        s1 = _r2_score(dy, p1)
+        p1 = p1.compute()
+
+        local_regressor = lgb.LGBMRegressor(**params)
+        local_regressor.fit(X, y, sample_weight=w)
+        p2 = local_regressor.predict(X)
+        s2 = local_regressor.score(X, y)
+
+        # function should have been preserved
+        assert callable(dask_regressor.objective_)
+        assert callable(dask_regressor_local.objective_)
+
+        # Scores should be the same
+        assert_eq(s1, s2, atol=0.01)
+        assert_eq(s1, s1_local)
+
+        # local and Dask predictions should be the same
+        assert_eq(p1, p1_local)
+
+        # predictions should be better than random
+        assert_precision = {"rtol": 0.5, "atol": 50.}
+        assert_eq(p1, y, **assert_precision)
+        assert_eq(p2, y, **assert_precision)
+
+
 @pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
 @pytest.mark.parametrize('group', [None, group_sizes])
 @pytest.mark.parametrize('boosting_type', boosting_types)
@@ -808,6 +914,67 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
             assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
 
 
+@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
+def test_ranker_custom_objective(output, cluster):
+    with Client(cluster) as client:
+        if output == 'dataframe-with-categorical':
+            X, y, w, g, dX, dy, dw, dg = _create_data(
+                objective='ranking',
+                output=output,
+                group=group_sizes,
+                n_features=1,
+                n_informative=1
+            )
+        else:
+            X, y, w, g, dX, dy, dw, dg = _create_data(
+                objective='ranking',
+                output=output,
+                group=group_sizes
+            )
+
+        # rebalance small dask.Array dataset for better performance.
+        if output == 'array':
+            dX = dX.persist()
+            dy = dy.persist()
+            dw = dw.persist()
+            dg = dg.persist()
+            _ = wait([dX, dy, dw, dg])
+            client.rebalance()
+
+        params = {
+            "random_state": 42,
+            "n_estimators": 50,
+            "num_leaves": 20,
+            "min_child_samples": 1,
+            "objective": _objective_least_squares
+        }
+
+        dask_ranker = lgb.DaskLGBMRanker(
+            client=client,
+            time_out=5,
+            tree_learner_type="data",
+            **params
+        )
+        dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
+        rnkvec_dask = dask_ranker.predict(dX).compute()
+        dask_ranker_local = dask_ranker.to_local()
+        rnkvec_dask_local = dask_ranker_local.predict(X)
+
+        local_ranker = lgb.LGBMRanker(**params)
+        local_ranker.fit(X, y, sample_weight=w, group=g)
+        rnkvec_local = local_ranker.predict(X)
+
+        # distributed ranker should be able to rank decently well with the least-squares objective
+        # and should have high rank correlation with scores from serial ranker.
+        assert spearmanr(rnkvec_dask, y).correlation > 0.6
+        assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
+        assert_eq(rnkvec_dask, rnkvec_dask_local)
+
+        # function should have been preserved
+        assert callable(dask_ranker.objective_)
+        assert callable(dask_ranker_local.objective_)
+
+
 @pytest.mark.parametrize('task', tasks)
 @pytest.mark.parametrize('output', data_output)
 @pytest.mark.parametrize('eval_sizes', [[0.5, 1, 1.5], [0]])
@@ -1146,23 +1313,23 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
             assert getattr(local_model, "client", None) is None
 
             tmp_file = tmp_path / "model-1.pkl"
-            _pickle(
+            pickle_obj(
                 obj=dask_model,
                 filepath=tmp_file,
                 serializer=serializer
             )
-            model_from_disk = _unpickle(
+            model_from_disk = unpickle_obj(
                 filepath=tmp_file,
                 serializer=serializer
             )
 
             local_tmp_file = tmp_path / "local-model-1.pkl"
-            _pickle(
+            pickle_obj(
                 obj=local_model,
                 filepath=local_tmp_file,
                 serializer=serializer
             )
-            local_model_from_disk = _unpickle(
+            local_model_from_disk = unpickle_obj(
                 filepath=local_tmp_file,
                 serializer=serializer
             )
@@ -1202,23 +1369,23 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
                 local_model.client_
 
             tmp_file2 = tmp_path / "model-2.pkl"
-            _pickle(
+            pickle_obj(
                 obj=dask_model,
                 filepath=tmp_file2,
                 serializer=serializer
             )
-            fitted_model_from_disk = _unpickle(
+            fitted_model_from_disk = unpickle_obj(
                 filepath=tmp_file2,
                 serializer=serializer
             )
 
             local_tmp_file2 = tmp_path / "local-model-2.pkl"
-            _pickle(
+            pickle_obj(
                 obj=local_model,
                 filepath=local_tmp_file2,
                 serializer=serializer
             )
-            local_fitted_model_from_disk = _unpickle(
+            local_fitted_model_from_disk = unpickle_obj(
                 filepath=local_tmp_file2,
                 serializer=serializer
             )
@@ -1638,10 +1805,7 @@ def test_sklearn_integration(estimator, check, cluster):
 @pytest.mark.parametrize("estimator", list(_tested_estimators()))
 def test_parameters_default_constructible(estimator):
     name = estimator.__class__.__name__
-    if sk_version >= parse_version("0.24"):
-        Estimator = estimator
-    else:
-        Estimator = estimator.__class__
+    Estimator = estimator
     sklearn_checks.check_parameters_default_constructible(name, Estimator)
 
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 4dd57f311b23..3eb6186efd5c 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -6,25 +6,32 @@
 import pickle
 import platform
 import random
+from os import getenv
 from pathlib import Path
 
 import numpy as np
 import psutil
 import pytest
 from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr
-from sklearn.datasets import load_svmlight_file, make_multilabel_classification
+from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
 from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score
 from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split
 
 import lightgbm as lgb
+from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
 
-from .utils import load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression
+from .utils import (dummy_obj, load_boston, load_breast_cancer, load_digits, load_iris, logistic_sigmoid,
+                    make_synthetic_regression, mse_obj, sklearn_multiclass_custom_objective, softmax)
 
 decreasing_generator = itertools.count(0, -1)
 
 
-def dummy_obj(preds, train_data):
-    return np.ones(preds.shape), np.ones(preds.shape)
+def logloss_obj(preds, train_data):
+    y_true = train_data.get_label()
+    y_pred = logistic_sigmoid(preds)
+    grad = y_pred - y_true
+    hess = y_pred * (1.0 - y_pred)
+    return grad, hess
 
 
 def multi_logloss(y_true, y_pred):
@@ -62,10 +69,13 @@ def test_binary():
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=20,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=20,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = log_loss(y_test, gbm.predict(X_test))
     assert ret < 0.14
     assert len(evals_result['valid_0']['binary_logloss']) == 50
@@ -88,10 +98,13 @@ def test_rf():
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=50,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=50,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = log_loss(y_test, gbm.predict(X_test))
     assert ret < 0.19
     assert evals_result['valid_0']['binary_logloss'][-1] == pytest.approx(ret)
@@ -107,10 +120,13 @@ def test_regression():
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=50,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=50,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = mean_squared_error(y_test, gbm.predict(X_test))
     assert ret < 7
     assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
@@ -132,10 +148,13 @@ def test_missing_value_handle():
         'boost_from_average': False
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=20,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=20,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = mean_squared_error(y_train, gbm.predict(X_train))
     assert ret < 0.005
     assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
@@ -157,10 +176,13 @@ def test_missing_value_handle_more_na():
         'boost_from_average': False
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=20,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=20,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = mean_squared_error(y_train, gbm.predict(X_train))
     assert ret < 0.005
     assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
@@ -187,10 +209,13 @@ def test_missing_value_handle_na():
         'zero_as_missing': False
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=1,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=1,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     pred = gbm.predict(X_train)
     np.testing.assert_allclose(pred, y)
     ret = roc_auc_score(y_train, pred)
@@ -219,10 +244,13 @@ def test_missing_value_handle_zero():
         'zero_as_missing': True
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=1,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=1,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     pred = gbm.predict(X_train)
     np.testing.assert_allclose(pred, y)
     ret = roc_auc_score(y_train, pred)
@@ -251,10 +279,13 @@ def test_missing_value_handle_none():
         'use_missing': False
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=1,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=1,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     pred = gbm.predict(X_train)
     assert pred[0] == pytest.approx(pred[1])
     assert pred[-1] == pytest.approx(pred[0])
@@ -289,10 +320,13 @@ def test_categorical_handle():
         'categorical_column': 0
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=1,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=1,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     pred = gbm.predict(X_train)
     np.testing.assert_allclose(pred, y)
     ret = roc_auc_score(y_train, pred)
@@ -326,10 +360,13 @@ def test_categorical_handle_na():
         'categorical_column': 0
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=1,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=1,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     pred = gbm.predict(X_train)
     np.testing.assert_allclose(pred, y)
     ret = roc_auc_score(y_train, pred)
@@ -363,10 +400,13 @@ def test_categorical_non_zero_inputs():
         'categorical_column': 0
     }
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=1,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=1,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     pred = gbm.predict(X_train)
     np.testing.assert_allclose(pred, y)
     ret = roc_auc_score(y_train, pred)
@@ -386,10 +426,13 @@ def test_multiclass():
     lgb_train = lgb.Dataset(X_train, y_train, params=params)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=50,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=50,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = multi_logloss(y_test, gbm.predict(X_test))
     assert ret < 0.16
     assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret)
@@ -414,10 +457,13 @@ def test_multiclass_rf():
     lgb_train = lgb.Dataset(X_train, y_train, params=params)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=50,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=50,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = multi_logloss(y_test, gbm.predict(X_test))
     assert ret < 0.23
     assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret)
@@ -456,8 +502,16 @@ def test_multi_class_error():
     est = lgb.train(params, lgb_data, num_boost_round=10)
     predict_default = est.predict(X)
     results = {}
-    est = lgb.train(dict(params, multi_error_top_k=1), lgb_data, num_boost_round=10,
-                    valid_sets=[lgb_data], evals_result=results)
+    est = lgb.train(
+        dict(
+            params,
+            multi_error_top_k=1
+        ),
+        lgb_data,
+        num_boost_round=10,
+        valid_sets=[lgb_data],
+        callbacks=[lgb.record_evaluation(results)]
+    )
     predict_1 = est.predict(X)
     # check that default gives same result as k = 1
     np.testing.assert_allclose(predict_1, predict_default)
@@ -466,15 +520,31 @@ def test_multi_class_error():
     assert results['training']['multi_error'][-1] == pytest.approx(err)
     # check against independent calculation for k = 2
     results = {}
-    est = lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10,
-                    valid_sets=[lgb_data], evals_result=results)
+    est = lgb.train(
+        dict(
+            params,
+            multi_error_top_k=2
+        ),
+        lgb_data,
+        num_boost_round=10,
+        valid_sets=[lgb_data],
+        callbacks=[lgb.record_evaluation(results)]
+    )
     predict_2 = est.predict(X)
     err = top_k_error(y, predict_2, 2)
     assert results['training']['multi_error@2'][-1] == pytest.approx(err)
     # check against independent calculation for k = 10
     results = {}
-    est = lgb.train(dict(params, multi_error_top_k=10), lgb_data, num_boost_round=10,
-                    valid_sets=[lgb_data], evals_result=results)
+    est = lgb.train(
+        dict(
+            params,
+            multi_error_top_k=10
+        ),
+        lgb_data,
+        num_boost_round=10,
+        valid_sets=[lgb_data],
+        callbacks=[lgb.record_evaluation(results)]
+    )
     predict_3 = est.predict(X)
     err = top_k_error(y, predict_3, 10)
     assert results['training']['multi_error@10'][-1] == pytest.approx(err)
@@ -484,15 +554,29 @@ def test_multi_class_error():
     lgb_data = lgb.Dataset(X, label=y)
     params['num_classes'] = 2
     results = {}
-    lgb.train(params, lgb_data, num_boost_round=10,
-              valid_sets=[lgb_data], evals_result=results)
+    lgb.train(
+        params,
+        lgb_data,
+        num_boost_round=10,
+        valid_sets=[lgb_data],
+        callbacks=[lgb.record_evaluation(results)]
+    )
     assert results['training']['multi_error'][-1] == pytest.approx(1)
     results = {}
-    lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10,
-              valid_sets=[lgb_data], evals_result=results)
+    lgb.train(
+        dict(
+            params,
+            multi_error_top_k=2
+        ),
+        lgb_data,
+        num_boost_round=10,
+        valid_sets=[lgb_data],
+        callbacks=[lgb.record_evaluation(results)]
+    )
     assert results['training']['multi_error@2'][-1] == pytest.approx(0)
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_auc_mu():
     # should give same result as binary auc for 2 classes
     X, y = load_digits(n_class=10, return_X_y=True)
@@ -505,13 +589,25 @@ def test_auc_mu():
               'num_classes': 2,
               'seed': 0}
     results_auc_mu = {}
-    lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc_mu)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=10,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results_auc_mu)]
+    )
     params = {'objective': 'binary',
               'metric': 'auc',
               'verbose': -1,
               'seed': 0}
     results_auc = {}
-    lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=10,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results_auc)]
+    )
     np.testing.assert_allclose(results_auc_mu['training']['auc_mu'], results_auc['training']['auc'])
     # test the case where all predictions are equal
     lgb_X = lgb.Dataset(X[:10], label=y_new[:10])
@@ -522,7 +618,13 @@ def test_auc_mu():
               'min_data_in_leaf': 20,
               'seed': 0}
     results_auc_mu = {}
-    lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc_mu)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=10,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results_auc_mu)]
+    )
     assert results_auc_mu['training']['auc_mu'][-1] == pytest.approx(0.5)
     # test that weighted data gives different auc_mu
     lgb_X = lgb.Dataset(X, label=y)
@@ -530,15 +632,31 @@ def test_auc_mu():
     results_unweighted = {}
     results_weighted = {}
     params = dict(params, num_classes=10, num_leaves=5)
-    lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_unweighted)
-    lgb.train(params, lgb_X_weighted, num_boost_round=10, valid_sets=[lgb_X_weighted],
-              evals_result=results_weighted)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=10,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results_unweighted)]
+    )
+    lgb.train(
+        params,
+        lgb_X_weighted,
+        num_boost_round=10,
+        valid_sets=[lgb_X_weighted],
+        callbacks=[lgb.record_evaluation(results_weighted)]
+    )
     assert results_weighted['training']['auc_mu'][-1] < 1
     assert results_unweighted['training']['auc_mu'][-1] != results_weighted['training']['auc_mu'][-1]
     # test that equal data weights give same auc_mu as unweighted data
     lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.ones(y.shape) * 0.5)
-    lgb.train(params, lgb_X_weighted, num_boost_round=10, valid_sets=[lgb_X_weighted],
-              evals_result=results_weighted)
+    lgb.train(
+        params,
+        lgb_X_weighted,
+        num_boost_round=10,
+        valid_sets=[lgb_X_weighted],
+        callbacks=[lgb.record_evaluation(results_weighted)]
+    )
     assert results_unweighted['training']['auc_mu'][-1] == pytest.approx(
         results_weighted['training']['auc_mu'][-1], abs=1e-5)
     # should give 1 when accuracy = 1
@@ -551,7 +669,13 @@ def test_auc_mu():
               'min_data_in_leaf': 1,
               'verbose': -1}
     results = {}
-    lgb.train(params, lgb_X, num_boost_round=100, valid_sets=[lgb_X], evals_result=results)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=100,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results)]
+    )
     assert results['training']['auc_mu'][-1] == pytest.approx(1)
     # test loading class weights
     Xy = np.loadtxt(
@@ -567,10 +691,22 @@ def test_auc_mu():
               'verbose': -1,
               'seed': 0}
     results_weight = {}
-    lgb.train(params, lgb_X, num_boost_round=5, valid_sets=[lgb_X], evals_result=results_weight)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=5,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results_weight)]
+    )
     params['auc_mu_weights'] = []
     results_no_weight = {}
-    lgb.train(params, lgb_X, num_boost_round=5, valid_sets=[lgb_X], evals_result=results_no_weight)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=5,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(results_no_weight)]
+    )
     assert results_weight['training']['auc_mu'][-1] != results_no_weight['training']['auc_mu'][-1]
 
 
@@ -613,7 +749,7 @@ def test_early_stopping():
                     num_boost_round=10,
                     valid_sets=lgb_eval,
                     valid_names=valid_set_name,
-                    early_stopping_rounds=5)
+                    callbacks=[lgb.early_stopping(stopping_rounds=5)])
     assert gbm.best_iteration == 10
     assert valid_set_name in gbm.best_score
     assert 'binary_logloss' in gbm.best_score[valid_set_name]
@@ -622,12 +758,42 @@ def test_early_stopping():
                     num_boost_round=40,
                     valid_sets=lgb_eval,
                     valid_names=valid_set_name,
-                    early_stopping_rounds=5)
+                    callbacks=[lgb.early_stopping(stopping_rounds=5)])
     assert gbm.best_iteration <= 39
     assert valid_set_name in gbm.best_score
     assert 'binary_logloss' in gbm.best_score[valid_set_name]
 
 
+@pytest.mark.parametrize('first_metric_only', [True, False])
+def test_early_stopping_via_global_params(first_metric_only):
+    X, y = load_breast_cancer(return_X_y=True)
+    num_trees = 5
+    params = {
+        'num_trees': num_trees,
+        'objective': 'binary',
+        'metric': 'None',
+        'verbose': -1,
+        'early_stopping_round': 2,
+        'first_metric_only': first_metric_only
+    }
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    lgb_train = lgb.Dataset(X_train, y_train)
+    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+    valid_set_name = 'valid_set'
+    gbm = lgb.train(params,
+                    lgb_train,
+                    feval=[decreasing_metric, constant_metric],
+                    valid_sets=lgb_eval,
+                    valid_names=valid_set_name)
+    if first_metric_only:
+        assert gbm.best_iteration == num_trees
+    else:
+        assert gbm.best_iteration == 1
+    assert valid_set_name in gbm.best_score
+    assert 'decreasing_metric' in gbm.best_score[valid_set_name]
+    assert 'error' in gbm.best_score[valid_set_name]
+
+
 @pytest.mark.parametrize('first_only', [True, False])
 @pytest.mark.parametrize('single_metric', [True, False])
 @pytest.mark.parametrize('greater_is_better', [True, False])
@@ -678,15 +844,21 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
     )
 
     # regular early stopping
-    train_kwargs['callbacks'] = [lgb.callback.early_stopping(10, first_only, verbose=0)]
     evals_result = {}
-    bst = lgb.train(evals_result=evals_result, **train_kwargs)
+    train_kwargs['callbacks'] = [
+        lgb.callback.early_stopping(10, first_only, verbose=False),
+        lgb.record_evaluation(evals_result)
+    ]
+    bst = lgb.train(**train_kwargs)
     scores = np.vstack(list(evals_result['valid'].values())).T
 
     # positive min_delta
-    train_kwargs['callbacks'] = [lgb.callback.early_stopping(10, first_only, verbose=0, min_delta=min_delta)]
     delta_result = {}
-    delta_bst = lgb.train(evals_result=delta_result, **train_kwargs)
+    train_kwargs['callbacks'] = [
+        lgb.callback.early_stopping(10, first_only, verbose=False, min_delta=min_delta),
+        lgb.record_evaluation(delta_result)
+    ]
+    delta_bst = lgb.train(**train_kwargs)
     delta_scores = np.vstack(list(delta_result['valid'].values())).T
 
     if first_only:
@@ -717,13 +889,16 @@ def test_continue_train():
     model_name = 'model.txt'
     init_gbm.save_model(model_name)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=30,
-                    valid_sets=lgb_eval,
-                    # test custom eval metrics
-                    feval=(lambda p, d: ('custom_mae', mean_absolute_error(p, d.get_label()), False)),
-                    evals_result=evals_result,
-                    init_model='model.txt')
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=30,
+        valid_sets=lgb_eval,
+        # test custom eval metrics
+        feval=(lambda p, d: ('custom_mae', mean_absolute_error(p, d.get_label()), False)),
+        callbacks=[lgb.record_evaluation(evals_result)],
+        init_model='model.txt'
+    )
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
     assert ret < 2.0
     assert evals_result['valid_0']['l1'][-1] == pytest.approx(ret)
@@ -757,11 +932,14 @@ def test_continue_train_dart():
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
     init_gbm = lgb.train(params, lgb_train, num_boost_round=50)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=50,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result,
-                    init_model=init_gbm)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=50,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)],
+        init_model=init_gbm
+    )
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
     assert ret < 2.0
     assert evals_result['valid_0']['l1'][-1] == pytest.approx(ret)
@@ -780,11 +958,14 @@ def test_continue_train_multiclass():
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False)
     init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=30,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result,
-                    init_model=init_gbm)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=30,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)],
+        init_model=init_gbm
+    )
     ret = multi_logloss(y_test, gbm.predict(X_test))
     assert ret < 0.1
     assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret)
@@ -798,15 +979,15 @@ def test_cv():
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
                     nfold=3, stratified=False, shuffle=False, metrics='l1')
-    assert 'l1-mean' in cv_res
-    assert 'l2-mean' not in cv_res
-    assert len(cv_res['l1-mean']) == 10
+    assert 'valid l1-mean' in cv_res
+    assert 'valid l2-mean' not in cv_res
+    assert len(cv_res['valid l1-mean']) == 10
     # shuffle = True, callbacks
     cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3,
                     stratified=False, shuffle=True, metrics='l1',
                     callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
-    assert 'l1-mean' in cv_res
-    assert len(cv_res['l1-mean']) == 10
+    assert 'valid l1-mean' in cv_res
+    assert len(cv_res['valid l1-mean']) == 10
     # enable display training loss
     cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
                     nfold=3, stratified=False, shuffle=False,
@@ -822,7 +1003,7 @@ def test_cv():
     folds = tss.split(X_train)
     cv_res_gen = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds)
     cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss)
-    np.testing.assert_allclose(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
+    np.testing.assert_allclose(cv_res_gen['valid l2-mean'], cv_res_obj['valid l2-mean'])
     # LambdaRank
     rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
     X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
@@ -832,15 +1013,15 @@ def test_cv():
     # ... with l2 metric
     cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, metrics='l2')
     assert len(cv_res_lambda) == 2
-    assert not np.isnan(cv_res_lambda['l2-mean']).any()
+    assert not np.isnan(cv_res_lambda['valid l2-mean']).any()
     # ... with NDCG (default) metric
     cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3)
     assert len(cv_res_lambda) == 2
-    assert not np.isnan(cv_res_lambda['ndcg@3-mean']).any()
+    assert not np.isnan(cv_res_lambda['valid ndcg@3-mean']).any()
     # self defined folds with lambdarank
     cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10,
                                folds=GroupKFold(n_splits=3))
-    np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
+    np.testing.assert_allclose(cv_res_lambda['valid ndcg@3-mean'], cv_res_lambda_obj['valid ndcg@3-mean'])
 
 
 def test_cvbooster():
@@ -851,24 +1032,30 @@ def test_cvbooster():
         'metric': 'binary_logloss',
         'verbose': -1,
     }
+    nfold = 3
     lgb_train = lgb.Dataset(X_train, y_train)
     # with early stopping
     cv_res = lgb.cv(params, lgb_train,
                     num_boost_round=25,
-                    early_stopping_rounds=5,
-                    nfold=3,
+                    nfold=nfold,
+                    callbacks=[lgb.early_stopping(stopping_rounds=5)],
                     return_cvbooster=True)
     assert 'cvbooster' in cv_res
     cvb = cv_res['cvbooster']
     assert isinstance(cvb, lgb.CVBooster)
     assert isinstance(cvb.boosters, list)
-    assert len(cvb.boosters) == 3
+    assert len(cvb.boosters) == nfold
     assert all(isinstance(bst, lgb.Booster) for bst in cvb.boosters)
     assert cvb.best_iteration > 0
     # predict by each fold booster
-    preds = cvb.predict(X_test, num_iteration=cvb.best_iteration)
+    preds = cvb.predict(X_test)
     assert isinstance(preds, list)
-    assert len(preds) == 3
+    assert len(preds) == nfold
+    # check that each booster predicted using the best iteration
+    for fold_preds, bst in zip(preds, cvb.boosters):
+        assert bst.best_iteration == cvb.best_iteration
+        expected = bst.predict(X_test, num_iteration=cvb.best_iteration)
+        np.testing.assert_allclose(fold_preds, expected)
     # fold averaging
     avg_pred = np.mean(preds, axis=0)
     ret = log_loss(y_test, avg_pred)
@@ -1034,20 +1221,15 @@ def test_pandas_categorical():
 
 def test_pandas_sparse():
     pd = pytest.importorskip("pandas")
-    try:
-        from pandas.arrays import SparseArray
-    except ImportError:  # support old versions
-        from pandas import SparseArray
-    X = pd.DataFrame({"A": SparseArray(np.random.permutation([0, 1, 2] * 100)),
-                      "B": SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
-                      "C": SparseArray(np.random.permutation([True, False] * 150))})
-    y = pd.Series(SparseArray(np.random.permutation([0, 1] * 150)))
-    X_test = pd.DataFrame({"A": SparseArray(np.random.permutation([0, 2] * 30)),
-                           "B": SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
-                           "C": SparseArray(np.random.permutation([True, False] * 30))})
-    if pd.__version__ >= '0.24.0':
-        for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
-            assert pd.api.types.is_sparse(dtype)
+    X = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
+                      "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
+                      "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150))})
+    y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150)))
+    X_test = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
+                           "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
+                           "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30))})
+    for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
+        assert pd.api.types.is_sparse(dtype)
     params = {
         'objective': 'binary',
         'verbose': -1
@@ -1071,9 +1253,13 @@ def test_reference_chain():
     tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
     params = {'objective': 'regression_l2', 'metric': 'rmse'}
     evals_result = {}
-    lgb.train(params, tmp_dat_train, num_boost_round=20,
-              valid_sets=[tmp_dat_train, tmp_dat_val],
-              evals_result=evals_result)
+    lgb.train(
+        params,
+        tmp_dat_train,
+        num_boost_round=20,
+        valid_sets=[tmp_dat_train, tmp_dat_val],
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     assert len(evals_result['training']['rmse']) == 20
     assert len(evals_result['valid_1']['rmse']) == 20
 
@@ -1296,6 +1482,18 @@ def test_init_with_subset():
     assert subset_data_4.get_data() == "lgb_train_data.bin"
 
 
+def test_training_on_constructed_subset_without_params():
+    X = np.random.random((100, 10))
+    y = np.random.random(100)
+    lgb_data = lgb.Dataset(X, y)
+    subset_indices = [1, 2, 3, 4]
+    subset = lgb_data.subset(subset_indices).construct()
+    bst = lgb.train({}, subset, num_boost_round=1)
+    assert subset.get_params() == {}
+    assert subset.num_data() == len(subset_indices)
+    assert bst.current_iteration() == 1
+
+
 def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
     number_of_dpoints = 3000
     x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
@@ -1322,6 +1520,7 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
     return trainset
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version')
 @pytest.mark.parametrize("test_with_categorical_variable", [True, False])
 def test_monotone_constraints(test_with_categorical_variable):
     def is_increasing(y):
@@ -1411,6 +1610,7 @@ def has_interaction(treef):
                 assert are_interactions_enforced(constrained_model, feature_sets)
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version')
 def test_monotone_penalty():
     def are_first_splits_non_monotone(tree, n, monotone_constraints):
         if n <= 0:
@@ -1450,6 +1650,7 @@ def are_there_monotone_splits(tree, monotone_constraints):
 
 
 # test if a penalty as high as the depth indeed prohibits all monotone splits
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version')
 def test_monotone_penalty_max():
     max_depth = 5
     monotone_constraints = [1, -1, 0]
@@ -1545,6 +1746,40 @@ def test_refit():
     assert err_pred > new_err_pred
 
 
+def test_refit_dataset_params():
+    # check refit accepts dataset_params
+    X, y = load_breast_cancer(return_X_y=True)
+    lgb_train = lgb.Dataset(X, y, init_score=np.zeros(y.size))
+    train_params = {
+        'objective': 'binary',
+        'verbose': -1,
+        'seed': 123
+    }
+    gbm = lgb.train(train_params, lgb_train, num_boost_round=10)
+    non_weight_err_pred = log_loss(y, gbm.predict(X))
+    refit_weight = np.random.rand(y.shape[0])
+    dataset_params = {
+        'max_bin': 260,
+        'min_data_in_bin': 5,
+        'data_random_seed': 123,
+    }
+    new_gbm = gbm.refit(
+        data=X,
+        label=y,
+        weight=refit_weight,
+        dataset_params=dataset_params,
+        decay_rate=0.0,
+    )
+    weight_err_pred = log_loss(y, new_gbm.predict(X))
+    train_set_params = new_gbm.train_set.get_params()
+    stored_weights = new_gbm.train_set.get_weight()
+    assert weight_err_pred != non_weight_err_pred
+    assert train_set_params["max_bin"] == 260
+    assert train_set_params["min_data_in_bin"] == 5
+    assert train_set_params["data_random_seed"] == 123
+    np.testing.assert_allclose(stored_weights, refit_weight)
+
+
 def test_mape_rf():
     X, y = load_boston(return_X_y=True)
     params = {
@@ -1653,8 +1888,8 @@ def preprocess_data(dtrain, dtest, params):
     dataset = lgb.Dataset(X, y, free_raw_data=False)
     params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
     results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data)
-    assert 'multi_logloss-mean' in results
-    assert len(results['multi_logloss-mean']) == 10
+    assert 'valid multi_logloss-mean' in results
+    assert len(results['valid multi_logloss-mean']) == 10
 
 
 def test_metrics():
@@ -1664,7 +1899,7 @@ def test_metrics():
     lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)
 
     evals_result = {}
-    params_verbose = {'verbose': -1}
+    params_dummy_obj_verbose = {'verbose': -1, 'objective': dummy_obj}
     params_obj_verbose = {'objective': 'binary', 'verbose': -1}
     params_obj_metric_log_verbose = {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1}
     params_obj_metric_err_verbose = {'objective': 'binary', 'metric': 'binary_error', 'verbose': -1}
@@ -1673,59 +1908,62 @@ def test_metrics():
                                        'metric': ['binary_logloss', 'binary_error'],
                                        'verbose': -1}
     params_obj_metric_none_verbose = {'objective': 'binary', 'metric': 'None', 'verbose': -1}
-    params_metric_log_verbose = {'metric': 'binary_logloss', 'verbose': -1}
-    params_metric_err_verbose = {'metric': 'binary_error', 'verbose': -1}
-    params_metric_inv_verbose = {'metric_types': 'invalid_metric', 'verbose': -1}
-    params_metric_multi_verbose = {'metric': ['binary_logloss', 'binary_error'], 'verbose': -1}
-    params_metric_none_verbose = {'metric': 'None', 'verbose': -1}
+    params_dummy_obj_metric_log_verbose = {'objective': dummy_obj, 'metric': 'binary_logloss', 'verbose': -1}
+    params_dummy_obj_metric_err_verbose = {'objective': dummy_obj, 'metric': 'binary_error', 'verbose': -1}
+    params_dummy_obj_metric_inv_verbose = {'objective': dummy_obj, 'metric_types': 'invalid_metric', 'verbose': -1}
+    params_dummy_obj_metric_multi_verbose = {'objective': dummy_obj, 'metric': ['binary_logloss', 'binary_error'], 'verbose': -1}
+    params_dummy_obj_metric_none_verbose = {'objective': dummy_obj, 'metric': 'None', 'verbose': -1}
 
     def get_cv_result(params=params_obj_verbose, **kwargs):
         return lgb.cv(params, lgb_train, num_boost_round=2, **kwargs)
 
     def train_booster(params=params_obj_verbose, **kwargs):
-        lgb.train(params, lgb_train,
-                  num_boost_round=2,
-                  valid_sets=[lgb_valid],
-                  evals_result=evals_result,
-                  **kwargs)
+        lgb.train(
+            params,
+            lgb_train,
+            num_boost_round=2,
+            valid_sets=[lgb_valid],
+            callbacks=[lgb.record_evaluation(evals_result)],
+            **kwargs
+        )
 
-    # no fobj, no feval
+    # no custom objective, no feval
     # default metric
     res = get_cv_result()
     assert len(res) == 2
-    assert 'binary_logloss-mean' in res
+    assert 'valid binary_logloss-mean' in res
 
     # non-default metric in params
     res = get_cv_result(params=params_obj_metric_err_verbose)
     assert len(res) == 2
-    assert 'binary_error-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # default metric in args
     res = get_cv_result(metrics='binary_logloss')
     assert len(res) == 2
-    assert 'binary_logloss-mean' in res
+    assert 'valid binary_logloss-mean' in res
 
     # non-default metric in args
     res = get_cv_result(metrics='binary_error')
     assert len(res) == 2
-    assert 'binary_error-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # metric in args overwrites one in params
     res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error')
     assert len(res) == 2
-    assert 'binary_error-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # multiple metrics in params
     res = get_cv_result(params=params_obj_metric_multi_verbose)
     assert len(res) == 4
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # multiple metrics in args
     res = get_cv_result(metrics=['binary_logloss', 'binary_error'])
     assert len(res) == 4
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # remove default metric by 'None' in list
     res = get_cv_result(metrics=['None'])
@@ -1736,136 +1974,136 @@ def train_booster(params=params_obj_verbose, **kwargs):
         res = get_cv_result(metrics=na_alias)
         assert len(res) == 0
 
-    # fobj, no feval
+    # custom objective, no feval
     # no default metric
-    res = get_cv_result(params=params_verbose, fobj=dummy_obj)
+    res = get_cv_result(params=params_dummy_obj_verbose)
     assert len(res) == 0
 
     # metric in params
-    res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj)
+    res = get_cv_result(params=params_dummy_obj_metric_err_verbose)
     assert len(res) == 2
-    assert 'binary_error-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # metric in args
-    res = get_cv_result(params=params_verbose, fobj=dummy_obj, metrics='binary_error')
+    res = get_cv_result(params=params_dummy_obj_verbose, metrics='binary_error')
     assert len(res) == 2
-    assert 'binary_error-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # metric in args overwrites its' alias in params
-    res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj, metrics='binary_error')
+    res = get_cv_result(params=params_dummy_obj_metric_inv_verbose, metrics='binary_error')
     assert len(res) == 2
-    assert 'binary_error-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # multiple metrics in params
-    res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj)
+    res = get_cv_result(params=params_dummy_obj_metric_multi_verbose)
     assert len(res) == 4
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
 
     # multiple metrics in args
-    res = get_cv_result(params=params_verbose, fobj=dummy_obj,
+    res = get_cv_result(params=params_dummy_obj_verbose,
                         metrics=['binary_logloss', 'binary_error'])
     assert len(res) == 4
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
 
-    # no fobj, feval
+    # no custom objective, feval
     # default metric with custom one
     res = get_cv_result(feval=constant_metric)
     assert len(res) == 4
-    assert 'binary_logloss-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid error-mean' in res
 
     # non-default metric in params with custom one
     res = get_cv_result(params=params_obj_metric_err_verbose, feval=constant_metric)
     assert len(res) == 4
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # default metric in args with custom one
     res = get_cv_result(metrics='binary_logloss', feval=constant_metric)
     assert len(res) == 4
-    assert 'binary_logloss-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid error-mean' in res
 
     # non-default metric in args with custom one
     res = get_cv_result(metrics='binary_error', feval=constant_metric)
     assert len(res) == 4
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # metric in args overwrites one in params, custom one is evaluated too
     res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error', feval=constant_metric)
     assert len(res) == 4
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # multiple metrics in params with custom one
     res = get_cv_result(params=params_obj_metric_multi_verbose, feval=constant_metric)
     assert len(res) == 6
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # multiple metrics in args with custom one
     res = get_cv_result(metrics=['binary_logloss', 'binary_error'], feval=constant_metric)
     assert len(res) == 6
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # custom metric is evaluated despite 'None' is passed
     res = get_cv_result(metrics=['None'], feval=constant_metric)
     assert len(res) == 2
-    assert 'error-mean' in res
+    assert 'valid error-mean' in res
 
-    # fobj, feval
+    # custom objective, feval
     # no default metric, only custom one
-    res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
+    res = get_cv_result(params=params_dummy_obj_verbose, feval=constant_metric)
     assert len(res) == 2
-    assert 'error-mean' in res
+    assert 'valid error-mean' in res
 
     # metric in params with custom one
-    res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj, feval=constant_metric)
+    res = get_cv_result(params=params_dummy_obj_metric_err_verbose, feval=constant_metric)
     assert len(res) == 4
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # metric in args with custom one
-    res = get_cv_result(params=params_verbose, fobj=dummy_obj,
+    res = get_cv_result(params=params_dummy_obj_verbose,
                         feval=constant_metric, metrics='binary_error')
     assert len(res) == 4
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # metric in args overwrites one in params, custom one is evaluated too
-    res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj,
+    res = get_cv_result(params=params_dummy_obj_metric_inv_verbose,
                         feval=constant_metric, metrics='binary_error')
     assert len(res) == 4
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # multiple metrics in params with custom one
-    res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
+    res = get_cv_result(params=params_dummy_obj_metric_multi_verbose, feval=constant_metric)
     assert len(res) == 6
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # multiple metrics in args with custom one
-    res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric,
+    res = get_cv_result(params=params_dummy_obj_verbose, feval=constant_metric,
                         metrics=['binary_logloss', 'binary_error'])
     assert len(res) == 6
-    assert 'binary_logloss-mean' in res
-    assert 'binary_error-mean' in res
-    assert 'error-mean' in res
+    assert 'valid binary_logloss-mean' in res
+    assert 'valid binary_error-mean' in res
+    assert 'valid error-mean' in res
 
     # custom metric is evaluated despite 'None' is passed
-    res = get_cv_result(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
+    res = get_cv_result(params=params_dummy_obj_metric_none_verbose, feval=constant_metric)
     assert len(res) == 2
-    assert 'error-mean' in res
+    assert 'valid error-mean' in res
 
-    # no fobj, no feval
+    # no custom objective, no feval
     # default metric
     train_booster()
     assert len(evals_result['valid_0']) == 1
@@ -1893,23 +2131,23 @@ def train_booster(params=params_obj_verbose, **kwargs):
         train_booster(params=params)
         assert len(evals_result) == 0
 
-    # fobj, no feval
+    # custom objective, no feval
     # no default metric
-    train_booster(params=params_verbose, fobj=dummy_obj)
+    train_booster(params=params_dummy_obj_verbose)
     assert len(evals_result) == 0
 
     # metric in params
-    train_booster(params=params_metric_log_verbose, fobj=dummy_obj)
+    train_booster(params=params_dummy_obj_metric_log_verbose)
     assert len(evals_result['valid_0']) == 1
     assert 'binary_logloss' in evals_result['valid_0']
 
     # multiple metrics in params
-    train_booster(params=params_metric_multi_verbose, fobj=dummy_obj)
+    train_booster(params=params_dummy_obj_metric_multi_verbose)
     assert len(evals_result['valid_0']) == 2
     assert 'binary_logloss' in evals_result['valid_0']
     assert 'binary_error' in evals_result['valid_0']
 
-    # no fobj, feval
+    # no custom objective, feval
     # default metric with custom one
     train_booster(feval=constant_metric)
     assert len(evals_result['valid_0']) == 2
@@ -1940,27 +2178,27 @@ def train_booster(params=params_obj_verbose, **kwargs):
     assert len(evals_result) == 1
     assert 'error' in evals_result['valid_0']
 
-    # fobj, feval
+    # custom objective, feval
     # no default metric, only custom one
-    train_booster(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
+    train_booster(params=params_dummy_obj_verbose, feval=constant_metric)
     assert len(evals_result['valid_0']) == 1
     assert 'error' in evals_result['valid_0']
 
     # metric in params with custom one
-    train_booster(params=params_metric_log_verbose, fobj=dummy_obj, feval=constant_metric)
+    train_booster(params=params_dummy_obj_metric_log_verbose, feval=constant_metric)
     assert len(evals_result['valid_0']) == 2
     assert 'binary_logloss' in evals_result['valid_0']
     assert 'error' in evals_result['valid_0']
 
     # multiple metrics in params with custom one
-    train_booster(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
+    train_booster(params=params_dummy_obj_metric_multi_verbose, feval=constant_metric)
     assert len(evals_result['valid_0']) == 3
     assert 'binary_logloss' in evals_result['valid_0']
     assert 'binary_error' in evals_result['valid_0']
     assert 'error' in evals_result['valid_0']
 
     # custom metric is evaluated despite 'None' is passed
-    train_booster(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
+    train_booster(params=params_dummy_obj_metric_none_verbose, feval=constant_metric)
     assert len(evals_result) == 1
     assert 'error' in evals_result['valid_0']
 
@@ -1969,33 +2207,36 @@ def train_booster(params=params_obj_verbose, **kwargs):
 
     obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr']
     for obj_multi_alias in obj_multi_aliases:
+        # Custom objective replaces multiclass
         params_obj_class_3_verbose = {'objective': obj_multi_alias, 'num_class': 3, 'verbose': -1}
-        params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1}
+        params_dummy_obj_class_3_verbose = {'objective': dummy_obj, 'num_class': 3, 'verbose': -1}
+        params_dummy_obj_class_1_verbose = {'objective': dummy_obj, 'num_class': 1, 'verbose': -1}
         params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1}
+        params_dummy_obj_verbose = {'objective': dummy_obj, 'verbose': -1}
         # multiclass default metric
         res = get_cv_result(params_obj_class_3_verbose)
         assert len(res) == 2
-        assert 'multi_logloss-mean' in res
+        assert 'valid multi_logloss-mean' in res
         # multiclass default metric with custom one
         res = get_cv_result(params_obj_class_3_verbose, feval=constant_metric)
         assert len(res) == 4
-        assert 'multi_logloss-mean' in res
-        assert 'error-mean' in res
+        assert 'valid multi_logloss-mean' in res
+        assert 'valid error-mean' in res
         # multiclass metric alias with custom one for custom objective
-        res = get_cv_result(params_obj_class_3_verbose, fobj=dummy_obj, feval=constant_metric)
+        res = get_cv_result(params_dummy_obj_class_3_verbose, feval=constant_metric)
         assert len(res) == 2
-        assert 'error-mean' in res
+        assert 'valid error-mean' in res
         # no metric for invalid class_num
-        res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj)
+        res = get_cv_result(params_dummy_obj_class_1_verbose)
         assert len(res) == 0
         # custom metric for invalid class_num
-        res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj, feval=constant_metric)
+        res = get_cv_result(params_dummy_obj_class_1_verbose, feval=constant_metric)
         assert len(res) == 2
-        assert 'error-mean' in res
+        assert 'valid error-mean' in res
         # multiclass metric alias with custom one with invalid class_num
         with pytest.raises(lgb.basic.LightGBMError):
-            get_cv_result(params_obj_class_1_verbose, metrics=obj_multi_alias,
-                          fobj=dummy_obj, feval=constant_metric)
+            get_cv_result(params_dummy_obj_class_1_verbose, metrics=obj_multi_alias,
+                          feval=constant_metric)
         # multiclass default metric without num_class
         with pytest.raises(lgb.basic.LightGBMError):
             get_cv_result(params_obj_verbose)
@@ -2003,11 +2244,11 @@ def train_booster(params=params_obj_verbose, **kwargs):
             # multiclass metric alias
             res = get_cv_result(params_obj_class_3_verbose, metrics=metric_multi_alias)
             assert len(res) == 2
-            assert 'multi_logloss-mean' in res
+            assert 'valid multi_logloss-mean' in res
         # multiclass metric
         res = get_cv_result(params_obj_class_3_verbose, metrics='multi_error')
         assert len(res) == 2
-        assert 'multi_error-mean' in res
+        assert 'valid multi_error-mean' in res
         # non-valid metric for multiclass objective
         with pytest.raises(lgb.basic.LightGBMError):
             get_cv_result(params_obj_class_3_verbose, metrics='binary_logloss')
@@ -2016,20 +2257,20 @@ def train_booster(params=params_obj_verbose, **kwargs):
     with pytest.raises(lgb.basic.LightGBMError):
         get_cv_result(params_class_3_verbose)
     # no metric with non-default num_class for custom objective
-    res = get_cv_result(params_class_3_verbose, fobj=dummy_obj)
+    res = get_cv_result(params_dummy_obj_class_3_verbose)
     assert len(res) == 0
     for metric_multi_alias in obj_multi_aliases + ['multi_logloss']:
         # multiclass metric alias for custom objective
-        res = get_cv_result(params_class_3_verbose, metrics=metric_multi_alias, fobj=dummy_obj)
+        res = get_cv_result(params_dummy_obj_class_3_verbose, metrics=metric_multi_alias)
         assert len(res) == 2
-        assert 'multi_logloss-mean' in res
+        assert 'valid multi_logloss-mean' in res
     # multiclass metric for custom objective
-    res = get_cv_result(params_class_3_verbose, metrics='multi_error', fobj=dummy_obj)
+    res = get_cv_result(params_dummy_obj_class_3_verbose, metrics='multi_error')
     assert len(res) == 2
-    assert 'multi_error-mean' in res
+    assert 'valid multi_error-mean' in res
     # binary metric with non-default num_class for custom objective
     with pytest.raises(lgb.basic.LightGBMError):
-        get_cv_result(params_class_3_verbose, metrics='binary_error', fobj=dummy_obj)
+        get_cv_result(params_dummy_obj_class_3_verbose, metrics='binary_error')
 
 
 def test_multiple_feval_train():
@@ -2048,7 +2289,8 @@ def test_multiple_feval_train():
         valid_sets=validation_dataset,
         num_boost_round=5,
         feval=[constant_metric, decreasing_metric],
-        evals_result=evals_result)
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
 
     assert len(evals_result['valid_0']) == 3
     assert 'binary_logloss' in evals_result['valid_0']
@@ -2056,6 +2298,97 @@ def test_multiple_feval_train():
     assert 'decreasing_metric' in evals_result['valid_0']
 
 
+def test_objective_callable_train_binary_classification():
+    X, y = load_breast_cancer(return_X_y=True)
+    params = {
+        'verbose': -1,
+        'objective': logloss_obj,
+        'learning_rate': 0.01
+    }
+    train_dataset = lgb.Dataset(X, y)
+    booster = lgb.train(
+        params=params,
+        train_set=train_dataset,
+        num_boost_round=20
+    )
+    y_pred = logistic_sigmoid(booster.predict(X))
+    logloss_error = log_loss(y, y_pred)
+    rocauc_error = roc_auc_score(y, y_pred)
+    assert booster.params['objective'] == 'none'
+    assert logloss_error == pytest.approx(0.547907)
+    assert rocauc_error == pytest.approx(0.995944)
+
+
+def test_objective_callable_train_regression():
+    X, y = make_synthetic_regression()
+    params = {
+        'verbose': -1,
+        'objective': mse_obj
+    }
+    lgb_train = lgb.Dataset(X, y)
+    booster = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=20
+    )
+    y_pred = booster.predict(X)
+    mse_error = mean_squared_error(y, y_pred)
+    assert booster.params['objective'] == 'none'
+    assert mse_error == pytest.approx(286.724194)
+
+
+def test_objective_callable_cv_binary_classification():
+    X, y = load_breast_cancer(return_X_y=True)
+    params = {
+        'verbose': -1,
+        'objective': logloss_obj,
+        'learning_rate': 0.01
+    }
+    train_dataset = lgb.Dataset(X, y)
+    cv_res = lgb.cv(
+        params,
+        train_dataset,
+        num_boost_round=20,
+        nfold=3,
+        return_cvbooster=True
+    )
+    cv_booster = cv_res['cvbooster'].boosters
+    cv_logloss_errors = [
+        log_loss(y, logistic_sigmoid(cb.predict(X))) < 0.56 for cb in cv_booster
+    ]
+    cv_objs = [
+        cb.params['objective'] == 'none' for cb in cv_booster
+    ]
+    assert all(cv_objs)
+    assert all(cv_logloss_errors)
+
+
+def test_objective_callable_cv_regression():
+    X, y = make_synthetic_regression()
+    lgb_train = lgb.Dataset(X, y)
+    params = {
+        'verbose': -1,
+        'objective': mse_obj
+    }
+    cv_res = lgb.cv(
+        params,
+        lgb_train,
+        num_boost_round=20,
+        nfold=3,
+        stratified=False,
+        return_cvbooster=True
+    )
+    cv_booster = cv_res['cvbooster'].boosters
+    cv_mse_errors = [
+        mean_squared_error(y, cb.predict(X)) < 463 for cb in cv_booster
+    ]
+    cv_objs = [
+        cb.params['objective'] == 'none' for cb in cv_booster
+    ]
+    assert all(cv_objs)
+    assert all(cv_mse_errors)
+
+
 def test_multiple_feval_cv():
     X, y = load_breast_cancer(return_X_y=True)
 
@@ -2071,12 +2404,12 @@ def test_multiple_feval_cv():
 
     # Expect three metrics but mean and stdv for each metric
     assert len(cv_results) == 6
-    assert 'binary_logloss-mean' in cv_results
-    assert 'error-mean' in cv_results
-    assert 'decreasing_metric-mean' in cv_results
-    assert 'binary_logloss-stdv' in cv_results
-    assert 'error-stdv' in cv_results
-    assert 'decreasing_metric-stdv' in cv_results
+    assert 'valid binary_logloss-mean' in cv_results
+    assert 'valid error-mean' in cv_results
+    assert 'valid decreasing_metric-mean' in cv_results
+    assert 'valid binary_logloss-stdv' in cv_results
+    assert 'valid error-stdv' in cv_results
+    assert 'valid decreasing_metric-stdv' in cv_results
 
 
 def test_default_objective_and_metric():
@@ -2100,6 +2433,70 @@ def test_default_objective_and_metric():
     assert len(evals_result['valid_0']['l2']) == 5
 
 
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_multiclass_custom_objective(use_weight):
+    def custom_obj(y_pred, ds):
+        y_true = ds.get_label()
+        weight = ds.get_weight()
+        grad, hess = sklearn_multiclass_custom_objective(y_true, y_pred, weight)
+        return grad, hess
+
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    weight = np.full_like(y, 2)
+    ds = lgb.Dataset(X, y)
+    if use_weight:
+        ds.set_weight(weight)
+    params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7}
+    builtin_obj_bst = lgb.train(params, ds, num_boost_round=10)
+    builtin_obj_preds = builtin_obj_bst.predict(X)
+
+    params['objective'] = custom_obj
+    custom_obj_bst = lgb.train(params, ds, num_boost_round=10)
+    custom_obj_preds = softmax(custom_obj_bst.predict(X))
+
+    np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01)
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_multiclass_custom_eval(use_weight):
+    def custom_eval(y_pred, ds):
+        y_true = ds.get_label()
+        weight = ds.get_weight()  # weight is None when not set
+        loss = log_loss(y_true, y_pred, sample_weight=weight)
+        return 'custom_logloss', loss, False
+
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    weight = np.full_like(y, 2)
+    X_train, X_valid, y_train, y_valid, weight_train, weight_valid = train_test_split(
+        X, y, weight, test_size=0.2, random_state=0
+    )
+    train_ds = lgb.Dataset(X_train, y_train)
+    valid_ds = lgb.Dataset(X_valid, y_valid, reference=train_ds)
+    if use_weight:
+        train_ds.set_weight(weight_train)
+        valid_ds.set_weight(weight_valid)
+    params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7}
+    eval_result = {}
+    bst = lgb.train(
+        params,
+        train_ds,
+        num_boost_round=10,
+        valid_sets=[train_ds, valid_ds],
+        valid_names=['train', 'valid'],
+        feval=custom_eval,
+        callbacks=[lgb.record_evaluation(eval_result)],
+        keep_training_booster=True,
+    )
+
+    for key, ds in zip(['train', 'valid'], [train_ds, valid_ds]):
+        np.testing.assert_allclose(eval_result[key]['multi_logloss'], eval_result[key]['custom_logloss'])
+        _, metric, value, _ = bst.eval(ds, key, feval=custom_eval)[1]  # first element is multi_logloss
+        assert metric == 'custom_logloss'
+        np.testing.assert_allclose(value, eval_result[key][metric][-1])
+
+
 @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason='not enough RAM')
 def test_model_size():
     X, y = make_synthetic_regression()
@@ -2128,6 +2525,7 @@ def test_model_size():
         pytest.skipTest('not enough RAM')
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_get_split_value_histogram():
     X, y = load_boston(return_X_y=True)
     lgb_train = lgb.Dataset(X, y, categorical_feature=[2])
@@ -2192,22 +2590,22 @@ def test_get_split_value_histogram():
     np.testing.assert_array_equal(hist_idx, hist_name)
     np.testing.assert_allclose(bins_idx, bins_name)
     # test bins string type
-    if np.__version__ > '1.11.0':
-        hist_vals, bin_edges = gbm.get_split_value_histogram(0, bins='auto')
-        hist = gbm.get_split_value_histogram(0, bins='auto', xgboost_style=True)
-        if lgb.compat.PANDAS_INSTALLED:
-            mask = hist_vals > 0
-            np.testing.assert_array_equal(hist_vals[mask], hist['Count'].values)
-            np.testing.assert_allclose(bin_edges[1:][mask], hist['SplitValue'].values)
-        else:
-            mask = hist_vals > 0
-            np.testing.assert_array_equal(hist_vals[mask], hist[:, 1])
-            np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0])
+    hist_vals, bin_edges = gbm.get_split_value_histogram(0, bins='auto')
+    hist = gbm.get_split_value_histogram(0, bins='auto', xgboost_style=True)
+    if lgb.compat.PANDAS_INSTALLED:
+        mask = hist_vals > 0
+        np.testing.assert_array_equal(hist_vals[mask], hist['Count'].values)
+        np.testing.assert_allclose(bin_edges[1:][mask], hist['SplitValue'].values)
+    else:
+        mask = hist_vals > 0
+        np.testing.assert_array_equal(hist_vals[mask], hist[:, 1])
+        np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0])
     # test histogram is disabled for categorical features
     with pytest.raises(lgb.basic.LightGBMError):
         gbm.get_split_value_histogram(2)
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_early_stopping_for_only_first_metric():
 
     def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration,
@@ -2220,9 +2618,14 @@ def metrics_combination_train_regression(valid_sets, metric_list, assumed_iterat
             'verbose': -1,
             'seed': 123
         }
-        gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train,
-                        num_boost_round=25, valid_sets=valid_sets, feval=feval,
-                        early_stopping_rounds=5)
+        gbm = lgb.train(
+            params,
+            lgb_train,
+            num_boost_round=25,
+            valid_sets=valid_sets,
+            feval=feval,
+            callbacks=[lgb.early_stopping(stopping_rounds=5, first_metric_only=first_metric_only)]
+        )
         assert assumed_iteration == gbm.best_iteration
 
     def metrics_combination_cv_regression(metric_list, assumed_iteration,
@@ -2236,11 +2639,15 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration,
             'seed': 123,
             'gpu_use_dp': True
         }
-        ret = lgb.cv(dict(params, first_metric_only=first_metric_only),
-                     train_set=lgb_train, num_boost_round=25,
-                     stratified=False, feval=feval,
-                     early_stopping_rounds=5,
-                     eval_train_metric=eval_train_metric)
+        ret = lgb.cv(
+            params,
+            train_set=lgb_train,
+            num_boost_round=25,
+            stratified=False,
+            feval=feval,
+            callbacks=[lgb.early_stopping(stopping_rounds=5, first_metric_only=first_metric_only)],
+            eval_train_metric=eval_train_metric
+        )
         assert assumed_iteration == len(ret[list(ret.keys())[0]])
 
     X, y = load_boston(return_X_y=True)
@@ -2334,10 +2741,13 @@ def test_node_level_subcol():
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     evals_result = {}
-    gbm = lgb.train(params, lgb_train,
-                    num_boost_round=25,
-                    valid_sets=lgb_eval,
-                    evals_result=evals_result)
+    gbm = lgb.train(
+        params,
+        lgb_train,
+        num_boost_round=25,
+        valid_sets=lgb_eval,
+        callbacks=[lgb.record_evaluation(evals_result)]
+    )
     ret = log_loss(y_test, gbm.predict(X_test))
     assert ret < 0.14
     assert evals_result['valid_0']['binary_logloss'][-1] == pytest.approx(ret)
@@ -2602,6 +3012,7 @@ def _imptcs_to_numpy(X, impcts_dict):
         assert tree_df.loc[0, col] is None
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Interaction constraints are not yet supported by CUDA Experimental version')
 def test_interaction_constraints():
     X, y = load_boston(return_X_y=True)
     num_features = X.shape[1]
@@ -2650,8 +3061,17 @@ def test_linear_trees(tmp_path):
     pred1 = est.predict(x)
     lgb_train = lgb.Dataset(x, label=y)
     res = {}
-    est = lgb.train(dict(params, linear_tree=True), lgb_train, num_boost_round=10, evals_result=res,
-                    valid_sets=[lgb_train], valid_names=['train'])
+    est = lgb.train(
+        dict(
+            params,
+            linear_tree=True
+        ),
+        lgb_train,
+        num_boost_round=10,
+        valid_sets=[lgb_train],
+        valid_names=['train'],
+        callbacks=[lgb.record_evaluation(res)]
+    )
     pred2 = est.predict(x)
     assert res['train']['l2'][-1] == pytest.approx(mean_squared_error(y, pred2), abs=1e-1)
     assert mean_squared_error(y, pred2) < mean_squared_error(y, pred1)
@@ -2662,15 +3082,35 @@ def test_linear_trees(tmp_path):
     pred1 = est.predict(x)
     lgb_train = lgb.Dataset(x, label=y)
     res = {}
-    est = lgb.train(dict(params, linear_tree=True), lgb_train, num_boost_round=10, evals_result=res,
-                    valid_sets=[lgb_train], valid_names=['train'])
+    est = lgb.train(
+        dict(
+            params,
+            linear_tree=True
+        ),
+        lgb_train,
+        num_boost_round=10,
+        valid_sets=[lgb_train],
+        valid_names=['train'],
+        callbacks=[lgb.record_evaluation(res)]
+    )
     pred2 = est.predict(x)
     assert res['train']['l2'][-1] == pytest.approx(mean_squared_error(y, pred2), abs=1e-1)
     assert mean_squared_error(y, pred2) < mean_squared_error(y, pred1)
     # test again with bagging
     res = {}
-    est = lgb.train(dict(params, linear_tree=True, subsample=0.8, bagging_freq=1), lgb_train,
-                    num_boost_round=10, evals_result=res, valid_sets=[lgb_train], valid_names=['train'])
+    est = lgb.train(
+        dict(
+            params,
+            linear_tree=True,
+            subsample=0.8,
+            bagging_freq=1
+        ),
+        lgb_train,
+        num_boost_round=10,
+        valid_sets=[lgb_train],
+        valid_names=['train'],
+        callbacks=[lgb.record_evaluation(res)]
+    )
     pred = est.predict(x)
     assert res['train']['l2'][-1] == pytest.approx(mean_squared_error(y, pred), abs=1e-1)
     # test with a feature that has only one non-nan value
@@ -2679,8 +3119,19 @@ def test_linear_trees(tmp_path):
     y[500:] += 10
     lgb_train = lgb.Dataset(x, label=y)
     res = {}
-    est = lgb.train(dict(params, linear_tree=True, subsample=0.8, bagging_freq=1), lgb_train,
-                    num_boost_round=10, evals_result=res, valid_sets=[lgb_train], valid_names=['train'])
+    est = lgb.train(
+        dict(
+            params,
+            linear_tree=True,
+            subsample=0.8,
+            bagging_freq=1
+        ),
+        lgb_train,
+        num_boost_round=10,
+        valid_sets=[lgb_train],
+        valid_names=['train'],
+        callbacks=[lgb.record_evaluation(res)]
+    )
     pred = est.predict(x)
     assert res['train']['l2'][-1] == pytest.approx(mean_squared_error(y, pred), abs=1e-1)
     # test with a categorical feature
@@ -2762,8 +3213,14 @@ def inner_test(X, y, params, early_stopping_rounds):
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         train_data = lgb.Dataset(X_train, label=y_train)
         valid_data = lgb.Dataset(X_test, label=y_test)
-        booster = lgb.train(params, train_data, num_boost_round=50, early_stopping_rounds=early_stopping_rounds,
-                            valid_sets=[valid_data])
+        callbacks = [lgb.early_stopping(early_stopping_rounds)] if early_stopping_rounds is not None else []
+        booster = lgb.train(
+            params,
+            train_data,
+            num_boost_round=50,
+            valid_sets=[valid_data],
+            callbacks=callbacks
+        )
 
         # test that the predict once with all iterations equals summed results with start_iteration and num_iteration
         all_pred = booster.predict(X, raw_score=True)
@@ -2852,7 +3309,13 @@ def test_average_precision_metric():
     }
     res = {}
     lgb_X = lgb.Dataset(X, label=y)
-    est = lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=res)
+    est = lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=10,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(res)]
+    )
     ap = res['training']['average_precision'][-1]
     pred = est.predict(X)
     sklearn_ap = average_precision_score(y, pred)
@@ -2861,7 +3324,13 @@ def test_average_precision_metric():
     y = y.copy()
     y[:] = 1
     lgb_X = lgb.Dataset(X, label=y)
-    lgb.train(params, lgb_X, num_boost_round=1, valid_sets=[lgb_X], evals_result=res)
+    lgb.train(
+        params,
+        lgb_X,
+        num_boost_round=1,
+        valid_sets=[lgb_X],
+        callbacks=[lgb.record_evaluation(res)]
+    )
     assert res['training']['average_precision'][-1] == pytest.approx(1)
 
 
@@ -2938,6 +3407,7 @@ def hook(obj):
     assert "LV" in dumped_model_str
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Forced splits are not yet supported by CUDA Experimental version')
 def test_force_split_with_feature_fraction(tmp_path):
     X, y = load_boston(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -2973,3 +3443,224 @@ def test_force_split_with_feature_fraction(tmp_path):
     for tree in tree_info:
         tree_structure = tree["tree_structure"]
         assert tree_structure['split_feature'] == 0
+
+
+def test_record_evaluation_with_train():
+    X, y = make_synthetic_regression()
+    ds = lgb.Dataset(X, y)
+    eval_result = {}
+    callbacks = [lgb.record_evaluation(eval_result)]
+    params = {'objective': 'l2', 'num_leaves': 3}
+    num_boost_round = 5
+    bst = lgb.train(params, ds, num_boost_round=num_boost_round, valid_sets=[ds], callbacks=callbacks)
+    assert list(eval_result.keys()) == ['training']
+    train_mses = []
+    for i in range(num_boost_round):
+        pred = bst.predict(X, num_iteration=i + 1)
+        mse = mean_squared_error(y, pred)
+        train_mses.append(mse)
+    np.testing.assert_allclose(eval_result['training']['l2'], train_mses)
+
+
+@pytest.mark.parametrize('train_metric', [False, True])
+def test_record_evaluation_with_cv(train_metric):
+    X, y = make_synthetic_regression()
+    ds = lgb.Dataset(X, y)
+    eval_result = {}
+    callbacks = [lgb.record_evaluation(eval_result)]
+    metrics = ['l2', 'rmse']
+    params = {'objective': 'l2', 'num_leaves': 3, 'metric': metrics}
+    cv_hist = lgb.cv(params, ds, num_boost_round=5, stratified=False, callbacks=callbacks, eval_train_metric=train_metric)
+    expected_datasets = {'valid'}
+    if train_metric:
+        expected_datasets.add('train')
+    assert set(eval_result.keys()) == expected_datasets
+    for dataset in expected_datasets:
+        for metric in metrics:
+            for agg in ('mean', 'stdv'):
+                key = f'{dataset} {metric}-{agg}'
+                np.testing.assert_allclose(
+                    cv_hist[key], eval_result[dataset][f'{metric}-{agg}']
+                )
+
+
+def test_pandas_with_numpy_regular_dtypes():
+    pd = pytest.importorskip('pandas')
+    uints = ['uint8', 'uint16', 'uint32', 'uint64']
+    ints = ['int8', 'int16', 'int32', 'int64']
+    bool_and_floats = ['bool', 'float16', 'float32', 'float64']
+    rng = np.random.RandomState(42)
+
+    n_samples = 100
+    # data as float64
+    df = pd.DataFrame({
+        'x1': rng.randint(0, 2, n_samples),
+        'x2': rng.randint(1, 3, n_samples),
+        'x3': 10 * rng.randint(1, 3, n_samples),
+        'x4': 100 * rng.randint(1, 3, n_samples),
+    })
+    df = df.astype(np.float64)
+    y = df['x1'] * (df['x2'] + df['x3'] + df['x4'])
+    ds = lgb.Dataset(df, y)
+    params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
+    bst = lgb.train(params, ds, num_boost_round=5)
+    preds = bst.predict(df)
+
+    # test all features were used
+    assert bst.trees_to_dataframe()['split_feature'].nunique() == df.shape[1]
+    # test the score is better than predicting the mean
+    baseline = np.full_like(y, y.mean())
+    assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)
+
+    # test all predictions are equal using different input dtypes
+    for target_dtypes in [uints, ints, bool_and_floats]:
+        df2 = df.astype({f'x{i}': dtype for i, dtype in enumerate(target_dtypes, start=1)})
+        assert df2.dtypes.tolist() == target_dtypes
+        ds2 = lgb.Dataset(df2, y)
+        bst2 = lgb.train(params, ds2, num_boost_round=5)
+        preds2 = bst2.predict(df2)
+        np.testing.assert_allclose(preds, preds2)
+
+
+def test_pandas_nullable_dtypes():
+    pd = pytest.importorskip('pandas')
+    rng = np.random.RandomState(0)
+    df = pd.DataFrame({
+        'x1': rng.randint(1, 3, size=100),
+        'x2': np.linspace(-1, 1, 100),
+        'x3': pd.arrays.SparseArray(rng.randint(0, 11, size=100)),
+        'x4': rng.rand(100) < 0.5,
+    })
+    # introduce some missing values
+    df.loc[1, 'x1'] = np.nan
+    df.loc[2, 'x2'] = np.nan
+    df.loc[3, 'x4'] = np.nan
+    # the previous line turns x3 into object dtype in recent versions of pandas
+    df['x4'] = df['x4'].astype(np.float64)
+    y = df['x1'] * df['x2'] + df['x3'] * (1 + df['x4'])
+    y = y.fillna(0)
+
+    # train with regular dtypes
+    params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
+    ds = lgb.Dataset(df, y)
+    bst = lgb.train(params, ds, num_boost_round=5)
+    preds = bst.predict(df)
+
+    # convert to nullable dtypes
+    df2 = df.copy()
+    df2['x1'] = df2['x1'].astype('Int32')
+    df2['x2'] = df2['x2'].astype('Float64')
+    df2['x4'] = df2['x4'].astype('boolean')
+
+    # test training succeeds
+    ds_nullable_dtypes = lgb.Dataset(df2, y)
+    bst_nullable_dtypes = lgb.train(params, ds_nullable_dtypes, num_boost_round=5)
+    preds_nullable_dtypes = bst_nullable_dtypes.predict(df2)
+
+    trees_df = bst_nullable_dtypes.trees_to_dataframe()
+    # test all features were used
+    assert trees_df['split_feature'].nunique() == df.shape[1]
+    # test the score is better than predicting the mean
+    baseline = np.full_like(y, y.mean())
+    assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)
+
+    # test equal predictions
+    np.testing.assert_allclose(preds, preds_nullable_dtypes)
+
+
+def test_boost_from_average_with_single_leaf_trees():
+    # test data are taken from bug report
+    # https://github.com/microsoft/LightGBM/issues/4708
+    X = np.array([
+        [1021.0589, 1018.9578],
+        [1023.85754, 1018.7854],
+        [1024.5468, 1018.88513],
+        [1019.02954, 1018.88513],
+        [1016.79926, 1018.88513],
+        [1007.6, 1018.88513]], dtype=np.float32)
+    y = np.array([1023.8, 1024.6, 1024.4, 1023.8, 1022.0, 1014.4], dtype=np.float32)
+    params = {
+        "extra_trees": True,
+        "min_data_in_bin": 1,
+        "extra_seed": 7,
+        "objective": "regression",
+        "verbose": -1,
+        "boost_from_average": True,
+        "min_data_in_leaf": 1,
+    }
+    train_set = lgb.Dataset(X, y)
+    model = lgb.train(params=params, train_set=train_set, num_boost_round=10)
+
+    preds = model.predict(X)
+    mean_preds = np.mean(preds)
+    assert y.min() <= mean_preds <= y.max()
+
+
+def test_cegb_split_buffer_clean():
+    # modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
+    # and https://github.com/microsoft/LightGBM/pull/5087
+    # test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
+    # which is done in the fix #5164
+    # without the fix:
+    #    Check failed: (best_split_info.left_count) > (0)
+
+    R, C = 1000, 100
+    seed = 29
+    np.random.seed(seed)
+    data = np.random.randn(R, C)
+    for i in range(1, C):
+        data[i] += data[0] * np.random.randn()
+
+    N = int(0.8 * len(data))
+    train_data = data[:N]
+    test_data = data[N:]
+    train_y = np.sum(train_data, axis=1)
+    test_y = np.sum(test_data, axis=1)
+
+    train = lgb.Dataset(train_data, train_y, free_raw_data=True)
+
+    params = {
+        'boosting_type': 'gbdt',
+        'objective': 'regression',
+        'max_bin': 255,
+        'num_leaves': 31,
+        'seed': 0,
+        'learning_rate': 0.1,
+        'min_data_in_leaf': 0,
+        'verbose': -1,
+        'min_split_gain': 1000.0,
+        'cegb_penalty_feature_coupled': 5 * np.arange(C),
+        'cegb_penalty_split': 0.0002,
+        'cegb_tradeoff': 10.0,
+        'force_col_wise': True,
+    }
+
+    model = lgb.train(params, train, num_boost_round=10)
+    predicts = model.predict(test_data)
+    rmse = np.sqrt(mean_squared_error(test_y, predicts))
+    assert rmse < 10.0
+
+
+@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
+def test_validate_features():
+    X, y = make_synthetic_regression()
+    features = ['x1', 'x2', 'x3', 'x4']
+    df = pd_DataFrame(X, columns=features)
+    ds = lgb.Dataset(df, y)
+    bst = lgb.train({'num_leaves': 15, 'verbose': -1}, ds, num_boost_round=10)
+    assert bst.feature_name() == features
+
+    # try to predict with a different feature
+    df2 = df.rename(columns={'x3': 'z'})
+    with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x3' at position 2 but found 'z'"):
+        bst.predict(df2, validate_features=True)
+
+    # check that disabling the check doesn't raise the error
+    bst.predict(df2, validate_features=False)
+
+    # try to refit with a different feature
+    with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x3' at position 2 but found 'z'"):
+        bst.refit(df2, y, validate_features=True)
+
+    # check that disabling the check doesn't raise the error
+    bst.refit(df2, y, validate_features=False)
diff --git a/tests/python_package_test/test_plotting.py b/tests/python_package_test/test_plotting.py
index e5e8b6eeda65..8b61a2e47cdb 100644
--- a/tests/python_package_test/test_plotting.py
+++ b/tests/python_package_test/test_plotting.py
@@ -198,7 +198,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
               valid_sets=[train_data, test_data],
               valid_names=['v1', 'v2'],
               num_boost_round=10,
-              evals_result=evals_result0)
+              callbacks=[lgb.record_evaluation(evals_result0)])
     with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
         ax0 = lgb.plot_metric(evals_result0)
     assert isinstance(ax0, matplotlib.axes.Axes)
@@ -258,7 +258,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
     evals_result1 = {}
     lgb.train(params, train_data,
               num_boost_round=10,
-              evals_result=evals_result1)
+              callbacks=[lgb.record_evaluation(evals_result1)])
     with pytest.raises(ValueError, match="eval results cannot be empty."):
         lgb.plot_metric(evals_result1)
 
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index d996808a271a..e7d8c8de4a6a 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1,36 +1,48 @@
 # coding: utf-8
 import itertools
 import math
+import re
+from functools import partial
+from os import getenv
 from pathlib import Path
 
 import joblib
 import numpy as np
 import pytest
-from pkg_resources import parse_version
-from sklearn import __version__ as sk_version
 from sklearn.base import clone
-from sklearn.datasets import load_svmlight_file, make_multilabel_classification
+from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
+from sklearn.ensemble import StackingClassifier, StackingRegressor
 from sklearn.metrics import log_loss, mean_squared_error
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain
-from sklearn.utils.estimator_checks import check_parameters_default_constructible
+from sklearn.utils.estimator_checks import parametrize_with_checks
 from sklearn.utils.validation import check_is_fitted
 
 import lightgbm as lgb
+from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
 
 from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking,
-                    make_synthetic_regression)
+                    make_synthetic_regression, sklearn_multiclass_custom_objective, softmax)
 
-sk_version = parse_version(sk_version)
-if sk_version < parse_version("0.23"):
-    import warnings
+decreasing_generator = itertools.count(0, -1)
+task_to_model_factory = {
+    'ranking': lgb.LGBMRanker,
+    'classification': lgb.LGBMClassifier,
+    'regression': lgb.LGBMRegressor,
+}
 
-    from sklearn.exceptions import SkipTestWarning
-    from sklearn.utils.estimator_checks import SkipTest, _yield_all_checks
-else:
-    from sklearn.utils.estimator_checks import parametrize_with_checks
 
-decreasing_generator = itertools.count(0, -1)
+def _create_data(task):
+    if task == 'ranking':
+        X, y, g = make_ranking(n_features=4)
+        g = np.bincount(g)
+    elif task == 'classification':
+        X, y = load_iris(return_X_y=True)
+        g = None
+    elif task == 'regression':
+        X, y = make_synthetic_regression()
+        g = None
+    return X, y, g
 
 
 class UnpicklableCallback:
@@ -38,7 +50,7 @@ def __reduce__(self):
         raise Exception("This class in not picklable")
 
     def __call__(self, env):
-        env.model.set_attr(attr_set_inside_callback=str(env.iteration * 10))
+        env.model.attr_set_inside_callback = env.iteration * 10
 
 
 def custom_asymmetric_obj(y_true, y_pred):
@@ -109,6 +121,7 @@ def test_regression():
     assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret)
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_multiclass():
     X, y = load_digits(n_class=10, return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -121,6 +134,7 @@ def test_multiclass():
     assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret)
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_lambdarank():
     rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
     X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
@@ -259,11 +273,7 @@ def test_dart():
     assert score <= 1.
 
 
-# sklearn <0.23 does not have a stacking classifier and n_features_in_ property
-@pytest.mark.skipif(sk_version < parse_version("0.23"), reason='scikit-learn version is less than 0.23')
 def test_stacking_classifier():
-    from sklearn.ensemble import StackingClassifier
-
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
     classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)),
@@ -284,11 +294,7 @@ def test_stacking_classifier():
     assert all(clf.classes_ == clf.named_estimators_['gbm1'].classes_)
 
 
-# sklearn <0.23 does not have a stacking regressor and n_features_in_ property
-@pytest.mark.skipif(sk_version < parse_version('0.23'), reason='scikit-learn version is less than 0.23')
 def test_stacking_regressor():
-    from sklearn.ensemble import StackingRegressor
-
     X, y = load_boston(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
     regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)),
@@ -373,8 +379,6 @@ def test_random_search():
     assert score <= 1.
 
 
-# sklearn < 0.22 does not have the post fit attribute: classes_
-@pytest.mark.skipif(sk_version < parse_version('0.22'), reason='scikit-learn version is less than 0.22')
 def test_multioutput_classifier():
     n_outputs = 3
     X, y = make_multilabel_classification(n_samples=100, n_features=20,
@@ -394,8 +398,6 @@ def test_multioutput_classifier():
         assert isinstance(classifier.booster_, lgb.Booster)
 
 
-# sklearn < 0.23 does not have as_frame parameter
-@pytest.mark.skipif(sk_version < parse_version('0.23'), reason='scikit-learn version is less than 0.23')
 def test_multioutput_regressor():
     bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
     X, y = bunch['data'], bunch['target']
@@ -412,8 +414,6 @@ def test_multioutput_regressor():
         assert isinstance(regressor.booster_, lgb.Booster)
 
 
-# sklearn < 0.22 does not have the post fit attribute: classes_
-@pytest.mark.skipif(sk_version < parse_version('0.22'), reason='scikit-learn version is less than 0.22')
 def test_classifier_chain():
     n_outputs = 3
     X, y = make_multilabel_classification(n_samples=100, n_features=20,
@@ -435,8 +435,6 @@ def test_classifier_chain():
         assert isinstance(classifier.booster_, lgb.Booster)
 
 
-# sklearn < 0.23 does not have as_frame parameter
-@pytest.mark.skipif(sk_version < parse_version('0.23'), reason='scikit-learn version is less than 0.23')
 def test_regressor_chain():
     bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
     X, y = bunch['data'], bunch['target']
@@ -518,7 +516,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
     X, y = make_synthetic_regression()
     gbm = lgb.LGBMRegressor(n_estimators=5)
     gbm.fit(X, y, callbacks=[unpicklable_callback])
-    assert gbm.booster_.attr('attr_set_inside_callback') == '40'
+    assert gbm.booster_.attr_set_inside_callback == 40
 
 
 def test_random_state_object():
@@ -634,20 +632,15 @@ def test_pandas_categorical():
 
 def test_pandas_sparse():
     pd = pytest.importorskip("pandas")
-    try:
-        from pandas.arrays import SparseArray
-    except ImportError:  # support old versions
-        from pandas import SparseArray
-    X = pd.DataFrame({"A": SparseArray(np.random.permutation([0, 1, 2] * 100)),
-                      "B": SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
-                      "C": SparseArray(np.random.permutation([True, False] * 150))})
-    y = pd.Series(SparseArray(np.random.permutation([0, 1] * 150)))
-    X_test = pd.DataFrame({"A": SparseArray(np.random.permutation([0, 2] * 30)),
-                           "B": SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
-                           "C": SparseArray(np.random.permutation([True, False] * 30))})
-    if pd.__version__ >= '0.24.0':
-        for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
-            assert pd.api.types.is_sparse(dtype)
+    X = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
+                      "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
+                      "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150))})
+    y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150)))
+    X_test = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
+                           "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
+                           "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30))})
+    for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
+        assert pd.api.types.is_sparse(dtype)
     gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
     pred_sparse = gbm.predict(X_test, raw_score=True)
     if hasattr(X_test, 'sparse'):
@@ -1088,19 +1081,6 @@ def test_multiple_eval_metrics():
     assert 'binary_logloss' in gbm.evals_result_['training']
 
 
-def test_inf_handle():
-    nrows = 100
-    ncols = 10
-    X = np.random.randn(nrows, ncols)
-    y = np.random.randn(nrows) + np.full(nrows, 1e30)
-    weight = np.full(nrows, 1e10)
-    params = {'n_estimators': 20, 'verbose': -1}
-    params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y),
-                  'callbacks': [lgb.early_stopping(5)]}
-    gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
-    np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.inf)
-
-
 def test_nan_handle():
     nrows = 100
     ncols = 10
@@ -1114,6 +1094,7 @@ def test_nan_handle():
     np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan)
 
 
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
 def test_first_metric_only():
 
     def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
@@ -1271,8 +1252,6 @@ def test_actual_number_of_trees():
     np.testing.assert_array_equal(gbm.predict(np.array(X) * 10), y)
 
 
-# sklearn < 0.22 requires passing "attributes" argument
-@pytest.mark.skipif(sk_version < parse_version('0.22'), reason='scikit-learn version is less than 0.22')
 def test_check_is_fitted():
     X, y = load_digits(n_class=2, return_X_y=True)
     est = lgb.LGBMModel(n_estimators=5, objective="binary")
@@ -1291,65 +1270,16 @@ def test_check_is_fitted():
         check_is_fitted(model)
 
 
-def _tested_estimators():
-    for Estimator in [lgb.sklearn.LGBMClassifier, lgb.sklearn.LGBMRegressor]:
-        yield Estimator()
-
-
-if sk_version < parse_version("0.23"):
-    def _generate_checks_per_estimator(check_generator, estimators):
-        for estimator in estimators:
-            name = estimator.__class__.__name__
-            for check in check_generator(name, estimator):
-                yield estimator, check
-
-    @pytest.mark.skipif(
-        sk_version < parse_version("0.21"), reason="scikit-learn version is less than 0.21"
-    )
-    @pytest.mark.parametrize(
-        "estimator, check",
-        _generate_checks_per_estimator(_yield_all_checks, _tested_estimators()),
-    )
-    def test_sklearn_integration(estimator, check):
-        xfail_checks = estimator._get_tags()["_xfail_checks"]
-        check_name = check.__name__ if hasattr(check, "__name__") else check.func.__name__
-        if xfail_checks and check_name in xfail_checks:
-            warnings.warn(xfail_checks[check_name], SkipTestWarning)
-            raise SkipTest
-        estimator.set_params(min_child_samples=1, min_data_in_bin=1)
-        name = estimator.__class__.__name__
-        check(name, estimator)
-else:
-    @parametrize_with_checks(list(_tested_estimators()))
-    def test_sklearn_integration(estimator, check, request):
-        estimator.set_params(min_child_samples=1, min_data_in_bin=1)
-        check(estimator)
-
-
-@pytest.mark.skipif(
-    sk_version >= parse_version("0.24"),
-    reason="Default constructible check included in common check from 0.24"
-)
-@pytest.mark.parametrize("estimator", list(_tested_estimators()))
-def test_parameters_default_constructible(estimator):
-    name, Estimator = estimator.__class__.__name__, estimator.__class__
-    # Test that estimators are default-constructible
-    check_parameters_default_constructible(name, Estimator)
+@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
+def test_sklearn_integration(estimator, check):
+    estimator.set_params(min_child_samples=1, min_data_in_bin=1)
+    check(estimator)
 
 
 @pytest.mark.parametrize('task', ['classification', 'ranking', 'regression'])
 def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
     pd = pytest.importorskip("pandas")
-    if task == 'ranking':
-        X, y, g = make_ranking()
-        g = np.bincount(g)
-        model_factory = lgb.LGBMRanker
-    elif task == 'classification':
-        X, y = load_iris(return_X_y=True)
-        model_factory = lgb.LGBMClassifier
-    elif task == 'regression':
-        X, y = make_synthetic_regression()
-        model_factory = lgb.LGBMRegressor
+    X, y, g = _create_data(task)
     X = pd.DataFrame(X)
     y_col_array = y.reshape(-1, 1)
     params = {
@@ -1357,6 +1287,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
         'num_leaves': 3,
         'random_state': 0
     }
+    model_factory = task_to_model_factory[task]
     with pytest.warns(UserWarning, match='column-vector'):
         if task == 'ranking':
             model_1d = model_factory(**params).fit(X, y, group=g)
@@ -1368,3 +1299,109 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
     preds_1d = model_1d.predict(X)
     preds_2d = model_2d.predict(X)
     np.testing.assert_array_equal(preds_1d, preds_2d)
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_multiclass_custom_objective(use_weight):
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    weight = np.full_like(y, 2) if use_weight else None
+    params = {'n_estimators': 10, 'num_leaves': 7}
+    builtin_obj_model = lgb.LGBMClassifier(**params)
+    builtin_obj_model.fit(X, y, sample_weight=weight)
+    builtin_obj_preds = builtin_obj_model.predict_proba(X)
+
+    custom_obj_model = lgb.LGBMClassifier(objective=sklearn_multiclass_custom_objective, **params)
+    custom_obj_model.fit(X, y, sample_weight=weight)
+    custom_obj_preds = softmax(custom_obj_model.predict(X, raw_score=True))
+
+    np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01)
+    assert not callable(builtin_obj_model.objective_)
+    assert callable(custom_obj_model.objective_)
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_multiclass_custom_eval(use_weight):
+    def custom_eval(y_true, y_pred, weight):
+        loss = log_loss(y_true, y_pred, sample_weight=weight)
+        return 'custom_logloss', loss, False
+
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    train_test_split_func = partial(train_test_split, test_size=0.2, random_state=0)
+    X_train, X_valid, y_train, y_valid = train_test_split_func(X, y)
+    if use_weight:
+        weight = np.full_like(y, 2)
+        weight_train, weight_valid = train_test_split_func(weight)
+    else:
+        weight_train = None
+        weight_valid = None
+    params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7}
+    model = lgb.LGBMClassifier(**params)
+    model.fit(
+        X_train,
+        y_train,
+        sample_weight=weight_train,
+        eval_set=[(X_train, y_train), (X_valid, y_valid)],
+        eval_names=['train', 'valid'],
+        eval_sample_weight=[weight_train, weight_valid],
+        eval_metric=custom_eval,
+    )
+    eval_result = model.evals_result_
+    train_ds = (X_train, y_train, weight_train)
+    valid_ds = (X_valid, y_valid, weight_valid)
+    for key, (X, y_true, weight) in zip(['train', 'valid'], [train_ds, valid_ds]):
+        np.testing.assert_allclose(
+            eval_result[key]['multi_logloss'], eval_result[key]['custom_logloss']
+        )
+        y_pred = model.predict_proba(X)
+        _, metric_value, _ = custom_eval(y_true, y_pred, weight)
+        np.testing.assert_allclose(metric_value, eval_result[key]['custom_logloss'][-1])
+
+
+def test_negative_n_jobs(tmp_path):
+    n_threads = joblib.cpu_count()
+    if n_threads <= 1:
+        return None
+    # 'val_minus_two' here is the expected number of threads for n_jobs=-2
+    val_minus_two = n_threads - 1
+    X, y = load_breast_cancer(return_X_y=True)
+    # Note: according to joblib's formula, a value of n_jobs=-2 means
+    # "use all but one thread" (formula: n_cpus + 1 + n_jobs)
+    gbm = lgb.LGBMClassifier(n_estimators=2, verbose=-1, n_jobs=-2).fit(X, y)
+    gbm.booster_.save_model(tmp_path / "model.txt")
+    with open(tmp_path / "model.txt", "r") as f:
+        model_txt = f.read()
+    assert bool(re.search(rf"\[num_threads: {val_minus_two}\]", model_txt))
+
+
+def test_default_n_jobs(tmp_path):
+    n_cores = joblib.cpu_count(only_physical_cores=True)
+    X, y = load_breast_cancer(return_X_y=True)
+    gbm = lgb.LGBMClassifier(n_estimators=2, verbose=-1, n_jobs=None).fit(X, y)
+    gbm.booster_.save_model(tmp_path / "model.txt")
+    with open(tmp_path / "model.txt", "r") as f:
+        model_txt = f.read()
+    assert bool(re.search(rf"\[num_threads: {n_cores}\]", model_txt))
+
+
+@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
+@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression'])
+def test_validate_features(task):
+    X, y, g = _create_data(task)
+    features = ['x1', 'x2', 'x3', 'x4']
+    df = pd_DataFrame(X, columns=features)
+    model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1)
+    if task == 'ranking':
+        model.fit(df, y, group=g)
+    else:
+        model.fit(df, y)
+    assert model.feature_name_ == features
+
+    # try to predict with a different feature
+    df2 = df.rename(columns={'x2': 'z'})
+    with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x2' at position 1 but found 'z'"):
+        model.predict(df2, validate_features=True)
+
+    # check that disabling the check doesn't raise the error
+    model.predict(df2, validate_features=False)
diff --git a/tests/python_package_test/test_utilities.py b/tests/python_package_test/test_utilities.py
index be57d585f695..9c8cd23519fc 100644
--- a/tests/python_package_test/test_utilities.py
+++ b/tests/python_package_test/test_utilities.py
@@ -2,6 +2,7 @@
 import logging
 
 import numpy as np
+import pytest
 
 import lightgbm as lgb
 
@@ -43,7 +44,7 @@ def dummy_metric(_, __):
     lgb.plot_metric(eval_records)
 
     expected_log = r"""
-INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
+INFO | [LightGBM] [Warning] There are no meaningful features which satisfy the provided configuration. Decreasing Dataset parameters min_data_in_bin or min_data_in_leaf and re-constructing Dataset might resolve this warning.
 INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2
 INFO | [LightGBM] [Info] Total Bins 0
 INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
@@ -97,3 +98,70 @@ def dummy_metric(_, __):
                 actual_log_wo_gpu_stuff.append(line)
 
     assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
+
+
+def test_register_invalid_logger():
+    class LoggerWithoutInfoMethod:
+        def warning(self, msg: str) -> None:
+            print(msg)
+
+    class LoggerWithoutWarningMethod:
+        def info(self, msg: str) -> None:
+            print(msg)
+
+    class LoggerWithAttributeNotCallable:
+        def __init__(self):
+            self.info = 1
+            self.warning = 2
+
+    expected_error_message = "Logger must provide 'info' and 'warning' method"
+
+    with pytest.raises(TypeError, match=expected_error_message):
+        lgb.register_logger(LoggerWithoutInfoMethod())
+
+    with pytest.raises(TypeError, match=expected_error_message):
+        lgb.register_logger(LoggerWithoutWarningMethod())
+
+    with pytest.raises(TypeError, match=expected_error_message):
+        lgb.register_logger(LoggerWithAttributeNotCallable())
+
+
+def test_register_custom_logger():
+    logged_messages = []
+
+    class CustomLogger:
+        def custom_info(self, msg: str) -> None:
+            logged_messages.append(msg)
+
+        def custom_warning(self, msg: str) -> None:
+            logged_messages.append(msg)
+
+    custom_logger = CustomLogger()
+    lgb.register_logger(
+        custom_logger,
+        info_method_name="custom_info",
+        warning_method_name="custom_warning"
+    )
+
+    lgb.basic._log_info("info message")
+    lgb.basic._log_warning("warning message")
+
+    expected_log = ["info message", "warning message"]
+    assert logged_messages == expected_log
+
+    logged_messages = []
+    X = np.array([[1, 2, 3],
+                  [1, 2, 4],
+                  [1, 2, 4],
+                  [1, 2, 3]],
+                 dtype=np.float32)
+    y = np.array([0, 1, 1, 0])
+    lgb_data = lgb.Dataset(X, y)
+    lgb.train(
+        {'objective': 'binary', 'metric': 'auc'},
+        lgb_data,
+        num_boost_round=10,
+        valid_sets=[lgb_data],
+        categorical_feature=[1]
+    )
+    assert logged_messages, "custom logger was not called"
diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py
index 21d5f2cd542a..fc142ede9fe7 100644
--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -1,6 +1,9 @@
 # coding: utf-8
+import pickle
 from functools import lru_cache
 
+import cloudpickle
+import joblib
 import numpy as np
 import sklearn.datasets
 from sklearn.utils import check_random_state
@@ -114,3 +117,65 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
 @lru_cache(maxsize=None)
 def make_synthetic_regression(n_samples=100):
     return sklearn.datasets.make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
+
+
+def dummy_obj(preds, train_data):
+    return np.ones(preds.shape), np.ones(preds.shape)
+
+
+def mse_obj(y_pred, dtrain):
+    y_true = dtrain.get_label()
+    grad = (y_pred - y_true)
+    hess = np.ones(len(grad))
+    return grad, hess
+
+
+def softmax(x):
+    row_wise_max = np.max(x, axis=1).reshape(-1, 1)
+    exp_x = np.exp(x - row_wise_max)
+    return exp_x / np.sum(exp_x, axis=1).reshape(-1, 1)
+
+
+def logistic_sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
+    num_rows, num_class = y_pred.shape
+    prob = softmax(y_pred)
+    grad_update = np.zeros_like(prob)
+    grad_update[np.arange(num_rows), y_true.astype(np.int32)] = -1.0
+    grad = prob + grad_update
+    factor = num_class / (num_class - 1)
+    hess = factor * prob * (1 - prob)
+    if weight is not None:
+        weight2d = weight.reshape(-1, 1)
+        grad *= weight2d
+        hess *= weight2d
+    return grad, hess
+
+
+def pickle_obj(obj, filepath, serializer):
+    if serializer == 'pickle':
+        with open(filepath, 'wb') as f:
+            pickle.dump(obj, f)
+    elif serializer == 'joblib':
+        joblib.dump(obj, filepath)
+    elif serializer == 'cloudpickle':
+        with open(filepath, 'wb') as f:
+            cloudpickle.dump(obj, f)
+    else:
+        raise ValueError(f'Unrecognized serializer type: {serializer}')
+
+
+def unpickle_obj(filepath, serializer):
+    if serializer == 'pickle':
+        with open(filepath, 'rb') as f:
+            return pickle.load(f)
+    elif serializer == 'joblib':
+        return joblib.load(filepath)
+    elif serializer == 'cloudpickle':
+        with open(filepath, 'rb') as f:
+            return cloudpickle.load(f)
+    else:
+        raise ValueError(f'Unrecognized serializer type: {serializer}')
diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj
index 59b589a40d51..bd0ad783f59f 100644
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -317,10 +317,14 @@
     <ClCompile Include="..\src\io\config_auto.cpp" />
     <ClCompile Include="..\src\io\dataset.cpp" />
     <ClCompile Include="..\src\io\dataset_loader.cpp" />
+    <ClCompile Include="..\src\io\dense_bin.cpp" />
     <ClCompile Include="..\src\io\file_io.cpp" />
     <ClCompile Include="..\src\io\json11.cpp" />
     <ClCompile Include="..\src\io\metadata.cpp" />
+    <ClCompile Include="..\src\io\multi_val_dense_bin.cpp" />
+    <ClCompile Include="..\src\io\multi_val_sparse_bin.cpp" />
     <ClCompile Include="..\src\io\parser.cpp" />
+    <ClCompile Include="..\src\io\sparse_bin.cpp" />
     <ClCompile Include="..\src\io\train_share_states.cpp" />
     <ClCompile Include="..\src\io\tree.cpp" />
     <ClCompile Include="..\src\metric\dcg_calculator.cpp" />
diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters
index 0f48c7564580..b0e3d7744f3e 100644
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -326,5 +326,17 @@
     <ClCompile Include="..\src\treelearner\linear_tree_learner.cpp">
       <Filter>src\treelearner</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\io\multi_val_dense_bin.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\multi_val_sparse_bin.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\dense_bin.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\io\sparse_bin.cpp">
+      <Filter>src\io</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file