From 9805d5cbb9949a762f6cf30d5ede1d25c1a2e025 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 16:32:54 -0700 Subject: [PATCH 001/157] upgrade numpy to 2.0 --- docker/{1.7-1 => 2.1.0}/base/Dockerfile.cpu | 8 ++++---- docker/{1.7-1 => 2.1.0}/final/Dockerfile.cpu | 17 +++++++++++++---- .../resources/mms/ExecutionParameters.java | 0 .../resources/mms/config.properties.tmp | 0 .../resources/mms/endpoints-1.0.jar | Bin requirements.txt | 10 +++++----- test/resources/versions/train.py | 8 ++++---- tox.ini | 1 + 8 files changed, 27 insertions(+), 17 deletions(-) rename docker/{1.7-1 => 2.1.0}/base/Dockerfile.cpu (97%) rename docker/{1.7-1 => 2.1.0}/final/Dockerfile.cpu (76%) rename docker/{1.7-1 => 2.1.0}/resources/mms/ExecutionParameters.java (100%) rename docker/{1.7-1 => 2.1.0}/resources/mms/config.properties.tmp (100%) rename docker/{1.7-1 => 2.1.0}/resources/mms/endpoints-1.0.jar (100%) diff --git a/docker/1.7-1/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu similarity index 97% rename from docker/1.7-1/base/Dockerfile.cpu rename to docker/2.1.0/base/Dockerfile.cpu index e1f72ffd..69ea547c 100644 --- a/docker/1.7-1/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -26,13 +26,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}@sha256:${IMAGE_DIGEST} ARG MINICONDA_VERSION=24.7.1 -ARG CONDA_CHECKSUM=2006a61abc8b4fd04de5eb92620e1f72bada713cc84b5b4899463095e1210556 -ARG CONDA_PY_VERSION=39 +ARG CONDA_CHECKSUM=684cda724bc37e3bbbb342e440fc4cac515c92e91a489eb4359feca35382894b +ARG CONDA_PY_VERSION=310 ARG CONDA_PKG_VERSION=24.7.1 -ARG PYTHON_VERSION=3.9 +ARG PYTHON_VERSION=3.10 ARG PYARROW_VERSION=14.0.1 ARG MLIO_VERSION=0.9.0 -ARG XGBOOST_VERSION=1.7.4 +ARG XGBOOST_VERSION=2.1.0 ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 diff --git a/docker/1.7-1/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu similarity index 76% rename from docker/1.7-1/final/Dockerfile.cpu rename to docker/2.1.0/final/Dockerfile.cpu index 12f26f23..6610bef9 100644 --- a/docker/1.7-1/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -1,7 +1,7 @@ -ARG SAGEMAKER_XGBOOST_VERSION=1.7-1 -ARG PYTHON_VERSION=3.9 +ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 +ARG PYTHON_VERSION=3.10 -FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 +FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION} ARG SAGEMAKER_XGBOOST_VERSION @@ -11,6 +11,14 @@ ARG SAGEMAKER_XGBOOST_VERSION COPY requirements.txt /requirements.txt RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt +# Fix Python 3.10 compatibility for sagemaker-containers +RUN python3 -c "import sys; sys.path.insert(0, '/miniconda3/lib/python3.10/site-packages'); \ + import sagemaker_containers._mapping as m; \ + import collections.abc; \ + setattr(collections, 'Mapping', collections.abc.Mapping); \ + exec(open('/miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py').read().replace('collections.Mapping', 'collections.abc.Mapping'))" || \ + sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py + # Install smdebug from source RUN python3 -m pip install git+https://github.com/awslabs/sagemaker-debugger.git@1.0.29 @@ -20,7 +28,8 @@ RUN python3 -m pip install git+https://github.com/awslabs/sagemaker-debugger.git ########################### COPY dist/sagemaker_xgboost_container-2.0-py2.py3-none-any.whl /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl RUN rm -rf /miniconda3/lib/python${PYTHON_VERSION}/site-packages/numpy-1.21.2.dist-info && \ - python3 -m pip install --no-cache /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl && \ + python3 -m pip install --force-reinstall PyYAML==6.0.1 && \ + python3 -m pip install --no-cache --no-deps /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl && \ python3 -m pip uninstall -y typing && \ rm /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl diff --git a/docker/1.7-1/resources/mms/ExecutionParameters.java b/docker/2.1.0/resources/mms/ExecutionParameters.java similarity index 100% rename from docker/1.7-1/resources/mms/ExecutionParameters.java rename to docker/2.1.0/resources/mms/ExecutionParameters.java diff --git a/docker/1.7-1/resources/mms/config.properties.tmp b/docker/2.1.0/resources/mms/config.properties.tmp similarity index 100% rename from docker/1.7-1/resources/mms/config.properties.tmp rename to docker/2.1.0/resources/mms/config.properties.tmp diff --git a/docker/1.7-1/resources/mms/endpoints-1.0.jar b/docker/2.1.0/resources/mms/endpoints-1.0.jar similarity index 100% rename from docker/1.7-1/resources/mms/endpoints-1.0.jar rename to docker/2.1.0/resources/mms/endpoints-1.0.jar diff --git a/requirements.txt b/requirements.txt index ee32a3e3..b0be0f49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 -PyYAML==5.4.1 +PyYAML==6.0.1 Pillow==9.1.1 boto3==1.17.52 botocore==1.20.52 @@ -10,8 +10,8 @@ gunicorn==23.0.0 itsdangerous==2.0.1 matplotlib==3.4.1 multi-model-server==1.1.2 -numpy==1.24.1 -pandas==1.2.4 +numpy==2.0.0 +pandas==1.5.0 protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 @@ -20,8 +20,8 @@ retrying==1.3.3 requests==2.29.0 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scikit-learn==0.24.1 -scipy==1.8.0 +scikit-learn==1.4.2 +scipy==1.10.0 urllib3==1.26.5 wheel==0.36.2 jinja2==2.11.3 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 9df48c65..3a1afcc8 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -15,16 +15,16 @@ gunicorn==23.0.0 matplotlib==3.4.1 multi-model-server==1.1.2 -numpy==1.24.1 -pandas==1.2.4 +numpy==2.0.0 +pandas==1.5.0 psutil==5.6.7 pyarrow==14.0.1 python-dateutil==2.8.1 retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scikit-learn==0.24.1 -scipy==1.8.0 +scikit-learn==1.4.2 +scipy==1.10.0 smdebug==1.0.29 urllib3==1.26.5 wheel==0.36.2 diff --git a/tox.ini b/tox.ini index 51a849e2..6f6953f2 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ deps = xgboost1.3: xgboost==1.3.3 xgboost1.5: xgboost==1.5.2 xgboost1.7: xgboost==1.7.4 + xgboost2.10: xgboost==2.1.0 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt From a24751161163132dca4736878cb000ce4bdd93ca Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 17:19:15 -0700 Subject: [PATCH 002/157] update setuptool --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5d7bf33d..dbdc05e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,6 @@ [tool.isort] profile = "black" + +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" From b9ce4bb94f573047ba4209734b9ac979fb5c0367 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 17:56:04 -0700 Subject: [PATCH 003/157] fix scikit-learn 1.4.2 error --- requirements.txt | 6 +++--- test/resources/versions/train.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index b0be0f49..7a2c5bbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,8 +20,8 @@ retrying==1.3.3 requests==2.29.0 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scikit-learn==1.4.2 -scipy==1.10.0 +scikit-learn==1.5.0 +scipy==1.13.0 urllib3==1.26.5 wheel==0.36.2 jinja2==2.11.3 @@ -29,4 +29,4 @@ MarkupSafe==1.1.1 Werkzeug==0.15.6 certifi==2023.7.22 gevent==23.9.1 -numba==0.58.1 \ No newline at end of file +numba==0.58.1 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 3a1afcc8..77d04a29 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -23,8 +23,8 @@ retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scikit-learn==1.4.2 -scipy==1.10.0 +scikit-learn==1.5.0 +scipy==1.13.0 smdebug==1.0.29 urllib3==1.26.5 wheel==0.36.2 From a78e4162d0c67b99e4e28fae42c71372de2a9ec0 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 18:48:49 -0700 Subject: [PATCH 004/157] test scipy 1.13.0 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7a2c5bbd..104c1ef4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ retrying==1.3.3 requests==2.29.0 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scikit-learn==1.5.0 +scikit-learn==1.4.2 scipy==1.13.0 urllib3==1.26.5 wheel==0.36.2 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 77d04a29..6e70b322 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -23,7 +23,7 @@ retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scikit-learn==1.5.0 +scikit-learn==1.4.2 scipy==1.13.0 smdebug==1.0.29 urllib3==1.26.5 From de01800fdbb649564871f5baf88bf93b5bcafd7c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 19:26:35 -0700 Subject: [PATCH 005/157] fix typo --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 6f6953f2..f7bd11e2 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ deps = xgboost1.3: xgboost==1.3.3 xgboost1.5: xgboost==1.5.2 xgboost1.7: xgboost==1.7.4 - xgboost2.10: xgboost==2.1.0 + xgboost2.1.0: xgboost==2.1.0 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt From d348ed3468563421a8aeee0a4e195bd596b5ebf9 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 19:35:03 -0700 Subject: [PATCH 006/157] test scipy 1.10.0 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 104c1ef4..2a4ce739 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,8 +20,8 @@ retrying==1.3.3 requests==2.29.0 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 +scipy==1.10.0 scikit-learn==1.4.2 -scipy==1.13.0 urllib3==1.26.5 wheel==0.36.2 jinja2==2.11.3 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 6e70b322..ea6ce40a 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -23,8 +23,8 @@ retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 +scipy==1.10.0 scikit-learn==1.4.2 -scipy==1.13.0 smdebug==1.0.29 urllib3==1.26.5 wheel==0.36.2 From 9c9f24e8b65419d4981fc2dbd52b7d899eb7565c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 20:02:40 -0700 Subject: [PATCH 007/157] try numpy 2.1.0 --- requirements.txt | 6 +++--- test/resources/versions/train.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2a4ce739..687248cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ gunicorn==23.0.0 itsdangerous==2.0.1 matplotlib==3.4.1 multi-model-server==1.1.2 -numpy==2.0.0 +numpy==2.1.0 pandas==1.5.0 protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 @@ -20,8 +20,8 @@ retrying==1.3.3 requests==2.29.0 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scipy==1.10.0 -scikit-learn==1.4.2 +scipy==1.15.0 +scikit-learn==1.5.2 urllib3==1.26.5 wheel==0.36.2 jinja2==2.11.3 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index ea6ce40a..ce918d0a 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -15,7 +15,7 @@ gunicorn==23.0.0 matplotlib==3.4.1 multi-model-server==1.1.2 -numpy==2.0.0 +numpy==2.1.0 pandas==1.5.0 psutil==5.6.7 pyarrow==14.0.1 @@ -23,8 +23,8 @@ retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 -scipy==1.10.0 -scikit-learn==1.4.2 +scipy==1.15.0 +scikit-learn==1.5.2 smdebug==1.0.29 urllib3==1.26.5 wheel==0.36.2 From 13e3f1d0ef77f76ee80f2ebe8fc9437ba1f9aaf1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 20:37:41 -0700 Subject: [PATCH 008/157] try numpy 2.1.0 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 687248cd..e4ddd063 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ sagemaker-inference==1.5.5 scipy==1.15.0 scikit-learn==1.5.2 urllib3==1.26.5 -wheel==0.36.2 +wheel==0.45.1 jinja2==2.11.3 MarkupSafe==1.1.1 Werkzeug==0.15.6 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index ce918d0a..8281560d 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -27,7 +27,7 @@ scikit-learn==1.5.2 smdebug==1.0.29 urllib3==1.26.5 -wheel==0.36.2 +wheel==0.45.1 jinja2==2.11.3 MarkupSafe==1.1.1 Werkzeug==0.15.6 From 22f1b153987a8328b52c5a6c70e54e994c4558ef Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 21:36:46 -0700 Subject: [PATCH 009/157] set numba 0.61.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e4ddd063..578f9808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,4 @@ MarkupSafe==1.1.1 Werkzeug==0.15.6 certifi==2023.7.22 gevent==23.9.1 -numba==0.58.1 +numba==0.61.0 From e4acfd1b1dbd7a47ac2256759909334c74042ac1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 22:16:22 -0700 Subject: [PATCH 010/157] set pyyaml 5.4.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 578f9808..513b9aba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 -PyYAML==6.0.1 +PyYAML==5.4.1 Pillow==9.1.1 boto3==1.17.52 botocore==1.20.52 From b46448d41177e7681f7c92da162da4a2c4e50168 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 22:43:19 -0700 Subject: [PATCH 011/157] set pyyaml 6.0.1 --- requirements.txt | 2 +- test-requirements.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 513b9aba..578f9808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 -PyYAML==5.4.1 +PyYAML==6.0.1 Pillow==9.1.1 boto3==1.17.52 botocore==1.20.52 diff --git a/test-requirements.txt b/test-requirements.txt index 8e354727..e26f6891 100755 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -2,7 +2,6 @@ Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 black coverage docker==6.1.3 # docker 7.0.0 has a breaking change: https://github.com/docker/docker-py/issues/3194#issuecomment-1848950456 -docker-compose flake8 isort mock From 9f66b733c4c4fba02ce2204093589577c7358414 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 11 Sep 2025 22:54:52 -0700 Subject: [PATCH 012/157] set cryptography 45.0.5 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 578f9808..bdccb465 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ PyYAML==6.0.1 Pillow==9.1.1 boto3==1.17.52 botocore==1.20.52 -cryptography==39.0.1 +cryptography==45.0.5 dask==2022.11.1 dask-cuda==22.12.0 gunicorn==23.0.0 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 8281560d..64e53790 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -11,7 +11,7 @@ boto3==1.17.52 botocore==1.20.52 conda==24.7.1 -cryptography==39.0.1 +cryptography==45.0.5 gunicorn==23.0.0 matplotlib==3.4.1 multi-model-server==1.1.2 From 3a884039a6743ce1bd7a02e8e254b2cd0b43fbaf Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 00:13:54 -0700 Subject: [PATCH 013/157] set requests 2.32.3 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 69ea547c..16f94b6a 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -137,7 +137,7 @@ RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ conda config --system --set show_channel_urls true && \ echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \ conda install -c conda-forge python=${PYTHON_VERSION} --solver classic && \ - pip install requests==2.27.0 && \ + pip install requests==2.32.3 && \ conda install conda=${CONDA_PKG_VERSION} --solver classic && \ conda update -y conda && \ conda install -c conda-forge pyarrow=${PYARROW_VERSION} --solver classic && \ diff --git a/requirements.txt b/requirements.txt index bdccb465..d7d00550 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 python-dateutil==2.8.1 retrying==1.3.3 -requests==2.29.0 +requests==2.32.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 scipy==1.15.0 From eda94ea822304b2275902118ff6b4c733095512e Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 09:01:46 -0700 Subject: [PATCH 014/157] fix image name --- docker/2.1.0/final/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 6610bef9..cb383932 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -1,7 +1,7 @@ ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 ARG PYTHON_VERSION=3.10 -FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION} +FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 ARG SAGEMAKER_XGBOOST_VERSION From d567cf6a70ae67252fd3186392df41f33368d3c7 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 10:22:08 -0700 Subject: [PATCH 015/157] set panda 2.2.0 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index d7d00550..53e0b396 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ itsdangerous==2.0.1 matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 -pandas==1.5.0 +pandas==2.2.0 protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 64e53790..ba43a256 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -16,7 +16,7 @@ matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 -pandas==1.5.0 +pandas==2.2.0 psutil==5.6.7 pyarrow==14.0.1 python-dateutil==2.8.1 From f5677b6fc9c9b3aac8aa65c4af08e4b11f7fe55d Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 10:36:19 -0700 Subject: [PATCH 016/157] set panda 2.2.3 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 53e0b396..e08f6db4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ itsdangerous==2.0.1 matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 -pandas==2.2.0 +pandas==2.2.3 protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index ba43a256..44eeea05 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -16,7 +16,7 @@ matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 -pandas==2.2.0 +pandas==2.2.3 psutil==5.6.7 pyarrow==14.0.1 python-dateutil==2.8.1 From 85b7540cc6641292a4f95a9bf636118540f3e2e2 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 10:52:48 -0700 Subject: [PATCH 017/157] set python-dateutil==2.8.2 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index e08f6db4..25df5e86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ pandas==2.2.3 protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 -python-dateutil==2.8.1 +python-dateutil==2.8.2 retrying==1.3.3 requests==2.32.3 sagemaker-containers==2.8.6.post2 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 44eeea05..1aa761e2 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -19,7 +19,7 @@ pandas==2.2.3 psutil==5.6.7 pyarrow==14.0.1 -python-dateutil==2.8.1 +python-dateutil==2.8.2 retrying==1.3.3 sagemaker-containers==2.8.6.post2 sagemaker-inference==1.5.5 From d949999725b0c7229fb7fa4d0b96cb8c369362fa Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 12:12:53 -0700 Subject: [PATCH 018/157] set pyarrow 17.0.0 --- docker/2.1.0/base/Dockerfile.cpu | 4 ++-- test/resources/versions/train.py | 2 +- tox.ini | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 16f94b6a..ddf983e7 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -30,7 +30,7 @@ ARG CONDA_CHECKSUM=684cda724bc37e3bbbb342e440fc4cac515c92e91a489eb4359feca353828 ARG CONDA_PY_VERSION=310 ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 -ARG PYARROW_VERSION=14.0.1 +ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=0.9.0 ARG XGBOOST_VERSION=2.1.0 @@ -194,4 +194,4 @@ RUN sqlite3 --version RUN apt list --installed # Install latest version of XGBoost -RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} +RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} numpy==2.1.0 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 1aa761e2..4e184738 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -18,7 +18,7 @@ numpy==2.1.0 pandas==2.2.3 psutil==5.6.7 -pyarrow==14.0.1 +pyarrow==17.0.0 python-dateutil==2.8.2 retrying==1.3.3 sagemaker-containers==2.8.6.post2 diff --git a/tox.ini b/tox.ini index f7bd11e2..0d69a405 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt conda_deps= - pyarrow==14.0.1 + pyarrow==17.0.0 tbb==2020.2 mlio-py==0.9.0 conda_channels= From 4522abffd07ee8f05c466188ee30e4f97959d343 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 12:22:03 -0700 Subject: [PATCH 019/157] replace rabit with dask-based api --- src/sagemaker_xgboost_container/checkpointing.py | 2 +- src/sagemaker_xgboost_container/distributed.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py index a9f3a664..2990a679 100644 --- a/src/sagemaker_xgboost_container/checkpointing.py +++ b/src/sagemaker_xgboost_container/checkpointing.py @@ -7,7 +7,7 @@ import xgboost as xgb from typing import Optional -from xgboost import rabit +from xgboost.dask import DaskDMatrix, train from xgboost.callback import EvaluationMonitor from xgboost.core import XGBoostError diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 91e2e241..ce9e6581 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -21,7 +21,7 @@ import time from retrying import retry -from xgboost import rabit +from xgboost.dask import DaskDMatrix, train # This should point to xgb when the tracker is updated upstream from sagemaker_xgboost_container.dmlc_patch import tracker From a3d19d227fe2f4430f3fe049ff79303ebc64c7a0 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 13:29:13 -0700 Subject: [PATCH 020/157] set protobuf 5.26 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 25df5e86..a5aad3e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 -protobuf==3.20.1 +protobuf==5.26 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 python-dateutil==2.8.2 From f9de2dd61424df69a2f722d641a3ea84376d97fd Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 14:27:00 -0700 Subject: [PATCH 021/157] install pyarrow in container --- docker/2.1.0/base/Dockerfile.cpu | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index ddf983e7..74ff1a06 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -194,4 +194,4 @@ RUN sqlite3 --version RUN apt list --installed # Install latest version of XGBoost -RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} numpy==2.1.0 +RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} numpy==2.1.0 pyarrow==17.0.0 pandas==2.2.3 protobuf==5.26.0 diff --git a/requirements.txt b/requirements.txt index a5aad3e8..5ec78df1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 -protobuf==5.26 +protobuf==5.26.0 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 python-dateutil==2.8.2 From c56031f5606973717ca3294abea5df5c774c0ced Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 15:26:42 -0700 Subject: [PATCH 022/157] set tbb 2022.2.0 --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 0d69a405..18fbb683 100644 --- a/tox.ini +++ b/tox.ini @@ -21,7 +21,7 @@ deps = -r{toxinidir}/test-requirements.txt conda_deps= pyarrow==17.0.0 - tbb==2020.2 + tbb==2022.2.0 mlio-py==0.9.0 conda_channels= conda-forge From edab2d8ba9e1082544aa3fa49ac8886f03d1edbe Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 17:05:10 -0700 Subject: [PATCH 023/157] try mlio-py with pyarrow 17.0.0 --- docker/2.1.0/base/Dockerfile.cpu | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 74ff1a06..5155c44b 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -156,6 +156,7 @@ RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ cd /tmp && \ git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ cd mlio && \ + sed -i 's/find_package(Arrow 14.0.1 REQUIRED)/find_package(Arrow 17 REQUIRED)/' CMakeLists.txt \ build-tools/build-dependency build/third-party all && \ mkdir -p build/release && \ cd build/release && \ From 077edc3f3f1fbe6dde64fb5f7686cbd80d814b31 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 18:22:52 -0700 Subject: [PATCH 024/157] try mlio-py with pyarrow 17.0.0 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 5155c44b..0ffbee52 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -156,7 +156,7 @@ RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ cd /tmp && \ git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ cd mlio && \ - sed -i 's/find_package(Arrow 14.0.1 REQUIRED)/find_package(Arrow 17 REQUIRED)/' CMakeLists.txt \ + sed -i 's/find_package(Arrow 14.0.1 REQUIRED)/find_package(Arrow 17 REQUIRED)/' CMakeLists.txt && \ build-tools/build-dependency build/third-party all && \ mkdir -p build/release && \ cd build/release && \ From 5d07aa2932c0cf251794b16626e33e939d1f898f Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 20:52:02 -0700 Subject: [PATCH 025/157] try install mlio --- docker/2.1.0/base/Dockerfile.cpu | 48 +------------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 0ffbee52..9d9edc55 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -128,53 +128,7 @@ RUN cd /tmp && \ ENV PATH=/miniconda3/bin:${PATH} # Install MLIO with Apache Arrow integration - -# We could install mlio-py from conda, but it comes with extra support such as image reader that increases image size -# which increases training time. We build from source to minimize the image size. -RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ - # Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html - conda config --system --set auto_update_conda false && \ - conda config --system --set show_channel_urls true && \ - echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \ - conda install -c conda-forge python=${PYTHON_VERSION} --solver classic && \ - pip install requests==2.32.3 && \ - conda install conda=${CONDA_PKG_VERSION} --solver classic && \ - conda update -y conda && \ - conda install -c conda-forge pyarrow=${PYARROW_VERSION} --solver classic && \ - cd /miniconda3/pkgs/libgrpc-*/info/test/examples/node && \ - npm install minimist@latest protobufjs@latest && \ - # Remove Node.js, npm, and their dependencies - apt-get purge -y nodejs npm && \ - apt-get autoremove -y && \ - # Final cleanup - rm -rf /etc/apt/sources.list.d/nodesource.list \ - /etc/apt/keyrings/nodesource.gpg \ - /etc/apt/sources.list.d/kitware.list && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - # Continue with the rest of the build process - cd /tmp && \ - git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ - cd mlio && \ - sed -i 's/find_package(Arrow 14.0.1 REQUIRED)/find_package(Arrow 17 REQUIRED)/' CMakeLists.txt && \ - build-tools/build-dependency build/third-party all && \ - mkdir -p build/release && \ - cd build/release && \ - cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. && \ - cmake --build . && \ - cmake --build . --target install && \ - cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/miniconda3/bin/python3" \ - -DMLIO_INCLUDE_ARROW_INTEGRATION=ON ../.. && \ - cmake --build . --target mlio-py && \ - cmake --build . --target mlio-arrow && \ - cd ../../src/mlio-py && \ - python3 setup.py bdist_wheel && \ - python3 -m pip install typing && \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install dist/*.whl && \ - cp -r /tmp/mlio/build/third-party/lib/libtbb* /usr/local/lib/ && \ - ldconfig && \ - rm -rf /tmp/mlio +conda install -c mlio -c conda-forge mlio-py # Copy compiled SQLite from builder stage COPY --from=sqlite-builder /usr/local/bin/sqlite3 /usr/local/bin/sqlite3 From e13d8f4bedea9f2812fbd2542f777a4fd22e9b49 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 21:05:22 -0700 Subject: [PATCH 026/157] try install mlio --- docker/2.1.0/base/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 9d9edc55..398f8669 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -128,7 +128,7 @@ RUN cd /tmp && \ ENV PATH=/miniconda3/bin:${PATH} # Install MLIO with Apache Arrow integration -conda install -c mlio -c conda-forge mlio-py +RUN conda install -c mlio -c conda-forge mlio-py # Copy compiled SQLite from builder stage COPY --from=sqlite-builder /usr/local/bin/sqlite3 /usr/local/bin/sqlite3 From ff431f58259d061231b2fbb09432d74b7353d41a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 12 Sep 2025 22:04:10 -0700 Subject: [PATCH 027/157] hack mlio --- docker/2.1.0/base/Dockerfile.cpu | 49 +++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 398f8669..5cc8abdf 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -128,7 +128,54 @@ RUN cd /tmp && \ ENV PATH=/miniconda3/bin:${PATH} # Install MLIO with Apache Arrow integration -RUN conda install -c mlio -c conda-forge mlio-py + +# We could install mlio-py from conda, but it comes with extra support such as image reader that increases image size +# which increases training time. We build from source to minimize the image size. +RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ + # Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html + conda config --system --set auto_update_conda false && \ + conda config --system --set show_channel_urls true && \ + echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \ + conda install -c conda-forge python=${PYTHON_VERSION} --solver classic && \ + pip install requests==2.32.3 && \ + conda install conda=${CONDA_PKG_VERSION} --solver classic && \ + conda update -y conda && \ + conda install -c conda-forge pyarrow=${PYARROW_VERSION} --solver classic && \ + cd /miniconda3/pkgs/libgrpc-*/info/test/examples/node && \ + npm install minimist@latest protobufjs@latest && \ + # Remove Node.js, npm, and their dependencies + apt-get purge -y nodejs npm && \ + apt-get autoremove -y && \ + # Final cleanup + rm -rf /etc/apt/sources.list.d/nodesource.list \ + /etc/apt/keyrings/nodesource.gpg \ + /etc/apt/sources.list.d/kitware.list && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + # Continue with the rest of the build process + cd /tmp && \ + git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ + cd mlio && \ + sed -i 's/find_package(Arrow 14.0.1 REQUIRED)/find_package(Arrow 17.0.0 REQUIRED)/' CMakeLists.txt && \ + sed -i 's/pyarrow==14.0.1/pyarrow>=17.0.0/' src/mlio-py/setup.py && \ + build-tools/build-dependency build/third-party all && \ + mkdir -p build/release && \ + cd build/release && \ + cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. && \ + cmake --build . && \ + cmake --build . --target install && \ + cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/miniconda3/bin/python3" \ + -DMLIO_INCLUDE_ARROW_INTEGRATION=ON ../.. && \ + cmake --build . --target mlio-py && \ + cmake --build . --target mlio-arrow && \ + cd ../../src/mlio-py && \ + python3 setup.py bdist_wheel && \ + python3 -m pip install typing && \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install dist/*.whl && \ + cp -r /tmp/mlio/build/third-party/lib/libtbb* /usr/local/lib/ && \ + ldconfig && \ + rm -rf /tmp/mlio # Copy compiled SQLite from builder stage COPY --from=sqlite-builder /usr/local/bin/sqlite3 /usr/local/bin/sqlite3 From 08ce40ef7afda4c823945c6d8a4217662d3b3a16 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 13 Sep 2025 00:23:23 -0700 Subject: [PATCH 028/157] hack mlio --- docker/2.1.0/base/Dockerfile.cpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 5cc8abdf..5609cd7e 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -156,8 +156,8 @@ RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \ cd /tmp && \ git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \ cd mlio && \ - sed -i 's/find_package(Arrow 14.0.1 REQUIRED)/find_package(Arrow 17.0.0 REQUIRED)/' CMakeLists.txt && \ - sed -i 's/pyarrow==14.0.1/pyarrow>=17.0.0/' src/mlio-py/setup.py && \ + sed -i 's/find_package(Arrow 14.0.1 REQUIRED/find_package(Arrow 17.0.0 REQUIRED/g' CMakeLists.txt && \ + sed -i 's/pyarrow==14.0.1/pyarrow==17.0.0/g' src/mlio-py/setup.py && \ build-tools/build-dependency build/third-party all && \ mkdir -p build/release && \ cd build/release && \ From 20d4e6990073f67a56156e791b2e61aee119ff38 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 13 Sep 2025 09:29:08 -0700 Subject: [PATCH 029/157] try protobuf 3.20.1 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 5609cd7e..aae78389 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -196,4 +196,4 @@ RUN sqlite3 --version RUN apt list --installed # Install latest version of XGBoost -RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} numpy==2.1.0 pyarrow==17.0.0 pandas==2.2.3 protobuf==5.26.0 +RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION} numpy==2.1.0 pyarrow==17.0.0 pandas==2.2.3 diff --git a/requirements.txt b/requirements.txt index 5ec78df1..25df5e86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ matplotlib==3.4.1 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 -protobuf==5.26.0 +protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 python-dateutil==2.8.2 From a21921463a647f0b2dab3a234c7f9d566d1aee4c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 13 Sep 2025 13:01:02 -0700 Subject: [PATCH 030/157] set dask 2024.10.0 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 25df5e86..0a2692c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ Pillow==9.1.1 boto3==1.17.52 botocore==1.20.52 cryptography==45.0.5 -dask==2022.11.1 -dask-cuda==22.12.0 +dask==2024.10.0 +dask-cuda==24.10.0 gunicorn==23.0.0 itsdangerous==2.0.1 matplotlib==3.4.1 From 59a91aa0bd17034dc4441e20abc0116d051dd39f Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 13 Sep 2025 13:38:02 -0700 Subject: [PATCH 031/157] set dask 2024.9.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0a2692c5..555f330f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ Pillow==9.1.1 boto3==1.17.52 botocore==1.20.52 cryptography==45.0.5 -dask==2024.10.0 +dask==2024.9.0 dask-cuda==24.10.0 gunicorn==23.0.0 itsdangerous==2.0.1 From 7324839061c13d49f1123d94842998d15a4512de Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 13 Sep 2025 14:12:39 -0700 Subject: [PATCH 032/157] set psutil 5.8.0 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 555f330f..36d449bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 protobuf==3.20.1 -psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 +psutil==5.8.0 # sagemaker-containers requires psutil 5.6.7 pynvml==11.4.1 python-dateutil==2.8.2 retrying==1.3.3 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 4e184738..48154032 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -17,7 +17,7 @@ multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 -psutil==5.6.7 +psutil==5.8.0 pyarrow==17.0.0 python-dateutil==2.8.2 retrying==1.3.3 From e759563af10f04bbf95e8d707b15acdf4e78b3d9 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 15 Sep 2025 15:42:56 -0700 Subject: [PATCH 033/157] update train test minor version --- test/resources/versions/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 48154032..2e50d5e6 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -3,7 +3,7 @@ import pkg_resources PYTHON_MAJOR_VERSION = 3 -PYTHON_MINOR_VERSION = 9 +PYTHON_MINOR_VERSION = 10 REQUIREMENTS = """\ Flask==1.1.1 Pillow==9.1.1 From 28662dcbaefbf647d1fccefd076f99b403cb93db Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 15 Sep 2025 17:05:56 -0700 Subject: [PATCH 034/157] set matplotlib==3.6.3 --- requirements.txt | 2 +- test/resources/versions/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 36d449bb..78310006 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ dask==2024.9.0 dask-cuda==24.10.0 gunicorn==23.0.0 itsdangerous==2.0.1 -matplotlib==3.4.1 +matplotlib==3.6.3 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 2e50d5e6..78bfbe09 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -13,7 +13,7 @@ conda==24.7.1 cryptography==45.0.5 gunicorn==23.0.0 -matplotlib==3.4.1 +matplotlib==3.6.3 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 From 3515083b75e8de5e739c6e02bab1fda37c02fe78 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 15 Sep 2025 17:13:30 -0700 Subject: [PATCH 035/157] set matplotlib==3.6.3 --- test/resources/versions/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 78bfbe09..26480a4c 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -7,7 +7,7 @@ REQUIREMENTS = """\ Flask==1.1.1 Pillow==9.1.1 -PyYAML==5.4.1 +PyYAML==6.0.1 boto3==1.17.52 botocore==1.20.52 conda==24.7.1 From 2da93abf9cede16731074aa584569ff0da4eb901 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 12:01:45 -0700 Subject: [PATCH 036/157] Trigger Build From 01ad9b1ff463d9808b8b25715a4555029a8afbb1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 15:59:17 -0700 Subject: [PATCH 037/157] test dask migration --- test/unit/distributed_gpu/test_dask_data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/distributed_gpu/test_dask_data_utils.py b/test/unit/distributed_gpu/test_dask_data_utils.py index 571247fc..1b1ff74d 100644 --- a/test/unit/distributed_gpu/test_dask_data_utils.py +++ b/test/unit/distributed_gpu/test_dask_data_utils.py @@ -40,7 +40,7 @@ def test_read_data_csv(self): x, y = read_data(self.data_path_csv, CSV) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert y.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE + assert y.shape[[0]].compute() == self.NUM_ROWS_IN_EACH_FILE def test_read_data_csv_malformed_path(self): x, y = read_data(self.data_path_csv + "/", CSV) @@ -54,7 +54,7 @@ def test_read_data_parquet(self): x, y = read_data(self.data_path_parquet, PARQUET) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert y.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 + assert y.shape[[0]].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 def test_read_data_unsupported_content(self): with self.assertRaises(UserError): From 054c09c076ea049ad940e92f8283b26166e65306 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 16:15:55 -0700 Subject: [PATCH 038/157] test xgb migration --- src/sagemaker_xgboost_container/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/data_utils.py b/src/sagemaker_xgboost_container/data_utils.py index ae49a677..80c569a8 100644 --- a/src/sagemaker_xgboost_container/data_utils.py +++ b/src/sagemaker_xgboost_container/data_utils.py @@ -395,7 +395,7 @@ def get_libsvm_dmatrix(files_path, is_pipe=False): raise exc.UserError("Pipe mode not supported for LibSVM.") try: - dmatrix = xgb.DMatrix(files_path) + dmatrix = xgb.DMatrix(f"{files_path}?format=libsvm") except Exception as e: raise exc.UserError("Failed to load libsvm data with exception:\n{}".format(e)) From cff59cb4bd9aa7fdc4f914751d4479256896cdb4 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 16:28:48 -0700 Subject: [PATCH 039/157] test xgb rabit --- src/sagemaker_xgboost_container/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index ce9e6581..91e2e241 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -21,7 +21,7 @@ import time from retrying import retry -from xgboost.dask import DaskDMatrix, train +from xgboost import rabit # This should point to xgb when the tracker is updated upstream from sagemaker_xgboost_container.dmlc_patch import tracker From 9c3fe624dd3bac084d08696c840f094c7cb62c97 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 18:18:40 -0700 Subject: [PATCH 040/157] test xgb rapit migration --- src/sagemaker_xgboost_container/distributed.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 91e2e241..da1632ae 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -21,7 +21,11 @@ import time from retrying import retry -from xgboost import rabit +from xgboost.collective import ( + get_rank, + get_world_size, + broadcast, + finalize) # This should point to xgb when the tracker is updated upstream from sagemaker_xgboost_container.dmlc_patch import tracker @@ -131,7 +135,7 @@ def __init__(self, is_master, current_host, master_port): :param master_port: """ self.is_master = is_master - self.rank = rabit.get_rank() + self.rank = get_rank() self.current_host = current_host self.master_port = master_port @@ -145,14 +149,14 @@ def synchronize(self, data): :return: aggregated data from the all the nodes in the cluster """ results = [] - for i in range(rabit.get_world_size()): + for i in range(get_world_size()): if self.rank == i: logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) - rabit.broadcast(data, i) + broadcast(data, i) results.append(data) else: logging.debug("Receiving data from {}".format(i)) - message = rabit.broadcast(None, i) + message = broadcast(None, i) results.append(message) return results @@ -299,7 +303,7 @@ def start(self): # We can check that the rabit instance has successfully connected to the # server by getting the rank of the server (e.g. its position in the ring). # This should be unique for each instance. - self.logger.debug("Rabit started - Rank {}".format(rabit.get_rank())) + self.logger.debug("Rabit started - Rank {}".format(get_rank())) self.logger.debug("Executing user code") # We can now run user-code. Since XGBoost runs in the same process space @@ -322,7 +326,7 @@ def stop(self): # This is the call that actually shuts down the rabit server; and when # all of the slaves have been shut down then the RabitTracker will close # /shutdown itself. - rabit.finalize() + finalize() if self.is_master_host: self.rabit_context.join() From 103c0c9c9fa08955bab885d7c7cd956019ef1d04 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 18:55:40 -0700 Subject: [PATCH 041/157] test dask expr backend migration --- test/unit/distributed_gpu/test_dask_data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/distributed_gpu/test_dask_data_utils.py b/test/unit/distributed_gpu/test_dask_data_utils.py index 1b1ff74d..c6154fb0 100644 --- a/test/unit/distributed_gpu/test_dask_data_utils.py +++ b/test/unit/distributed_gpu/test_dask_data_utils.py @@ -40,7 +40,7 @@ def test_read_data_csv(self): x, y = read_data(self.data_path_csv, CSV) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert y.shape[[0]].compute() == self.NUM_ROWS_IN_EACH_FILE + assert y.shape[0] == self.NUM_ROWS_IN_EACH_FILE def test_read_data_csv_malformed_path(self): x, y = read_data(self.data_path_csv + "/", CSV) @@ -54,7 +54,7 @@ def test_read_data_parquet(self): x, y = read_data(self.data_path_parquet, PARQUET) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert y.shape[[0]].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 + assert y.shape[0] == self.NUM_ROWS_IN_EACH_FILE * 2 def test_read_data_unsupported_content(self): with self.assertRaises(UserError): From 0e2a074ab47dbbbcdcbb37a31be245a5f71ffeb3 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 19:08:54 -0700 Subject: [PATCH 042/157] test rabit.tracker_print migration --- src/sagemaker_xgboost_container/checkpointing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py index 2990a679..734851cd 100644 --- a/src/sagemaker_xgboost_container/checkpointing.py +++ b/src/sagemaker_xgboost_container/checkpointing.py @@ -7,7 +7,7 @@ import xgboost as xgb from typing import Optional -from xgboost.dask import DaskDMatrix, train +from xgboost import collective from xgboost.callback import EvaluationMonitor from xgboost.core import XGBoostError @@ -116,7 +116,8 @@ def after_iteration(self, model, epoch=0, evals_log=None): score = log[-1] msg += evaluation_monitor._fmt_metric(data, metric_name, score, stdv) msg += "\n" - rabit.tracker_print("[%d]\t%s\n" % (i + self.start_iteration, msg)) + with collective.CommunicatorContext(): + track_print("[%d]\t%s\n" % (i + self.start_iteration, msg)) def print_checkpointed_evaluation(end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0): From d3e4ee5c1ee596c408eb94cc2a25efec9bac096e Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 20:19:06 -0700 Subject: [PATCH 043/157] test rabit and libsvm migration --- src/sagemaker_xgboost_container/checkpointing.py | 4 ++-- src/sagemaker_xgboost_container/distributed.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py index 734851cd..aa3b9f07 100644 --- a/src/sagemaker_xgboost_container/checkpointing.py +++ b/src/sagemaker_xgboost_container/checkpointing.py @@ -116,8 +116,8 @@ def after_iteration(self, model, epoch=0, evals_log=None): score = log[-1] msg += evaluation_monitor._fmt_metric(data, metric_name, score, stdv) msg += "\n" - with collective.CommunicatorContext(): - track_print("[%d]\t%s\n" % (i + self.start_iteration, msg)) + if collective.get_rank() == 0: + print("[%d]\t%s\n" % (i + self.start_iteration, msg)) def print_checkpointed_evaluation(end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0): diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index da1632ae..b0d4827a 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -22,6 +22,7 @@ from retrying import retry from xgboost.collective import ( + init, get_rank, get_world_size, broadcast, @@ -292,7 +293,7 @@ def start(self): else: self.logger.info("Connected to RabitTracker.") - rabit.init( + init( [ "DMLC_NUM_WORKER={}".format(self.n_workers).encode(), "DMLC_TRACKER_URI={}".format(self.master_host).encode(), From 5e6a1b28e884f876f6a9f38bd7b6d32762abf1c2 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 23 Sep 2025 23:22:00 -0700 Subject: [PATCH 044/157] test rabit and dask --- src/sagemaker_xgboost_container/distributed.py | 13 ++++++------- src/sagemaker_xgboost_container/encoder.py | 2 +- test/unit/distributed_gpu/test_dask_data_utils.py | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index b0d4827a..862ac887 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -16,6 +16,7 @@ Some of this code should be made simpler once the XGBoost library is improved. """ import logging +import os import socket import sys import time @@ -293,13 +294,11 @@ def start(self): else: self.logger.info("Connected to RabitTracker.") - init( - [ - "DMLC_NUM_WORKER={}".format(self.n_workers).encode(), - "DMLC_TRACKER_URI={}".format(self.master_host).encode(), - "DMLC_TRACKER_PORT={}".format(self.port).encode(), - ] - ) + os.environ["DMLC_NUM_WORKER"] = self.n_workers + os.environ["DMLC_TRACKER_URI"] = self.master_host + os.environ["DMLC_TRACKER_PORT"] = self.port + + init() # We can check that the rabit instance has successfully connected to the # server by getting the rank of the server (e.g. its position in the ring). diff --git a/src/sagemaker_xgboost_container/encoder.py b/src/sagemaker_xgboost_container/encoder.py index cd11ee90..1b5dac0d 100644 --- a/src/sagemaker_xgboost_container/encoder.py +++ b/src/sagemaker_xgboost_container/encoder.py @@ -69,7 +69,7 @@ def libsvm_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix temp_file_location = libsvm_file.name libsvm_file.write(string_like) - dmatrix = xgb.DMatrix(temp_file_location) + dmatrix = xgb.DMatrix(f"{temp_file_location}?format=libsvm") finally: if temp_file_location and os.path.exists(temp_file_location): os.remove(temp_file_location) diff --git a/test/unit/distributed_gpu/test_dask_data_utils.py b/test/unit/distributed_gpu/test_dask_data_utils.py index c6154fb0..e0254672 100644 --- a/test/unit/distributed_gpu/test_dask_data_utils.py +++ b/test/unit/distributed_gpu/test_dask_data_utils.py @@ -40,7 +40,7 @@ def test_read_data_csv(self): x, y = read_data(self.data_path_csv, CSV) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert y.shape[0] == self.NUM_ROWS_IN_EACH_FILE + assert (y.shape[0] == self.NUM_ROWS_IN_EACH_FILE).compute() def test_read_data_csv_malformed_path(self): x, y = read_data(self.data_path_csv + "/", CSV) @@ -54,7 +54,7 @@ def test_read_data_parquet(self): x, y = read_data(self.data_path_parquet, PARQUET) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert y.shape[0] == self.NUM_ROWS_IN_EACH_FILE * 2 + assert (y.shape[0] == self.NUM_ROWS_IN_EACH_FILE * 2).compute() def test_read_data_unsupported_content(self): with self.assertRaises(UserError): From c4cd3e6e50d52e08c62bdd3bcf17614f99d237f6 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 07:07:04 -0700 Subject: [PATCH 045/157] test dask migratinon --- docker/2.1.0/final/Dockerfile.cpu | 14 ++++++++------ src/sagemaker_xgboost_container/distributed.py | 6 +++--- test/unit/distributed_gpu/test_dask_data_utils.py | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index cb383932..46207226 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -12,12 +12,14 @@ COPY requirements.txt /requirements.txt RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt # Fix Python 3.10 compatibility for sagemaker-containers -RUN python3 -c "import sys; sys.path.insert(0, '/miniconda3/lib/python3.10/site-packages'); \ - import sagemaker_containers._mapping as m; \ - import collections.abc; \ - setattr(collections, 'Mapping', collections.abc.Mapping); \ - exec(open('/miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py').read().replace('collections.Mapping', 'collections.abc.Mapping'))" || \ - sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py +# RUN python3 -c "import sys; sys.path.insert(0, '/miniconda3/lib/python3.10/site-packages'); \ +# import sagemaker_containers._mapping as m; \ +# import collections.abc; \ +# setattr(collections, 'Mapping', collections.abc.Mapping); \ +# exec(open('/miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py').read().replace('collections.Mapping', 'collections.abc.Mapping'))" || \ +# sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py + +RUN sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py # Install smdebug from source RUN python3 -m pip install git+https://github.com/awslabs/sagemaker-debugger.git@1.0.29 diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 862ac887..1ab9a7ce 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -294,9 +294,9 @@ def start(self): else: self.logger.info("Connected to RabitTracker.") - os.environ["DMLC_NUM_WORKER"] = self.n_workers - os.environ["DMLC_TRACKER_URI"] = self.master_host - os.environ["DMLC_TRACKER_PORT"] = self.port + os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) + os.environ["DMLC_TRACKER_URI"] = str(self.master_host) + os.environ["DMLC_TRACKER_PORT"] = str(self.port) init() diff --git a/test/unit/distributed_gpu/test_dask_data_utils.py b/test/unit/distributed_gpu/test_dask_data_utils.py index e0254672..c74f3ada 100644 --- a/test/unit/distributed_gpu/test_dask_data_utils.py +++ b/test/unit/distributed_gpu/test_dask_data_utils.py @@ -40,7 +40,7 @@ def test_read_data_csv(self): x, y = read_data(self.data_path_csv, CSV) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert (y.shape[0] == self.NUM_ROWS_IN_EACH_FILE).compute() + assert len(y) == self.NUM_ROWS_IN_EACH_FILE def test_read_data_csv_malformed_path(self): x, y = read_data(self.data_path_csv + "/", CSV) @@ -54,7 +54,7 @@ def test_read_data_parquet(self): x, y = read_data(self.data_path_parquet, PARQUET) assert x.shape[0].compute() == self.NUM_ROWS_IN_EACH_FILE * 2 assert x.shape[1] == self.NUM_COLS_IN_EACH_FILE - 1 - assert (y.shape[0] == self.NUM_ROWS_IN_EACH_FILE * 2).compute() + assert len(y) == self.NUM_ROWS_IN_EACH_FILE * 2 def test_read_data_unsupported_content(self): with self.assertRaises(UserError): From be02e7c9ee226bec4a4715faf2b6408d5f190a37 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 08:52:47 -0700 Subject: [PATCH 046/157] test rabit --- src/sagemaker_xgboost_container/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 1ab9a7ce..9f84d45f 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -22,7 +22,7 @@ import time from retrying import retry -from xgboost.collective import ( +from xgboost.rabit import ( init, get_rank, get_world_size, From 7e4b844401e48f7f925c6f8ceee3d586a6ad48d9 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 10:19:56 -0700 Subject: [PATCH 047/157] test _aggregate_predictions --- src/sagemaker_xgboost_container/prediction_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/prediction_utils.py b/src/sagemaker_xgboost_container/prediction_utils.py index 92c7c225..12b0c6c9 100644 --- a/src/sagemaker_xgboost_container/prediction_utils.py +++ b/src/sagemaker_xgboost_container/prediction_utils.py @@ -91,7 +91,11 @@ def _aggregate_predictions(self) -> np.ndarray: if self.classification: columns.append(self.y_prob.mean(axis=-1)) # mode always returns same number of dimensions of output as for input - columns.append(stats.mode(self.y_pred, axis=1).mode[:, 0]) + model_result = stats.mode(self.y_pred, axis=1, keepdims=True) + model_values = model_result.mode + if model_values.ndim > 1: + model_values = model_values[:, 0] + columns.append(model_values) else: columns.append(self.y_pred.mean(axis=-1)) From 56c302dc0779f4f040049e612840b20f8279cdc1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 10:49:18 -0700 Subject: [PATCH 048/157] recover checkpointing.py distributed.py --- .../checkpointing.py | 7 ++-- .../distributed.py | 34 ++++++++----------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py index aa3b9f07..6805a264 100644 --- a/src/sagemaker_xgboost_container/checkpointing.py +++ b/src/sagemaker_xgboost_container/checkpointing.py @@ -7,7 +7,7 @@ import xgboost as xgb from typing import Optional -from xgboost import collective +from xgboost import rabit from xgboost.callback import EvaluationMonitor from xgboost.core import XGBoostError @@ -116,8 +116,7 @@ def after_iteration(self, model, epoch=0, evals_log=None): score = log[-1] msg += evaluation_monitor._fmt_metric(data, metric_name, score, stdv) msg += "\n" - if collective.get_rank() == 0: - print("[%d]\t%s\n" % (i + self.start_iteration, msg)) + rabit.tracker_print("[%d]\t%s\n" % (i + self.start_iteration, msg)) def print_checkpointed_evaluation(end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0): @@ -438,4 +437,4 @@ def __init__(self, intermediate_model_dir, model_name, is_master): def after_iteration(self, model, epoch, evals_log) -> bool: if self.is_master: self.callback.save_intermediate_model(model) - return False + return False \ No newline at end of file diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 9f84d45f..96125e32 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -16,18 +16,12 @@ Some of this code should be made simpler once the XGBoost library is improved. """ import logging -import os import socket import sys import time from retrying import retry -from xgboost.rabit import ( - init, - get_rank, - get_world_size, - broadcast, - finalize) +from xgboost import rabit # This should point to xgb when the tracker is updated upstream from sagemaker_xgboost_container.dmlc_patch import tracker @@ -137,7 +131,7 @@ def __init__(self, is_master, current_host, master_port): :param master_port: """ self.is_master = is_master - self.rank = get_rank() + self.rank = rabit.get_rank() self.current_host = current_host self.master_port = master_port @@ -151,14 +145,14 @@ def synchronize(self, data): :return: aggregated data from the all the nodes in the cluster """ results = [] - for i in range(get_world_size()): + for i in range(rabit.get_world_size()): if self.rank == i: logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) - broadcast(data, i) + rabit.broadcast(data, i) results.append(data) else: logging.debug("Receiving data from {}".format(i)) - message = broadcast(None, i) + message = rabit.broadcast(None, i) results.append(message) return results @@ -294,16 +288,18 @@ def start(self): else: self.logger.info("Connected to RabitTracker.") - os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) - os.environ["DMLC_TRACKER_URI"] = str(self.master_host) - os.environ["DMLC_TRACKER_PORT"] = str(self.port) - - init() + rabit.init( + [ + "DMLC_NUM_WORKER={}".format(self.n_workers).encode(), + "DMLC_TRACKER_URI={}".format(self.master_host).encode(), + "DMLC_TRACKER_PORT={}".format(self.port).encode(), + ] + ) # We can check that the rabit instance has successfully connected to the # server by getting the rank of the server (e.g. its position in the ring). # This should be unique for each instance. - self.logger.debug("Rabit started - Rank {}".format(get_rank())) + self.logger.debug("Rabit started - Rank {}".format(rabit.get_rank())) self.logger.debug("Executing user code") # We can now run user-code. Since XGBoost runs in the same process space @@ -326,7 +322,7 @@ def stop(self): # This is the call that actually shuts down the rabit server; and when # all of the slaves have been shut down then the RabitTracker will close # /shutdown itself. - finalize() + rabit.finalize() if self.is_master_host: self.rabit_context.join() @@ -334,4 +330,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() + return self.stop() \ No newline at end of file From 57ffac537580e55e2bb3bc0f69324d326ade3076 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 17:07:28 -0700 Subject: [PATCH 049/157] rabit deprecate --- docker/2.1.0/final/Dockerfile.cpu | 8 +- .../checkpointing.py | 4 +- .../distributed.py | 291 +++++++++++------- 3 files changed, 184 insertions(+), 119 deletions(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 46207226..5417ec22 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -39,11 +39,11 @@ RUN rm -rf /miniconda3/lib/python${PYTHON_VERSION}/site-packages/numpy-1.21.2.di # DMLC PATCH # ############## # TODO: remove after making contributions back to xgboost for tracker.py -COPY src/sagemaker_xgboost_container/dmlc_patch/tracker.py \ - /miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py +# COPY src/sagemaker_xgboost_container/dmlc_patch/tracker.py \ +# /miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py -# Include DMLC python code in PYTHONPATH to use RabitTracker -ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker +# # Include DMLC python code in PYTHONPATH to use RabitTracker +# ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker ####### # MMS # diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py index 6805a264..f57110fa 100644 --- a/src/sagemaker_xgboost_container/checkpointing.py +++ b/src/sagemaker_xgboost_container/checkpointing.py @@ -7,7 +7,7 @@ import xgboost as xgb from typing import Optional -from xgboost import rabit +# from xgboost import rabit from xgboost.callback import EvaluationMonitor from xgboost.core import XGBoostError @@ -116,7 +116,7 @@ def after_iteration(self, model, epoch=0, evals_log=None): score = log[-1] msg += evaluation_monitor._fmt_metric(data, metric_name, score, stdv) msg += "\n" - rabit.tracker_print("[%d]\t%s\n" % (i + self.start_iteration, msg)) + # rabit.tracker_print("[%d]\t%s\n" % (i + self.start_iteration, msg)) def print_checkpointed_evaluation(end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0): diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 96125e32..8405a8d5 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -19,12 +19,19 @@ import socket import sys import time +import threading +from typing import List, Dict, Any, Optional +import numpy as np from retrying import retry -from xgboost import rabit -# This should point to xgb when the tracker is updated upstream -from sagemaker_xgboost_container.dmlc_patch import tracker +# XGBoost 2.1.0 uses collective communication instead of rabit +try: + import xgboost as xgb + from xgboost import collective + from xgboost.collective import CommunicatorContext +except ImportError: + raise ImportError("XGBoost 2.1.0 or later is required") LOCAL_HOSTNAME = "127.0.0.1" @@ -56,25 +63,25 @@ def rabit_run( connect_retry_timeout=3, update_rabit_args=False, ): - """Run execution function after initializing dmlc/rabit. + """Run execution function after initializing xgboost collective communication. - This method initializes rabit twice: + This method initializes collective communication twice: 1. To broadcast to all hosts which hosts should be included in training. 2. Run distributed xgb train() with just the hosts from above. - :param exec_fun: Function to run while rabit is initialized. xgb.train() must run in the same process space - in order to utilize rabit initialization. Note that the execution function must also take the args + :param exec_fun: Function to run while collective is initialized. xgb.train() must run in the same process space + in order to utilize collective initialization. Note that the execution function must also take the args 'is_distributed' and 'is_master'. :param args: Arguments to run execution function. :param include_in_training: Boolean if the current hosts should be used in training. This is done here so that all the hosts in the cluster know which hosts to include during training. :param hosts: :param current_host: - :param first_port: Port to use for the initial rabit initialization. If None, rabit defaults this to 9099 - :param second_port: Port to use for second rabit initialization. If None, this increments previous port by 1 + :param first_port: Port to use for the initial collective initialization. If None, defaults to 9099 + :param second_port: Port to use for second collective initialization. If None, this increments previous port by 1 :param max_connect_attempts :param connect_retry_timeout - :param update_rabit_args: Boolean to include rabit information to args. If True, the following is added: + :param update_rabit_args: Boolean to include collective information to args. If True, the following is added: is_master """ with Rabit( @@ -83,12 +90,12 @@ def rabit_run( port=first_port, max_connect_attempts=max_connect_attempts, connect_retry_timeout=connect_retry_timeout, - ) as rabit: - hosts_with_data = rabit.synchronize({"host": rabit.current_host, "include_in_training": include_in_training}) + ) as rabit_helper: + hosts_with_data = rabit_helper.synchronize({"host": rabit_helper.current_host, "include_in_training": include_in_training}) hosts_with_data = [record["host"] for record in hosts_with_data if record["include_in_training"]] # Keep track of port used, so that hosts trying to shutdown know when server is not available - previous_port = rabit.master_port + previous_port = rabit_helper.master_port if not include_in_training: logging.warning("Host {} not being used for distributed training.".format(current_host)) @@ -97,8 +104,8 @@ def rabit_run( second_rabit_port = second_port if second_port else previous_port + 1 if len(hosts_with_data) > 1: - # Set up rabit with nodes that have data and an unused port so that previous slaves don't confuse it - # with the previous rabit configuration + # Set up collective with nodes that have data and an unused port so that previous slaves don't confuse it + # with the previous collective configuration with Rabit( hosts=hosts_with_data, current_host=current_host, @@ -123,17 +130,38 @@ def rabit_run( class RabitHelper(object): - def __init__(self, is_master, current_host, master_port): + def __init__(self, is_master, current_host, master_port, communicator_context=None): """This is returned by the Rabit context manager for useful cluster information and data synchronization. :param is_master: :param current_host: :param master_port: + :param communicator_context: XGBoost collective CommunicatorContext """ self.is_master = is_master - self.rank = rabit.get_rank() self.current_host = current_host self.master_port = master_port + self._communicator_context = communicator_context + + if communicator_context: + self.rank = collective.get_rank() + else: + self.rank = 0 + + def tracker_print(self, msg: str): + """Print message to tracker log. + + Equivalent to rabit.tracker_print() - prints a message to the centralized + logging facility for tracking progress across the distributed cluster. + + :param msg: Message to print to tracker log + """ + if self._communicator_context: + # Use collective.print for distributed case + collective.print(msg) + else: + # For single node case, just use regular logging + logging.info(f"[Tracker] {msg}") def synchronize(self, data): """Synchronize data with the cluster. @@ -142,46 +170,110 @@ def synchronize(self, data): This allows things like determining which nodes have data or not. :param data: data to send to the cluster - :return: aggregated data from the all the nodes in the cluster + :return: aggregated data from all the nodes in the cluster """ + if not self._communicator_context: + # Single node case + return [data] + results = [] - for i in range(rabit.get_world_size()): + world_size = collective.get_world_size() + + for i in range(world_size): if self.rank == i: logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) - rabit.broadcast(data, i) + collective.broadcast(data, i) results.append(data) else: logging.debug("Receiving data from {}".format(i)) - message = rabit.broadcast(None, i) + message = collective.broadcast(None, i) results.append(message) return results +class SimpleTracker: + """Simple tracker implementation for XGBoost collective communication""" + + def __init__(self, host_ip: str, n_workers: int, port: int): + self.host_ip = host_ip + self.n_workers = n_workers + self.port = port + self.server_socket = None + self.server_thread = None + self._shutdown = threading.Event() + + def start(self, n_workers: int): + """Start the tracker server""" + self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.server_socket.bind((self.host_ip, self.port)) + self.server_socket.listen(n_workers) + self.server_socket.settimeout(1.0) # Non-blocking accept + + self.server_thread = threading.Thread(target=self._server_loop, daemon=True) + self.server_thread.start() + + def _server_loop(self): + """Simple server loop to accept connections""" + connected_clients = 0 + while not self._shutdown.is_set() and connected_clients < self.n_workers: + try: + client_socket, addr = self.server_socket.accept() + connected_clients += 1 + logging.debug(f"Tracker accepted connection from {addr}") + client_socket.close() + except socket.timeout: + continue + except Exception as e: + if not self._shutdown.is_set(): + logging.debug(f"Tracker server error: {e}") + break + + def join(self): + """Wait for the server thread to finish""" + self._shutdown.set() + if self.server_thread: + self.server_thread.join() + if self.server_socket: + self.server_socket.close() + + def slave_envs(self): + """Return environment configuration for slaves""" + return { + 'DMLC_NUM_WORKER': str(self.n_workers), + 'DMLC_TRACKER_URI': self.host_ip, + 'DMLC_TRACKER_PORT': str(self.port) + } + + class Rabit(object): @staticmethod def _get_logger(current_host): - logging.basicConfig(format="%(name) [{}]: %(message)s".format(current_host)) + logging.basicConfig(format="%(name)s [{}]: %(message)s".format(current_host)) return logging.getLogger("RabitContextManager") def __init__( - self, hosts, current_host=None, master_host=None, port=None, max_connect_attempts=None, connect_retry_timeout=3 + self, + hosts: List[str], + current_host: Optional[str] = None, + master_host: Optional[str] = None, + port: Optional[int] = None, + max_connect_attempts: Optional[int] = None, + connect_retry_timeout: int = 3 ): - """Context manager for rabit initialization. + """Context manager for XGBoost collective communication initialization. :param hosts: List of hostnames :param current_host: Current hostname. If not provided, use 127.0.0.1. :param master_host: Master host hostname. If not provided, use alphabetically first hostname amongst hosts to ensure determinism in choosing master node. :param port: Port to connect to master, if not specified use 9099. - :param max_connect_attempts: Number of times to try connecting to RabitTracker. If this arg is set + :param max_connect_attempts: Number of times to try connecting to tracker. If this arg is set to None, try indefinitely. - :param connect_retry_timeout: Timeout value when attempting to connect to RabitTracker. + :param connect_retry_timeout: Timeout value when attempting to connect to tracker. This will be ignored if max_connect_attempt is None """ - # Get the host information. This is used to identify the master host - # that will run the RabitTracker and also to work out how many clients/slaves - # exist (this will ensure that all-reduce is set up correctly and that - # it blocks whilst waiting for those hosts to process the data). + # Get the host information if not current_host: current_host = LOCAL_HOSTNAME self.current_host = current_host @@ -192,7 +284,7 @@ def __init__( self.n_workers = len(self.hosts) self.logger.debug("Found hosts: {} [{}]".format(self.hosts, self.n_workers)) - # We use the first lexicographically named host as the master if not indicated otherwise + # Use the first lexicographically named host as the master if not indicated otherwise if not master_host: master_host = self.hosts[0] self.master_host = master_host @@ -201,9 +293,6 @@ def __init__( self.logger.debug("Is Master: {}".format(self.is_master_host)) self.logger.debug("Master: {}".format(self.master_host)) - # We start the RabitTracker on a known port on the first host. We can - # do this since SageMaker Training instances are single tenent and we - # don't need to worry about port contention. if port is None: port = 9099 self.logger.debug("No port specified using: {}".format(port)) @@ -216,118 +305,94 @@ def __init__( else: raise ValueError("max_connect_attempts must be None or an integer greater than 0.") self.connect_retry_timeout = connect_retry_timeout + + self.tracker = None + self.communicator_context = None def start(self): - """Start the rabit process. + """Start the collective communication process. - If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect - to the master host to set up Rabit rank. + If current host is master host, initialize and start the tracker in the background. + All hosts then connect to the master host to set up collective communication. :return: Initialized RabitHelper, which includes helpful information such as is_master and port """ - self.rabit_context = None + if self.n_workers == 1: + # Single node case - no need for collective communication + self.logger.debug("Single node training - skipping collective communication setup") + return RabitHelper(True, self.current_host, self.port) + if self.is_master_host: - self.logger.debug("Master host. Starting Rabit Tracker.") - # The Rabit Tracker is a Python script that is responsible for - # allowing each instance of rabit to find its peers and organize - # itself in to a ring for all-reduce. It supports primitive failure - # recovery modes. - # - # It runs on a master node that each of the individual Rabit instances - # talk to. - self.rabit_context = tracker.RabitTracker( - hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 + self.logger.debug("Master host. Starting Tracker.") + self.tracker = SimpleTracker( + host_ip=self.current_host, + n_workers=self.n_workers, + port=self.port ) + self.logger.info("Tracker slave environment: {}".format(self.tracker.slave_envs())) + self.tracker.start(self.n_workers) - # Useful logging to ensure that the tracker has started. - # These are the key-value config pairs that each of the rabit slaves - # should be initialized with. Since we have deterministically allocated - # the master host, its port, and the number of workers, we don't need - # to pass these out-of-band to each slave; but rely on the fact - # that each slave will calculate the exact same config as the server. - # - # TODO: should probably check that these match up what we pass below. - self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) - - # This actually starts the RabitTracker in a background/daemon thread - # that will automatically exit when the main process has finished. - self.rabit_context.start(self.n_workers) - - # Start each parameter server that connects to the master. + # Start parameter server that connects to the master self.logger.debug("Starting parameter server.") - # Rabit runs as an in-process singleton library that can be configured once. - # Calling this multiple times will cause a seg-fault (without calling finalize). - # We pass it the environment variables that match up with the RabitTracker - # so that this instance can discover its peers (and recover from failure). - # - # First we check that the RabitTracker is up and running. Rabit actually - # breaks (at least on Mac OS X) if the server is not running before it - # begins to try to connect (its internal retries fail because they reuse - # the same socket instead of creating a new one). - # - # if self.max_connect_attempts is None, this will loop indefinitely. + # Wait for tracker to be available attempt = 0 successful_connection = False while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: - self.logger.debug("Checking if RabitTracker is available.") + self.logger.debug("Checking if Tracker is available.") s.connect((self.master_host, self.port)) successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") + self.logger.debug("Successfully connected to Tracker.") except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + self.logger.info("Failed to connect to Tracker on attempt {}".format(attempt)) attempt += 1 self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) time.sleep(self.connect_retry_timeout) if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") + self.logger.error("Failed to connect to Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Tracker") else: - self.logger.info("Connected to RabitTracker.") - - rabit.init( - [ - "DMLC_NUM_WORKER={}".format(self.n_workers).encode(), - "DMLC_TRACKER_URI={}".format(self.master_host).encode(), - "DMLC_TRACKER_PORT={}".format(self.port).encode(), - ] - ) - - # We can check that the rabit instance has successfully connected to the - # server by getting the rank of the server (e.g. its position in the ring). - # This should be unique for each instance. - self.logger.debug("Rabit started - Rank {}".format(rabit.get_rank())) + self.logger.info("Connected to Tracker.") + + # Initialize XGBoost collective communication + collective_config = { + 'xgboost_communicator': 'rabit', + 'rabit_tracker_uri': self.master_host, + 'rabit_tracker_port': self.port, + 'rabit_world_size': self.n_workers, + } + + # Create communicator context + self.communicator_context = CommunicatorContext(**collective_config) + self.communicator_context.__enter__() + + # Get rank information + rank = collective.get_rank() if self.n_workers > 1 else 0 + self.logger.debug("Collective started - Rank {}".format(rank)) self.logger.debug("Executing user code") - # We can now run user-code. Since XGBoost runs in the same process space - # it will use the same instance of rabit that we have configured. It has - # a number of checks throughout the learning process to see if it is running - # in distributed mode by calling rabit APIs. If it is it will do the - # synchronization automatically. - # - # Hence we can now execute any XGBoost specific training code and it - # will be distributed automatically. - return RabitHelper(self.is_master_host, self.current_host, self.port) + return RabitHelper(self.is_master_host, self.current_host, self.port, self.communicator_context) def stop(self): - """Shutdown parameter server. - - If current host is master host, also join the background thread that is running the master host. - """ + """Shutdown parameter server and tracker.""" self.logger.debug("Shutting down parameter server.") - # This is the call that actually shuts down the rabit server; and when - # all of the slaves have been shut down then the RabitTracker will close - # /shutdown itself. - rabit.finalize() - if self.is_master_host: - self.rabit_context.join() + # Clean up communicator context + if self.communicator_context: + try: + self.communicator_context.__exit__(None, None, None) + except Exception as e: + self.logger.debug(f"Error closing communicator context: {e}") + + if self.is_master_host and self.tracker: + self.tracker.join() def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() \ No newline at end of file + self.stop() + return False \ No newline at end of file From 15e7484f7e4511ec2268f791b7cbf198f0fd56d4 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 20:11:09 -0700 Subject: [PATCH 050/157] set env var --- .../distributed.py | 40 +++++++++---------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 8405a8d5..e66ec590 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -130,20 +130,20 @@ def rabit_run( class RabitHelper(object): - def __init__(self, is_master, current_host, master_port, communicator_context=None): + def __init__(self, is_master, current_host, master_port, is_collective_initialized=False): """This is returned by the Rabit context manager for useful cluster information and data synchronization. :param is_master: :param current_host: :param master_port: - :param communicator_context: XGBoost collective CommunicatorContext + :param is_collective_initialized: Whether XGBoost collective communication is initialized """ self.is_master = is_master self.current_host = current_host self.master_port = master_port - self._communicator_context = communicator_context + self._is_collective_initialized = is_collective_initialized - if communicator_context: + if is_collective_initialized: self.rank = collective.get_rank() else: self.rank = 0 @@ -156,7 +156,7 @@ def tracker_print(self, msg: str): :param msg: Message to print to tracker log """ - if self._communicator_context: + if self._is_collective_initialized: # Use collective.print for distributed case collective.print(msg) else: @@ -172,7 +172,7 @@ def synchronize(self, data): :param data: data to send to the cluster :return: aggregated data from all the nodes in the cluster """ - if not self._communicator_context: + if not self._is_collective_initialized: # Single node case return [data] @@ -307,7 +307,6 @@ def __init__( self.connect_retry_timeout = connect_retry_timeout self.tracker = None - self.communicator_context = None def start(self): """Start the collective communication process. @@ -357,35 +356,32 @@ def start(self): else: self.logger.info("Connected to Tracker.") - # Initialize XGBoost collective communication - collective_config = { - 'xgboost_communicator': 'rabit', - 'rabit_tracker_uri': self.master_host, - 'rabit_tracker_port': self.port, - 'rabit_world_size': self.n_workers, - } + # Initialize XGBoost collective communication using environment variables + import os + os.environ['DMLC_NUM_WORKER'] = str(self.n_workers) + os.environ['DMLC_TRACKER_URI'] = self.master_host + os.environ['DMLC_TRACKER_PORT'] = str(self.port) - # Create communicator context - self.communicator_context = CommunicatorContext(**collective_config) - self.communicator_context.__enter__() + # Initialize collective communication + collective.init() # Get rank information rank = collective.get_rank() if self.n_workers > 1 else 0 self.logger.debug("Collective started - Rank {}".format(rank)) self.logger.debug("Executing user code") - return RabitHelper(self.is_master_host, self.current_host, self.port, self.communicator_context) + return RabitHelper(self.is_master_host, self.current_host, self.port, True) def stop(self): """Shutdown parameter server and tracker.""" self.logger.debug("Shutting down parameter server.") - # Clean up communicator context - if self.communicator_context: + # Clean up collective communication + if self.n_workers > 1: try: - self.communicator_context.__exit__(None, None, None) + collective.finalize() except Exception as e: - self.logger.debug(f"Error closing communicator context: {e}") + self.logger.debug(f"Error finalizing collective: {e}") if self.is_master_host and self.tracker: self.tracker.join() From 2acc1ec125dc1ac917929d21c9d0133f556884aa Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 24 Sep 2025 21:42:33 -0700 Subject: [PATCH 051/157] test distributed.py --- .../distributed.py | 120 ++++++++++++++---- 1 file changed, 98 insertions(+), 22 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index e66ec590..ffedfa3e 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -29,7 +29,14 @@ try: import xgboost as xgb from xgboost import collective - from xgboost.collective import CommunicatorContext + # Try to import the built-in tracker if available + try: + from xgboost.tracker import RabitTracker + HAS_BUILTIN_TRACKER = True + except ImportError: + # Fallback for older versions or different installations + HAS_BUILTIN_TRACKER = False + except ImportError: raise ImportError("XGBoost 2.1.0 or later is required") @@ -201,6 +208,8 @@ def __init__(self, host_ip: str, n_workers: int, port: int): self.server_socket = None self.server_thread = None self._shutdown = threading.Event() + self._workers_connected = 0 + self._worker_sockets = [] def start(self, n_workers: int): """Start the tracker server""" @@ -213,29 +222,83 @@ def start(self, n_workers: int): self.server_thread = threading.Thread(target=self._server_loop, daemon=True) self.server_thread.start() + # Give the server a moment to start + time.sleep(0.1) + def _server_loop(self): - """Simple server loop to accept connections""" - connected_clients = 0 - while not self._shutdown.is_set() and connected_clients < self.n_workers: + """DMLC tracker protocol server loop""" + try: + while not self._shutdown.is_set() and self._workers_connected < self.n_workers: + try: + client_socket, addr = self.server_socket.accept() + self._workers_connected += 1 + logging.debug(f"Tracker accepted connection {self._workers_connected}/{self.n_workers} from {addr}") + + # Handle the DMLC tracker protocol + self._handle_worker(client_socket, self._workers_connected - 1) + + except socket.timeout: + continue + except Exception as e: + if not self._shutdown.is_set(): + logging.debug(f"Tracker server error: {e}") + break + + # Keep server alive until shutdown + while not self._shutdown.is_set(): + time.sleep(0.1) + + except Exception as e: + logging.debug(f"Tracker server loop error: {e}") + finally: + for sock in self._worker_sockets: + try: + sock.close() + except: + pass + + def _handle_worker(self, client_socket, worker_rank): + """Handle DMLC tracker protocol for a worker""" + try: + self._worker_sockets.append(client_socket) + + # Send worker configuration following DMLC protocol + # Format: "start rank world_size\n" + config_msg = f"start {worker_rank} {self.n_workers}\n".encode() + client_socket.send(config_msg) + + # Keep connection alive for the worker + client_socket.settimeout(1.0) + while not self._shutdown.is_set(): + try: + data = client_socket.recv(1024) + if not data: + break + # Echo back any messages (simple heartbeat) + client_socket.send(b"ok\n") + except socket.timeout: + continue + except: + break + + except Exception as e: + logging.debug(f"Worker {worker_rank} handler error: {e}") + finally: try: - client_socket, addr = self.server_socket.accept() - connected_clients += 1 - logging.debug(f"Tracker accepted connection from {addr}") client_socket.close() - except socket.timeout: - continue - except Exception as e: - if not self._shutdown.is_set(): - logging.debug(f"Tracker server error: {e}") - break + except: + pass def join(self): """Wait for the server thread to finish""" self._shutdown.set() if self.server_thread: - self.server_thread.join() + self.server_thread.join(timeout=5.0) if self.server_socket: - self.server_socket.close() + try: + self.server_socket.close() + except: + pass def slave_envs(self): """Return environment configuration for slaves""" @@ -323,13 +386,26 @@ def start(self): if self.is_master_host: self.logger.debug("Master host. Starting Tracker.") - self.tracker = SimpleTracker( - host_ip=self.current_host, - n_workers=self.n_workers, - port=self.port - ) + + if HAS_BUILTIN_TRACKER: + # Use XGBoost's built-in tracker if available + self.tracker = RabitTracker( + hostIP=self.current_host, + nslave=self.n_workers, + port=self.port, + port_end=self.port + 1 + ) + self.tracker.start(self.n_workers) + else: + # Fallback to our simple tracker + self.tracker = SimpleTracker( + host_ip=self.current_host, + n_workers=self.n_workers, + port=self.port + ) + self.tracker.start(self.n_workers) + self.logger.info("Tracker slave environment: {}".format(self.tracker.slave_envs())) - self.tracker.start(self.n_workers) # Start parameter server that connects to the master self.logger.debug("Starting parameter server.") @@ -391,4 +467,4 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, exc_traceback): self.stop() - return False \ No newline at end of file + return False From e9cea56993e3e859eb9ec6f89e5bb4ec7ab0bf00 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 11:46:03 -0700 Subject: [PATCH 052/157] test distributed.py --- .../distributed.py | 369 ++++++------------ 1 file changed, 117 insertions(+), 252 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index ffedfa3e..c6c49ff3 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -19,26 +19,13 @@ import socket import sys import time -import threading -from typing import List, Dict, Any, Optional +import json -import numpy as np from retrying import retry +from xgboost import collective -# XGBoost 2.1.0 uses collective communication instead of rabit -try: - import xgboost as xgb - from xgboost import collective - # Try to import the built-in tracker if available - try: - from xgboost.tracker import RabitTracker - HAS_BUILTIN_TRACKER = True - except ImportError: - # Fallback for older versions or different installations - HAS_BUILTIN_TRACKER = False - -except ImportError: - raise ImportError("XGBoost 2.1.0 or later is required") +# This should point to xgb when the tracker is updated upstream +from sagemaker_xgboost_container.dmlc_patch import tracker LOCAL_HOSTNAME = "127.0.0.1" @@ -70,25 +57,25 @@ def rabit_run( connect_retry_timeout=3, update_rabit_args=False, ): - """Run execution function after initializing xgboost collective communication. + """Run execution function after initializing dmlc/rabit. - This method initializes collective communication twice: + This method initializes rabit twice: 1. To broadcast to all hosts which hosts should be included in training. 2. Run distributed xgb train() with just the hosts from above. - :param exec_fun: Function to run while collective is initialized. xgb.train() must run in the same process space - in order to utilize collective initialization. Note that the execution function must also take the args + :param exec_fun: Function to run while rabit is initialized. xgb.train() must run in the same process space + in order to utilize rabit initialization. Note that the execution function must also take the args 'is_distributed' and 'is_master'. :param args: Arguments to run execution function. :param include_in_training: Boolean if the current hosts should be used in training. This is done here so that all the hosts in the cluster know which hosts to include during training. :param hosts: :param current_host: - :param first_port: Port to use for the initial collective initialization. If None, defaults to 9099 - :param second_port: Port to use for second collective initialization. If None, this increments previous port by 1 + :param first_port: Port to use for the initial rabit initialization. If None, rabit defaults this to 9099 + :param second_port: Port to use for second rabit initialization. If None, this increments previous port by 1 :param max_connect_attempts :param connect_retry_timeout - :param update_rabit_args: Boolean to include collective information to args. If True, the following is added: + :param update_rabit_args: Boolean to include rabit information to args. If True, the following is added: is_master """ with Rabit( @@ -97,12 +84,12 @@ def rabit_run( port=first_port, max_connect_attempts=max_connect_attempts, connect_retry_timeout=connect_retry_timeout, - ) as rabit_helper: - hosts_with_data = rabit_helper.synchronize({"host": rabit_helper.current_host, "include_in_training": include_in_training}) + ) as rabit: + hosts_with_data = rabit.synchronize({"host": rabit.current_host, "include_in_training": include_in_training}) hosts_with_data = [record["host"] for record in hosts_with_data if record["include_in_training"]] # Keep track of port used, so that hosts trying to shutdown know when server is not available - previous_port = rabit_helper.master_port + previous_port = rabit.master_port if not include_in_training: logging.warning("Host {} not being used for distributed training.".format(current_host)) @@ -111,8 +98,8 @@ def rabit_run( second_rabit_port = second_port if second_port else previous_port + 1 if len(hosts_with_data) > 1: - # Set up collective with nodes that have data and an unused port so that previous slaves don't confuse it - # with the previous collective configuration + # Set up rabit with nodes that have data and an unused port so that previous slaves don't confuse it + # with the previous rabit configuration with Rabit( hosts=hosts_with_data, current_host=current_host, @@ -137,38 +124,17 @@ def rabit_run( class RabitHelper(object): - def __init__(self, is_master, current_host, master_port, is_collective_initialized=False): + def __init__(self, is_master, current_host, master_port): """This is returned by the Rabit context manager for useful cluster information and data synchronization. :param is_master: :param current_host: :param master_port: - :param is_collective_initialized: Whether XGBoost collective communication is initialized """ self.is_master = is_master + self.rank = collective.get_rank() self.current_host = current_host self.master_port = master_port - self._is_collective_initialized = is_collective_initialized - - if is_collective_initialized: - self.rank = collective.get_rank() - else: - self.rank = 0 - - def tracker_print(self, msg: str): - """Print message to tracker log. - - Equivalent to rabit.tracker_print() - prints a message to the centralized - logging facility for tracking progress across the distributed cluster. - - :param msg: Message to print to tracker log - """ - if self._is_collective_initialized: - # Use collective.print for distributed case - collective.print(msg) - else: - # For single node case, just use regular logging - logging.info(f"[Tracker] {msg}") def synchronize(self, data): """Synchronize data with the cluster. @@ -177,166 +143,48 @@ def synchronize(self, data): This allows things like determining which nodes have data or not. :param data: data to send to the cluster - :return: aggregated data from all the nodes in the cluster + :return: aggregated data from the all the nodes in the cluster """ - if not self._is_collective_initialized: - # Single node case - return [data] - results = [] - world_size = collective.get_world_size() - - for i in range(world_size): + data_str = json.dumps(data) + for i in range(collective.get_world_size()): if self.rank == i: logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) - collective.broadcast(data, i) + collective.broadcast(data_str, i) results.append(data) else: logging.debug("Receiving data from {}".format(i)) - message = collective.broadcast(None, i) + message_str = collective.broadcast("", i) + message = json.loads(message_str) if message_str else None results.append(message) return results -class SimpleTracker: - """Simple tracker implementation for XGBoost collective communication""" - - def __init__(self, host_ip: str, n_workers: int, port: int): - self.host_ip = host_ip - self.n_workers = n_workers - self.port = port - self.server_socket = None - self.server_thread = None - self._shutdown = threading.Event() - self._workers_connected = 0 - self._worker_sockets = [] - - def start(self, n_workers: int): - """Start the tracker server""" - self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - self.server_socket.bind((self.host_ip, self.port)) - self.server_socket.listen(n_workers) - self.server_socket.settimeout(1.0) # Non-blocking accept - - self.server_thread = threading.Thread(target=self._server_loop, daemon=True) - self.server_thread.start() - - # Give the server a moment to start - time.sleep(0.1) - - def _server_loop(self): - """DMLC tracker protocol server loop""" - try: - while not self._shutdown.is_set() and self._workers_connected < self.n_workers: - try: - client_socket, addr = self.server_socket.accept() - self._workers_connected += 1 - logging.debug(f"Tracker accepted connection {self._workers_connected}/{self.n_workers} from {addr}") - - # Handle the DMLC tracker protocol - self._handle_worker(client_socket, self._workers_connected - 1) - - except socket.timeout: - continue - except Exception as e: - if not self._shutdown.is_set(): - logging.debug(f"Tracker server error: {e}") - break - - # Keep server alive until shutdown - while not self._shutdown.is_set(): - time.sleep(0.1) - - except Exception as e: - logging.debug(f"Tracker server loop error: {e}") - finally: - for sock in self._worker_sockets: - try: - sock.close() - except: - pass - - def _handle_worker(self, client_socket, worker_rank): - """Handle DMLC tracker protocol for a worker""" - try: - self._worker_sockets.append(client_socket) - - # Send worker configuration following DMLC protocol - # Format: "start rank world_size\n" - config_msg = f"start {worker_rank} {self.n_workers}\n".encode() - client_socket.send(config_msg) - - # Keep connection alive for the worker - client_socket.settimeout(1.0) - while not self._shutdown.is_set(): - try: - data = client_socket.recv(1024) - if not data: - break - # Echo back any messages (simple heartbeat) - client_socket.send(b"ok\n") - except socket.timeout: - continue - except: - break - - except Exception as e: - logging.debug(f"Worker {worker_rank} handler error: {e}") - finally: - try: - client_socket.close() - except: - pass - - def join(self): - """Wait for the server thread to finish""" - self._shutdown.set() - if self.server_thread: - self.server_thread.join(timeout=5.0) - if self.server_socket: - try: - self.server_socket.close() - except: - pass - - def slave_envs(self): - """Return environment configuration for slaves""" - return { - 'DMLC_NUM_WORKER': str(self.n_workers), - 'DMLC_TRACKER_URI': self.host_ip, - 'DMLC_TRACKER_PORT': str(self.port) - } - - class Rabit(object): @staticmethod def _get_logger(current_host): - logging.basicConfig(format="%(name)s [{}]: %(message)s".format(current_host)) + logging.basicConfig(format="%(name) [{}]: %(message)s".format(current_host)) return logging.getLogger("RabitContextManager") def __init__( - self, - hosts: List[str], - current_host: Optional[str] = None, - master_host: Optional[str] = None, - port: Optional[int] = None, - max_connect_attempts: Optional[int] = None, - connect_retry_timeout: int = 3 + self, hosts, current_host=None, master_host=None, port=None, max_connect_attempts=None, connect_retry_timeout=3 ): - """Context manager for XGBoost collective communication initialization. + """Context manager for rabit initialization. :param hosts: List of hostnames :param current_host: Current hostname. If not provided, use 127.0.0.1. :param master_host: Master host hostname. If not provided, use alphabetically first hostname amongst hosts to ensure determinism in choosing master node. :param port: Port to connect to master, if not specified use 9099. - :param max_connect_attempts: Number of times to try connecting to tracker. If this arg is set + :param max_connect_attempts: Number of times to try connecting to RabitTracker. If this arg is set to None, try indefinitely. - :param connect_retry_timeout: Timeout value when attempting to connect to tracker. + :param connect_retry_timeout: Timeout value when attempting to connect to RabitTracker. This will be ignored if max_connect_attempt is None """ - # Get the host information + # Get the host information. This is used to identify the master host + # that will run the RabitTracker and also to work out how many clients/slaves + # exist (this will ensure that all-reduce is set up correctly and that + # it blocks whilst waiting for those hosts to process the data). if not current_host: current_host = LOCAL_HOSTNAME self.current_host = current_host @@ -347,7 +195,7 @@ def __init__( self.n_workers = len(self.hosts) self.logger.debug("Found hosts: {} [{}]".format(self.hosts, self.n_workers)) - # Use the first lexicographically named host as the master if not indicated otherwise + # We use the first lexicographically named host as the master if not indicated otherwise if not master_host: master_host = self.hosts[0] self.master_host = master_host @@ -356,6 +204,9 @@ def __init__( self.logger.debug("Is Master: {}".format(self.is_master_host)) self.logger.debug("Master: {}".format(self.master_host)) + # We start the RabitTracker on a known port on the first host. We can + # do this since SageMaker Training instances are single tenent and we + # don't need to worry about port contention. if port is None: port = 9099 self.logger.debug("No port specified using: {}".format(port)) @@ -368,103 +219,117 @@ def __init__( else: raise ValueError("max_connect_attempts must be None or an integer greater than 0.") self.connect_retry_timeout = connect_retry_timeout - - self.tracker = None def start(self): - """Start the collective communication process. + """Start the rabit process. - If current host is master host, initialize and start the tracker in the background. - All hosts then connect to the master host to set up collective communication. + If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect + to the master host to set up Rabit rank. :return: Initialized RabitHelper, which includes helpful information such as is_master and port """ - if self.n_workers == 1: - # Single node case - no need for collective communication - self.logger.debug("Single node training - skipping collective communication setup") - return RabitHelper(True, self.current_host, self.port) - + self.rabit_context = None if self.is_master_host: - self.logger.debug("Master host. Starting Tracker.") - - if HAS_BUILTIN_TRACKER: - # Use XGBoost's built-in tracker if available - self.tracker = RabitTracker( - hostIP=self.current_host, - nslave=self.n_workers, - port=self.port, - port_end=self.port + 1 - ) - self.tracker.start(self.n_workers) - else: - # Fallback to our simple tracker - self.tracker = SimpleTracker( - host_ip=self.current_host, - n_workers=self.n_workers, - port=self.port - ) - self.tracker.start(self.n_workers) - - self.logger.info("Tracker slave environment: {}".format(self.tracker.slave_envs())) - - # Start parameter server that connects to the master + self.logger.debug("Master host. Starting Rabit Tracker.") + # The Rabit Tracker is a Python script that is responsible for + # allowing each instance of rabit to find its peers and organize + # itself in to a ring for all-reduce. It supports primitive failure + # recovery modes. + # + # It runs on a master node that each of the individual Rabit instances + # talk to. + self.rabit_context = tracker.RabitTracker( + hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 + ) + + # Useful logging to ensure that the tracker has started. + # These are the key-value config pairs that each of the rabit slaves + # should be initialized with. Since we have deterministically allocated + # the master host, its port, and the number of workers, we don't need + # to pass these out-of-band to each slave; but rely on the fact + # that each slave will calculate the exact same config as the server. + # + # TODO: should probably check that these match up what we pass below. + self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) + + # This actually starts the RabitTracker in a background/daemon thread + # that will automatically exit when the main process has finished. + self.rabit_context.start(self.n_workers) + + # Start each parameter server that connects to the master. self.logger.debug("Starting parameter server.") - # Wait for tracker to be available + # Rabit runs as an in-process singleton library that can be configured once. + # Calling this multiple times will cause a seg-fault (without calling finalize). + # We pass it the environment variables that match up with the RabitTracker + # so that this instance can discover its peers (and recover from failure). + # + # First we check that the RabitTracker is up and running. Rabit actually + # breaks (at least on Mac OS X) if the server is not running before it + # begins to try to connect (its internal retries fail because they reuse + # the same socket instead of creating a new one). + # + # if self.max_connect_attempts is None, this will loop indefinitely. attempt = 0 successful_connection = False while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: - self.logger.debug("Checking if Tracker is available.") + self.logger.debug("Checking if RabitTracker is available.") s.connect((self.master_host, self.port)) successful_connection = True - self.logger.debug("Successfully connected to Tracker.") + self.logger.debug("Successfully connected to RabitTracker.") except OSError: - self.logger.info("Failed to connect to Tracker on attempt {}".format(attempt)) + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) attempt += 1 self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) time.sleep(self.connect_retry_timeout) if not successful_connection: - self.logger.error("Failed to connect to Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Tracker") + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") else: - self.logger.info("Connected to Tracker.") - - # Initialize XGBoost collective communication using environment variables - import os - os.environ['DMLC_NUM_WORKER'] = str(self.n_workers) - os.environ['DMLC_TRACKER_URI'] = self.master_host - os.environ['DMLC_TRACKER_PORT'] = str(self.port) - - # Initialize collective communication - collective.init() - - # Get rank information - rank = collective.get_rank() if self.n_workers > 1 else 0 - self.logger.debug("Collective started - Rank {}".format(rank)) + self.logger.info("Connected to RabitTracker.") + + # Initialize collective with the new API + collective.init({ + "DMLC_NUM_WORKER": str(self.n_workers), + "DMLC_TRACKER_URI": self.master_host, + "DMLC_TRACKER_PORT": str(self.port), + }) + + # We can check that the collective instance has successfully connected to the + # server by getting the rank of the server (e.g. its position in the ring). + # This should be unique for each instance. + self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) self.logger.debug("Executing user code") - return RabitHelper(self.is_master_host, self.current_host, self.port, True) + # We can now run user-code. Since XGBoost runs in the same process space + # it will use the same instance of collective that we have configured. It has + # a number of checks throughout the learning process to see if it is running + # in distributed mode by calling collective APIs. If it is it will do the + # synchronization automatically. + # + # Hence we can now execute any XGBoost specific training code and it + # will be distributed automatically. + return RabitHelper(self.is_master_host, self.current_host, self.port) def stop(self): - """Shutdown parameter server and tracker.""" + """Shutdown parameter server. + + If current host is master host, also join the background thread that is running the master host. + """ self.logger.debug("Shutting down parameter server.") - # Clean up collective communication - if self.n_workers > 1: - try: - collective.finalize() - except Exception as e: - self.logger.debug(f"Error finalizing collective: {e}") - - if self.is_master_host and self.tracker: - self.tracker.join() + # This is the call that actually shuts down the collective server; and when + # all of the slaves have been shut down then the RabitTracker will close + # /shutdown itself. + collective.finalize() + if self.is_master_host: + self.rabit_context.join() def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - self.stop() - return False + return self.stop() \ No newline at end of file From c003caf5082cf272cdb0b0de0f260a733bf359cc Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 12:05:58 -0700 Subject: [PATCH 053/157] replace rabit with dask --- .../distributed.py | 347 +++--------------- 1 file changed, 48 insertions(+), 299 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index c6c49ff3..f0c5769b 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -15,321 +15,70 @@ This is heavily inspired by the Dask version of XGBoost. Some of this code should be made simpler once the XGBoost library is improved. """ +# Dask-based replacement for distributed.py import logging -import socket import sys -import time -import json - -from retrying import retry -from xgboost import collective - -# This should point to xgb when the tracker is updated upstream -from sagemaker_xgboost_container.dmlc_patch import tracker +from dask.distributed import Client, as_completed +from dask import delayed +import xgboost as xgb LOCAL_HOSTNAME = "127.0.0.1" - -@retry(stop_max_delay=1000 * 60 * 15, wait_exponential_multiplier=100, wait_exponential_max=30000) -def _dns_lookup(host): - """Retrying dns lookup on host""" - return socket.gethostbyname(host) - - def wait_hostname_resolution(sm_hosts): - """Wait for the hostname resolution of the container. This is known behavior as the cluster - boots up and has been documented here: - https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-dist-training - """ - for host in sm_hosts: - _dns_lookup(host) - - -def rabit_run( - exec_fun, - args, - include_in_training, - hosts, - current_host, - first_port=None, - second_port=None, - max_connect_attempts=None, - connect_retry_timeout=3, - update_rabit_args=False, -): - """Run execution function after initializing dmlc/rabit. - - This method initializes rabit twice: - 1. To broadcast to all hosts which hosts should be included in training. - 2. Run distributed xgb train() with just the hosts from above. - - :param exec_fun: Function to run while rabit is initialized. xgb.train() must run in the same process space - in order to utilize rabit initialization. Note that the execution function must also take the args - 'is_distributed' and 'is_master'. - :param args: Arguments to run execution function. - :param include_in_training: Boolean if the current hosts should be used in training. This is done here so that - all the hosts in the cluster know which hosts to include during training. - :param hosts: - :param current_host: - :param first_port: Port to use for the initial rabit initialization. If None, rabit defaults this to 9099 - :param second_port: Port to use for second rabit initialization. If None, this increments previous port by 1 - :param max_connect_attempts - :param connect_retry_timeout - :param update_rabit_args: Boolean to include rabit information to args. If True, the following is added: - is_master - """ - with Rabit( - hosts=hosts, - current_host=current_host, - port=first_port, - max_connect_attempts=max_connect_attempts, - connect_retry_timeout=connect_retry_timeout, - ) as rabit: - hosts_with_data = rabit.synchronize({"host": rabit.current_host, "include_in_training": include_in_training}) - hosts_with_data = [record["host"] for record in hosts_with_data if record["include_in_training"]] - - # Keep track of port used, so that hosts trying to shutdown know when server is not available - previous_port = rabit.master_port - + """Wait for hostname resolution - simplified for Dask""" + pass # Dask handles this internally + +def rabit_run(exec_fun, args, include_in_training, hosts, current_host, + first_port=None, second_port=None, max_connect_attempts=None, + connect_retry_timeout=3, update_rabit_args=False): + """Run execution using Dask instead of rabit""" + if not include_in_training: logging.warning("Host {} not being used for distributed training.".format(current_host)) sys.exit(0) - - second_rabit_port = second_port if second_port else previous_port + 1 - - if len(hosts_with_data) > 1: - # Set up rabit with nodes that have data and an unused port so that previous slaves don't confuse it - # with the previous rabit configuration - with Rabit( - hosts=hosts_with_data, - current_host=current_host, - port=second_rabit_port, - max_connect_attempts=max_connect_attempts, - connect_retry_timeout=connect_retry_timeout, - ) as cluster: - if update_rabit_args: - args.update({"is_master": cluster.is_master}) - exec_fun(**args) - - elif len(hosts_with_data) == 1: - logging.debug( - "Only 1 host with training data, " "starting single node training job from: {}".format(current_host) - ) + + # Use Dask client for coordination + scheduler_address = f"{hosts[0]}:{first_port or 8786}" + + with Client(scheduler_address) as client: if update_rabit_args: - args.update({"is_master": True}) - exec_fun(**args) - - else: - raise RuntimeError("No hosts received training data.") - - -class RabitHelper(object): - def __init__(self, is_master, current_host, master_port): - """This is returned by the Rabit context manager for useful cluster information and data synchronization. - - :param is_master: - :param current_host: - :param master_port: - """ - self.is_master = is_master - self.rank = collective.get_rank() + args.update({"is_master": client.scheduler_info()["address"] == scheduler_address}) + + # Execute function in distributed manner + future = client.submit(exec_fun, **args) + return future.result() + +class DaskHelper(object): + def __init__(self, client, current_host): + self.client = client self.current_host = current_host - self.master_port = master_port - + self.is_master = True # Simplified + def synchronize(self, data): - """Synchronize data with the cluster. - - This function allows every node to share state with every other node easily. - This allows things like determining which nodes have data or not. - - :param data: data to send to the cluster - :return: aggregated data from the all the nodes in the cluster - """ - results = [] - data_str = json.dumps(data) - for i in range(collective.get_world_size()): - if self.rank == i: - logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) - collective.broadcast(data_str, i) - results.append(data) - else: - logging.debug("Receiving data from {}".format(i)) - message_str = collective.broadcast("", i) - message = json.loads(message_str) if message_str else None - results.append(message) - return results - + """Synchronize data using Dask""" + futures = self.client.scatter([data] * len(self.client.scheduler_info()["workers"])) + return self.client.gather(futures) class Rabit(object): - @staticmethod - def _get_logger(current_host): - logging.basicConfig(format="%(name) [{}]: %(message)s".format(current_host)) - return logging.getLogger("RabitContextManager") - - def __init__( - self, hosts, current_host=None, master_host=None, port=None, max_connect_attempts=None, connect_retry_timeout=3 - ): - """Context manager for rabit initialization. - - :param hosts: List of hostnames - :param current_host: Current hostname. If not provided, use 127.0.0.1. - :param master_host: Master host hostname. If not provided, use alphabetically first hostname amongst hosts - to ensure determinism in choosing master node. - :param port: Port to connect to master, if not specified use 9099. - :param max_connect_attempts: Number of times to try connecting to RabitTracker. If this arg is set - to None, try indefinitely. - :param connect_retry_timeout: Timeout value when attempting to connect to RabitTracker. - This will be ignored if max_connect_attempt is None - """ - # Get the host information. This is used to identify the master host - # that will run the RabitTracker and also to work out how many clients/slaves - # exist (this will ensure that all-reduce is set up correctly and that - # it blocks whilst waiting for those hosts to process the data). - if not current_host: - current_host = LOCAL_HOSTNAME - self.current_host = current_host - self.logger = self._get_logger(self.current_host) - self.logger.debug("Found current host.") - - self.hosts = sorted(hosts) - self.n_workers = len(self.hosts) - self.logger.debug("Found hosts: {} [{}]".format(self.hosts, self.n_workers)) - - # We use the first lexicographically named host as the master if not indicated otherwise - if not master_host: - master_host = self.hosts[0] - self.master_host = master_host - self.is_master_host = self.current_host == self.master_host - - self.logger.debug("Is Master: {}".format(self.is_master_host)) - self.logger.debug("Master: {}".format(self.master_host)) - - # We start the RabitTracker on a known port on the first host. We can - # do this since SageMaker Training instances are single tenent and we - # don't need to worry about port contention. - if port is None: - port = 9099 - self.logger.debug("No port specified using: {}".format(port)) - else: - self.logger.debug("Using provided port: {}".format(port)) - self.port = port - - if max_connect_attempts is None or max_connect_attempts > 0: - self.max_connect_attempts = max_connect_attempts - else: - raise ValueError("max_connect_attempts must be None or an integer greater than 0.") - self.connect_retry_timeout = connect_retry_timeout - + def __init__(self, hosts, current_host=None, master_host=None, port=None, + max_connect_attempts=None, connect_retry_timeout=3): + self.hosts = hosts + self.current_host = current_host or LOCAL_HOSTNAME + self.master_host = master_host or hosts[0] + self.port = port or 8786 + self.client = None + def start(self): - """Start the rabit process. - - If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect - to the master host to set up Rabit rank. - - :return: Initialized RabitHelper, which includes helpful information such as is_master and port - """ - self.rabit_context = None - if self.is_master_host: - self.logger.debug("Master host. Starting Rabit Tracker.") - # The Rabit Tracker is a Python script that is responsible for - # allowing each instance of rabit to find its peers and organize - # itself in to a ring for all-reduce. It supports primitive failure - # recovery modes. - # - # It runs on a master node that each of the individual Rabit instances - # talk to. - self.rabit_context = tracker.RabitTracker( - hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 - ) - - # Useful logging to ensure that the tracker has started. - # These are the key-value config pairs that each of the rabit slaves - # should be initialized with. Since we have deterministically allocated - # the master host, its port, and the number of workers, we don't need - # to pass these out-of-band to each slave; but rely on the fact - # that each slave will calculate the exact same config as the server. - # - # TODO: should probably check that these match up what we pass below. - self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) - - # This actually starts the RabitTracker in a background/daemon thread - # that will automatically exit when the main process has finished. - self.rabit_context.start(self.n_workers) - - # Start each parameter server that connects to the master. - self.logger.debug("Starting parameter server.") - - # Rabit runs as an in-process singleton library that can be configured once. - # Calling this multiple times will cause a seg-fault (without calling finalize). - # We pass it the environment variables that match up with the RabitTracker - # so that this instance can discover its peers (and recover from failure). - # - # First we check that the RabitTracker is up and running. Rabit actually - # breaks (at least on Mac OS X) if the server is not running before it - # begins to try to connect (its internal retries fail because they reuse - # the same socket instead of creating a new one). - # - # if self.max_connect_attempts is None, this will loop indefinitely. - attempt = 0 - successful_connection = False - while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - self.logger.debug("Checking if RabitTracker is available.") - s.connect((self.master_host, self.port)) - successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") - except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - attempt += 1 - self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - time.sleep(self.connect_retry_timeout) - - if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") - else: - self.logger.info("Connected to RabitTracker.") - - # Initialize collective with the new API - collective.init({ - "DMLC_NUM_WORKER": str(self.n_workers), - "DMLC_TRACKER_URI": self.master_host, - "DMLC_TRACKER_PORT": str(self.port), - }) - - # We can check that the collective instance has successfully connected to the - # server by getting the rank of the server (e.g. its position in the ring). - # This should be unique for each instance. - self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) - self.logger.debug("Executing user code") - - # We can now run user-code. Since XGBoost runs in the same process space - # it will use the same instance of collective that we have configured. It has - # a number of checks throughout the learning process to see if it is running - # in distributed mode by calling collective APIs. If it is it will do the - # synchronization automatically. - # - # Hence we can now execute any XGBoost specific training code and it - # will be distributed automatically. - return RabitHelper(self.is_master_host, self.current_host, self.port) - + scheduler_address = f"{self.master_host}:{self.port}" + self.client = Client(scheduler_address) + return DaskHelper(self.client, self.current_host) + def stop(self): - """Shutdown parameter server. - - If current host is master host, also join the background thread that is running the master host. - """ - self.logger.debug("Shutting down parameter server.") - - # This is the call that actually shuts down the collective server; and when - # all of the slaves have been shut down then the RabitTracker will close - # /shutdown itself. - collective.finalize() - if self.is_master_host: - self.rabit_context.join() - + if self.client: + self.client.close() + def __enter__(self): return self.start() - + def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() \ No newline at end of file + self.stop() \ No newline at end of file From 41d1794701eacadf315048f8a0c928f6bb852a0e Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 13:03:26 -0700 Subject: [PATCH 054/157] replace rabit with collective --- .../distributed.py | 295 +++++++++++++++--- 1 file changed, 247 insertions(+), 48 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index f0c5769b..f071ed40 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -15,70 +15,269 @@ This is heavily inspired by the Dask version of XGBoost. Some of this code should be made simpler once the XGBoost library is improved. """ -# Dask-based replacement for distributed.py import logging +import socket import sys -from dask.distributed import Client, as_completed -from dask import delayed -import xgboost as xgb +import time +import json + +from retrying import retry +from xgboost import collective + +# This should point to xgb when the tracker is updated upstream +from sagemaker_xgboost_container.dmlc_patch import tracker LOCAL_HOSTNAME = "127.0.0.1" + +@retry(stop_max_delay=1000 * 60 * 15, wait_exponential_multiplier=100, wait_exponential_max=30000) +def _dns_lookup(host): + """Retrying dns lookup on host""" + return socket.gethostbyname(host) + + def wait_hostname_resolution(sm_hosts): - """Wait for hostname resolution - simplified for Dask""" - pass # Dask handles this internally - -def rabit_run(exec_fun, args, include_in_training, hosts, current_host, - first_port=None, second_port=None, max_connect_attempts=None, - connect_retry_timeout=3, update_rabit_args=False): - """Run execution using Dask instead of rabit""" - + """Wait for the hostname resolution of the container. This is known behavior as the cluster + boots up and has been documented here: + https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-dist-training + """ + for host in sm_hosts: + _dns_lookup(host) + + +def rabit_run( + exec_fun, + args, + include_in_training, + hosts, + current_host, + first_port=None, + second_port=None, + max_connect_attempts=None, + connect_retry_timeout=3, + update_rabit_args=False, +): + """Run execution function after initializing dmlc/rabit. + + This method initializes rabit twice: + 1. To broadcast to all hosts which hosts should be included in training. + 2. Run distributed xgb train() with just the hosts from above. + + :param exec_fun: Function to run while rabit is initialized. xgb.train() must run in the same process space + in order to utilize rabit initialization. Note that the execution function must also take the args + 'is_distributed' and 'is_master'. + :param args: Arguments to run execution function. + :param include_in_training: Boolean if the current hosts should be used in training. This is done here so that + all the hosts in the cluster know which hosts to include during training. + :param hosts: + :param current_host: + :param first_port: Port to use for the initial rabit initialization. If None, rabit defaults this to 9099 + :param second_port: Port to use for second rabit initialization. If None, this increments previous port by 1 + :param max_connect_attempts + :param connect_retry_timeout + :param update_rabit_args: Boolean to include rabit information to args. If True, the following is added: + is_master + """ + with Rabit( + hosts=hosts, + current_host=current_host, + port=first_port, + max_connect_attempts=max_connect_attempts, + connect_retry_timeout=connect_retry_timeout, + ) as rabit_ctx: + hosts_with_data = rabit_ctx.synchronize({"host": rabit_ctx.current_host, "include_in_training": include_in_training}) + hosts_with_data = [record["host"] for record in hosts_with_data if record["include_in_training"]] + + # Keep track of port used, so that hosts trying to shutdown know when server is not available + previous_port = rabit_ctx.master_port + if not include_in_training: logging.warning("Host {} not being used for distributed training.".format(current_host)) sys.exit(0) - - # Use Dask client for coordination - scheduler_address = f"{hosts[0]}:{first_port or 8786}" - - with Client(scheduler_address) as client: + + second_rabit_port = second_port if second_port else previous_port + 1 + + if len(hosts_with_data) > 1: + # Set up rabit with nodes that have data and an unused port so that previous slaves don't confuse it + # with the previous rabit configuration + with Rabit( + hosts=hosts_with_data, + current_host=current_host, + port=second_rabit_port, + max_connect_attempts=max_connect_attempts, + connect_retry_timeout=connect_retry_timeout, + ) as cluster: + if update_rabit_args: + args.update({"is_master": cluster.is_master}) + exec_fun(**args) + + elif len(hosts_with_data) == 1: + logging.debug( + "Only 1 host with training data, " "starting single node training job from: {}".format(current_host) + ) if update_rabit_args: - args.update({"is_master": client.scheduler_info()["address"] == scheduler_address}) - - # Execute function in distributed manner - future = client.submit(exec_fun, **args) - return future.result() - -class DaskHelper(object): - def __init__(self, client, current_host): - self.client = client + args.update({"is_master": True}) + exec_fun(**args) + + else: + raise RuntimeError("No hosts received training data.") + + +class RabitHelper(object): + def __init__(self, is_master, current_host, master_port): + """This is returned by the Rabit context manager for useful cluster information and data synchronization. + + :param is_master: + :param current_host: + :param master_port: + """ + self.is_master = is_master + self.rank = collective.get_rank() self.current_host = current_host - self.is_master = True # Simplified - + self.master_port = master_port + def synchronize(self, data): - """Synchronize data using Dask""" - futures = self.client.scatter([data] * len(self.client.scheduler_info()["workers"])) - return self.client.gather(futures) + """Synchronize data with the cluster. + + This function allows every node to share state with every other node easily. + This allows things like determining which nodes have data or not. + + :param data: data to send to the cluster + :return: aggregated data from the all the nodes in the cluster + """ + results = [] + data_str = json.dumps(data) + for i in range(collective.get_world_size()): + if self.rank == i: + logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) + collective.broadcast(data_str, i) + results.append(data) + else: + logging.debug("Receiving data from {}".format(i)) + message_str = collective.broadcast("", i) + message = json.loads(message_str) if message_str else None + results.append(message) + return results + class Rabit(object): - def __init__(self, hosts, current_host=None, master_host=None, port=None, - max_connect_attempts=None, connect_retry_timeout=3): - self.hosts = hosts - self.current_host = current_host or LOCAL_HOSTNAME - self.master_host = master_host or hosts[0] - self.port = port or 8786 - self.client = None - + @staticmethod + def _get_logger(current_host): + logging.basicConfig(format="%(name) [{}]: %(message)s".format(current_host)) + return logging.getLogger("RabitContextManager") + + def __init__( + self, hosts, current_host=None, master_host=None, port=None, max_connect_attempts=None, connect_retry_timeout=3 + ): + """Context manager for rabit initialization. + + :param hosts: List of hostnames + :param current_host: Current hostname. If not provided, use 127.0.0.1. + :param master_host: Master host hostname. If not provided, use alphabetically first hostname amongst hosts + to ensure determinism in choosing master node. + :param port: Port to connect to master, if not specified use 9099. + :param max_connect_attempts: Number of times to try connecting to RabitTracker. If this arg is set + to None, try indefinitely. + :param connect_retry_timeout: Timeout value when attempting to connect to RabitTracker. + This will be ignored if max_connect_attempt is None + """ + if not current_host: + current_host = LOCAL_HOSTNAME + self.current_host = current_host + self.logger = self._get_logger(self.current_host) + self.logger.debug("Found current host.") + + self.hosts = sorted(hosts) + self.n_workers = len(self.hosts) + self.logger.debug("Found hosts: {} [{}]".format(self.hosts, self.n_workers)) + + if not master_host: + master_host = self.hosts[0] + self.master_host = master_host + self.is_master_host = self.current_host == self.master_host + + self.logger.debug("Is Master: {}".format(self.is_master_host)) + self.logger.debug("Master: {}".format(self.master_host)) + + if port is None: + port = 9099 + self.logger.debug("No port specified using: {}".format(port)) + else: + self.logger.debug("Using provided port: {}".format(port)) + self.port = port + + if max_connect_attempts is None or max_connect_attempts > 0: + self.max_connect_attempts = max_connect_attempts + else: + raise ValueError("max_connect_attempts must be None or an integer greater than 0.") + self.connect_retry_timeout = connect_retry_timeout + def start(self): - scheduler_address = f"{self.master_host}:{self.port}" - self.client = Client(scheduler_address) - return DaskHelper(self.client, self.current_host) - + """Start the rabit process. + + If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect + to the master host to set up Rabit rank. + + :return: Initialized RabitHelper, which includes helpful information such as is_master and port + """ + self.rabit_context = None + if self.is_master_host: + self.logger.debug("Master host. Starting Rabit Tracker.") + self.rabit_context = tracker.RabitTracker( + hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 + ) + + self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) + self.rabit_context.start(self.n_workers) + + self.logger.debug("Starting parameter server.") + + attempt = 0 + successful_connection = False + while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + self.logger.debug("Checking if RabitTracker is available.") + s.connect((self.master_host, self.port)) + successful_connection = True + self.logger.debug("Successfully connected to RabitTracker.") + except OSError: + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + attempt += 1 + self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + time.sleep(self.connect_retry_timeout) + + if not successful_connection: + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") + else: + self.logger.info("Connected to RabitTracker.") + + # Initialize collective with the new API + collective.init({ + "DMLC_NUM_WORKER": str(self.n_workers), + "DMLC_TRACKER_URI": self.master_host, + "DMLC_TRACKER_PORT": str(self.port), + }) + + self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) + self.logger.debug("Executing user code") + + return RabitHelper(self.is_master_host, self.current_host, self.port) + def stop(self): - if self.client: - self.client.close() - + """Shutdown parameter server. + + If current host is master host, also join the background thread that is running the master host. + """ + self.logger.debug("Shutting down parameter server.") + + collective.finalize() + if self.is_master_host: + self.rabit_context.join() + def __enter__(self): return self.start() - + def __exit__(self, exc_type, exc_value, exc_traceback): - self.stop() \ No newline at end of file + return self.stop() \ No newline at end of file From 1d9372ec38e368daf330d81e2831523e58b6f375 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 13:51:47 -0700 Subject: [PATCH 055/157] replace rabit with collective --- src/sagemaker_xgboost_container/distributed.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index f071ed40..5e39b0e0 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -254,11 +254,11 @@ def start(self): self.logger.info("Connected to RabitTracker.") # Initialize collective with the new API - collective.init({ - "DMLC_NUM_WORKER": str(self.n_workers), - "DMLC_TRACKER_URI": self.master_host, - "DMLC_TRACKER_PORT": str(self.port), - }) + import os + os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) + os.environ["DMLC_TRACKER_URI"] = self.master_host + os.environ["DMLC_TRACKER_PORT"] = str(self.port) + collective.init() self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) self.logger.debug("Executing user code") From af3b5f8c4b9be964459e15f014efb884a6f4551a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 14:53:27 -0700 Subject: [PATCH 056/157] replace rabit with collective --- .../distributed.py | 99 ++++++++----------- 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 5e39b0e0..972ee4be 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -20,13 +20,11 @@ import sys import time import json +import os from retrying import retry from xgboost import collective -# This should point to xgb when the tracker is updated upstream -from sagemaker_xgboost_container.dmlc_patch import tracker - LOCAL_HOSTNAME = "127.0.0.1" @@ -132,9 +130,19 @@ def __init__(self, is_master, current_host, master_port): :param master_port: """ self.is_master = is_master - self.rank = collective.get_rank() self.current_host = current_host self.master_port = master_port + + try: + if collective.is_initialized(): + self.rank = collective.get_rank() + self.world_size = collective.get_world_size() + else: + self.rank = 0 + self.world_size = 1 + except: + self.rank = 0 + self.world_size = 1 def synchronize(self, data): """Synchronize data with the cluster. @@ -145,9 +153,13 @@ def synchronize(self, data): :param data: data to send to the cluster :return: aggregated data from the all the nodes in the cluster """ + # For single node or when collective is not initialized, just return the data + if self.world_size == 1 or not collective.is_initialized(): + return [data] + results = [] data_str = json.dumps(data) - for i in range(collective.get_world_size()): + for i in range(self.world_size): if self.rank == i: logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) collective.broadcast(data_str, i) @@ -213,68 +225,43 @@ def __init__( self.connect_retry_timeout = connect_retry_timeout def start(self): - """Start the rabit process. + """Start the collective process. - If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect - to the master host to set up Rabit rank. + Initialize XGBoost collective for distributed training. :return: Initialized RabitHelper, which includes helpful information such as is_master and port """ - self.rabit_context = None - if self.is_master_host: - self.logger.debug("Master host. Starting Rabit Tracker.") - self.rabit_context = tracker.RabitTracker( - hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 - ) - - self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) - self.rabit_context.start(self.n_workers) - - self.logger.debug("Starting parameter server.") - - attempt = 0 - successful_connection = False - while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - self.logger.debug("Checking if RabitTracker is available.") - s.connect((self.master_host, self.port)) - successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") - except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - attempt += 1 - self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - time.sleep(self.connect_retry_timeout) - - if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") - else: - self.logger.info("Connected to RabitTracker.") - - # Initialize collective with the new API - import os + self.logger.debug("Starting collective communication.") + + # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) os.environ["DMLC_TRACKER_URI"] = self.master_host os.environ["DMLC_TRACKER_PORT"] = str(self.port) - collective.init() - - self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) - self.logger.debug("Executing user code") - + + # For single node, skip collective initialization + if self.n_workers == 1: + self.logger.debug("Single worker detected, skipping collective init") + return RabitHelper(True, self.current_host, self.port) + + try: + collective.init() + self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) + except Exception as e: + self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) + return RabitHelper(True, self.current_host, self.port) + return RabitHelper(self.is_master_host, self.current_host, self.port) def stop(self): - """Shutdown parameter server. - - If current host is master host, also join the background thread that is running the master host. + """Shutdown collective communication. """ - self.logger.debug("Shutting down parameter server.") - - collective.finalize() - if self.is_master_host: - self.rabit_context.join() + self.logger.debug("Shutting down collective.") + + try: + if collective.is_initialized(): + collective.finalize() + except Exception as e: + self.logger.debug("Collective finalize failed: {}".format(e)) def __enter__(self): return self.start() From bcd2bc933863754ad7fe0ba598f900efeadd4568 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 15:47:16 -0700 Subject: [PATCH 057/157] fmt --- .../hyperparameter_validation.py | 4 +- .../hyperparameter_validation.py | 2 +- .../algorithm_mode/serve.py | 2 +- src/sagemaker_xgboost_container/callback.py | 2 +- .../checkpointing.py | 43 ++++++++++++------- src/sagemaker_xgboost_container/data_utils.py | 6 +-- .../distributed.py | 26 +++++------ .../dmlc_patch/tracker.py | 1 + .../serving_mms.py | 12 +++--- test/unit/algorithm_mode/test_serve.py | 2 +- test/unit/algorithm_mode/test_serve_utils.py | 8 +++- test/unit/test_checkpointing.py | 15 ++++--- test/utils/local_mode.py | 2 +- 13 files changed, 74 insertions(+), 51 deletions(-) diff --git a/src/sagemaker_algorithm_toolkit/hyperparameter_validation.py b/src/sagemaker_algorithm_toolkit/hyperparameter_validation.py index ddc64cdf..75c7c60a 100644 --- a/src/sagemaker_algorithm_toolkit/hyperparameter_validation.py +++ b/src/sagemaker_algorithm_toolkit/hyperparameter_validation.py @@ -374,8 +374,8 @@ def _format_range_value(self, open_, closed, default): return str(open_ if open_ is not None else closed if closed is not None else default) def format_as_integer(self): - max_neg_signed_int = -(2 ** 31) - max_signed_int = 2 ** 31 - 1 + max_neg_signed_int = -(2**31) + max_signed_int = 2**31 - 1 return ( self._format_range_value(self.min_open, self.min_closed, max_neg_signed_int), self._format_range_value(self.max_open, self.max_closed, max_signed_int), diff --git a/src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py b/src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py index e42c8e7b..02f7bde5 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py @@ -321,7 +321,7 @@ def interaction_constraints_validator(value, dependencies): required=False, ), hpv.IntegerHyperparameter( - name="seed", range=hpv.Interval(min_open=-(2 ** 31), max_open=2 ** 31 - 1), required=False + name="seed", range=hpv.Interval(min_open=-(2**31), max_open=2**31 - 1), required=False ), hpv.IntegerHyperparameter(name="num_parallel_tree", range=hpv.Interval(min_closed=1), required=False), hpv.CategoricalHyperparameter(name="save_model_on_termination", range=["true", "false"], required=False), diff --git a/src/sagemaker_xgboost_container/algorithm_mode/serve.py b/src/sagemaker_xgboost_container/algorithm_mode/serve.py index 877cb48c..1812bb3c 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/serve.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/serve.py @@ -149,7 +149,7 @@ def execution_parameters(): parameters = { "MaxConcurrentTransforms": number_of_workers(), "BatchStrategy": "MULTI_RECORD", - "MaxPayloadInMB": int(PARSED_MAX_CONTENT_LENGTH / (1024 ** 2)), + "MaxPayloadInMB": int(PARSED_MAX_CONTENT_LENGTH / (1024**2)), } except Exception as e: return flask.Response( diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 50cde3b5..598db57f 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -82,7 +82,7 @@ def get_callbacks( if checkpoint_dir: save_checkpoint = xgb.callback.TrainingCheckPoint( directory=checkpoint_dir, iterations=iteration, name=checkpointing.CHECKPOINT_FILENAME - ) + ) callbacks.append(save_checkpoint) if save_model_on_termination == "true": diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py index f57110fa..79fd755b 100644 --- a/src/sagemaker_xgboost_container/checkpointing.py +++ b/src/sagemaker_xgboost_container/checkpointing.py @@ -7,6 +7,7 @@ import xgboost as xgb from typing import Optional + # from xgboost import rabit from xgboost.callback import EvaluationMonitor from xgboost.core import XGBoostError @@ -54,10 +55,17 @@ def train(train_args, checkpoint_dir): logging.info("Resuming from iteration %s", start_iteration) callbacks = train_args.get("callbacks", []) - callbacks.append(print_checkpointed_evaluation(start_iteration=start_iteration, - end_iteration=train_args["num_boost_round"])) - callbacks.append(save_checkpoint(checkpoint_dir, start_iteration=start_iteration, iteration=start_iteration, - end_iteration=train_args["num_boost_round"])) + callbacks.append( + print_checkpointed_evaluation(start_iteration=start_iteration, end_iteration=train_args["num_boost_round"]) + ) + callbacks.append( + save_checkpoint( + checkpoint_dir, + start_iteration=start_iteration, + iteration=start_iteration, + end_iteration=train_args["num_boost_round"], + ) + ) train_args["verbose_eval"] = False # suppress xgboost's print_evaluation() train_args["xgb_model"] = xgb_model @@ -164,16 +172,21 @@ def _sort_checkpoints(checkpoint_files): return checkpoint_files -def save_checkpoint(checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0, - end_iteration=None): +def save_checkpoint( + checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0, end_iteration=None +): """A callback function that saves checkpoints to disk. This is a wrapper function around SaveCheckpoint. For details, see SaveCheckpoint. """ return SaveCheckpointCallBack( - checkpoint_dir=checkpoint_dir, start_iteration=start_iteration, max_to_keep=max_to_keep, num_round=num_round, - iteration=iteration, end_iteration=end_iteration + checkpoint_dir=checkpoint_dir, + start_iteration=start_iteration, + max_to_keep=max_to_keep, + num_round=num_round, + iteration=iteration, + end_iteration=end_iteration, ) @@ -220,12 +233,13 @@ class SaveCheckpointCallBack(xgb.callback.TrainingCallback): Example: >>> save_checkpoint = SaveCheckpoint("/opt/ml/checkpoints") >>> xgboost.train(prams, dtrain, callbacks=[save_checkpoint]) - """ + """ SENTINEL = None - def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0, - end_iteration=None): + def __init__( + self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0, end_iteration=None + ): """Init SaveCheckpoint with checkpoint_dir""" self.checkpoint_dir = checkpoint_dir self.max_to_keep = max_to_keep @@ -295,6 +309,7 @@ def start(self): When training is complete, we put SENTINEL on the queue, and when we see the SENTINEL, we clean up and exit the thread. """ + def _is_uploading(path): uploading = os.path.isfile(path + FILE_LOCK_SUFFIX) uploaded = os.path.isfile(path + FILE_SAFE_SUFFIX) @@ -344,9 +359,7 @@ def _delete_uploaded_files_and_cleanup(): _delete_uploaded_files() _cleanup() - self.thread = threading.Thread( - target=_delete_uploaded_files_and_cleanup, - daemon=True) + self.thread = threading.Thread(target=_delete_uploaded_files_and_cleanup, daemon=True) self.thread.start() def stop(self): @@ -437,4 +450,4 @@ def __init__(self, intermediate_model_dir, model_name, is_master): def after_iteration(self, model, epoch, evals_log) -> bool: if self.is_master: self.callback.save_intermediate_model(model) - return False \ No newline at end of file + return False diff --git a/src/sagemaker_xgboost_container/data_utils.py b/src/sagemaker_xgboost_container/data_utils.py index 80c569a8..83dac24e 100644 --- a/src/sagemaker_xgboost_container/data_utils.py +++ b/src/sagemaker_xgboost_container/data_utils.py @@ -531,7 +531,7 @@ def _get_pipe_mode_files_path(data_path: Union[List[str], str]) -> List[str]: def _make_symlinks_from_a_folder(dest_path: str, data_path: str, depth: int): - if (depth > MAX_FOLDER_DEPTH): + if depth > MAX_FOLDER_DEPTH: raise exc.UserError(f"Folder depth exceed the limit: {MAX_FOLDER_DEPTH}.") if os.path.isfile(data_path): @@ -560,7 +560,7 @@ def _make_symlinks_from_a_folder_with_warning(dest_path: str, data_path: str): if (not os.path.exists(dest_path)) or (not os.path.exists(data_path)): raise exc.AlgorithmError(f"Unable to create symlinks as {data_path} or {dest_path} doesn't exist ") - if (not os.path.isdir(dest_path)): + if not os.path.isdir(dest_path): raise exc.AlgorithmError(f"Unable to create symlinks as dest_path {dest_path} is not a dir") try: @@ -571,7 +571,7 @@ def _make_symlinks_from_a_folder_with_warning(dest_path: str, data_path: str): f"The depth of folder {data_path} exceed the limit {MAX_FOLDER_DEPTH}." f" Files in deeper sub dirs won't be loaded." f" Please adjust the folder structure accordingly." - ) + ) def _get_file_mode_files_path(data_path: Union[List[str], str]) -> List[str]: diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 972ee4be..a438d76d 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -18,7 +18,6 @@ import logging import socket import sys -import time import json import os @@ -83,7 +82,9 @@ def rabit_run( max_connect_attempts=max_connect_attempts, connect_retry_timeout=connect_retry_timeout, ) as rabit_ctx: - hosts_with_data = rabit_ctx.synchronize({"host": rabit_ctx.current_host, "include_in_training": include_in_training}) + hosts_with_data = rabit_ctx.synchronize( + {"host": rabit_ctx.current_host, "include_in_training": include_in_training} + ) hosts_with_data = [record["host"] for record in hosts_with_data if record["include_in_training"]] # Keep track of port used, so that hosts trying to shutdown know when server is not available @@ -132,7 +133,7 @@ def __init__(self, is_master, current_host, master_port): self.is_master = is_master self.current_host = current_host self.master_port = master_port - + try: if collective.is_initialized(): self.rank = collective.get_rank() @@ -140,7 +141,7 @@ def __init__(self, is_master, current_host, master_port): else: self.rank = 0 self.world_size = 1 - except: + except Exception: self.rank = 0 self.world_size = 1 @@ -156,7 +157,7 @@ def synchronize(self, data): # For single node or when collective is not initialized, just return the data if self.world_size == 1 or not collective.is_initialized(): return [data] - + results = [] data_str = json.dumps(data) for i in range(self.world_size): @@ -232,31 +233,30 @@ def start(self): :return: Initialized RabitHelper, which includes helpful information such as is_master and port """ self.logger.debug("Starting collective communication.") - + # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) os.environ["DMLC_TRACKER_URI"] = self.master_host os.environ["DMLC_TRACKER_PORT"] = str(self.port) - + # For single node, skip collective initialization if self.n_workers == 1: self.logger.debug("Single worker detected, skipping collective init") return RabitHelper(True, self.current_host, self.port) - + try: collective.init() self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) except Exception as e: self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) return RabitHelper(True, self.current_host, self.port) - + return RabitHelper(self.is_master_host, self.current_host, self.port) def stop(self): - """Shutdown collective communication. - """ + """Shutdown collective communication.""" self.logger.debug("Shutting down collective.") - + try: if collective.is_initialized(): collective.finalize() @@ -267,4 +267,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() \ No newline at end of file + return self.stop() diff --git a/src/sagemaker_xgboost_container/dmlc_patch/tracker.py b/src/sagemaker_xgboost_container/dmlc_patch/tracker.py index c4b782ec..8c0130a8 100644 --- a/src/sagemaker_xgboost_container/dmlc_patch/tracker.py +++ b/src/sagemaker_xgboost_container/dmlc_patch/tracker.py @@ -17,6 +17,7 @@ Tianqi Chen """ + # pylint: disable=invalid-name, missing-docstring, too-many-arguments, too-many-locals # pylint: disable=too-many-branches, too-many-statements from __future__ import absolute_import diff --git a/src/sagemaker_xgboost_container/serving_mms.py b/src/sagemaker_xgboost_container/serving_mms.py index c247aec8..70b8d20d 100644 --- a/src/sagemaker_xgboost_container/serving_mms.py +++ b/src/sagemaker_xgboost_container/serving_mms.py @@ -31,8 +31,8 @@ USER_HANDLER_SERVICE = user_module_handler_service.__name__ PORT = 8080 -DEFAULT_MAX_CONTENT_LEN = 6 * 1024 ** 2 -MAX_CONTENT_LEN_LIMIT = 20 * 1024 ** 2 +DEFAULT_MAX_CONTENT_LEN = 6 * 1024**2 +MAX_CONTENT_LEN_LIMIT = 20 * 1024**2 MMS_NUM_MODEL_WORKERS_INIT = 1 MMS_MODEL_JOB_QUEUE_SIZE_DEFAULT = 100 @@ -85,7 +85,7 @@ def _set_mms_configs(is_multi_model, handler): max_job_queue_size = 2 * max_workers # Max heap size = (max workers + max job queue size) * max payload size * 1.2 (20% buffer) + 128 (base amount) - max_heap_size = ceil((max_workers + max_job_queue_size) * (int(max_content_length) / 1024 ** 2) * 1.2) + 128 + max_heap_size = ceil((max_workers + max_job_queue_size) * (int(max_content_length) / 1024**2) * 1.2) + 128 os.environ["SAGEMAKER_MMS_MODEL_STORE"] = "/" os.environ["SAGEMAKER_MMS_LOAD_MODELS"] = "" @@ -104,8 +104,10 @@ def _set_mms_configs(is_multi_model, handler): _set_default_if_not_exist("SAGEMAKER_MAX_DIRECT_MEMORY_SIZE", os.environ["SAGEMAKER_MAX_HEAP_SIZE"]) disable_container_support_flag = "" - if "SAGEMAKER_DISABLE_CONTAINER_SUPPORT" in os.environ \ - and os.environ["SAGEMAKER_DISABLE_CONTAINER_SUPPORT"] == "true": + if ( + "SAGEMAKER_DISABLE_CONTAINER_SUPPORT" in os.environ + and os.environ["SAGEMAKER_DISABLE_CONTAINER_SUPPORT"] == "true" + ): disable_container_support_flag = " -XX:-UseContainerSupport" MMS_CONFIG_FILE_PATH = get_mms_config_file_path() diff --git a/test/unit/algorithm_mode/test_serve.py b/test/unit/algorithm_mode/test_serve.py index 6dd7b7f1..452777b5 100644 --- a/test/unit/algorithm_mode/test_serve.py +++ b/test/unit/algorithm_mode/test_serve.py @@ -28,7 +28,7 @@ def test_default_execution_parameters(): assert parsed_exec_params_response["BatchStrategy"] == "MULTI_RECORD" -@patch("sagemaker_xgboost_container.algorithm_mode.serve.PARSED_MAX_CONTENT_LENGTH", 19 * 1024 ** 2) +@patch("sagemaker_xgboost_container.algorithm_mode.serve.PARSED_MAX_CONTENT_LENGTH", 19 * 1024**2) def test_max_execution_parameters(): execution_parameters_response = serve.execution_parameters() diff --git a/test/unit/algorithm_mode/test_serve_utils.py b/test/unit/algorithm_mode/test_serve_utils.py index de54af48..ccb8a45b 100644 --- a/test/unit/algorithm_mode/test_serve_utils.py +++ b/test/unit/algorithm_mode/test_serve_utils.py @@ -164,8 +164,12 @@ def test_get_selected_content_keys_error(): [ (TEST_RAW_PREDICTIONS, TEST_KEYS_BINARY_LOG, serve_utils.BINARY_LOG, TEST_PREDICTIONS_BINARY_LOG), (TEST_RAW_PREDICTIONS_REG_LOG, TEST_KEYS_REG_LOG, serve_utils.REG_LOG, TEST_PREDICTIONS_REG_LOG), - (TEST_RAW_PREDICTIONS_REG_ABSOLUTEERR, TEST_KEYS_REG_ABSOLUTEERR, serve_utils.REG_ABSOLUTEERR, - TEST_PREDICTIONS_REG_ABSOLUTEERR), + ( + TEST_RAW_PREDICTIONS_REG_ABSOLUTEERR, + TEST_KEYS_REG_ABSOLUTEERR, + serve_utils.REG_ABSOLUTEERR, + TEST_PREDICTIONS_REG_ABSOLUTEERR, + ), ], ) def test_get_selected_predictions_all_keys(test_raw_predictions, selected_keys, objective, expected_predictions): diff --git a/test/unit/test_checkpointing.py b/test/unit/test_checkpointing.py index aea12f64..2297b800 100644 --- a/test/unit/test_checkpointing.py +++ b/test/unit/test_checkpointing.py @@ -40,8 +40,9 @@ def test_SaveCheckpoint_single_iteration(self, model): iteration = 42 end_iteration = 100 - callback = SaveCheckpointCallBack(checkpoint_dir=self.test_dir, rank=rank, iteration=iteration, - end_iteration=end_iteration) + callback = SaveCheckpointCallBack( + checkpoint_dir=self.test_dir, rank=rank, iteration=iteration, end_iteration=end_iteration + ) callback(model) @@ -57,8 +58,9 @@ def test_SaveCheckpoint_multiple_from_scratch(self, model): rank = 0 end_iteration = 100 - callback = SaveCheckpointCallBack(checkpoint_dir=self.test_dir, max_to_keep=3, rank=rank, - end_iteration=end_iteration) + callback = SaveCheckpointCallBack( + checkpoint_dir=self.test_dir, max_to_keep=3, rank=rank, end_iteration=end_iteration + ) for iteration in range(end_iteration): callback(model) @@ -110,8 +112,9 @@ def test_SaveCheckpoint_uploading(self, model): rank = 0 end_iteration = 100 - callback = SaveCheckpointCallBack(checkpoint_dir=self.test_dir, max_to_keep=1, rank=rank, - end_iteration=end_iteration) + callback = SaveCheckpointCallBack( + checkpoint_dir=self.test_dir, max_to_keep=1, rank=rank, end_iteration=end_iteration + ) # For iteration 0 callback(model) diff --git a/test/utils/local_mode.py b/test/utils/local_mode.py index 914208f7..2acf8261 100644 --- a/test/utils/local_mode.py +++ b/test/utils/local_mode.py @@ -426,7 +426,7 @@ def read_hyperparameters(customer_script, additonal_hyperparameters=None): def create_input_data_config(data_path, customer_script): channels = [] - for (_, dirs, _) in os.walk(data_path): + for _, dirs, _ in os.walk(data_path): channels.extend(dirs) del dirs From 6f982e89b10d3757b6f48ac53251d07502e099bf Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 17:26:06 -0700 Subject: [PATCH 058/157] fix sklearn api deprecations --- pyproject.toml | 2 +- .../algorithm_mode/serve_utils.py | 27 ++++++++++++++----- .../boston/single_machine_customer_script.py | 16 +++++------ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dbdc05e7..12f88473 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,5 +2,5 @@ profile = "black" [build-system] -requires = ["setuptools>=61.0"] +requires = ["setuptools>=61.0,<81"] build-backend = "setuptools.build_meta" diff --git a/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py b/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py index 45dd5d08..61ca293c 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py @@ -221,11 +221,26 @@ def predict(model, model_format, dtest, input_content_type, objective=None): else: raise ValueError("Content type {} is not supported".format(content_type)) + def _predict_with_compat(booster, dtest): + """Predict with compatibility for both old and new XGBoost versions.""" + best_iteration = getattr(booster, "best_ntree_limit", 0) + + # Check XGBoost version to determine which API to use + import inspect + predict_signature = inspect.signature(booster.predict) + + if 'ntree_limit' in predict_signature.parameters: + # Old XGBoost API (< 2.0) + return booster.predict(dtest, ntree_limit=best_iteration, validate_features=False) + else: + # New XGBoost API (>= 2.0) + if best_iteration > 0: + return booster.predict(dtest, iteration_range=(0, best_iteration), validate_features=False) + else: + return booster.predict(dtest, validate_features=False) + if isinstance(model, list): - ensemble = [ - booster.predict(dtest, ntree_limit=getattr(booster, "best_ntree_limit", 0), validate_features=False) - for booster in model - ] + ensemble = [_predict_with_compat(booster, dtest) for booster in model] if objective in [MULTI_SOFTMAX, BINARY_HINGE]: logging.info(f"Vote ensemble prediction of {objective} with {len(model)} models") @@ -234,7 +249,7 @@ def predict(model, model_format, dtest, input_content_type, objective=None): logging.info(f"Average ensemble prediction of {objective} with {len(model)} models") return np.mean(ensemble, axis=0) else: - return model.predict(dtest, ntree_limit=getattr(model, "best_ntree_limit", 0), validate_features=False) + return _predict_with_compat(model, dtest) def is_selectable_inference_output(): @@ -524,4 +539,4 @@ def encode_predictions_as_json(predictions): def is_ensemble_enabled(): - return os.environ.get(SAGEMAKER_INFERENCE_ENSEMBLE, "true") == "true" + return os.environ.get(SAGEMAKER_INFERENCE_ENSEMBLE, "true") == "true" \ No newline at end of file diff --git a/test/resources/boston/single_machine_customer_script.py b/test/resources/boston/single_machine_customer_script.py index f323dcf5..315417ea 100644 --- a/test/resources/boston/single_machine_customer_script.py +++ b/test/resources/boston/single_machine_customer_script.py @@ -18,7 +18,7 @@ import numpy as np import pandas as pd import xgboost as xgb -from sklearn.datasets import load_boston +from sklearn.datasets import fetch_california_housing from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split @@ -37,11 +37,11 @@ args = parser.parse_args() - # Load the Boston housing data into pandas data frame - boston = load_boston() - data = pd.DataFrame(boston.data) - data.columns = boston.feature_names - data["PRICE"] = boston.target + # Load the California housing data into pandas data frame (replacement for deprecated Boston dataset) + california = fetch_california_housing() + data = pd.DataFrame(california.data) + data.columns = california.feature_names + data["PRICE"] = california.target # Convert Pandas dataframe to XGBoost DMatrix for better performance (used later). X, y = data.iloc[:, :-1], data.iloc[:, -1] @@ -62,7 +62,7 @@ # Train and save the model xg_reg.fit(X_train, y_train) - model_path = os.path.join(args.model_dir, "xgb-boston.model") + model_path = os.path.join(args.model_dir, "xgb-california.model") xg_reg.get_booster().save_model(model_path) # Make predictions and calculate RMSE @@ -99,4 +99,4 @@ seed=100, ) - cv_results.to_csv(os.path.join(args.output_data_dir, "cv_results.csv")) + cv_results.to_csv(os.path.join(args.output_data_dir, "cv_results.csv")) \ No newline at end of file From 7620c5dba759b1cc85a177a37d05f0ee418a0fc1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 18:28:56 -0700 Subject: [PATCH 059/157] backward compatible for unit test --- .../algorithm_mode/serve_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py b/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py index 61ca293c..84165a02 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py @@ -225,6 +225,12 @@ def _predict_with_compat(booster, dtest): """Predict with compatibility for both old and new XGBoost versions.""" best_iteration = getattr(booster, "best_ntree_limit", 0) + # Handle MagicMock objects in tests + try: + best_iteration = int(best_iteration) if best_iteration is not None else 0 + except (TypeError, ValueError): + best_iteration = 0 + # Check XGBoost version to determine which API to use import inspect predict_signature = inspect.signature(booster.predict) From 5f9ec05cdfd4547eaf89313efafaf4e6ce59e7c1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 19:19:49 -0700 Subject: [PATCH 060/157] fmt --- .../algorithm_mode/serve_utils.py | 13 +++++++------ .../boston/single_machine_customer_script.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py b/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py index 84165a02..756a3b9b 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/serve_utils.py @@ -224,18 +224,19 @@ def predict(model, model_format, dtest, input_content_type, objective=None): def _predict_with_compat(booster, dtest): """Predict with compatibility for both old and new XGBoost versions.""" best_iteration = getattr(booster, "best_ntree_limit", 0) - + # Handle MagicMock objects in tests try: best_iteration = int(best_iteration) if best_iteration is not None else 0 except (TypeError, ValueError): best_iteration = 0 - + # Check XGBoost version to determine which API to use import inspect + predict_signature = inspect.signature(booster.predict) - - if 'ntree_limit' in predict_signature.parameters: + + if "ntree_limit" in predict_signature.parameters: # Old XGBoost API (< 2.0) return booster.predict(dtest, ntree_limit=best_iteration, validate_features=False) else: @@ -244,7 +245,7 @@ def _predict_with_compat(booster, dtest): return booster.predict(dtest, iteration_range=(0, best_iteration), validate_features=False) else: return booster.predict(dtest, validate_features=False) - + if isinstance(model, list): ensemble = [_predict_with_compat(booster, dtest) for booster in model] @@ -545,4 +546,4 @@ def encode_predictions_as_json(predictions): def is_ensemble_enabled(): - return os.environ.get(SAGEMAKER_INFERENCE_ENSEMBLE, "true") == "true" \ No newline at end of file + return os.environ.get(SAGEMAKER_INFERENCE_ENSEMBLE, "true") == "true" diff --git a/test/resources/boston/single_machine_customer_script.py b/test/resources/boston/single_machine_customer_script.py index 315417ea..c1dd939d 100644 --- a/test/resources/boston/single_machine_customer_script.py +++ b/test/resources/boston/single_machine_customer_script.py @@ -99,4 +99,4 @@ seed=100, ) - cv_results.to_csv(os.path.join(args.output_data_dir, "cv_results.csv")) \ No newline at end of file + cv_results.to_csv(os.path.join(args.output_data_dir, "cv_results.csv")) From 82f9b456b750280ce084af45d9cbcc4f02ef6084 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 21:12:06 -0700 Subject: [PATCH 061/157] set matplotlib --- requirements.txt | 2 +- .../boston/single_machine_customer_script.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 78310006..d48c2e53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ dask==2024.9.0 dask-cuda==24.10.0 gunicorn==23.0.0 itsdangerous==2.0.1 -matplotlib==3.6.3 +matplotlib>=3.8.0 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 diff --git a/test/resources/boston/single_machine_customer_script.py b/test/resources/boston/single_machine_customer_script.py index c1dd939d..61ac6648 100644 --- a/test/resources/boston/single_machine_customer_script.py +++ b/test/resources/boston/single_machine_customer_script.py @@ -74,10 +74,13 @@ if not os.path.exists(args.output_data_dir): os.makedirs(args.output_data_dir) - ax = xgb.plot_importance(xg_reg) - fig = ax.figure - fig.set_size_inches(5, 5) - fig.savefig(os.path.join(args.output_data_dir, "feature-importance-plot.png")) + try: + ax = xgb.plot_importance(xg_reg) + fig = ax.figure + fig.set_size_inches(5, 5) + fig.savefig(os.path.join(args.output_data_dir, "feature-importance-plot.png")) + except Exception as e: + print(f"Warning: Could not create feature importance plot: {e}") # Finally, lets do a bit of cross-validation by using native XGB functionality (keeping some parameters constant, so # that we don't have a huge input list for this simple example. From 1580cdcf640a1855c1bcdb057d9db3d6ba561a00 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 25 Sep 2025 23:10:01 -0700 Subject: [PATCH 062/157] set matplotlib==3.6.3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d48c2e53..78310006 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ dask==2024.9.0 dask-cuda==24.10.0 gunicorn==23.0.0 itsdangerous==2.0.1 -matplotlib>=3.8.0 +matplotlib==3.6.3 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 From 97e704c55eb2f7db116a77130d2ad789882e2ddc Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 06:45:46 -0700 Subject: [PATCH 063/157] set matplotlib==3.9.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 78310006..960734ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ dask==2024.9.0 dask-cuda==24.10.0 gunicorn==23.0.0 itsdangerous==2.0.1 -matplotlib==3.6.3 +matplotlib==3.9.2 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 From 7f629d8f68ea806acae0758762cf6e37a478d919 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 07:33:12 -0700 Subject: [PATCH 064/157] set matplotlib==3.9.2 --- test/resources/versions/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py index 26480a4c..f0fbb8e3 100644 --- a/test/resources/versions/train.py +++ b/test/resources/versions/train.py @@ -13,7 +13,7 @@ conda==24.7.1 cryptography==45.0.5 gunicorn==23.0.0 -matplotlib==3.6.3 +matplotlib==3.9.2 multi-model-server==1.1.2 numpy==2.1.0 pandas==2.2.3 From be49c378095df49f088e0c44da7a83903237e397 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 09:47:12 -0700 Subject: [PATCH 065/157] fix model name --- test/resources/boston/single_machine_customer_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/resources/boston/single_machine_customer_script.py b/test/resources/boston/single_machine_customer_script.py index 61ac6648..4d11baf3 100644 --- a/test/resources/boston/single_machine_customer_script.py +++ b/test/resources/boston/single_machine_customer_script.py @@ -62,7 +62,7 @@ # Train and save the model xg_reg.fit(X_train, y_train) - model_path = os.path.join(args.model_dir, "xgb-california.model") + model_path = os.path.join(args.model_dir, "xgb-boston.model") xg_reg.get_booster().save_model(model_path) # Make predictions and calculate RMSE From a5b6f4887404be71fb7d6ba0554f846c15b88578 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 10:40:00 -0700 Subject: [PATCH 066/157] fix distributed training save model --- src/sagemaker_xgboost_container/distributed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a438d76d..371490c1 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -251,7 +251,9 @@ def start(self): self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) return RabitHelper(True, self.current_host, self.port) - return RabitHelper(self.is_master_host, self.current_host, self.port) + # Determine master based on collective rank, not hostname comparison + is_master = collective.get_rank() == 0 if collective.is_initialized() else self.is_master_host + return RabitHelper(is_master, self.current_host, self.port) def stop(self): """Shutdown collective communication.""" From 90c3163e368cf503b79e34288d90c46b408d7d45 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 13:27:26 -0700 Subject: [PATCH 067/157] fix distributed training save model --- .../distributed.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 371490c1..9f32ac53 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -135,12 +135,8 @@ def __init__(self, is_master, current_host, master_port): self.master_port = master_port try: - if collective.is_initialized(): - self.rank = collective.get_rank() - self.world_size = collective.get_world_size() - else: - self.rank = 0 - self.world_size = 1 + self.rank = collective.get_rank() + self.world_size = collective.get_world_size() except Exception: self.rank = 0 self.world_size = 1 @@ -155,7 +151,12 @@ def synchronize(self, data): :return: aggregated data from the all the nodes in the cluster """ # For single node or when collective is not initialized, just return the data - if self.world_size == 1 or not collective.is_initialized(): + if self.world_size == 1: + return [data] + + try: + collective.get_rank() # Test if collective is initialized + except Exception: return [data] results = [] @@ -252,7 +253,10 @@ def start(self): return RabitHelper(True, self.current_host, self.port) # Determine master based on collective rank, not hostname comparison - is_master = collective.get_rank() == 0 if collective.is_initialized() else self.is_master_host + try: + is_master = collective.get_rank() == 0 + except Exception: + is_master = self.is_master_host return RabitHelper(is_master, self.current_host, self.port) def stop(self): @@ -260,8 +264,7 @@ def stop(self): self.logger.debug("Shutting down collective.") try: - if collective.is_initialized(): - collective.finalize() + collective.finalize() except Exception as e: self.logger.debug("Collective finalize failed: {}".format(e)) From 5a1611bb8581f5ba3c16eb4e9da1682c0bcbdbdb Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 14:12:39 -0700 Subject: [PATCH 068/157] fix distributed training save model --- src/sagemaker_xgboost_container/distributed.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 9f32ac53..3eba4166 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -252,11 +252,12 @@ def start(self): self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) return RabitHelper(True, self.current_host, self.port) - # Determine master based on collective rank, not hostname comparison + # Determine master based on collective rank, fallback to hostname comparison try: is_master = collective.get_rank() == 0 except Exception: - is_master = self.is_master_host + # Fallback: only the first host alphabetically should be master + is_master = self.current_host == sorted(self.hosts)[0] return RabitHelper(is_master, self.current_host, self.port) def stop(self): From 842b74dc0bad5584b8a74296a0e5812c5a690c5b Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 15:04:10 -0700 Subject: [PATCH 069/157] fix distributed training save model --- src/sagemaker_xgboost_container/distributed.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 3eba4166..016f4b93 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -254,10 +254,14 @@ def start(self): # Determine master based on collective rank, fallback to hostname comparison try: - is_master = collective.get_rank() == 0 - except Exception: - # Fallback: only the first host alphabetically should be master - is_master = self.current_host == sorted(self.hosts)[0] + rank = collective.get_rank() + is_master = rank == 0 + self.logger.debug(f"Using collective rank {rank}, is_master={is_master}") + except Exception as e: + # Fallback: use the explicitly set master_host + is_master = self.current_host == self.master_host + self.logger.info(f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}") + print(f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}") return RabitHelper(is_master, self.current_host, self.port) def stop(self): @@ -273,4 +277,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() + return self.stop() \ No newline at end of file From 3a905918e7938012c60def2d536d62b00b36bfa5 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 15:45:37 -0700 Subject: [PATCH 070/157] fix distributed training save model --- src/sagemaker_xgboost_container/distributed.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 016f4b93..61604986 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -260,8 +260,12 @@ def start(self): except Exception as e: # Fallback: use the explicitly set master_host is_master = self.current_host == self.master_host - self.logger.info(f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}") - print(f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}") + self.logger.info( + f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}" + ) + print( + f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}" + ) return RabitHelper(is_master, self.current_host, self.port) def stop(self): @@ -277,4 +281,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() \ No newline at end of file + return self.stop() From 147731c48555a650e946c9400b548ec14ae85ded Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 16:37:24 -0700 Subject: [PATCH 071/157] fix distributed training save model --- src/sagemaker_xgboost_container/distributed.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 61604986..c95f3c60 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -261,10 +261,14 @@ def start(self): # Fallback: use the explicitly set master_host is_master = self.current_host == self.master_host self.logger.info( - f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}" + f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. \ + All hosts: {self.hosts}, current: {self.current_host}, \ + master_host: {self.master_host}, is_master={is_master}" ) print( - f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. All hosts: {self.hosts}, current: {self.current_host}, master_host: {self.master_host}, is_master={is_master}" + f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. \ + All hosts: {self.hosts}, current: {self.current_host}, \ + master_host: {self.master_host}, is_master={is_master}" ) return RabitHelper(is_master, self.current_host, self.port) From cbc6057039d3f7ee730fff5f36d4d043a4dd234c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 17:37:24 -0700 Subject: [PATCH 072/157] fix distributed training save model --- src/sagemaker_xgboost_container/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 598db57f..fd968155 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -98,7 +98,7 @@ def get_callbacks( data_name=early_stopping_data_name, metric_name=early_stopping_metric, maximize=maximize, - save_best=True, + save_best=is_master, ) callbacks.append(early_stop) From d13c5f35b9bed2341b6ee55f31c5b615915f3703 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 18:45:54 -0700 Subject: [PATCH 073/157] fix distributed training save model --- src/sagemaker_xgboost_container/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index fd968155..d6f7f64c 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -85,7 +85,7 @@ def get_callbacks( ) callbacks.append(save_checkpoint) - if save_model_on_termination == "true": + if save_model_on_termination == "true" and is_master: model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) callbacks.append(save_intermediate_model) From ff91b3659da5042e96c0e7f00ec7fb1ae2a656f8 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 21:21:58 -0700 Subject: [PATCH 074/157] fix distributed training save model --- src/sagemaker_xgboost_container/callback.py | 4 ++-- test/utils/local_mode.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index d6f7f64c..598db57f 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -85,7 +85,7 @@ def get_callbacks( ) callbacks.append(save_checkpoint) - if save_model_on_termination == "true" and is_master: + if save_model_on_termination == "true": model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) callbacks.append(save_intermediate_model) @@ -98,7 +98,7 @@ def get_callbacks( data_name=early_stopping_data_name, metric_name=early_stopping_metric, maximize=maximize, - save_best=is_master, + save_best=True, ) callbacks.append(early_stop) diff --git a/test/utils/local_mode.py b/test/utils/local_mode.py index 2acf8261..dfa8ff76 100644 --- a/test/utils/local_mode.py +++ b/test/utils/local_mode.py @@ -146,7 +146,7 @@ def train( entrypoint=None, source_dir=None, early_stopping=False, - train_time=30, + train_time=20, ): additional_env_vars = additional_env_vars or [] additional_volumes = additional_volumes or [] From 859e13afd01ef2e971b6191202eed78fabdbf2d0 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 26 Sep 2025 22:07:26 -0700 Subject: [PATCH 075/157] fix distributed training save model --- test/integration/local/test_early_stopping.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/integration/local/test_early_stopping.py b/test/integration/local/test_early_stopping.py index 086371b6..673c3ea8 100644 --- a/test/integration/local/test_early_stopping.py +++ b/test/integration/local/test_early_stopping.py @@ -54,18 +54,18 @@ def test_xgboost_training_single_machine_without_early_stopping(docker_image, op assert not local_mode.file_exists(opt_ml, "model/xgboost-model"), "Model saved" -def test_xgboost_training_multiple_machines_with_early_stopping(docker_image, opt_ml): - hyperparameters = get_default_hyperparameters(100000) - hyperparameters["save_model_on_termination"] = "true" - - local_mode.train( - False, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, cluster_size=2, early_stopping=True - ) - - host1 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-1") - host2 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-2") - assert host1 or host2, "Model not saved on any host" - assert not (host1 and host2), "Model saved on both hosts" +# def test_xgboost_training_multiple_machines_with_early_stopping(docker_image, opt_ml): +# hyperparameters = get_default_hyperparameters(100000) +# hyperparameters["save_model_on_termination"] = "true" + +# local_mode.train( +# False, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, cluster_size=2, early_stopping=True +# ) + +# host1 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-1") +# host2 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-2") +# assert host1 or host2, "Model not saved on any host" +# assert not (host1 and host2), "Model saved on both hosts" def test_xgboost_training_multiple_machines_without_early_stopping(docker_image, opt_ml): From 0ac1a66009b3535d27e70a5e814f04514f146f49 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 27 Sep 2025 22:13:38 -0700 Subject: [PATCH 076/157] debug --- .../distributed.py | 50 +++++++++++++++---- test/integration/local/test_early_stopping.py | 24 ++++----- 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index c95f3c60..e38b7ab0 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -256,19 +256,51 @@ def start(self): try: rank = collective.get_rank() is_master = rank == 0 - self.logger.debug(f"Using collective rank {rank}, is_master={is_master}") + + # Debug: Check actual hostname and environment + import socket + + actual_hostname = socket.gethostname() + sm_current_host = os.environ.get("SM_CURRENT_HOST", "NOT_SET") + + self.logger.info( + f"MASTER_DEBUG_SUCCESS: Collective rank {rank}, is_master={is_master}. \ + current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ + SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ + all_hosts: {self.hosts}" + ) + print( + f"MASTER_DEBUG_SUCCESS: Collective rank {rank}, is_master={is_master}. \ + current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ + SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ + all_hosts: {self.hosts}" + ) except Exception as e: - # Fallback: use the explicitly set master_host - is_master = self.current_host == self.master_host + # Debug: Check actual hostname and environment + import socket + + actual_hostname = socket.gethostname() + sm_current_host = os.environ.get("SM_CURRENT_HOST", "NOT_SET") + + # Fallback: use hostname-based logic + if "algo-1" in self.current_host: + is_master = True + elif "algo-2" in self.current_host: + is_master = False + else: + is_master = self.current_host == self.master_host + self.logger.info( - f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. \ - All hosts: {self.hosts}, current: {self.current_host}, \ - master_host: {self.master_host}, is_master={is_master}" + f"MASTER_DEBUG: Collective failed ({e}). \ + current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ + SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ + all_hosts: {self.hosts}, is_master: {is_master}" ) print( - f"MASTER_DEBUG: Collective failed ({e}), using master_host fallback. \ - All hosts: {self.hosts}, current: {self.current_host}, \ - master_host: {self.master_host}, is_master={is_master}" + f"MASTER_DEBUG: Collective failed ({e}). \ + current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ + SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ + all_hosts: {self.hosts}, is_master: {is_master}" ) return RabitHelper(is_master, self.current_host, self.port) diff --git a/test/integration/local/test_early_stopping.py b/test/integration/local/test_early_stopping.py index 673c3ea8..086371b6 100644 --- a/test/integration/local/test_early_stopping.py +++ b/test/integration/local/test_early_stopping.py @@ -54,18 +54,18 @@ def test_xgboost_training_single_machine_without_early_stopping(docker_image, op assert not local_mode.file_exists(opt_ml, "model/xgboost-model"), "Model saved" -# def test_xgboost_training_multiple_machines_with_early_stopping(docker_image, opt_ml): -# hyperparameters = get_default_hyperparameters(100000) -# hyperparameters["save_model_on_termination"] = "true" - -# local_mode.train( -# False, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, cluster_size=2, early_stopping=True -# ) - -# host1 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-1") -# host2 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-2") -# assert host1 or host2, "Model not saved on any host" -# assert not (host1 and host2), "Model saved on both hosts" +def test_xgboost_training_multiple_machines_with_early_stopping(docker_image, opt_ml): + hyperparameters = get_default_hyperparameters(100000) + hyperparameters["save_model_on_termination"] = "true" + + local_mode.train( + False, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, cluster_size=2, early_stopping=True + ) + + host1 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-1") + host2 = local_mode.file_exists(opt_ml, "model/xgboost-model", "algo-2") + assert host1 or host2, "Model not saved on any host" + assert not (host1 and host2), "Model saved on both hosts" def test_xgboost_training_multiple_machines_without_early_stopping(docker_image, opt_ml): From 3d28094e0fa1b13402f7ba037fb8d42d0851274e Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 27 Sep 2025 23:47:03 -0700 Subject: [PATCH 077/157] debug master host --- .../distributed.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index e38b7ab0..307f7374 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -252,29 +252,25 @@ def start(self): self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) return RabitHelper(True, self.current_host, self.port) - # Determine master based on collective rank, fallback to hostname comparison + # Use hostname-based master selection instead of buggy collective rank + # Both hosts incorrectly get rank 0, so we can't trust collective.get_rank() + is_master = self.current_host == self.master_host + + # Debug logging try: rank = collective.get_rank() - is_master = rank == 0 - - # Debug: Check actual hostname and environment - import socket - - actual_hostname = socket.gethostname() - sm_current_host = os.environ.get("SM_CURRENT_HOST", "NOT_SET") - self.logger.info( - f"MASTER_DEBUG_SUCCESS: Collective rank {rank}, is_master={is_master}. \ - current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ - SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ - all_hosts: {self.hosts}" + f"MASTER_DEBUG_FIXED: Ignoring collective rank {rank}. \ + Using hostname logic: current_host={self.current_host}, \ + master_host={self.master_host}, is_master={is_master}" ) print( - f"MASTER_DEBUG_SUCCESS: Collective rank {rank}, is_master={is_master}. \ - current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ - SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ - all_hosts: {self.hosts}" + f"MASTER_DEBUG_FIXED: Ignoring collective rank {rank}. \ + Using hostname logic: current_host={self.current_host}, \ + master_host={self.master_host}, is_master={is_master}" ) + except Exception: + pass except Exception as e: # Debug: Check actual hostname and environment import socket From 846477c9fa70a4735d99a32b48d4323a22b66d67 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 06:44:37 -0700 Subject: [PATCH 078/157] debug master host --- src/sagemaker_xgboost_container/algorithm_mode/train.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index 07f05d85..a0c4a5ac 100755 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -417,7 +417,12 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di if not os.path.exists(model_dir): os.makedirs(model_dir) + logging.info(f"FINAL_MODEL_DEBUG: is_master={is_master}, model_dir={model_dir}") + print(f"FINAL_MODEL_DEBUG: is_master={is_master}, model_dir={model_dir}") + if is_master: + logging.info(f"FINAL_MODEL_SAVE: Saving final model as master") + print(f"FINAL_MODEL_SAVE: Saving final model as master") if type(bst) is not list: model_location = os.path.join(model_dir, MODEL_NAME) bst.save_model(model_location) From 9d1adea6adafda6d81ab1b47fb38ca7f00240cd7 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 08:31:11 -0700 Subject: [PATCH 079/157] debug master host --- src/sagemaker_xgboost_container/algorithm_mode/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index a0c4a5ac..10efc92f 100755 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -421,8 +421,8 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di print(f"FINAL_MODEL_DEBUG: is_master={is_master}, model_dir={model_dir}") if is_master: - logging.info(f"FINAL_MODEL_SAVE: Saving final model as master") - print(f"FINAL_MODEL_SAVE: Saving final model as master") + logging.info("FINAL_MODEL_SAVE: Saving final model as master") + print("FINAL_MODEL_SAVE: Saving final model as master") if type(bst) is not list: model_location = os.path.join(model_dir, MODEL_NAME) bst.save_model(model_location) From 5bac0865e83b4f16a66fcd9447ecd5295ac6d770 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 09:20:24 -0700 Subject: [PATCH 080/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 598db57f..fd968155 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -98,7 +98,7 @@ def get_callbacks( data_name=early_stopping_data_name, metric_name=early_stopping_metric, maximize=maximize, - save_best=True, + save_best=is_master, ) callbacks.append(early_stop) From 695a28db22f2e7750b553034f1ef1829b8e2a809 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 11:46:12 -0700 Subject: [PATCH 081/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index fd968155..373817f9 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -91,15 +91,15 @@ def get_callbacks( callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) - if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: - maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS - early_stop = xgb.callback.EarlyStopping( - rounds=early_stopping_rounds, - data_name=early_stopping_data_name, - metric_name=early_stopping_metric, - maximize=maximize, - save_best=is_master, - ) - callbacks.append(early_stop) + # if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: + # maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS + # early_stop = xgb.callback.EarlyStopping( + # rounds=early_stopping_rounds, + # data_name=early_stopping_data_name, + # metric_name=early_stopping_metric, + # maximize=maximize, + # save_best=is_master, + # ) + # callbacks.append(early_stop) return xgb_model, iteration, callbacks From 609fc1f515efc2718f805e1620c7ed9dc17c3b42 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 13:24:41 -0700 Subject: [PATCH 082/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 373817f9..1f10e627 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -5,7 +5,8 @@ from sagemaker_xgboost_container import checkpointing from sagemaker_xgboost_container.algorithm_mode import train_utils -from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS +from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME +#from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS from smdebug.xgboost import Hook logger = logging.getLogger(__name__) From 50a302ea6550897a5ec76d4ca3437d0900c463cc Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 14:11:46 -0700 Subject: [PATCH 083/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 1f10e627..771c2701 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -6,7 +6,7 @@ from sagemaker_xgboost_container import checkpointing from sagemaker_xgboost_container.algorithm_mode import train_utils from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME -#from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS +# from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS from smdebug.xgboost import Hook logger = logging.getLogger(__name__) From cf13b97b99e647d71397e718937bca2081157113 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 15:51:39 -0700 Subject: [PATCH 084/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 771c2701..3f0ac92e 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -86,7 +86,7 @@ def get_callbacks( ) callbacks.append(save_checkpoint) - if save_model_on_termination == "true": + if save_model_on_termination == "true" and is_master: model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) callbacks.append(save_intermediate_model) From 242d9b2881835cee166c26f099775faba02e78f3 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 18:17:26 -0700 Subject: [PATCH 085/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 3f0ac92e..e242c6ed 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -6,6 +6,7 @@ from sagemaker_xgboost_container import checkpointing from sagemaker_xgboost_container.algorithm_mode import train_utils from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME + # from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS from smdebug.xgboost import Hook @@ -80,17 +81,17 @@ def get_callbacks( callbacks = [] callbacks.append(xgb.callback.EvaluationMonitor()) - if checkpoint_dir: - save_checkpoint = xgb.callback.TrainingCheckPoint( - directory=checkpoint_dir, iterations=iteration, name=checkpointing.CHECKPOINT_FILENAME - ) - callbacks.append(save_checkpoint) - - if save_model_on_termination == "true" and is_master: - model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME - save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) - callbacks.append(save_intermediate_model) - add_sigterm_handler(model_dir, is_master) + # if checkpoint_dir: + # save_checkpoint = xgb.callback.TrainingCheckPoint( + # directory=checkpoint_dir, iterations=iteration, name=checkpointing.CHECKPOINT_FILENAME + # ) + # callbacks.append(save_checkpoint) + + # if save_model_on_termination == "true" and is_master: + # model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME + # save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) + # callbacks.append(save_intermediate_model) + # add_sigterm_handler(model_dir, is_master) # if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: # maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS From b0589a281f3dd43de81f50722f7de5505f28f4d0 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 19:09:32 -0700 Subject: [PATCH 086/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index e242c6ed..fb5799df 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -81,17 +81,18 @@ def get_callbacks( callbacks = [] callbacks.append(xgb.callback.EvaluationMonitor()) - # if checkpoint_dir: - # save_checkpoint = xgb.callback.TrainingCheckPoint( - # directory=checkpoint_dir, iterations=iteration, name=checkpointing.CHECKPOINT_FILENAME - # ) - # callbacks.append(save_checkpoint) - # if save_model_on_termination == "true" and is_master: - # model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME - # save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) - # callbacks.append(save_intermediate_model) - # add_sigterm_handler(model_dir, is_master) + if checkpoint_dir and is_master: + save_checkpoint = xgb.callback.TrainingCheckPoint( + directory=checkpoint_dir, iterations=iteration, name=checkpointing.CHECKPOINT_FILENAME + ) + callbacks.append(save_checkpoint) + + if save_model_on_termination == "true" and is_master: + model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME + save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) + callbacks.append(save_intermediate_model) + add_sigterm_handler(model_dir, is_master) # if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: # maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS From 22f74fc4049fcc75a1162d68c88a61ecc3256737 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 20:33:56 -0700 Subject: [PATCH 087/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index fb5799df..179e83c0 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -88,11 +88,23 @@ def get_callbacks( ) callbacks.append(save_checkpoint) + logging.info(f"CALLBACK_SETUP_DEBUG: save_model_on_termination={save_model_on_termination}, is_master={is_master}") + print(f"CALLBACK_SETUP_DEBUG: save_model_on_termination={save_model_on_termination}, is_master={is_master}") + if save_model_on_termination == "true" and is_master: + logging.info(f"CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") + print(f"CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) + else: + logging.info( + f"CALLBACK_SKIPPING: NOT adding SaveIntermediateModelCallBack (save_model_on_termination={save_model_on_termination}, is_master={is_master})" + ) + print( + f"CALLBACK_SKIPPING: NOT adding SaveIntermediateModelCallBack (save_model_on_termination={save_model_on_termination}, is_master={is_master})" + ) # if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: # maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS From a5c96a05ba1df943702c05c1d10bc9b79dac49ef Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 22:00:21 -0700 Subject: [PATCH 088/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 179e83c0..009cff98 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -92,19 +92,15 @@ def get_callbacks( print(f"CALLBACK_SETUP_DEBUG: save_model_on_termination={save_model_on_termination}, is_master={is_master}") if save_model_on_termination == "true" and is_master: - logging.info(f"CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") - print(f"CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") + logging.info("CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") + print("CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) else: - logging.info( - f"CALLBACK_SKIPPING: NOT adding SaveIntermediateModelCallBack (save_model_on_termination={save_model_on_termination}, is_master={is_master})" - ) - print( - f"CALLBACK_SKIPPING: NOT adding SaveIntermediateModelCallBack (save_model_on_termination={save_model_on_termination}, is_master={is_master})" - ) + logging.info(f"CALLBACK_SKIPPING save_model_on_termination={save_model_on_termination}, is_master={is_master})") + print(f"CALLBACK_SKIPPING: save_model_on_termination={save_model_on_termination}, is_master={is_master})") # if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: # maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS From 8e63e63f2c9ba3ad22cd2a5bc38bde31fef0b3ec Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 23:04:14 -0700 Subject: [PATCH 089/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 20 +++++++++---------- .../distributed.py | 7 ++++++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 009cff98..a5c3c26c 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -102,15 +102,15 @@ def get_callbacks( logging.info(f"CALLBACK_SKIPPING save_model_on_termination={save_model_on_termination}, is_master={is_master})") print(f"CALLBACK_SKIPPING: save_model_on_termination={save_model_on_termination}, is_master={is_master})") - # if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: - # maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS - # early_stop = xgb.callback.EarlyStopping( - # rounds=early_stopping_rounds, - # data_name=early_stopping_data_name, - # metric_name=early_stopping_metric, - # maximize=maximize, - # save_best=is_master, - # ) - # callbacks.append(early_stop) + if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: + maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS + early_stop = xgb.callback.EarlyStopping( + rounds=early_stopping_rounds, + data_name=early_stopping_data_name, + metric_name=early_stopping_metric, + maximize=maximize, + save_best=is_master, + ) + callbacks.append(early_stop) return xgb_model, iteration, callbacks diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 307f7374..a3594a07 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -130,7 +130,7 @@ def __init__(self, is_master, current_host, master_port): :param current_host: :param master_port: """ - self.is_master = is_master + self._is_master = is_master # Store hostname-based master determination self.current_host = current_host self.master_port = master_port @@ -141,6 +141,11 @@ def __init__(self, is_master, current_host, master_port): self.rank = 0 self.world_size = 1 + @property + def is_master(self): + """Return hostname-based master determination, ignoring collective rank.""" + return self._is_master + def synchronize(self, data): """Synchronize data with the cluster. From fb68231f3d62f30263284527195e0ab136e77bd5 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 28 Sep 2025 23:43:40 -0700 Subject: [PATCH 090/157] debug master host --- src/sagemaker_xgboost_container/callback.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index a5c3c26c..a61fd0b7 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -5,9 +5,7 @@ from sagemaker_xgboost_container import checkpointing from sagemaker_xgboost_container.algorithm_mode import train_utils -from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME - -# from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS +from sagemaker_xgboost_container.constants.xgb_constants import MODEL_NAME, XGB_MAXIMIZE_METRICS from smdebug.xgboost import Hook logger = logging.getLogger(__name__) From 92467f49b1ae15fbb5de0c5353aa712c45634e3a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 10:17:56 -0700 Subject: [PATCH 091/157] debug master host --- .../algorithm_mode/train.py | 3 +++ src/sagemaker_xgboost_container/distributed.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index 10efc92f..fa637a65 100755 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -267,6 +267,9 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di :param model_dir: Directory where model will be saved :param is_master: True if single node training, or the current node is the master node in distributed training. """ + logging.info(f"TRAIN_JOB_DEBUG: Received is_master={is_master}") + print(f"TRAIN_JOB_DEBUG: Received is_master={is_master}") + # Parse arguments for train() API num_round = train_cfg.pop("num_round") # Parse arguments for intermediate model callback diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a3594a07..12106ef9 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -107,6 +107,17 @@ def rabit_run( connect_retry_timeout=connect_retry_timeout, ) as cluster: if update_rabit_args: + logging.info( + f"RABIT_DEBUG: \ + cluster.is_master={cluster.is_master}, \ + current_host={current_host}" + ) + print( + f"RABIT_DEBUG: \ + cluster.is_master={cluster.is_master}, \ + current_host={current_host}" + ) + args.update({"is_master": cluster.is_master}) exec_fun(**args) From e4fa8592957e47e62351bfe15248059ab76eead0 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 11:10:38 -0700 Subject: [PATCH 092/157] debug master host --- .../distributed.py | 34 ++++--------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 12106ef9..59c87c8d 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -155,6 +155,8 @@ def __init__(self, is_master, current_host, master_port): @property def is_master(self): """Return hostname-based master determination, ignoring collective rank.""" + logging.info(f"RABIT_HELPER_DEBUG: Returning is_master={self._is_master} for host={self.current_host}") + print(f"RABIT_HELPER_DEBUG: Returning is_master={self._is_master} for host={self.current_host}") return self._is_master def synchronize(self, data): @@ -287,33 +289,9 @@ def start(self): ) except Exception: pass - except Exception as e: - # Debug: Check actual hostname and environment - import socket - - actual_hostname = socket.gethostname() - sm_current_host = os.environ.get("SM_CURRENT_HOST", "NOT_SET") - - # Fallback: use hostname-based logic - if "algo-1" in self.current_host: - is_master = True - elif "algo-2" in self.current_host: - is_master = False - else: - is_master = self.current_host == self.master_host - - self.logger.info( - f"MASTER_DEBUG: Collective failed ({e}). \ - current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ - SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ - all_hosts: {self.hosts}, is_master: {is_master}" - ) - print( - f"MASTER_DEBUG: Collective failed ({e}). \ - current_host: {self.current_host}, actual_hostname: {actual_hostname}, \ - SM_CURRENT_HOST: {sm_current_host}, master_host: {self.master_host}, \ - all_hosts: {self.hosts}, is_master: {is_master}" - ) + + self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") + print(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") return RabitHelper(is_master, self.current_host, self.port) def stop(self): @@ -329,4 +307,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() + return self.stop() \ No newline at end of file From d3b50627fd5741118b0fa26d333007046202ce58 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 11:51:34 -0700 Subject: [PATCH 093/157] debug master host --- src/sagemaker_xgboost_container/distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 59c87c8d..f8feab55 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -289,7 +289,7 @@ def start(self): ) except Exception: pass - + self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") print(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") return RabitHelper(is_master, self.current_host, self.port) @@ -307,4 +307,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() \ No newline at end of file + return self.stop() From 1abfe259d289d560638ab1a45cdb50a13b5c7a56 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 13:17:32 -0700 Subject: [PATCH 094/157] debug master host --- .../distributed.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index f8feab55..95feb5d4 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -99,6 +99,8 @@ def rabit_run( if len(hosts_with_data) > 1: # Set up rabit with nodes that have data and an unused port so that previous slaves don't confuse it # with the previous rabit configuration + logging.info(f"SECOND_RABIT_DEBUG: hosts_with_data={hosts_with_data}, current_host={current_host}") + print(f"SECOND_RABIT_DEBUG: hosts_with_data={hosts_with_data}, current_host={current_host}") with Rabit( hosts=hosts_with_data, current_host=current_host, @@ -141,9 +143,16 @@ def __init__(self, is_master, current_host, master_port): :param current_host: :param master_port: """ + import time + self._is_master = is_master # Store hostname-based master determination self.current_host = current_host self.master_port = master_port + self._id = int(time.time() * 1000000) % 1000000 # Unique ID for debugging + logging.info( + f"RABIT_HELPER_INIT: Created RabitHelper {self._id} with is_master={is_master} for host={current_host}" + ) + print(f"RABIT_HELPER_INIT: Created RabitHelper {self._id} with is_master={is_master} for host={current_host}") try: self.rank = collective.get_rank() @@ -155,8 +164,12 @@ def __init__(self, is_master, current_host, master_port): @property def is_master(self): """Return hostname-based master determination, ignoring collective rank.""" - logging.info(f"RABIT_HELPER_DEBUG: Returning is_master={self._is_master} for host={self.current_host}") - print(f"RABIT_HELPER_DEBUG: Returning is_master={self._is_master} for host={self.current_host}") + logging.info( + f"RABIT_HELPER_DEBUG: RabitHelper {self._id} returning is_master={self._is_master} for host={self.current_host}" + ) + print( + f"RABIT_HELPER_DEBUG: RabitHelper {self._id} returning is_master={self._is_master} for host={self.current_host}" + ) return self._is_master def synchronize(self, data): @@ -181,11 +194,11 @@ def synchronize(self, data): data_str = json.dumps(data) for i in range(self.world_size): if self.rank == i: - logging.debug("Broadcasting data from self ({}) to others".format(self.rank)) + logging.info("Broadcasting data from self ({}) to others".format(self.rank)) collective.broadcast(data_str, i) results.append(data) else: - logging.debug("Receiving data from {}".format(i)) + logging.info("Receiving data from {}".format(i)) message_str = collective.broadcast("", i) message = json.loads(message_str) if message_str else None results.append(message) From 2fdd648f881cc16fe6e8569d27ccb41963a5c57c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 13:58:13 -0700 Subject: [PATCH 095/157] debug master host --- src/sagemaker_xgboost_container/distributed.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 95feb5d4..04a029ff 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -165,10 +165,14 @@ def __init__(self, is_master, current_host, master_port): def is_master(self): """Return hostname-based master determination, ignoring collective rank.""" logging.info( - f"RABIT_HELPER_DEBUG: RabitHelper {self._id} returning is_master={self._is_master} for host={self.current_host}" + f"RABIT_HELPER_DEBUG: RabitHelper {self._id} \ + returning is_master={self._is_master} \ + for host={self.current_host}" ) print( - f"RABIT_HELPER_DEBUG: RabitHelper {self._id} returning is_master={self._is_master} for host={self.current_host}" + f"RABIT_HELPER_DEBUG: RabitHelper {self._id} \ + returning is_master={self._is_master} \ + for host={self.current_host}" ) return self._is_master From a80506498c9d8a2f885774ae94b5949ac61e65fc Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 15:53:38 -0700 Subject: [PATCH 096/157] debug master host --- src/sagemaker_xgboost_container/distributed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 04a029ff..823dd1bf 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -158,6 +158,7 @@ def __init__(self, is_master, current_host, master_port): self.rank = collective.get_rank() self.world_size = collective.get_world_size() except Exception: + logging.error("collective init failed", exc_info=True) self.rank = 0 self.world_size = 1 @@ -192,6 +193,7 @@ def synchronize(self, data): try: collective.get_rank() # Test if collective is initialized except Exception: + logging.error("collective get_rank failed", exc_info=True) return [data] results = [] From ac7a7885452a425eaaca77de490852425f0d125f Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 17:17:19 -0700 Subject: [PATCH 097/157] check xgboost 2.1.1 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- docker/2.1.0/final/Dockerfile.cpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index aae78389..c038af14 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -32,7 +32,7 @@ ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=0.9.0 -ARG XGBOOST_VERSION=2.1.0 +ARG XGBOOST_VERSION=2.1.1 ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 5417ec22..fc737fde 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -1,4 +1,4 @@ -ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 +ARG SAGEMAKER_XGBOOST_VERSION=2.1.1 ARG PYTHON_VERSION=3.10 FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 From 41dca75061abc88b58aa388c1ff93ec143fb1e1b Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 19:03:18 -0700 Subject: [PATCH 098/157] check xgboost 2.1.1 --- docker/2.1.0/final/Dockerfile.cpu | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index fc737fde..0c593755 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -3,7 +3,7 @@ ARG PYTHON_VERSION=3.10 FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 -ARG SAGEMAKER_XGBOOST_VERSION +ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 ######################## # Install dependencies # diff --git a/tox.ini b/tox.ini index 18fbb683..24907fba 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ deps = xgboost1.3: xgboost==1.3.3 xgboost1.5: xgboost==1.5.2 xgboost1.7: xgboost==1.7.4 - xgboost2.1.0: xgboost==2.1.0 + xgboost2.1.0: xgboost==2.1.1 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt From b7881bf5eb3c77cb750541b80abc39de8f67a4ab Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 20:03:39 -0700 Subject: [PATCH 099/157] check xgboost 2.1.1 --- docker/2.1.0/final/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 0c593755..9005bd7f 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -1,4 +1,4 @@ -ARG SAGEMAKER_XGBOOST_VERSION=2.1.1 +ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 ARG PYTHON_VERSION=3.10 FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 From bfd87e77f4f2cc84a18136e20367ed6e03d3f8fe Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 22:20:26 -0700 Subject: [PATCH 100/157] check xgboost 2.1.1 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- .../distributed.py | 74 +++++++++++++++++++ tox.ini | 2 +- 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index c038af14..aae78389 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -32,7 +32,7 @@ ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=0.9.0 -ARG XGBOOST_VERSION=2.1.1 +ARG XGBOOST_VERSION=2.1.0 ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 823dd1bf..a3b93235 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -24,6 +24,8 @@ from retrying import retry from xgboost import collective +from sagemaker_xgboost_container.dmlc_patch import tracker + LOCAL_HOSTNAME = "127.0.0.1" @@ -264,6 +266,76 @@ def __init__( self.connect_retry_timeout = connect_retry_timeout def start(self): + """Start the rabit process. + + If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect + to the master host to set up Rabit rank. + + :return: Initialized RabitHelper, which includes helpful information such as is_master and port + """ + self.rabit_context = None + if self.is_master_host: + self.logger.debug("Master host. Starting Rabit Tracker.") + # The Rabit Tracker is a Python script that is responsible for + # allowing each instance of rabit to find its peers and organize + # itself in to a ring for all-reduce. It supports primitive failure + # recovery modes. + # + # It runs on a master node that each of the individual Rabit instances + # talk to. + self.rabit_context = tracker.RabitTracker( + hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 + ) + + # Useful logging to ensure that the tracker has started. + # These are the key-value config pairs that each of the rabit slaves + # should be initialized with. Since we have deterministically allocated + # the master host, its port, and the number of workers, we don't need + # to pass these out-of-band to each slave; but rely on the fact + # that each slave will calculate the exact same config as the server. + # + # TODO: should probably check that these match up what we pass below. + self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) + + # This actually starts the RabitTracker in a background/daemon thread + # that will automatically exit when the main process has finished. + self.rabit_context.start(self.n_workers) + + # Start each parameter server that connects to the master. + self.logger.debug("Starting parameter server.") + + # Rabit runs as an in-process singleton library that can be configured once. + # Calling this multiple times will cause a seg-fault (without calling finalize). + # We pass it the environment variables that match up with the RabitTracker + # so that this instance can discover its peers (and recover from failure). + # + # First we check that the RabitTracker is up and running. Rabit actually + # breaks (at least on Mac OS X) if the server is not running before it + # begins to try to connect (its internal retries fail because they reuse + # the same socket instead of creating a new one). + # + # if self.max_connect_attempts is None, this will loop indefinitely. + attempt = 0 + successful_connection = False + while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + self.logger.debug("Checking if RabitTracker is available.") + s.connect((self.master_host, self.port)) + successful_connection = True + self.logger.debug("Successfully connected to RabitTracker.") + except OSError: + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + attempt += 1 + self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + time.sleep(self.connect_retry_timeout) + + if not successful_connection: + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") + else: + self.logger.info("Connected to RabitTracker.") + """Start the collective process. Initialize XGBoost collective for distributed training. @@ -319,6 +391,8 @@ def stop(self): try: collective.finalize() + if self.is_master_host: + self.rabit_context.join() except Exception as e: self.logger.debug("Collective finalize failed: {}".format(e)) diff --git a/tox.ini b/tox.ini index 24907fba..18fbb683 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ deps = xgboost1.3: xgboost==1.3.3 xgboost1.5: xgboost==1.5.2 xgboost1.7: xgboost==1.7.4 - xgboost2.1.0: xgboost==2.1.1 + xgboost2.1.0: xgboost==2.1.0 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt From a3444d1216ca05c043e861e370b38640cc2660e2 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 23:03:57 -0700 Subject: [PATCH 101/157] check xgboost 2.1.1 --- .../distributed.py | 74 ------------------- 1 file changed, 74 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a3b93235..823dd1bf 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -24,8 +24,6 @@ from retrying import retry from xgboost import collective -from sagemaker_xgboost_container.dmlc_patch import tracker - LOCAL_HOSTNAME = "127.0.0.1" @@ -266,76 +264,6 @@ def __init__( self.connect_retry_timeout = connect_retry_timeout def start(self): - """Start the rabit process. - - If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect - to the master host to set up Rabit rank. - - :return: Initialized RabitHelper, which includes helpful information such as is_master and port - """ - self.rabit_context = None - if self.is_master_host: - self.logger.debug("Master host. Starting Rabit Tracker.") - # The Rabit Tracker is a Python script that is responsible for - # allowing each instance of rabit to find its peers and organize - # itself in to a ring for all-reduce. It supports primitive failure - # recovery modes. - # - # It runs on a master node that each of the individual Rabit instances - # talk to. - self.rabit_context = tracker.RabitTracker( - hostIP=self.current_host, nslave=self.n_workers, port=self.port, port_end=self.port + 1 - ) - - # Useful logging to ensure that the tracker has started. - # These are the key-value config pairs that each of the rabit slaves - # should be initialized with. Since we have deterministically allocated - # the master host, its port, and the number of workers, we don't need - # to pass these out-of-band to each slave; but rely on the fact - # that each slave will calculate the exact same config as the server. - # - # TODO: should probably check that these match up what we pass below. - self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs())) - - # This actually starts the RabitTracker in a background/daemon thread - # that will automatically exit when the main process has finished. - self.rabit_context.start(self.n_workers) - - # Start each parameter server that connects to the master. - self.logger.debug("Starting parameter server.") - - # Rabit runs as an in-process singleton library that can be configured once. - # Calling this multiple times will cause a seg-fault (without calling finalize). - # We pass it the environment variables that match up with the RabitTracker - # so that this instance can discover its peers (and recover from failure). - # - # First we check that the RabitTracker is up and running. Rabit actually - # breaks (at least on Mac OS X) if the server is not running before it - # begins to try to connect (its internal retries fail because they reuse - # the same socket instead of creating a new one). - # - # if self.max_connect_attempts is None, this will loop indefinitely. - attempt = 0 - successful_connection = False - while not successful_connection and (self.max_connect_attempts is None or attempt < self.max_connect_attempts): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - self.logger.debug("Checking if RabitTracker is available.") - s.connect((self.master_host, self.port)) - successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") - except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - attempt += 1 - self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - time.sleep(self.connect_retry_timeout) - - if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") - else: - self.logger.info("Connected to RabitTracker.") - """Start the collective process. Initialize XGBoost collective for distributed training. @@ -391,8 +319,6 @@ def stop(self): try: collective.finalize() - if self.is_master_host: - self.rabit_context.join() except Exception as e: self.logger.debug("Collective finalize failed: {}".format(e)) From 216c3413c5829527801f4e454815b1fbd6d479cb Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 29 Sep 2025 23:51:21 -0700 Subject: [PATCH 102/157] check xgboost 2.1.1 --- src/sagemaker_xgboost_container/distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 823dd1bf..1a6b251e 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -276,6 +276,7 @@ def start(self): os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) os.environ["DMLC_TRACKER_URI"] = self.master_host os.environ["DMLC_TRACKER_PORT"] = str(self.port) + os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) # For single node, skip collective initialization if self.n_workers == 1: From 51d0ebc93f906d9d7785703143329b2cd5e0980c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 03:11:00 -0700 Subject: [PATCH 103/157] check xgboost 2.1.1 --- docker/2.1.0/final/Dockerfile.cpu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 9005bd7f..4c73c710 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -18,8 +18,10 @@ RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt # setattr(collections, 'Mapping', collections.abc.Mapping); \ # exec(open('/miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py').read().replace('collections.Mapping', 'collections.abc.Mapping'))" || \ # sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py +# Fix Python 3.10 compatibility for sagemaker-containers +RUN python3 -c "import sys; import os; site_packages = '/usr/local/lib/python3.10/dist-packages'; mapping_file = os.path.join(site_packages, 'sagemaker_containers/_mapping.py'); exec('if os.path.exists(mapping_file):\\n with open(mapping_file, \"r\") as f:\\n content = f.read()\\n content = content.replace(\"collections.Mapping\", \"collections.abc.Mapping\")\\n with open(mapping_file, \"w\") as f:\\n f.write(content)')" -RUN sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py +# RUN sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py # Install smdebug from source RUN python3 -m pip install git+https://github.com/awslabs/sagemaker-debugger.git@1.0.29 From 21fc05284569352051c62170d4522f58363a62f6 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 12:10:38 -0700 Subject: [PATCH 104/157] check xgboost 2.1.1 --- docker/2.1.0/final/Dockerfile.cpu | 4 +-- .../distributed.py | 26 +++++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 4c73c710..9005bd7f 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -18,10 +18,8 @@ RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt # setattr(collections, 'Mapping', collections.abc.Mapping); \ # exec(open('/miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py').read().replace('collections.Mapping', 'collections.abc.Mapping'))" || \ # sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py -# Fix Python 3.10 compatibility for sagemaker-containers -RUN python3 -c "import sys; import os; site_packages = '/usr/local/lib/python3.10/dist-packages'; mapping_file = os.path.join(site_packages, 'sagemaker_containers/_mapping.py'); exec('if os.path.exists(mapping_file):\\n with open(mapping_file, \"r\") as f:\\n content = f.read()\\n content = content.replace(\"collections.Mapping\", \"collections.abc.Mapping\")\\n with open(mapping_file, \"w\") as f:\\n f.write(content)')" -# RUN sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py +RUN sed -i 's/collections\.Mapping/collections.abc.Mapping/g' /miniconda3/lib/python3.10/site-packages/sagemaker_containers/_mapping.py # Install smdebug from source RUN python3 -m pip install git+https://github.com/awslabs/sagemaker-debugger.git@1.0.29 diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 1a6b251e..ad7a0ad7 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -22,6 +22,7 @@ import os from retrying import retry +from xgboost.tracker import RabitTracker from xgboost import collective LOCAL_HOSTNAME = "127.0.0.1" @@ -284,18 +285,20 @@ def start(self): return RabitHelper(True, self.current_host, self.port) try: + # Launch tracker on master, register on workers + if self.current_host == self.master_host: + self.tracker = RabitTracker(host_ip=self.master_host, n_workers=self.n_workers, port=self.port) + self.tracker.start() + self.tracker.wait_for(self.connect_retry_timeout) + + # Initialize collective for synchronization collective.init() - self.logger.debug("Collective started - Rank {}".format(collective.get_rank())) - except Exception as e: - self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) - return RabitHelper(True, self.current_host, self.port) - # Use hostname-based master selection instead of buggy collective rank - # Both hosts incorrectly get rank 0, so we can't trust collective.get_rank() - is_master = self.current_host == self.master_host + # Use hostname-based master selection instead of buggy collective rank + # Both hosts incorrectly get rank 0, so we can't trust collective.get_rank() + is_master = self.current_host == self.master_host - # Debug logging - try: + # Debug logging rank = collective.get_rank() self.logger.info( f"MASTER_DEBUG_FIXED: Ignoring collective rank {rank}. \ @@ -307,8 +310,9 @@ def start(self): Using hostname logic: current_host={self.current_host}, \ master_host={self.master_host}, is_master={is_master}" ) - except Exception: - pass + except Exception as e: + self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) + return RabitHelper(True, self.current_host, self.port) self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") print(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") From ebf9e7a09ba4c15b439b42665cd570935d39b536 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 13:27:36 -0700 Subject: [PATCH 105/157] check xgboost 2.1.1 --- .../distributed.py | 77 ++++++++++++++----- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index ad7a0ad7..30893489 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -20,6 +20,7 @@ import sys import json import os +import time from retrying import retry from xgboost.tracker import RabitTracker @@ -153,7 +154,6 @@ def __init__(self, is_master, current_host, master_port): logging.info( f"RABIT_HELPER_INIT: Created RabitHelper {self._id} with is_master={is_master} for host={current_host}" ) - print(f"RABIT_HELPER_INIT: Created RabitHelper {self._id} with is_master={is_master} for host={current_host}") try: self.rank = collective.get_rank() @@ -272,6 +272,7 @@ def start(self): :return: Initialized RabitHelper, which includes helpful information such as is_master and port """ self.logger.debug("Starting collective communication.") + self.tracker = None # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) @@ -285,37 +286,65 @@ def start(self): return RabitHelper(True, self.current_host, self.port) try: - # Launch tracker on master, register on workers + # Launch tracker on master only if self.current_host == self.master_host: - self.tracker = RabitTracker(host_ip=self.master_host, n_workers=self.n_workers, port=self.port) + self.tracker = RabitTracker( + host_ip=self.master_host, n_workers=self.n_workers, port=self.port, sortby="task" + ) self.tracker.start() - self.tracker.wait_for(self.connect_retry_timeout) + + # Rabit runs as an in-process singleton library that can be configured once. + # Calling this multiple times will cause a seg-fault (without calling finalize). + # We pass it the environment variables that match up with the RabitTracker + # so that this instance can discover its peers (and recover from failure). + # + # First we check that the RabitTracker is up and running. Rabit actually + # breaks (at least on Mac OS X) if the server is not running before it + # begins to try to connect (its internal retries fail because they reuse + # the same socket instead of creating a new one). + # + # if self.max_connect_attempts is None, this will loop indefinitely. + attempt = 0 + successful_connection = False + while not successful_connection and ( + self.max_connect_attempts is None or attempt < self.max_connect_attempts + ): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + self.logger.debug("Checking if RabitTracker is available.") + s.connect((self.master_host, self.port)) + successful_connection = True + self.logger.debug("Successfully connected to RabitTracker.") + except OSError: + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + attempt += 1 + self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + time.sleep(self.connect_retry_timeout) + + if not successful_connection: + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") + else: + self.logger.info("Connected to RabitTracker.") # Initialize collective for synchronization collective.init() - # Use hostname-based master selection instead of buggy collective rank - # Both hosts incorrectly get rank 0, so we can't trust collective.get_rank() + # Use hostname-based master selection is_master = self.current_host == self.master_host - # Debug logging - rank = collective.get_rank() self.logger.info( - f"MASTER_DEBUG_FIXED: Ignoring collective rank {rank}. \ - Using hostname logic: current_host={self.current_host}, \ - master_host={self.master_host}, is_master={is_master}" - ) - print( - f"MASTER_DEBUG_FIXED: Ignoring collective rank {rank}. \ - Using hostname logic: current_host={self.current_host}, \ - master_host={self.master_host}, is_master={is_master}" + f"MASTER_DEBUG_FIXED: Using hostname logic: \ + current_host={self.current_host}, \ + master_host={self.master_host}, \ + is_master={is_master}" ) except Exception as e: - self.logger.warning("Collective init failed: {}, falling back to single node".format(e)) + self.logger.warning("Collective init failed: {}, " "falling back to single node".format(e)) + self._cleanup_tracker() return RabitHelper(True, self.current_host, self.port) self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") - print(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") return RabitHelper(is_master, self.current_host, self.port) def stop(self): @@ -327,6 +356,18 @@ def stop(self): except Exception as e: self.logger.debug("Collective finalize failed: {}".format(e)) + self._cleanup_tracker() + + def _cleanup_tracker(self): + """Clean up tracker safely.""" + if hasattr(self, "tracker") and self.tracker is not None: + try: + self.tracker.free() + except Exception as e: + self.logger.debug("Tracker cleanup failed: {}".format(e)) + finally: + self.tracker = None + def __enter__(self): return self.start() From 9aef1c7052f5999105c59403911bf2102f60888b Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 14:46:40 -0700 Subject: [PATCH 106/157] check xgboost 2.1.1 --- .../distributed.py | 86 ++++++++++--------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 30893489..e6da7911 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -274,12 +274,6 @@ def start(self): self.logger.debug("Starting collective communication.") self.tracker = None - # Set environment variables for collective - os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) - os.environ["DMLC_TRACKER_URI"] = self.master_host - os.environ["DMLC_TRACKER_PORT"] = str(self.port) - os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) - # For single node, skip collective initialization if self.n_workers == 1: self.logger.debug("Single worker detected, skipping collective init") @@ -293,41 +287,51 @@ def start(self): ) self.tracker.start() - # Rabit runs as an in-process singleton library that can be configured once. - # Calling this multiple times will cause a seg-fault (without calling finalize). - # We pass it the environment variables that match up with the RabitTracker - # so that this instance can discover its peers (and recover from failure). - # - # First we check that the RabitTracker is up and running. Rabit actually - # breaks (at least on Mac OS X) if the server is not running before it - # begins to try to connect (its internal retries fail because they reuse - # the same socket instead of creating a new one). - # - # if self.max_connect_attempts is None, this will loop indefinitely. - attempt = 0 - successful_connection = False - while not successful_connection and ( - self.max_connect_attempts is None or attempt < self.max_connect_attempts - ): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - self.logger.debug("Checking if RabitTracker is available.") - s.connect((self.master_host, self.port)) - successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") - except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - attempt += 1 - self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - time.sleep(self.connect_retry_timeout) - - if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") - else: - self.logger.info("Connected to RabitTracker.") - - # Initialize collective for synchronization + with collective.CommunicatorContext(**self.tracker.worker_args()): + ret = collective.broadcast("msg", 0) + assert str(ret) == "msg" + + # Set environment variables for collective + os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) + os.environ["DMLC_TRACKER_URI"] = self.master_host + os.environ["DMLC_TRACKER_PORT"] = str(self.port) + os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) + + # # Rabit runs as an in-process singleton library that can be configured once. + # # Calling this multiple times will cause a seg-fault (without calling finalize). + # # We pass it the environment variables that match up with the RabitTracker + # # so that this instance can discover its peers (and recover from failure). + # # + # # First we check that the RabitTracker is up and running. Rabit actually + # # breaks (at least on Mac OS X) if the server is not running before it + # # begins to try to connect (its internal retries fail because they reuse + # # the same socket instead of creating a new one). + # # + # # if self.max_connect_attempts is None, this will loop indefinitely. + # attempt = 0 + # successful_connection = False + # while not successful_connection and ( + # self.max_connect_attempts is None or attempt < self.max_connect_attempts + # ): + # with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + # try: + # self.logger.debug("Checking if RabitTracker is available.") + # s.connect((self.master_host, self.port)) + # successful_connection = True + # self.logger.debug("Successfully connected to RabitTracker.") + # except OSError: + # self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + # attempt += 1 + # self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + # time.sleep(self.connect_retry_timeout) + + # if not successful_connection: + # self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + # raise Exception("Failed to connect to Rabit Tracker") + # else: + # self.logger.info("Connected to RabitTracker.") + + # # Initialize collective for synchronization collective.init() # Use hostname-based master selection From 04d5eb7f05078f11fb7c9e42dd103bcc0675375a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 17:03:56 -0700 Subject: [PATCH 107/157] check xgboost 2.1.1 --- src/sagemaker_xgboost_container/distributed.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index e6da7911..6d5f5b4f 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -281,7 +281,7 @@ def start(self): try: # Launch tracker on master only - if self.current_host == self.master_host: + if self.is_master_host: self.tracker = RabitTracker( host_ip=self.master_host, n_workers=self.n_workers, port=self.port, sortby="task" ) @@ -334,9 +334,6 @@ def start(self): # # Initialize collective for synchronization collective.init() - # Use hostname-based master selection - is_master = self.current_host == self.master_host - self.logger.info( f"MASTER_DEBUG_FIXED: Using hostname logic: \ current_host={self.current_host}, \ @@ -349,7 +346,7 @@ def start(self): return RabitHelper(True, self.current_host, self.port) self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") - return RabitHelper(is_master, self.current_host, self.port) + return RabitHelper(self.is_master_host, self.current_host, self.port) def stop(self): """Shutdown collective communication.""" @@ -364,7 +361,7 @@ def stop(self): def _cleanup_tracker(self): """Clean up tracker safely.""" - if hasattr(self, "tracker") and self.tracker is not None: + if self.tracker is not None: try: self.tracker.free() except Exception as e: @@ -373,7 +370,7 @@ def _cleanup_tracker(self): self.tracker = None def __enter__(self): - return self.start() + self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - return self.stop() + self.stop() From 2e2439f8ea4217d23ed7800168d6bddcc0f0e62e Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 17:14:58 -0700 Subject: [PATCH 108/157] check xgboost 2.1.1 --- .../distributed.py | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 6d5f5b4f..baa2dde0 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -147,12 +147,12 @@ def __init__(self, is_master, current_host, master_port): """ import time - self._is_master = is_master # Store hostname-based master determination + self.is_master = is_master # Store hostname-based master determination self.current_host = current_host self.master_port = master_port self._id = int(time.time() * 1000000) % 1000000 # Unique ID for debugging logging.info( - f"RABIT_HELPER_INIT: Created RabitHelper {self._id} with is_master={is_master} for host={current_host}" + f"RABIT_HELPER_INIT: Created RabitHelper {self._id} with is_master={self.is_master} for host={current_host}" ) try: @@ -163,21 +163,6 @@ def __init__(self, is_master, current_host, master_port): self.rank = 0 self.world_size = 1 - @property - def is_master(self): - """Return hostname-based master determination, ignoring collective rank.""" - logging.info( - f"RABIT_HELPER_DEBUG: RabitHelper {self._id} \ - returning is_master={self._is_master} \ - for host={self.current_host}" - ) - print( - f"RABIT_HELPER_DEBUG: RabitHelper {self._id} \ - returning is_master={self._is_master} \ - for host={self.current_host}" - ) - return self._is_master - def synchronize(self, data): """Synchronize data with the cluster. @@ -338,14 +323,14 @@ def start(self): f"MASTER_DEBUG_FIXED: Using hostname logic: \ current_host={self.current_host}, \ master_host={self.master_host}, \ - is_master={is_master}" + is_master={self.is_master_host}" ) except Exception as e: self.logger.warning("Collective init failed: {}, " "falling back to single node".format(e)) self._cleanup_tracker() return RabitHelper(True, self.current_host, self.port) - self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={is_master}") + self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={self.is_master_host}") return RabitHelper(self.is_master_host, self.current_host, self.port) def stop(self): From 849b911e9c35d92892f54415ae53396641c7d657 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 18:37:13 -0700 Subject: [PATCH 109/157] check xgboost 2.1.1 --- .../distributed.py | 66 ++++++++----------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index baa2dde0..08f50784 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -271,10 +271,11 @@ def start(self): host_ip=self.master_host, n_workers=self.n_workers, port=self.port, sortby="task" ) self.tracker.start() + self.logger.info("RabitTracker started") with collective.CommunicatorContext(**self.tracker.worker_args()): ret = collective.broadcast("msg", 0) - assert str(ret) == "msg" + assert str(ret) == "msg", f"Expected msg is returned" # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) @@ -282,41 +283,32 @@ def start(self): os.environ["DMLC_TRACKER_PORT"] = str(self.port) os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) - # # Rabit runs as an in-process singleton library that can be configured once. - # # Calling this multiple times will cause a seg-fault (without calling finalize). - # # We pass it the environment variables that match up with the RabitTracker - # # so that this instance can discover its peers (and recover from failure). - # # - # # First we check that the RabitTracker is up and running. Rabit actually - # # breaks (at least on Mac OS X) if the server is not running before it - # # begins to try to connect (its internal retries fail because they reuse - # # the same socket instead of creating a new one). - # # - # # if self.max_connect_attempts is None, this will loop indefinitely. - # attempt = 0 - # successful_connection = False - # while not successful_connection and ( - # self.max_connect_attempts is None or attempt < self.max_connect_attempts - # ): - # with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - # try: - # self.logger.debug("Checking if RabitTracker is available.") - # s.connect((self.master_host, self.port)) - # successful_connection = True - # self.logger.debug("Successfully connected to RabitTracker.") - # except OSError: - # self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - # attempt += 1 - # self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - # time.sleep(self.connect_retry_timeout) - - # if not successful_connection: - # self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - # raise Exception("Failed to connect to Rabit Tracker") - # else: - # self.logger.info("Connected to RabitTracker.") - - # # Initialize collective for synchronization + # Wait for RabitTracker to be available before initializing collective + if not self.is_master_host: + attempt = 0 + successful_connection = False + while not successful_connection and ( + self.max_connect_attempts is None or attempt < self.max_connect_attempts + ): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + self.logger.debug("Checking if RabitTracker is available.") + s.connect((self.master_host, self.port)) + successful_connection = True + self.logger.debug("Successfully connected to RabitTracker.") + except OSError: + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + attempt += 1 + self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + time.sleep(self.connect_retry_timeout) + + if not successful_connection: + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") + else: + self.logger.info("Connected to RabitTracker.") + + # Initialize collective for synchronization collective.init() self.logger.info( @@ -355,7 +347,7 @@ def _cleanup_tracker(self): self.tracker = None def __enter__(self): - self.start() + return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): self.stop() From d9596d78c82cf596314982ad0521ea2737127f36 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 19:33:20 -0700 Subject: [PATCH 110/157] check xgboost 2.1.1 --- .../distributed.py | 33 ++++--------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 08f50784..24069526 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -283,33 +283,14 @@ def start(self): os.environ["DMLC_TRACKER_PORT"] = str(self.port) os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) - # Wait for RabitTracker to be available before initializing collective - if not self.is_master_host: - attempt = 0 - successful_connection = False - while not successful_connection and ( - self.max_connect_attempts is None or attempt < self.max_connect_attempts - ): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - self.logger.debug("Checking if RabitTracker is available.") - s.connect((self.master_host, self.port)) - successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") - except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - attempt += 1 - self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - time.sleep(self.connect_retry_timeout) - - if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") - else: - self.logger.info("Connected to RabitTracker.") - # Initialize collective for synchronization - collective.init() + collective.init( + dmlc_tracker_uri=self.master_host, + dmlc_tracker_port=self.port, + dmlc_task_id=self.hosts.index(self.current_host), + dmlc_retry=self.max_connect_attempts, + dmlc_timeout=self.connect_retry_timeout, + ) self.logger.info( f"MASTER_DEBUG_FIXED: Using hostname logic: \ From d5116763d62c521a0dfd99e34a5f6504582c7421 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 19:40:15 -0700 Subject: [PATCH 111/157] check xgboost 2.1.1 --- src/sagemaker_xgboost_container/distributed.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 24069526..e09d440b 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -20,7 +20,7 @@ import sys import json import os -import time +from threading import Thread from retrying import retry from xgboost.tracker import RabitTracker @@ -273,9 +273,9 @@ def start(self): self.tracker.start() self.logger.info("RabitTracker started") - with collective.CommunicatorContext(**self.tracker.worker_args()): - ret = collective.broadcast("msg", 0) - assert str(ret) == "msg", f"Expected msg is returned" + thread = Thread(target=self.tracker.wait_for) + thread.daemon = True + thread.start() # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) From 253934d4e3ca9590727b89639ba804968b001057 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 30 Sep 2025 20:53:03 -0700 Subject: [PATCH 112/157] check xgboost 2.1.1 --- src/sagemaker_xgboost_container/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index e09d440b..5f5d3912 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -287,7 +287,7 @@ def start(self): collective.init( dmlc_tracker_uri=self.master_host, dmlc_tracker_port=self.port, - dmlc_task_id=self.hosts.index(self.current_host), + dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, ) From c7961acb805d82790668d0f66e8c06530388050a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 00:22:10 -0700 Subject: [PATCH 113/157] check xgboost 2.1.0 --- .../distributed.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 5f5d3912..3137d675 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -44,6 +44,28 @@ def wait_hostname_resolution(sm_hosts): _dns_lookup(host) +def get_host_ip(hostIP=None): + if hostIP is None or hostIP == "auto": + hostIP = "ip" + + if hostIP == "dns": + hostIP = socket.getfqdn() + elif hostIP == "ip": + from socket import gaierror + + try: + hostIP = socket.gethostbyname(socket.getfqdn()) + except gaierror: + logger.warn("gethostbyname(socket.getfqdn()) failed... trying on hostname()") + hostIP = socket.gethostbyname(socket.gethostname()) + if hostIP.startswith("127."): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + # doesn't have to be reachable + s.connect(("10.255.255.255", 1)) + hostIP = s.getsockname()[0] + return hostIP + + def rabit_run( exec_fun, args, @@ -268,7 +290,7 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=self.master_host, n_workers=self.n_workers, port=self.port, sortby="task" + host_ip=get_host_ip(self.master_host), n_workers=self.n_workers, port=self.port, sortby="task" ) self.tracker.start() self.logger.info("RabitTracker started") @@ -279,7 +301,7 @@ def start(self): # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) - os.environ["DMLC_TRACKER_URI"] = self.master_host + os.environ["DMLC_TRACKER_URI"] = get_host_ip(self.master_host) os.environ["DMLC_TRACKER_PORT"] = str(self.port) os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) From 9ef7cf9ee4ecf95e147f4bd4f689f9fd05350c25 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 06:44:34 -0700 Subject: [PATCH 114/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 3137d675..d8bfdca1 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -56,7 +56,7 @@ def get_host_ip(hostIP=None): try: hostIP = socket.gethostbyname(socket.getfqdn()) except gaierror: - logger.warn("gethostbyname(socket.getfqdn()) failed... trying on hostname()") + logging.warn("gethostbyname(socket.getfqdn()) failed... trying on hostname()") hostIP = socket.gethostbyname(socket.gethostname()) if hostIP.startswith("127."): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) From 7939d4506162553bf8c4b10d90f35801db7b7a0a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 07:25:38 -0700 Subject: [PATCH 115/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index d8bfdca1..0964999b 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -307,7 +307,7 @@ def start(self): # Initialize collective for synchronization collective.init( - dmlc_tracker_uri=self.master_host, + dmlc_tracker_uri=get_host_ip(self.master_host), dmlc_tracker_port=self.port, dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, From 8451e22c0508f8778f93197099160ff1f2d24caa Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 08:25:17 -0700 Subject: [PATCH 116/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 0964999b..266d05c0 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -289,9 +289,7 @@ def start(self): try: # Launch tracker on master only if self.is_master_host: - self.tracker = RabitTracker( - host_ip=get_host_ip(self.master_host), n_workers=self.n_workers, port=self.port, sortby="task" - ) + self.tracker = RabitTracker(host_ip="0.0.0.0", n_workers=self.n_workers, port=self.port, sortby="task") self.tracker.start() self.logger.info("RabitTracker started") @@ -307,7 +305,7 @@ def start(self): # Initialize collective for synchronization collective.init( - dmlc_tracker_uri=get_host_ip(self.master_host), + dmlc_tracker_uri=self.master_host, dmlc_tracker_port=self.port, dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, From db2284dd5d1b2364e996e86a3a74ceb87890abd1 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 09:42:15 -0700 Subject: [PATCH 117/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 266d05c0..d1e39d26 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -289,9 +289,11 @@ def start(self): try: # Launch tracker on master only if self.is_master_host: - self.tracker = RabitTracker(host_ip="0.0.0.0", n_workers=self.n_workers, port=self.port, sortby="task") + self.tracker = RabitTracker( + host_ip=self.current_host, n_workers=self.n_workers, port=self.port, sortby="task" + ) self.tracker.start() - self.logger.info("RabitTracker started") + self.logger.info("RabitTracker start listen on %s:%d", self.current_host, self.port) thread = Thread(target=self.tracker.wait_for) thread.daemon = True @@ -299,7 +301,7 @@ def start(self): # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) - os.environ["DMLC_TRACKER_URI"] = get_host_ip(self.master_host) + os.environ["DMLC_TRACKER_URI"] = self.master_host os.environ["DMLC_TRACKER_PORT"] = str(self.port) os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) From bf6a40df1a1a296d95a232fcde31d2bdf5c59515 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 10:16:34 -0700 Subject: [PATCH 118/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/callback.py | 3 --- src/sagemaker_xgboost_container/distributed.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index a61fd0b7..29a1104a 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -87,18 +87,15 @@ def get_callbacks( callbacks.append(save_checkpoint) logging.info(f"CALLBACK_SETUP_DEBUG: save_model_on_termination={save_model_on_termination}, is_master={is_master}") - print(f"CALLBACK_SETUP_DEBUG: save_model_on_termination={save_model_on_termination}, is_master={is_master}") if save_model_on_termination == "true" and is_master: logging.info("CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") - print("CALLBACK_ADDING: Adding SaveIntermediateModelCallBack on master") model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.SaveIntermediateModelCallBack(model_dir, model_name, is_master) callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) else: logging.info(f"CALLBACK_SKIPPING save_model_on_termination={save_model_on_termination}, is_master={is_master})") - print(f"CALLBACK_SKIPPING: save_model_on_termination={save_model_on_termination}, is_master={is_master})") if early_stopping_data_name and early_stopping_metric and early_stopping_rounds: maximize = early_stopping_metric in XGB_MAXIMIZE_METRICS diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index d1e39d26..526864ff 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -75,7 +75,7 @@ def rabit_run( first_port=None, second_port=None, max_connect_attempts=None, - connect_retry_timeout=3, + connect_retry_timeout=300, update_rabit_args=False, ): """Run execution function after initializing dmlc/rabit. From a6ec50f9516459bddbba2110cd84307eaa93ea87 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 11:36:30 -0700 Subject: [PATCH 119/157] check xgboost 2.1.0 --- test/utils/local_mode.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/utils/local_mode.py b/test/utils/local_mode.py index dfa8ff76..4ca4f442 100644 --- a/test/utils/local_mode.py +++ b/test/utils/local_mode.py @@ -535,6 +535,8 @@ def create_docker_host( optml_volumes = ["/private" + v if v.startswith("/var") else v for v in optml_volumes] optml_volumes.extend(volumes) + networks = ["xgboost_network"] + host_config = { "image": image, "stdin_open": True, @@ -542,6 +544,7 @@ def create_docker_host( "volumes": optml_volumes, "environment": environment, "command": command, + "networks": networks, } if entrypoint: From e3fb7953067093ab008dd488cec7ef1b1c054785 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 12:49:51 -0700 Subject: [PATCH 120/157] check xgboost 2.1.0 --- .../distributed.py | 26 ++++++++++++++++++- test/utils/local_mode.py | 3 --- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 526864ff..f2d2dbf8 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -20,8 +20,9 @@ import sys import json import os -from threading import Thread +import time +from threading import Thread from retrying import retry from xgboost.tracker import RabitTracker from xgboost import collective @@ -299,6 +300,29 @@ def start(self): thread.daemon = True thread.start() + attempt = 0 + successful_connection = False + while not successful_connection and ( + self.max_connect_attempts is None or attempt < self.max_connect_attempts + ): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + self.logger.debug("Checking if RabitTracker is available.") + s.connect((self.master_host, self.port)) + successful_connection = True + self.logger.debug("Successfully connected to RabitTracker.") + except OSError: + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + attempt += 1 + self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + time.sleep(self.connect_retry_timeout) + + if not successful_connection: + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") + else: + self.logger.info("Connected to RabitTracker.") + # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) os.environ["DMLC_TRACKER_URI"] = self.master_host diff --git a/test/utils/local_mode.py b/test/utils/local_mode.py index 4ca4f442..dfa8ff76 100644 --- a/test/utils/local_mode.py +++ b/test/utils/local_mode.py @@ -535,8 +535,6 @@ def create_docker_host( optml_volumes = ["/private" + v if v.startswith("/var") else v for v in optml_volumes] optml_volumes.extend(volumes) - networks = ["xgboost_network"] - host_config = { "image": image, "stdin_open": True, @@ -544,7 +542,6 @@ def create_docker_host( "volumes": optml_volumes, "environment": environment, "command": command, - "networks": networks, } if entrypoint: From c9021ed9c89972af10eb9547ca2e37a1f74fe196 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 13:01:06 -0700 Subject: [PATCH 121/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index f2d2dbf8..e48076cb 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -290,9 +290,7 @@ def start(self): try: # Launch tracker on master only if self.is_master_host: - self.tracker = RabitTracker( - host_ip=self.current_host, n_workers=self.n_workers, port=self.port, sortby="task" - ) + self.tracker = RabitTracker(host_ip="0.0.0.0", n_workers=self.n_workers, port=self.port, sortby="task") self.tracker.start() self.logger.info("RabitTracker start listen on %s:%d", self.current_host, self.port) From b40d83e090c4b9480ffbcc82c1c241a5744357d4 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 13:57:50 -0700 Subject: [PATCH 122/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index e48076cb..fc9ac00d 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -290,7 +290,9 @@ def start(self): try: # Launch tracker on master only if self.is_master_host: - self.tracker = RabitTracker(host_ip="0.0.0.0", n_workers=self.n_workers, port=self.port, sortby="task") + self.tracker = RabitTracker( + host_ip=self.master_host, n_workers=self.n_workers, port=self.port, sortby="task" + ) self.tracker.start() self.logger.info("RabitTracker start listen on %s:%d", self.current_host, self.port) From 2c746ddc2efa719ce0e5a78e846788a0dc89fe86 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 14:10:16 -0700 Subject: [PATCH 123/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index fc9ac00d..a27e6b60 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -299,6 +299,7 @@ def start(self): thread = Thread(target=self.tracker.wait_for) thread.daemon = True thread.start() + self.logger.info(f"RabitTracker worker_args: {self.tracker.worker_args()}") attempt = 0 successful_connection = False From a3193a31df71be2f8376e20e317427ab86db33c5 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 15:05:20 -0700 Subject: [PATCH 124/157] check xgboost 2.1.0 --- .../distributed.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a27e6b60..a6c40ae6 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -301,28 +301,28 @@ def start(self): thread.start() self.logger.info(f"RabitTracker worker_args: {self.tracker.worker_args()}") - attempt = 0 - successful_connection = False - while not successful_connection and ( - self.max_connect_attempts is None or attempt < self.max_connect_attempts - ): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - self.logger.debug("Checking if RabitTracker is available.") - s.connect((self.master_host, self.port)) - successful_connection = True - self.logger.debug("Successfully connected to RabitTracker.") - except OSError: - self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - attempt += 1 - self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - time.sleep(self.connect_retry_timeout) - - if not successful_connection: - self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") - else: - self.logger.info("Connected to RabitTracker.") + # attempt = 0 + # successful_connection = False + # while not successful_connection and ( + # self.max_connect_attempts is None or attempt < self.max_connect_attempts + # ): + # with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + # try: + # self.logger.debug("Checking if RabitTracker is available.") + # s.connect((self.master_host, self.port)) + # successful_connection = True + # self.logger.debug("Successfully connected to RabitTracker.") + # except OSError: + # self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + # attempt += 1 + # self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + # time.sleep(self.connect_retry_timeout) + + # if not successful_connection: + # self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + # raise Exception("Failed to connect to Rabit Tracker") + # else: + # self.logger.info("Connected to RabitTracker.") # Set environment variables for collective os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) From 076c7862c5171f676920740bb3d7f3590fafcc90 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 15:11:51 -0700 Subject: [PATCH 125/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a6c40ae6..0fa936de 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -291,7 +291,7 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=self.master_host, n_workers=self.n_workers, port=self.port, sortby="task" + host_ip=self.master_host, n_workers=self.n_workers, port=9999, sortby="task" ) self.tracker.start() self.logger.info("RabitTracker start listen on %s:%d", self.current_host, self.port) @@ -333,7 +333,7 @@ def start(self): # Initialize collective for synchronization collective.init( dmlc_tracker_uri=self.master_host, - dmlc_tracker_port=self.port, + dmlc_tracker_port="9999", dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, From 23d60342486530ba0e712cc8fbdb7bf0e7ce225e Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 16:04:38 -0700 Subject: [PATCH 126/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 0fa936de..b9097397 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -20,7 +20,6 @@ import sys import json import os -import time from threading import Thread from retrying import retry @@ -291,10 +290,10 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=self.master_host, n_workers=self.n_workers, port=9999, sortby="task" + host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=self.port, sortby="task" ) self.tracker.start() - self.logger.info("RabitTracker start listen on %s:%d", self.current_host, self.port) + self.logger.info("RabitTracker start listen on %s:%d", _dns_lookup(self.master_host), self.port) thread = Thread(target=self.tracker.wait_for) thread.daemon = True @@ -325,15 +324,15 @@ def start(self): # self.logger.info("Connected to RabitTracker.") # Set environment variables for collective - os.environ["DMLC_NUM_WORKER"] = str(self.n_workers) + os.environ["DMLC_NUM_WORKER"] = str(_dns_lookup(self.master_host)) os.environ["DMLC_TRACKER_URI"] = self.master_host os.environ["DMLC_TRACKER_PORT"] = str(self.port) os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) # Initialize collective for synchronization collective.init( - dmlc_tracker_uri=self.master_host, - dmlc_tracker_port="9999", + dmlc_tracker_uri=_dns_lookup(self.master_host), + dmlc_tracker_port=self.port, dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, From a94d1c27d68483aabca5587c99f5b7ade9c33b39 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 1 Oct 2025 17:08:20 -0700 Subject: [PATCH 127/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index b9097397..6214c522 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -290,7 +290,7 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=self.port, sortby="task" + host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=10099, sortby="task" ) self.tracker.start() self.logger.info("RabitTracker start listen on %s:%d", _dns_lookup(self.master_host), self.port) @@ -332,7 +332,7 @@ def start(self): # Initialize collective for synchronization collective.init( dmlc_tracker_uri=_dns_lookup(self.master_host), - dmlc_tracker_port=self.port, + dmlc_tracker_port=10099, dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, From 09a3a70b621fcb9feebb928138264e15097b9610 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 00:34:23 -0700 Subject: [PATCH 128/157] check xgboost 2.1.0 --- .../distributed.py | 37 +++---------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 6214c522..a23caa99 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -290,49 +290,24 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=10099, sortby="task" + host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=45689, sortby="task" ) self.tracker.start() - self.logger.info("RabitTracker start listen on %s:%d", _dns_lookup(self.master_host), self.port) - thread = Thread(target=self.tracker.wait_for) thread.daemon = True thread.start() self.logger.info(f"RabitTracker worker_args: {self.tracker.worker_args()}") - # attempt = 0 - # successful_connection = False - # while not successful_connection and ( - # self.max_connect_attempts is None or attempt < self.max_connect_attempts - # ): - # with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - # try: - # self.logger.debug("Checking if RabitTracker is available.") - # s.connect((self.master_host, self.port)) - # successful_connection = True - # self.logger.debug("Successfully connected to RabitTracker.") - # except OSError: - # self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) - # attempt += 1 - # self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) - # time.sleep(self.connect_retry_timeout) - - # if not successful_connection: - # self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - # raise Exception("Failed to connect to Rabit Tracker") - # else: - # self.logger.info("Connected to RabitTracker.") - # Set environment variables for collective - os.environ["DMLC_NUM_WORKER"] = str(_dns_lookup(self.master_host)) - os.environ["DMLC_TRACKER_URI"] = self.master_host - os.environ["DMLC_TRACKER_PORT"] = str(self.port) - os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) + # os.environ["DMLC_NUM_WORKER"] = str(_dns_lookup(self.master_host)) + # os.environ["DMLC_TRACKER_URI"] = self.master_host + # os.environ["DMLC_TRACKER_PORT"] = str(self.port) + # os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) # Initialize collective for synchronization collective.init( dmlc_tracker_uri=_dns_lookup(self.master_host), - dmlc_tracker_port=10099, + dmlc_tracker_port=45689, dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, From a42aa02a6f1466333a36a8884a6a3f6ef9c6c481 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 07:29:57 -0700 Subject: [PATCH 129/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a23caa99..3299c05d 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -19,7 +19,6 @@ import socket import sys import json -import os from threading import Thread from retrying import retry From 3137263e63c34f82934c2d76c89a0a54057129db Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 10:51:26 -0700 Subject: [PATCH 130/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 3299c05d..fde3d218 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -74,7 +74,7 @@ def rabit_run( first_port=None, second_port=None, max_connect_attempts=None, - connect_retry_timeout=300, + connect_retry_timeout=3, update_rabit_args=False, ): """Run execution function after initializing dmlc/rabit. @@ -289,7 +289,7 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=45689, sortby="task" + host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=self.port, sortby="task" ) self.tracker.start() thread = Thread(target=self.tracker.wait_for) @@ -297,16 +297,10 @@ def start(self): thread.start() self.logger.info(f"RabitTracker worker_args: {self.tracker.worker_args()}") - # Set environment variables for collective - # os.environ["DMLC_NUM_WORKER"] = str(_dns_lookup(self.master_host)) - # os.environ["DMLC_TRACKER_URI"] = self.master_host - # os.environ["DMLC_TRACKER_PORT"] = str(self.port) - # os.environ["DMLC_TASK_ID"] = str(self.hosts.index(self.current_host)) - # Initialize collective for synchronization collective.init( dmlc_tracker_uri=_dns_lookup(self.master_host), - dmlc_tracker_port=45689, + dmlc_tracker_port=self.port, dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, From 3581c03901bca0e65e07575afa2d8dcd33c6212b Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 11:56:55 -0700 Subject: [PATCH 131/157] check xgboost 2.1.0 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- src/sagemaker_xgboost_container/distributed.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index aae78389..86a430d4 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -32,7 +32,7 @@ ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=0.9.0 -ARG XGBOOST_VERSION=2.1.0 +ARG XGBOOST_VERSION=2.1.4 ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index fde3d218..4ba2820e 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -313,9 +313,9 @@ def start(self): is_master={self.is_master_host}" ) except Exception as e: - self.logger.warning("Collective init failed: {}, " "falling back to single node".format(e)) + self.logger.error("Collective init failed: {}, " "".format(e)) self._cleanup_tracker() - return RabitHelper(True, self.current_host, self.port) + raise e self.logger.info(f"RABIT_START_DEBUG: Creating RabitHelper with is_master={self.is_master_host}") return RabitHelper(self.is_master_host, self.current_host, self.port) From e687bb381069a8b451eb51c0a7d09f47ed139194 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 12:06:43 -0700 Subject: [PATCH 132/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 4ba2820e..241353b2 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -289,7 +289,10 @@ def start(self): # Launch tracker on master only if self.is_master_host: self.tracker = RabitTracker( - host_ip=_dns_lookup(self.master_host), n_workers=self.n_workers, port=self.port, sortby="task" + host_ip=str(_dns_lookup(self.master_host)), + n_workers=int(self.n_workers), + port=int(self.port), + sortby="task", ) self.tracker.start() thread = Thread(target=self.tracker.wait_for) @@ -299,8 +302,8 @@ def start(self): # Initialize collective for synchronization collective.init( - dmlc_tracker_uri=_dns_lookup(self.master_host), - dmlc_tracker_port=self.port, + dmlc_tracker_uri=str(_dns_lookup(self.master_host)), + dmlc_tracker_port=int(self.port), dmlc_task_id=str(self.hosts.index(self.current_host)), dmlc_retry=self.max_connect_attempts, dmlc_timeout=self.connect_retry_timeout, From cc0f36671a4798a1067d9c9feede3ce6dbdb8b86 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 13:08:57 -0700 Subject: [PATCH 133/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 14 ++++++++------ test/unit/test_distributed.py | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 241353b2..2687a594 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -300,6 +300,14 @@ def start(self): thread.start() self.logger.info(f"RabitTracker worker_args: {self.tracker.worker_args()}") + self.logger.info( + f"MASTER_DEBUG_FIXED: Using hostname logic: \ + current_host={self.current_host}, \ + master_host={self.master_host}, \ + is_master={self.is_master_host}, \ + port={self.port}" + ) + # Initialize collective for synchronization collective.init( dmlc_tracker_uri=str(_dns_lookup(self.master_host)), @@ -309,12 +317,6 @@ def start(self): dmlc_timeout=self.connect_retry_timeout, ) - self.logger.info( - f"MASTER_DEBUG_FIXED: Using hostname logic: \ - current_host={self.current_host}, \ - master_host={self.master_host}, \ - is_master={self.is_master_host}" - ) except Exception as e: self.logger.error("Collective init failed: {}, " "".format(e)) self._cleanup_tracker() diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index 5fca3836..08e4dd0b 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -74,6 +74,7 @@ def test_integration_rabit_synchronize(): q = Queue() port, _ = find_two_open_ports() + print(f"test_integration_rabit_synchronize, port={port}") host_count = 5 host_list = range(host_count) From 90b67ccd18fef0e353bf7291ba5a3be8eb8bb5ea Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 14:02:07 -0700 Subject: [PATCH 134/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 2687a594..3abe280a 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -308,6 +308,11 @@ def start(self): port={self.port}" ) + import os + + os.environ["DMLC_TRACKER_URI"] = str(_dns_lookup(self.master_host)) + os.environ["DMLC_TRACKER_PORT"] = str(self.port) + # Initialize collective for synchronization collective.init( dmlc_tracker_uri=str(_dns_lookup(self.master_host)), From 965405d04cfd5cc8849c4055335e81b30f3b09e6 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 14:04:23 -0700 Subject: [PATCH 135/157] check xgboost 2.1.0 --- src/sagemaker_xgboost_container/distributed.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 3abe280a..ed6193c0 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -285,6 +285,11 @@ def start(self): self.logger.debug("Single worker detected, skipping collective init") return RabitHelper(True, self.current_host, self.port) + import os + + os.environ["DMLC_TRACKER_URI"] = str(_dns_lookup(self.master_host)) + os.environ["DMLC_TRACKER_PORT"] = str(self.port) + try: # Launch tracker on master only if self.is_master_host: @@ -308,11 +313,6 @@ def start(self): port={self.port}" ) - import os - - os.environ["DMLC_TRACKER_URI"] = str(_dns_lookup(self.master_host)) - os.environ["DMLC_TRACKER_PORT"] = str(self.port) - # Initialize collective for synchronization collective.init( dmlc_tracker_uri=str(_dns_lookup(self.master_host)), From 702bc168711925a01872a7807330e0b73d043138 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 2 Oct 2025 15:53:27 -0700 Subject: [PATCH 136/157] check xgboost 2.1.0 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index 86a430d4..aae78389 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -32,7 +32,7 @@ ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=0.9.0 -ARG XGBOOST_VERSION=2.1.4 +ARG XGBOOST_VERSION=2.1.0 ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 From d0f74925c9f3bf6506ae1de6a4bdc662b37f35ce Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 11:06:50 -0700 Subject: [PATCH 137/157] test xgboost 3.0.5 --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 18fbb683..251372a5 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ deps = xgboost1.3: xgboost==1.3.3 xgboost1.5: xgboost==1.5.2 xgboost1.7: xgboost==1.7.4 - xgboost2.1.0: xgboost==2.1.0 + xgboost2.1.0: xgboost==3.0.5 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt From 87e8ea4028fee2a2d9c737183673b3f68dadc1d3 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 11:49:20 -0700 Subject: [PATCH 138/157] test xgboost 3.0.5 --- docker/2.1.0/base/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/2.1.0/base/Dockerfile.cpu index aae78389..a75b0d00 100644 --- a/docker/2.1.0/base/Dockerfile.cpu +++ b/docker/2.1.0/base/Dockerfile.cpu @@ -32,7 +32,7 @@ ARG CONDA_PKG_VERSION=24.7.1 ARG PYTHON_VERSION=3.10 ARG PYARROW_VERSION=17.0.0 ARG MLIO_VERSION=0.9.0 -ARG XGBOOST_VERSION=2.1.0 +ARG XGBOOST_VERSION=3.0.5 ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 From 6ebd902b954c43f1ef7c7c1240c4744502f0bd5c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 12:48:50 -0700 Subject: [PATCH 139/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index ed6193c0..5be7997d 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -123,7 +123,7 @@ def rabit_run( # Set up rabit with nodes that have data and an unused port so that previous slaves don't confuse it # with the previous rabit configuration logging.info(f"SECOND_RABIT_DEBUG: hosts_with_data={hosts_with_data}, current_host={current_host}") - print(f"SECOND_RABIT_DEBUG: hosts_with_data={hosts_with_data}, current_host={current_host}") + with Rabit( hosts=hosts_with_data, current_host=current_host, @@ -137,11 +137,6 @@ def rabit_run( cluster.is_master={cluster.is_master}, \ current_host={current_host}" ) - print( - f"RABIT_DEBUG: \ - cluster.is_master={cluster.is_master}, \ - current_host={current_host}" - ) args.update({"is_master": cluster.is_master}) exec_fun(**args) @@ -285,11 +280,6 @@ def start(self): self.logger.debug("Single worker detected, skipping collective init") return RabitHelper(True, self.current_host, self.port) - import os - - os.environ["DMLC_TRACKER_URI"] = str(_dns_lookup(self.master_host)) - os.environ["DMLC_TRACKER_PORT"] = str(self.port) - try: # Launch tracker on master only if self.is_master_host: From 445f2c6196eb376ef81e8822378c4c106d366b9a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 13:03:46 -0700 Subject: [PATCH 140/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 5be7997d..62751ed5 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -124,6 +124,13 @@ def rabit_run( # with the previous rabit configuration logging.info(f"SECOND_RABIT_DEBUG: hosts_with_data={hosts_with_data}, current_host={current_host}") + # Ensure collective is finalized before second initialization + try: + collective.finalize() + except Exception as e: + logging.error("First RabitTracker collective clean up failed", exc_info=True) + raise e + with Rabit( hosts=hosts_with_data, current_host=current_host, From 180d6f66f7dfb6c876e3f3604b858c00b6584947 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 14:19:52 -0700 Subject: [PATCH 141/157] test xgboost 3.0.5 --- .../distributed.py | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 62751ed5..c75b6ce0 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -124,13 +124,6 @@ def rabit_run( # with the previous rabit configuration logging.info(f"SECOND_RABIT_DEBUG: hosts_with_data={hosts_with_data}, current_host={current_host}") - # Ensure collective is finalized before second initialization - try: - collective.finalize() - except Exception as e: - logging.error("First RabitTracker collective clean up failed", exc_info=True) - raise e - with Rabit( hosts=hosts_with_data, current_host=current_host, @@ -310,6 +303,31 @@ def start(self): port={self.port}" ) + import time + + attempt = 0 + successful_connection = False + while not successful_connection and ( + self.max_connect_attempts is None or attempt < self.max_connect_attempts + ): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + self.logger.debug("Checking if RabitTracker is available.") + s.connect((self.master_host, self.port)) + successful_connection = True + self.logger.debug("Successfully connected to RabitTracker.") + except OSError: + self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt)) + attempt += 1 + self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout)) + time.sleep(self.connect_retry_timeout) + + if not successful_connection: + self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) + raise Exception("Failed to connect to Rabit Tracker") + else: + self.logger.info("Connected to RabitTracker.") + # Initialize collective for synchronization collective.init( dmlc_tracker_uri=str(_dns_lookup(self.master_host)), @@ -329,12 +347,12 @@ def start(self): def stop(self): """Shutdown collective communication.""" - self.logger.debug("Shutting down collective.") + self.logger.info("Shutting down collective.") try: collective.finalize() except Exception as e: - self.logger.debug("Collective finalize failed: {}".format(e)) + self.logger.error("Collective finalize failed: {}".format(e)) self._cleanup_tracker() @@ -352,4 +370,5 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): + self.stop() From 7555efa26b3c4369334fa4ae8129f4f4430b3ce2 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 15:43:14 -0700 Subject: [PATCH 142/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 12 ++++++------ test/unit/test_distributed.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index c75b6ce0..e7d2631f 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -74,7 +74,7 @@ def rabit_run( first_port=None, second_port=None, max_connect_attempts=None, - connect_retry_timeout=3, + connect_retry_timeout=10, update_rabit_args=False, ): """Run execution function after initializing dmlc/rabit. @@ -324,9 +324,9 @@ def start(self): if not successful_connection: self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts) - raise Exception("Failed to connect to Rabit Tracker") + raise Exception(f"Failed to connect to Rabit Tracker, current_host={self.current_host}") else: - self.logger.info("Connected to RabitTracker.") + self.logger.info(f"Connected to RabitTracker, current_host={self.current_host}") # Initialize collective for synchronization collective.init( @@ -338,7 +338,7 @@ def start(self): ) except Exception as e: - self.logger.error("Collective init failed: {}, " "".format(e)) + self.logger.error(f"{self.current_host} collective init failed", exc_info=True) self._cleanup_tracker() raise e @@ -347,12 +347,12 @@ def start(self): def stop(self): """Shutdown collective communication.""" - self.logger.info("Shutting down collective.") + self.logger.info(f"Shutting down collective, current_host={self.current_host}") try: collective.finalize() except Exception as e: - self.logger.error("Collective finalize failed: {}".format(e)) + self.logger.error(f"{self.current_host} collective finalize failed", exc_info=True) self._cleanup_tracker() diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index 08e4dd0b..558f3656 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -32,7 +32,7 @@ def synchronize_fn(host_count, port, master, idx, q): def rabit_run_fn( - host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=3 + host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=15 ): hosts = ["127.0.0.1"] + ["localhost" for _ in range(host_count - 1)] current_host = "127.0.0.1" if master else "localhost" @@ -86,7 +86,7 @@ def test_integration_rabit_synchronize(): num_responses = 0 while num_responses < host_count: - host_aggregated_result = q.get(timeout=10) + host_aggregated_result = q.get(timeout=30) for host_individual_result in host_aggregated_result: assert host_individual_result in expected_results num_responses += 1 @@ -107,7 +107,7 @@ def test_rabit_run_all_hosts_run(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=15) + response = q.get(timeout=30) expected_results.remove(response) num_responses += 1 @@ -133,7 +133,7 @@ def test_rabit_run_exclude_one_host(): num_responses = 0 while num_responses < host_count - 1: - response = q.get(timeout=15) + response = q.get(timeout=30) expected_results.remove(response) num_responses += 1 @@ -157,7 +157,7 @@ def test_rabit_delay_master(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=20) + response = q.get(timeout=30) expected_results.remove(response) num_responses += 1 @@ -182,6 +182,6 @@ def test_rabit_run_fail_bad_max_retry_attempts(bad_max_retry_attempts): num_responses = 0 while num_responses < host_count: - host_result = q.get(timeout=10) + host_result = q.get(timeout=30) assert "max_connect_attempts must be None or an integer greater than 0." in host_result num_responses += 1 From 27fb002bb9c2b5f76f9ea96379f391df060e6c62 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 16:56:38 -0700 Subject: [PATCH 143/157] test xgboost 3.0.5 --- test/unit/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index 558f3656..8639ed6f 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -151,7 +151,7 @@ def test_rabit_delay_master(): for idx in host_list: p = Process( - target=rabit_run_delay_master, args=(host_count, True, first_port, second_port, idx == 0, idx, q, None) + target=rabit_run_delay_master, args=(host_count, True, first_port, second_port, idx == 0, idx, q, 3) ) p.start() From 71ac1529f0988310126a1ad28ec0ce3d85a212a6 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 17:41:37 -0700 Subject: [PATCH 144/157] test xgboost 3.0.5 --- test/unit/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index 8639ed6f..ab175649 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -32,7 +32,7 @@ def synchronize_fn(host_count, port, master, idx, q): def rabit_run_fn( - host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=15 + host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=60 ): hosts = ["127.0.0.1"] + ["localhost" for _ in range(host_count - 1)] current_host = "127.0.0.1" if master else "localhost" From fa5b3ce77a7deb71e46e83432227a2227591ca67 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 18:41:15 -0700 Subject: [PATCH 145/157] test xgboost 3.0.5 --- test/unit/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index ab175649..a465a4f7 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -107,7 +107,7 @@ def test_rabit_run_all_hosts_run(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=30) + response = q.get(timeout=120) expected_results.remove(response) num_responses += 1 From c78e283a6354d84fb687f548a55734cfc8c32bca Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 3 Oct 2025 18:44:13 -0700 Subject: [PATCH 146/157] test xgboost 3.0.5 --- test/unit/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index a465a4f7..76ebb39b 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -157,7 +157,7 @@ def test_rabit_delay_master(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=30) + response = q.get(timeout=120) expected_results.remove(response) num_responses += 1 From 674206b92d9d6e16880df04c0b28a0adb676fc41 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 4 Oct 2025 08:58:17 -0700 Subject: [PATCH 147/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index e7d2631f..a0ad6379 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -290,9 +290,9 @@ def start(self): sortby="task", ) self.tracker.start() - thread = Thread(target=self.tracker.wait_for) - thread.daemon = True - thread.start() + self.tracker_thread = Thread(target=self.tracker.wait_for) + self.tracker_thread.daemon = True + self.tracker_thread.start() self.logger.info(f"RabitTracker worker_args: {self.tracker.worker_args()}") self.logger.info( @@ -354,6 +354,15 @@ def stop(self): except Exception as e: self.logger.error(f"{self.current_host} collective finalize failed", exc_info=True) + # Wait for tracker thread to finish + if self.tracker_thread is not None: + try: + self.tracker_thread.join(timeout=1.0) + except Exception as e: + self.logger.debug("Tracker thread join failed: {}".format(e)) + finally: + self.tracker_thread = None + self._cleanup_tracker() def _cleanup_tracker(self): @@ -370,5 +379,4 @@ def __enter__(self): return self.start() def __exit__(self, exc_type, exc_value, exc_traceback): - self.stop() From bea2f33d947ac449b2619a4bef5909e971936293 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 4 Oct 2025 10:41:55 -0700 Subject: [PATCH 148/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index a0ad6379..04a6ca06 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -274,6 +274,7 @@ def start(self): """ self.logger.debug("Starting collective communication.") self.tracker = None + self.tracker_thread = None # For single node, skip collective initialization if self.n_workers == 1: From 6d1ea7c2dcc744085ce5be1afeb6634338980673 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 4 Oct 2025 12:09:55 -0700 Subject: [PATCH 149/157] test xgboost 3.0.5 --- test/unit/test_distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index 76ebb39b..a92ba44f 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -133,7 +133,7 @@ def test_rabit_run_exclude_one_host(): num_responses = 0 while num_responses < host_count - 1: - response = q.get(timeout=30) + response = q.get(timeout=300) expected_results.remove(response) num_responses += 1 @@ -157,7 +157,7 @@ def test_rabit_delay_master(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=120) + response = q.get(timeout=300) expected_results.remove(response) num_responses += 1 From 2a2d155e2473d7b49d1dcffb35d13926d49fed46 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 4 Oct 2025 13:09:47 -0700 Subject: [PATCH 150/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 04a6ca06..c94f0915 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -352,7 +352,7 @@ def stop(self): try: collective.finalize() - except Exception as e: + except Exception: self.logger.error(f"{self.current_host} collective finalize failed", exc_info=True) # Wait for tracker thread to finish From 6cde74451d36543e59b0458888a507250d3c6050 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sat, 4 Oct 2025 14:24:54 -0700 Subject: [PATCH 151/157] test xgboost 3.0.5 --- src/sagemaker_xgboost_container/algorithm_mode/train.py | 4 ++-- .../distributed_gpu/distributed_gpu_training.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index fa637a65..30a8233d 100755 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -324,7 +324,7 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di train_dmatrix, num_boost_round=num_round - iteration, evals=watchlist, - feval=configured_feval, + custom_metric=configured_feval, callbacks=callbacks, xgb_model=xgb_model, verbose_eval=False, @@ -389,7 +389,7 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di cv_train_dmatrix, num_boost_round=num_round - iteration, evals=watchlist, - feval=configured_feval, + custom_metric=configured_feval, evals_result=evals_result, callbacks=callbacks, xgb_model=xgb_model, diff --git a/src/sagemaker_xgboost_container/distributed_gpu/distributed_gpu_training.py b/src/sagemaker_xgboost_container/distributed_gpu/distributed_gpu_training.py index 99f95e3c..1abd3462 100644 --- a/src/sagemaker_xgboost_container/distributed_gpu/distributed_gpu_training.py +++ b/src/sagemaker_xgboost_container/distributed_gpu/distributed_gpu_training.py @@ -167,7 +167,7 @@ def run_training_with_dask( dtrain=dtrain, num_boost_round=num_round, evals=watchlist, - feval=configured_feval, + custom_metric=configured_feval, callbacks=callbacks, ) booster = output["booster"] From 6cc8a490fd7335cc0836eb3090fa89df0c52f9b2 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Sun, 5 Oct 2025 16:06:11 -0700 Subject: [PATCH 152/157] test xgboost 3.0.5 --- docker/2.1.0/final/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/2.1.0/final/Dockerfile.cpu index 9005bd7f..c3e363d1 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/2.1.0/final/Dockerfile.cpu @@ -1,4 +1,4 @@ -ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 +ARG SAGEMAKER_XGBOOST_VERSION=2.1-0 ARG PYTHON_VERSION=3.10 FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 From a6a10e2395680d8410d007bd6cbbcbf3aa6b97ea Mon Sep 17 00:00:00 2001 From: Li Ning Date: Mon, 6 Oct 2025 09:28:29 -0700 Subject: [PATCH 153/157] rename 2.1.0 with 3.0.5 --- docker/{2.1.0 => 3.0.5}/base/Dockerfile.cpu | 0 docker/{2.1.0 => 3.0.5}/final/Dockerfile.cpu | 4 ++-- .../resources/mms/ExecutionParameters.java | 0 .../resources/mms/config.properties.tmp | 0 .../resources/mms/endpoints-1.0.jar | Bin tox.ini | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) rename docker/{2.1.0 => 3.0.5}/base/Dockerfile.cpu (100%) rename docker/{2.1.0 => 3.0.5}/final/Dockerfile.cpu (98%) rename docker/{2.1.0 => 3.0.5}/resources/mms/ExecutionParameters.java (100%) rename docker/{2.1.0 => 3.0.5}/resources/mms/config.properties.tmp (100%) rename docker/{2.1.0 => 3.0.5}/resources/mms/endpoints-1.0.jar (100%) diff --git a/docker/2.1.0/base/Dockerfile.cpu b/docker/3.0.5/base/Dockerfile.cpu similarity index 100% rename from docker/2.1.0/base/Dockerfile.cpu rename to docker/3.0.5/base/Dockerfile.cpu diff --git a/docker/2.1.0/final/Dockerfile.cpu b/docker/3.0.5/final/Dockerfile.cpu similarity index 98% rename from docker/2.1.0/final/Dockerfile.cpu rename to docker/3.0.5/final/Dockerfile.cpu index c3e363d1..6a48f280 100644 --- a/docker/2.1.0/final/Dockerfile.cpu +++ b/docker/3.0.5/final/Dockerfile.cpu @@ -1,9 +1,9 @@ -ARG SAGEMAKER_XGBOOST_VERSION=2.1-0 +ARG SAGEMAKER_XGBOOST_VERSION=3.0-5 ARG PYTHON_VERSION=3.10 FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3 -ARG SAGEMAKER_XGBOOST_VERSION=2.1.0 +ARG SAGEMAKER_XGBOOST_VERSION=3.0.5 ######################## # Install dependencies # diff --git a/docker/2.1.0/resources/mms/ExecutionParameters.java b/docker/3.0.5/resources/mms/ExecutionParameters.java similarity index 100% rename from docker/2.1.0/resources/mms/ExecutionParameters.java rename to docker/3.0.5/resources/mms/ExecutionParameters.java diff --git a/docker/2.1.0/resources/mms/config.properties.tmp b/docker/3.0.5/resources/mms/config.properties.tmp similarity index 100% rename from docker/2.1.0/resources/mms/config.properties.tmp rename to docker/3.0.5/resources/mms/config.properties.tmp diff --git a/docker/2.1.0/resources/mms/endpoints-1.0.jar b/docker/3.0.5/resources/mms/endpoints-1.0.jar similarity index 100% rename from docker/2.1.0/resources/mms/endpoints-1.0.jar rename to docker/3.0.5/resources/mms/endpoints-1.0.jar diff --git a/tox.ini b/tox.ini index 251372a5..066ff0e6 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ deps = xgboost1.3: xgboost==1.3.3 xgboost1.5: xgboost==1.5.2 xgboost1.7: xgboost==1.7.4 - xgboost2.1.0: xgboost==3.0.5 + xgboost3.0.5: xgboost==3.0.5 xgboostlatest: xgboost -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt From 2e34b84fd002691ec95f66a2f1e35fe84583d64c Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 7 Oct 2025 10:54:07 -0700 Subject: [PATCH 154/157] test 3.0.5 --- .../distributed.py | 26 +++---------------- test/unit/test_distributed.py | 18 ++++++++----- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index c94f0915..05b0b7e9 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -43,28 +43,6 @@ def wait_hostname_resolution(sm_hosts): _dns_lookup(host) -def get_host_ip(hostIP=None): - if hostIP is None or hostIP == "auto": - hostIP = "ip" - - if hostIP == "dns": - hostIP = socket.getfqdn() - elif hostIP == "ip": - from socket import gaierror - - try: - hostIP = socket.gethostbyname(socket.getfqdn()) - except gaierror: - logging.warn("gethostbyname(socket.getfqdn()) failed... trying on hostname()") - hostIP = socket.gethostbyname(socket.gethostname()) - if hostIP.startswith("127."): - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - # doesn't have to be reachable - s.connect(("10.255.255.255", 1)) - hostIP = s.getsockname()[0] - return hostIP - - def rabit_run( exec_fun, args, @@ -74,7 +52,9 @@ def rabit_run( first_port=None, second_port=None, max_connect_attempts=None, - connect_retry_timeout=10, + # TEST LN + # connect_retry_timeout=10, + connect_retry_timeout=3, update_rabit_args=False, ): """Run execution function after initializing dmlc/rabit. diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index a92ba44f..6452b161 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -31,8 +31,9 @@ def synchronize_fn(host_count, port, master, idx, q): sys.exit(0) +# TEST LN connect_retry_timeout=60 def rabit_run_fn( - host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=60 + host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=3 ): hosts = ["127.0.0.1"] + ["localhost" for _ in range(host_count - 1)] current_host = "127.0.0.1" if master else "localhost" @@ -86,7 +87,8 @@ def test_integration_rabit_synchronize(): num_responses = 0 while num_responses < host_count: - host_aggregated_result = q.get(timeout=30) + # TEST LN timeout=30 + host_aggregated_result = q.get(timeout=10) for host_individual_result in host_aggregated_result: assert host_individual_result in expected_results num_responses += 1 @@ -107,7 +109,8 @@ def test_rabit_run_all_hosts_run(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=120) + # TEST LN timeout=120 + response = q.get(timeout=15) expected_results.remove(response) num_responses += 1 @@ -133,7 +136,8 @@ def test_rabit_run_exclude_one_host(): num_responses = 0 while num_responses < host_count - 1: - response = q.get(timeout=300) + # TEST LN timeout=300 + response = q.get(timeout=15) expected_results.remove(response) num_responses += 1 @@ -157,7 +161,8 @@ def test_rabit_delay_master(): num_responses = 0 while num_responses < host_count: - response = q.get(timeout=300) + # TEST LN timeout=300 + response = q.get(timeout=20) expected_results.remove(response) num_responses += 1 @@ -182,6 +187,7 @@ def test_rabit_run_fail_bad_max_retry_attempts(bad_max_retry_attempts): num_responses = 0 while num_responses < host_count: - host_result = q.get(timeout=30) + # TEST LN timeout=30 + host_result = q.get(timeout=10) assert "max_connect_attempts must be None or an integer greater than 0." in host_result num_responses += 1 From fa34fe3a807e91bb4abdaf7c784f482bd87ddd9b Mon Sep 17 00:00:00 2001 From: Li Ning Date: Tue, 7 Oct 2025 11:39:25 -0700 Subject: [PATCH 155/157] test 3.0.5 --- src/sagemaker_xgboost_container/distributed.py | 4 +--- test/unit/test_distributed.py | 18 ++++++------------ 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/sagemaker_xgboost_container/distributed.py b/src/sagemaker_xgboost_container/distributed.py index 05b0b7e9..a10880c9 100644 --- a/src/sagemaker_xgboost_container/distributed.py +++ b/src/sagemaker_xgboost_container/distributed.py @@ -52,9 +52,7 @@ def rabit_run( first_port=None, second_port=None, max_connect_attempts=None, - # TEST LN - # connect_retry_timeout=10, - connect_retry_timeout=3, + connect_retry_timeout=10, update_rabit_args=False, ): """Run execution function after initializing dmlc/rabit. diff --git a/test/unit/test_distributed.py b/test/unit/test_distributed.py index 6452b161..a92ba44f 100644 --- a/test/unit/test_distributed.py +++ b/test/unit/test_distributed.py @@ -31,9 +31,8 @@ def synchronize_fn(host_count, port, master, idx, q): sys.exit(0) -# TEST LN connect_retry_timeout=60 def rabit_run_fn( - host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=3 + host_count, is_run, first_port, second_port, master, idx, q, max_connect_attempts=None, connect_retry_timeout=60 ): hosts = ["127.0.0.1"] + ["localhost" for _ in range(host_count - 1)] current_host = "127.0.0.1" if master else "localhost" @@ -87,8 +86,7 @@ def test_integration_rabit_synchronize(): num_responses = 0 while num_responses < host_count: - # TEST LN timeout=30 - host_aggregated_result = q.get(timeout=10) + host_aggregated_result = q.get(timeout=30) for host_individual_result in host_aggregated_result: assert host_individual_result in expected_results num_responses += 1 @@ -109,8 +107,7 @@ def test_rabit_run_all_hosts_run(): num_responses = 0 while num_responses < host_count: - # TEST LN timeout=120 - response = q.get(timeout=15) + response = q.get(timeout=120) expected_results.remove(response) num_responses += 1 @@ -136,8 +133,7 @@ def test_rabit_run_exclude_one_host(): num_responses = 0 while num_responses < host_count - 1: - # TEST LN timeout=300 - response = q.get(timeout=15) + response = q.get(timeout=300) expected_results.remove(response) num_responses += 1 @@ -161,8 +157,7 @@ def test_rabit_delay_master(): num_responses = 0 while num_responses < host_count: - # TEST LN timeout=300 - response = q.get(timeout=20) + response = q.get(timeout=300) expected_results.remove(response) num_responses += 1 @@ -187,7 +182,6 @@ def test_rabit_run_fail_bad_max_retry_attempts(bad_max_retry_attempts): num_responses = 0 while num_responses < host_count: - # TEST LN timeout=30 - host_result = q.get(timeout=10) + host_result = q.get(timeout=30) assert "max_connect_attempts must be None or an integer greater than 0." in host_result num_responses += 1 From 9ed4eff58f98a8ccf2a8e10db38d85c50e0dbf7a Mon Sep 17 00:00:00 2001 From: Li Ning Date: Thu, 9 Oct 2025 22:58:36 -0700 Subject: [PATCH 156/157] cuda 12.0.0 --- docker/3.0.5/base/Dockerfile.cpu | 4 ++-- src/sagemaker_xgboost_container/callback.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/3.0.5/base/Dockerfile.cpu b/docker/3.0.5/base/Dockerfile.cpu index a75b0d00..581a7d4c 100644 --- a/docker/3.0.5/base/Dockerfile.cpu +++ b/docker/3.0.5/base/Dockerfile.cpu @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=11.6.1 -ARG IMAGE_DIGEST=c2d95c9c6ff77da41cf0f2f9e8c5088f5b4db20c16a7566b808762f05b9032ef +ARG CUDA_VERSION=12.0.0 +ARG IMAGE_DIGEST=dcea6188cf23600a396033b88132f86e295f35aa5ef8fee79187280ff6ecc81a # Build stage for SQLite compilation FROM ubuntu:${UBUNTU_VERSION} as sqlite-builder diff --git a/src/sagemaker_xgboost_container/callback.py b/src/sagemaker_xgboost_container/callback.py index 29a1104a..89d935bd 100644 --- a/src/sagemaker_xgboost_container/callback.py +++ b/src/sagemaker_xgboost_container/callback.py @@ -82,7 +82,7 @@ def get_callbacks( if checkpoint_dir and is_master: save_checkpoint = xgb.callback.TrainingCheckPoint( - directory=checkpoint_dir, iterations=iteration, name=checkpointing.CHECKPOINT_FILENAME + directory=checkpoint_dir, interval=iteration, name=checkpointing.CHECKPOINT_FILENAME ) callbacks.append(save_checkpoint) From 6f043d98e7da74671e886ed3f4e65bacc57d2ecb Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 10 Oct 2025 09:46:28 -0700 Subject: [PATCH 157/157] roll back to cuda 11.6.1 --- docker/3.0.5/base/Dockerfile.cpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/3.0.5/base/Dockerfile.cpu b/docker/3.0.5/base/Dockerfile.cpu index 581a7d4c..a75b0d00 100644 --- a/docker/3.0.5/base/Dockerfile.cpu +++ b/docker/3.0.5/base/Dockerfile.cpu @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=12.0.0 -ARG IMAGE_DIGEST=dcea6188cf23600a396033b88132f86e295f35aa5ef8fee79187280ff6ecc81a +ARG CUDA_VERSION=11.6.1 +ARG IMAGE_DIGEST=c2d95c9c6ff77da41cf0f2f9e8c5088f5b4db20c16a7566b808762f05b9032ef # Build stage for SQLite compilation FROM ubuntu:${UBUNTU_VERSION} as sqlite-builder