diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index f3193876..d6bc3f9b 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,66 +1,58 @@ -ARG BASE_IMAGE_REPO \ - BASE_IMAGE_TAG \ - CPU_BASE_IMAGE_NAME \ - GPU_BASE_IMAGE_NAME \ - LIGHTGBM_VERSION \ - TORCH_VERSION \ - TORCHAUDIO_VERSION \ - TORCHVISION_VERSION \ - JAX_VERSION +FROM us-docker.pkg.dev/colab-images/public/runtime:latest -{{ if eq .Accelerator "gpu" }} -FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl -FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl -FROM gcr.io/kaggle-images/python-jaxlib-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${JAX_VERSION} AS jaxlib_whl -FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ else }} -FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ end }} +ADD kaggle_requirements.txt /kaggle_requirements.txt + +# Freeze existing requirements from base image for critical packages: +RUN pip freeze | grep -E 'tensorflow|keras|torch|jax|lightgbm' > /colab_requirements.txt + +# Merge requirements files: +RUN cat /colab_requirements.txt >> /requirements.txt +RUN cat /kaggle_requirements.txt >> /requirements.txt + +# TODO: GPU requirements.txt +# TODO: merge them better (override matching ones). -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +# Install uv & Kaggle packages +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt +ENV PATH="~/.local/bin:${PATH}" +# Install manual packages: +# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. +RUN uv pip uninstall --system google-cloud-bigquery-storage + +# NOTE(herbison): uv fails to install this for some reason +RUN pip install git+https://github.com/Kaggle/learntools + +# We install an incompatible pair of libs (shapely<, libpysal==4.9.2) so we can't put this one in the requirements.txt +RUN uv pip install --system "libpysal==4.9.2" + +# Adding non-package dependencies: + +ADD clean-layer.sh /tmp/clean-layer.sh +ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl +ADD patches/template_conf.json /opt/kaggle/conf.json + +# /opt/conda/lib/python3.10/site-packages +ARG PACKAGE_PATH=/usr/local/lib/python3.10/dist-packages + +# Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} ARG CUDA_MAJOR_VERSION \ CUDA_MINOR_VERSION ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \ CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} + # Make sure we are on the right version of CUDA RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -# NVIDIA binaries from the host are mounted to /opt/bin. -ENV PATH=/opt/bin:${PATH} \ - # Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. - LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" -RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 -{{ end }} -# Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.16.1 \ - # See https://github.com/tensorflow/io#tensorflow-version-compatibility - TENSORFLOW_IO_VERSION=0.37.0 - -# We need to redefine the ARG here to get the ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG LIGHTGBM_VERSION \ - TORCH_VERSION \ - TORCHAUDIO_VERSION \ - TORCHVISION_VERSION \ - JAX_VERSION - -# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 -# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information -ENV KMP_WARNINGS=0 \ - # Also make the KMP logs noverbose. - # https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message - KMP_SETTINGS=false \ - # Remove the pip as the root user warning. - PIP_ROOT_USER_ACTION=ignore +RUN uv pip install --system "pycuda" + +# Remove CUDA_VERSION from non-GPU image. +{{ else }} +ENV CUDA_VERSION="" +{{ end }} -ADD clean-layer.sh /tmp/clean-layer.sh -ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl -ADD patches/template_conf.json /opt/kaggle/conf.json # Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - @@ -78,141 +70,18 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y graphviz && pip install graphviz && \ /tmp/clean-layer.sh -# b/128333086: Set PROJ_DATA to points to the proj4 cartographic library. -ENV PROJ_DATA=/opt/conda/share/proj - -# Install micromamba, setup channels, and replace conda with micromamba -ENV MAMBA_ROOT_PREFIX=/opt/conda -RUN curl -L "https://micro.mamba.pm/install.sh" -o /tmp/micromamba-install.sh \ - && bash /tmp/micromamba-install.sh \ - && rm /tmp/micromamba-install.sh \ - && mv ~/.local/bin/micromamba /usr/bin/micromamba \ - && (!(which conda) || cp /usr/bin/micromamba $(which conda)) \ - && micromamba config append channels nvidia \ - && micromamba config append channels rapidsai \ - && micromamba config append channels conda-forge \ - && micromamba config set channel_priority flexible \ - && python -m nb_conda_kernels.install --disable - -# Install conda packages not available on pip. -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -RUN micromamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ - rm -rf /opt/conda/lib/python3.10/site-packages/pyproj/proj_dir/ && \ - /tmp/clean-layer.sh - -# Install spacy -# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. -# b/341938540: unistall grpc-cpp to allow >=v24.4 cudf and cuml to be installed. -{{ if eq .Accelerator "gpu" }} -RUN pip uninstall -y pyarrow && \ - micromamba install -vvvy spacy "cudf>=24.4" "cuml>=24.4" cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install spacy && \ - /tmp/clean-layer.sh -{{ end}} - -# Install PyTorch -# b/356397043: magma-cuda121 is the latest version -{{ if eq .Accelerator "gpu" }} -COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -# b/356397043: We are currently using cuda 12.3, -# but magma-cuda121 is the latest compatible version -RUN micromamba install -y -c pytorch magma-cuda121 && \ - pip install /tmp/torch/*.whl && \ - sudo apt -y install libsox-dev && \ - rm -rf /tmp/torch && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install \ - torch==$TORCH_VERSION+cpu \ - torchvision==$TORCHVISION_VERSION+cpu \ - torchaudio==$TORCHAUDIO_VERSION+cpu \ - --index-url https://download.pytorch.org/whl/cpu && \ - /tmp/clean-layer.sh -{{ end }} - -# Install LightGBM -{{ if eq .Accelerator "gpu" }} -COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ -# Install OpenCL (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - pip install /tmp/lightgbm/*.whl && \ - rm -rf /tmp/lightgbm && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install lightgbm==$LIGHTGBM_VERSION && \ - /tmp/clean-layer.sh -{{ end }} - -# Install JAX -{{ if eq .Accelerator "gpu" }} -COPY --from=jaxlib_whl /tmp/whl/*.whl /tmp/jax/ -# b/319722433#comment9: Use pip wheels once versions matches our CUDA version. -RUN pip install /tmp/jax/*.whl jax==$JAX_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install jax[cpu] && \ - /tmp/clean-layer.sh -{{ end }} - - -# Install GPU specific packages -{{ if eq .Accelerator "gpu" }} -# Install GPU-only packages -# No specific package for nnabla-ext-cuda 12.x minor versions. -RUN export PATH=/usr/local/cuda/bin:$PATH && \ - export CUDA_ROOT=/usr/local/cuda && \ - pip install pycuda \ - # TODO(379932879): pip resolver fails when not specified. - pynvrtc==9.2 \ - pynvml && \ - /tmp/clean-layer.sh -{{ end }} - -RUN apt-get update && \ - apt-get install -y default-jre && \ - /tmp/clean-layer.sh - -RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && /tmp/clean-layer.sh - -RUN pip install \ - "tensorflow==${TENSORFLOW_VERSION}" \ - "tensorflow-io==${TENSORFLOW_IO_VERSION}" \ - tensorflow-probability \ - tensorflow_decision_forests \ - tensorflow-text \ - "tensorflow_hub>=0.16.0" \ - tf-keras \ - "keras>3" \ - keras-cv \ - keras-nlp && \ - /tmp/clean-layer.sh - ADD patches/keras_internal.py \ patches/keras_internal_test.py \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/ + $PACKAGE_PATH/tensorflow_decision_forests/keras/ -# b/350573866: xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ - apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ - pip install "numpy==1.26.4" && \ - pip install gensim \ - textblob \ - wordcloud \ - "xgboost==2.0.3" \ - pydot \ - hep_ml && \ - # NLTK Project datasets - mkdir -p /usr/share/nltk_data && \ + apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing + +# NLTK Project datasets +RUN mkdir -p /usr/share/nltk_data && \ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list # the corpuses that work - # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. - yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ + python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ basque_grammars biocreative_ppi bllip_wsj_no_aux \ book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ @@ -225,276 +94,13 @@ RUN apt-get install -y libfreetype6-dev && \ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ - vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - pip install scikit-image && \ - pip install opencv-contrib-python opencv-python && \ - /tmp/clean-layer.sh - -RUN pip install cython \ - fasttext \ - opencv-contrib-python \ - opencv-python \ - "scipy<1.14.0" \ - # Scikit-learn accelerated library for x86 - "scikit-learn-intelex>=2023.0.1" \ - # HDF5 support - h5py \ - # PUDB, for local debugging convenience - pudb \ - imbalanced-learn \ - # Profiling and other utilities - line_profiler \ - bokeh \ - numba \ - datashader \ - # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" - "libpysal==4.9.2" \ - # b/276344496: Install specific version of boto3, because 1.26.103 is broken. - "boto3==1.26.100" \ - Boruta && \ - # Pandoc is a dependency of deap - apt-get install -y pandoc && \ - /tmp/clean-layer.sh + vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe RUN apt-get install -y git-lfs && \ - # vtk with dependencies + # vtk dependencies apt-get install -y libgl1-mesa-glx && \ - pip install vtk && \ - # xvfbwrapper with dependencies + # xvfbwrapper dependencies apt-get install -y xvfb && \ - pip install xvfbwrapper && \ - /tmp/clean-layer.sh - -RUN pip install mpld3 \ - gpxpy \ - arrow \ - nilearn \ - nibabel \ - imgaug \ - preprocessing \ - path.py && \ - pip install deap \ - # b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 - "tpot==0.12.1" \ - scikit-optimize \ - haversine \ - toolz cytoolz \ - plotly \ - hyperopt \ - langid \ - # Useful data exploration libraries (for missing data and generating reports) - missingno \ - pandas-profiling \ - bayesian-optimization \ - matplotlib-venn \ - pyldavis \ - mlxtend \ - altair \ - ImageHash \ - ecos \ - CVXcanon \ - pymc3 \ - tifffile \ - geojson \ - pydicom \ - wavio \ - SimpleITK \ - squarify \ - fuzzywuzzy \ - python-louvain \ - pyexcel-ods \ - sklearn-pandas \ - prophet \ - holidays \ - holoviews \ - scikit-multilearn \ - leven \ - catboost \ - folium \ - scikit-plot \ - fury dipy \ - plotnine \ - scikit-surprise \ - pymongo \ - eli5 \ - kaggle \ - kagglehub \ - google-generativeai \ - pytest && \ - /tmp/clean-layer.sh - - # Add google PAIR-code Facets -RUN cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ - export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install librosa \ - sentencepiece \ - cufflinks \ - lime \ - memory_profiler && \ - /tmp/clean-layer.sh - -RUN pip install annoy \ - category_encoders && \ - # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. - pip uninstall -y google-cloud-bigquery-storage && \ - # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - # After launch this should be installed from pip - pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release \ - google-cloud-automl==1.0.1 \ - google-api-core==1.33.2 \ - google-cloud-bigquery \ - google-cloud-storage && \ - # Split these installations to avoid `pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000` - # b/315753846: Unpin translate package. - pip install google-cloud-translate==3.12.1 \ - google-cloud-language==2.* \ - google-cloud-videointelligence==2.* \ - google-cloud-vision==2.* \ - protobuf==3.20.3 \ - # Pandas data reader - pandas-datareader \ - emoji \ - # Add Japanese morphological analysis engine - janome \ - # yellowbrick machine learning visualization library - yellowbrick \ - mlcrate && \ - /tmp/clean-layer.sh - -# b/273059949: The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. -# b/274619697: learntools also requires a specific nbconvert right now -RUN rm -rf /opt/conda/lib/python3.10/site-packages/{nbconvert,nbclient,mistune,platformdirs}* - -RUN pip install bleach \ - certifi \ - cycler \ - decorator \ - entrypoints \ - html5lib \ - ipykernel \ - ipython \ - ipython-genutils \ - ipywidgets==8.1.5 \ - isoweek \ - jedi \ - jsonschema \ - jupyter-client \ - jupyter-console \ - jupyter-core \ - jupyterlab-lsp \ - MarkupSafe \ - mistune \ - nbformat \ - notebook \ - "nbconvert==6.4.5" \ - papermill \ - python-lsp-server[all] \ - olefile \ - kornia \ - pandas_summary \ - pandocfilters \ - pexpect \ - pickleshare \ - Pillow && \ - # Install openslide and its python binding - apt-get install -y openslide-tools && \ - pip install openslide-python \ - ptyprocess \ - Pygments \ - pyparsing \ - pytz \ - PyYAML \ - pyzmq \ - qtconsole \ - six \ - terminado \ - tornado \ - tqdm \ - traitlets \ - wcwidth \ - webencodings \ - widgetsnbextension \ - # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm - {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} - -RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ - apt-get update && apt-get install -y ffmpeg && \ - /tmp/clean-layer.sh - - ########### - # - # NEW CONTRIBUTORS: - # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end - # of all non-final lines. Thanks! - # - ########### - -RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json && \ - rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED -# dlib has a libmkl incompatibility: -# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. -# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. -# nnabla breaks protobuf compatibiilty: -RUN pip install wandb \ - pyemd \ - pympler \ - featuretools \ - #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ - git+https://github.com/Kaggle/learntools \ - ray \ - gym \ - pyarabic \ - pandasql \ - # b/302136621: Fix eli5 import for learntools - scikit-learn==1.2.2 \ - # b/329869023 shap 0.45.0 breaks learntools - shap==0.44.1 \ - cesium \ - rgf_python \ - jieba \ - tsfresh \ - optuna \ - plotly_express \ - albumentations \ - Rtree \ - accelerate && \ - apt-get -y install libspatialindex-dev && \ - # b/370860329: newer versions are not capable with current tensorflow - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ - pip install "numpy==1.26.4" && \ - pip install pytorch-ignite \ - bqplot \ - earthengine-api \ - transformers \ - datasets \ - s3fs \ - gcsfs \ - kaggle-environments \ - # geopandas > v0.14.4 breaks learn tools - geopandas==v0.14.4 \ - "shapely<2" \ - pydub \ - pydegensac \ - torchmetrics \ - pytorch-lightning \ - sympy \ - # flask is used by agents in the simulation competitions. - flask \ - # pycrypto is used by competitions team. - pycryptodome \ - nbdev \ - easyocr \ - onnx \ - tables \ - openpyxl \ - timm \ - torchinfo && \ - pip install git+https://github.com/facebookresearch/segment-anything.git && \ - # b/370860329: newer versions are not capable with current tensorflow - pip install --no-dependencies fastai fastdownload && \ - # b/343971718: remove duplicate aiohttp installs, and reinstall it - rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ - micromamba install --force-reinstall -y aiohttp && \ /tmp/clean-layer.sh # Download base easyocr models. @@ -512,12 +118,7 @@ RUN mkdir -p /root/.EasyOCR/model && \ /tmp/clean-layer.sh # Tesseract and some associated utility packages -RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract \ - wand \ - pdf2image \ - PyPDF && \ - /tmp/clean-layer.sh +RUN apt-get install tesseract-ocr -y ENV TESSERACT_PATH=/usr/bin/tesseract \ # For Facets @@ -525,39 +126,12 @@ ENV TESSERACT_PATH=/usr/bin/tesseract \ # For Theano with MKL MKL_THREADING_LAYER=GNU -# b/308525631: Pin Matplotlib until seaborn can be upgraded -# to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). -RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ - pip install --upgrade \ - "matplotlib==3.7.5" \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ - "seaborn==0.12.2" \ - pyupset \ - python-dateutil dask dask-expr igraph \ - pyyaml joblib geopy mne pyshp \ - pandas \ - polars \ - flax \ - "${JAXVER}" && \ - /tmp/clean-layer.sh - # Temporary fixes and patches -# Temporary patch for Dask getting downgraded, which breaks Keras -RUN pip install --upgrade dask && \ - # Stop jupyter nbconvert trying to rewrite its folder hierarchy - mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ - # TODO(b/380921973): Ensure only matplotlib 3.7.5 files are present. - rm -r /opt/conda/lib/python3.10/site-packages/matplotlib-3.9.2.dist-info/ && \ +# Stop jupyter nbconvert trying to rewrite its folder hierarchy +RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ - # Stop Matplotlib printing junk to the console on first load - sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.10/site-packages/matplotlib/font_manager.py && \ # Make matplotlib output in Jupyter notebooks display correctly mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ - # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. - ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - # b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) - pip install --force-reinstall --no-deps jupyter_server==2.12.5 && \ /tmp/clean-layer.sh # Fix to import bq_helper library without downgrading setuptools @@ -566,50 +140,35 @@ RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/ mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ - pip install -e ~/src/BigQuery_Helper && \ + uv pip install --system -e ~/src/BigQuery_Helper && \ /tmp/clean-layer.sh + +# install imagemagick for wand +# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu +RUN apt-get install libmagickwand-dev + +# Override default imagemagick policies +ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml + +# Add Kaggle module resolver +ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py +RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \ + sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py + # Add BigQuery client proxy settings -ENV PYTHONUSERBASE "/root/.local" +ENV PYTHONUSERBASE="/root/.local" ADD patches/kaggle_gcp.py \ patches/kaggle_secrets.py \ patches/kaggle_session.py \ patches/kaggle_web_client.py \ patches/kaggle_datasets.py \ patches/log.py \ - patches/sitecustomize.py \ - /root/.local/lib/python3.10/site-packages/ + $PACKAGE_PATH/ -# Override default imagemagick policies -ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml - -# Add Kaggle module resolver -ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/tensorflow_hub/kaggle_module_resolver.py -RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - # Disable preloaded jupyter modules (they add to startup, and break when they are missing) - sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /bigquery/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /sql/d /etc/ipython/ipython_kernel_config.py - -# Force only one libcusolver -{{ if eq .Accelerator "gpu" }} -RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ else }} -RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ end }} - -# b/270147159: conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. -RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 && \ - # b/276358430: fix Jupyter lsp freezing up the jupyter server - pip install "jupyter-lsp==1.5.1" - -# Set backend for matplotlib -ENV MPLBACKEND="agg" \ - # Set LC_ALL - # https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 - LC_ALL="POSIX" +# Figure out why this is in a different place? +# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. +ADD patches/sitecustomize.py /usr/lib/python3.10/sitecustomize.py ARG GIT_COMMIT=unknown \ BUILD_DATE=unknown @@ -620,10 +179,6 @@ LABEL git-commit=$GIT_COMMIT \ ENV GIT_COMMIT=${GIT_COMMIT} \ BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION \ - # Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. - kaggle-lang=python - # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date @@ -633,3 +188,4 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \ # Add the CUDA home. CUDA_HOME=/usr/local/cuda {{ end }} +ENTRYPOINT ["/usr/bin/env"] diff --git a/Jenkinsfile b/Jenkinsfile index 93f4753d..4980b956 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,66 +21,6 @@ pipeline { } stages { - stage('Pre-build Packages from Source') { - parallel { - stage('torch') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package torch \ - --version $TORCH_VERSION \ - --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \ - --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('lightgbm') { - options { - timeout(time: 10, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package lightgbm \ - --version $LIGHTGBM_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('jaxlib') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package jaxlib \ - --version $JAX_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - } - } stage('Build/Test/Diff') { parallel { stage('CPU') { diff --git a/clean-layer.sh b/clean-layer.sh index d1a048fc..467e1cac 100755 --- a/clean-layer.sh +++ b/clean-layer.sh @@ -19,6 +19,4 @@ apt-get clean # Ensures the current working directory won't be deleted cd /usr/local/src/ # Delete source files used for building binaries -rm -rf /usr/local/src/* -# Delete conda downloaded tarballs -conda clean -y --tarballs +rm -rf /usr/local/src/* \ No newline at end of file diff --git a/config.txt b/config.txt index e95a1af1..c0a7711c 100644 --- a/config.txt +++ b/config.txt @@ -1,11 +1,2 @@ -BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m122 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310 -LIGHTGBM_VERSION=4.2.0 -TORCH_VERSION=2.4.0 -TORCHAUDIO_VERSION=2.4.0 -TORCHVISION_VERSION=0.19.0 -JAX_VERSION=0.4.26 CUDA_MAJOR_VERSION=12 -CUDA_MINOR_VERSION=3 +CUDA_MINOR_VERSION=2 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt new file mode 100644 index 00000000..6400d8a1 --- /dev/null +++ b/kaggle_requirements.txt @@ -0,0 +1,139 @@ +altair>=5.4.0 +Babel +Boruta +Cartopy +ImageHash +Janome +PyArabic +PyUpSet +Pympler +Rtree +shapely<2 +SimpleITK +TPOT +Theano +Wand +annoy +arrow +bayesian-optimization +boto3 +catboost +category-encoders +cesium +comm +cytoolz +dask-expr +datasets +datashader +deap +dipy +docker +easyocr +eli5 +emoji +fasttext +featuretools +fiona +fury +fuzzywuzzy +geojson +# geopandas > v0.14.4 breaks learn tools +geopandas==v0.14.4 +google-cloud-aiplatform +# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 +google-cloud-automl==1.0.1 +# b/315753846: Unpin translate package. +google-cloud-translate==3.12.1 +google-cloud-videointelligence +google-cloud-vision +gpxpy +h2o +haversine +hep-ml +igraph +ipympl +ipywidgets==8.1.5 +isoweek +jedi +# b/276358430: fix Jupyter lsp freezing up the jupyter server +jupyter-lsp==1.5.1 +# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) +jupyter_server==2.12.5 +jupyterlab +jupyterlab-lsp +kaggle-environments +kagglehub>=0.3.4 +# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data(): +# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a +keras<3.6 +keras-cv +keras-nlp +keras-tuner +kornia +langid +leven +# b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" +libpysal<=4.9.2 +lime +line_profiler +mamba +mlcrate +mne +mpld3 +nbdev +nilearn +olefile +onnx +openslide-bin +openslide-python +optuna +pandas-profiling +pandasql +papermill +path +path.py +pdf2image +plotly-express +preprocessing +pudb +pyLDAvis +pycryptodome +pydegensac +pydicom +pydub +pyemd +pyexcel-ods +pymc3 +pymongo +pypdf +pytesseract +python-lsp-server +pytorch-ignite +pytorch-lightning +qgrid +qtconsole +ray +rgf-python +s3fs +scikit-learn-intelex +scikit-multilearn +scikit-optimize +scikit-plot +scikit-surprise +git+https://github.com/facebookresearch/segment-anything.git +shap +squarify +tensorflow-cloud +tensorflow-io +tensorflow-text +tensorflow_decision_forests +timm +torchinfo +torchmetrics +tsfresh +vtk +wandb +wavio +xgboost==2.0.3 +xvfbwrapper +ydata-profiling diff --git a/test b/test index ef1ffe3e..c2748e81 100755 --- a/test +++ b/test @@ -3,7 +3,7 @@ set -e IMAGE_TAG='kaggle/python-build' IMAGE_TAG_OVERRIDE='' -ADDITONAL_OPTS='' +ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default PATTERN='test*.py' usage() { @@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS readonly PATTERN set -x -docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/* -docker rm jupyter_test || true mkdir -p /tmp/python-build/tmp mkdir -p /tmp/python-build/devshm mkdir -p /tmp/python-build/working @@ -97,6 +95,9 @@ fi # Note about `--hostname localhost` (b/158137436) # hostname defaults to the container name which fails DNS name # resolution with --net=none (required to keep tests hermetic). See details in bug. +# +# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud +# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs. docker run --rm -t --read-only --net=none \ -e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \ -e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \ @@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \ -e KAGGLE_DATA_PROXY_PROJECT=test \ -e TF_FORCE_GPU_ALLOW_GROWTH=true \ -e XLA_PYTHON_CLIENT_PREALLOCATE=false \ + -e CLOUDSDK_CONFIG=/tmp/.config/gcloud \ --hostname localhost \ --shm-size=2g \ -v $PWD:/input:ro -v /tmp/python-build/working:/working \ diff --git a/tests/test_cuml.py b/tests/test_cuml.py index bbb7f0c6..695e47ca 100644 --- a/tests/test_cuml.py +++ b/tests/test_cuml.py @@ -6,6 +6,7 @@ class TestCuml(unittest.TestCase): @gpu_test @p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs. + @unittest.skip("b/381287748 cuML is not installed in Colab.") def test_pca_fit_transform(self): import unittest import numpy as np diff --git a/tests/test_fastai.py b/tests/test_fastai.py index 0de1f82f..49bce0ac 100644 --- a/tests/test_fastai.py +++ b/tests/test_fastai.py @@ -27,8 +27,9 @@ def test_tabular(self): "/input/tests/data/train.csv", cont_names=["pixel"+str(i) for i in range(784)], y_names='label', - procs=[FillMissing, Categorify, Normalize]) + procs=[FillMissing, Categorify, Normalize]) learn = tabular_learner(dls, layers=[200, 100]) - learn.fit_one_cycle(n_epoch=1) + with learn.no_bar(): + learn.fit_one_cycle(n_epoch=1) - self.assertGreater(learn.smooth_loss, 0) + self.assertGreater(learn.smooth_loss, 0) diff --git a/tests/test_lightgbm.py b/tests/test_lightgbm.py index bcdbb1a6..7001a0a7 100644 --- a/tests/test_lightgbm.py +++ b/tests/test_lightgbm.py @@ -34,7 +34,9 @@ def test_cpu(self): self.assertEqual(1, gbm.best_iteration) + # TODO(b/381256047): Colab needs to install GPU-enabled lightgbm. @gpu_test + @unittest.skip("Skipping this test until b/381256047 is resolved.") def test_gpu(self): lgb_train, lgb_eval = self.load_datasets()