From f13f634100bccdf249d8d83f8f99536f3ea57f3b Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:40:29 -0700 Subject: [PATCH] Mixed bag clean up (#1425) yeah handful of things 1) Combined some layer 2) made comments referring to bugs use consistent format 3) removed low usage packages and tests, we recently launched package manager. we can confidently remove more unused packages. The following packages are to be removed: - pykalman - preprocessing - hmmlearn - gplearn - geoplot - polyglot - ggplot - descartes - fitter - imagecodecs - wfdb - hpsklearn - cleverhans - osmnx - pysal - wordsegment - vowpalwabbit - feather - kmodes - ortools - mlens - vecstack - Geohash - geoviews - s2sphere - flashtext - kmapper - stemming - hunspell - spectral - essentia - hypertools - stop_words - scattertext - vaex - blake3 - catalyst (note no longer maintained since 2022) --- Dockerfile.tmpl | 293 ++++++++++++++----------------------- tests/test_catalyst.py | 158 -------------------- tests/test_essentia.py | 7 - tests/test_geoviews.py | 17 --- tests/test_ggplot.py | 12 -- tests/test_imports.py | 1 - tests/test_kmapper.py | 7 - tests/test_matplotlib.py | 7 + tests/test_pykalman.py | 47 ------ tests/test_vaex.py | 10 -- tests/test_vowpalwabbit.py | 10 -- 11 files changed, 119 insertions(+), 450 deletions(-) delete mode 100644 tests/test_catalyst.py delete mode 100644 tests/test_essentia.py delete mode 100644 tests/test_geoviews.py delete mode 100644 tests/test_ggplot.py delete mode 100644 tests/test_kmapper.py delete mode 100644 tests/test_pykalman.py delete mode 100644 tests/test_vaex.py delete mode 100644 tests/test_vowpalwabbit.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 8b3c3382..c0037839 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,12 +1,12 @@ -ARG BASE_IMAGE_REPO -ARG BASE_IMAGE_TAG -ARG CPU_BASE_IMAGE_NAME -ARG GPU_BASE_IMAGE_NAME -ARG LIGHTGBM_VERSION -ARG TORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION -ARG JAX_VERSION +ARG BASE_IMAGE_REPO \ + BASE_IMAGE_TAG \ + CPU_BASE_IMAGE_NAME \ + GPU_BASE_IMAGE_NAME \ + LIGHTGBM_VERSION \ + TORCH_VERSION \ + TORCHAUDIO_VERSION \ + TORCHVISION_VERSION \ + JAX_VERSION {{ if eq .Accelerator "gpu" }} FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl @@ -18,61 +18,50 @@ FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} # Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" \ + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" {{ if eq .Accelerator "gpu" }} -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION -ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} -ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} +ARG CUDA_MAJOR_VERSION \ + CUDA_MINOR_VERSION +ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \ + CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} # Make sure we are on the right version of CUDA RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION # NVIDIA binaries from the host are mounted to /opt/bin. -ENV PATH=/opt/bin:${PATH} -# Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. -ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" +ENV PATH=/opt/bin:${PATH} \ + # Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. + LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" \ + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.16.1 -# See https://github.com/tensorflow/io#tensorflow-version-compatibility -ENV TENSORFLOW_IO_VERSION=0.37.0 +ENV TENSORFLOW_VERSION=2.16.1 \ + # See https://github.com/tensorflow/io#tensorflow-version-compatibility + TENSORFLOW_IO_VERSION=0.37.0 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG LIGHTGBM_VERSION -ARG TORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION -ARG JAX_VERSION +ARG LIGHTGBM_VERSION \ + TORCH_VERSION \ + TORCHAUDIO_VERSION \ + TORCHVISION_VERSION \ + JAX_VERSION # Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 # See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information -ENV KMP_WARNINGS=0 -# Also make the KMP logs noverbose. -# https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message -ENV KMP_SETTINGS=false - -# Remove the pip as the root user warning. -ENV PIP_ROOT_USER_ACTION=ignore +ENV KMP_WARNINGS=0 \ + # Also make the KMP logs noverbose. + # https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message + KMP_SETTINGS=false \ + # Remove the pip as the root user warning. + PIP_ROOT_USER_ACTION=ignore ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json -# b/276344496: Install specific version of boto3, because 1.26.103 is broken. -RUN pip install boto3==1.26.100 && \ - /tmp/clean-layer.sh - -{{ if eq .Accelerator "gpu" }} -# b/200968891 Keeps horovod once torch is upgraded. -RUN pip uninstall -y horovod && \ - /tmp/clean-layer.sh -{{ end }} - # Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - @@ -98,7 +87,7 @@ ENV PROJ_DATA=/opt/conda/share/proj RUN conda config --add channels nvidia && \ conda config --add channels rapidsai && \ conda config --set solver libmamba && \ - # b/299991198 remove curl/libcurl install once DLVM base image includes version >= 7.86 + # b/299991198: remove curl/libcurl install once DLVM base image includes version >= 7.86 conda install -c conda-forge mamba curl libcurl && \ # Base image channel order: conda-forge (highest priority), defaults. # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. @@ -178,12 +167,15 @@ RUN export PATH=/usr/local/cuda/bin:$PATH && \ /tmp/clean-layer.sh {{ end }} -# (b/308525631) Pin Matplotlib until seaborn can be upgraded +# b/308525631: Pin Matplotlib until seaborn can be upgraded # to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ pip install --upgrade \ "matplotlib<3.8.0" \ + # ipympl adds interactive widget support for matplotlib + ipympl==0.7.0 \ "seaborn==0.12.2" \ + pyupset \ python-dateutil dask dask-expr igraph \ pyyaml joblib geopy mne pyshp \ pandas \ @@ -205,21 +197,17 @@ RUN pip install \ tensorflow_decision_forests \ tensorflow-text \ "tensorflow_hub>=0.16.0" \ - # b/331799280 remove once other packages over to dm-tre - optree \ - tf-keras && \ + tf-keras \ + "keras>3" \ + keras-cv \ + keras-nlp && \ /tmp/clean-layer.sh -ADD patches/keras_internal.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal.py -ADD patches/keras_internal_test.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal_test.py +ADD patches/keras_internal.py \ + patches/keras_internal_test.py \ + /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/ -RUN pip install "keras>3" keras-cv keras-nlp && \ - /tmp/clean-layer.sh - -# b/328788268 libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" -RUN pip install pysal "libpysal==4.9.2" - -# b/350573866 xgboost v2.1.0 breaks learntools +# b/350573866: xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ pip install gensim \ @@ -247,16 +235,15 @@ RUN apt-get install -y libfreetype6-dev && \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - # Stop-words - pip install stop-words \ - scikit-image && \ + pip install scikit-image && \ + pip install opencv-contrib-python opencv-python && \ /tmp/clean-layer.sh -RUN pip install opencv-contrib-python opencv-python && \ - /tmp/clean-layer.sh - -# Pin scipy until we update JAX b/335003097 -RUN pip install "scipy==1.12.0" \ +RUN pip install cython \ + fasttext \ + opencv-contrib-python \ + opencv-python \ + "scipy<1.14.0" \ # Scikit-learn accelerated library for x86 "scikit-learn-intelex>=2023.0.1" \ # HDF5 support @@ -269,17 +256,18 @@ RUN pip install "scipy==1.12.0" \ bokeh \ numba \ datashader \ - # Boruta (python implementation) + # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" + "libpysal==4.9.2" \ + # b/276344496: Install specific version of boto3, because 1.26.103 is broken. + "boto3==1.26.100" \ Boruta && \ # Pandoc is a dependency of deap apt-get install -y pandoc && \ - pip install essentia - -RUN apt-get install -y git-lfs && \ /tmp/clean-layer.sh -# vtk with dependencies -RUN apt-get install -y libgl1-mesa-glx && \ +RUN apt-get install -y git-lfs && \ + # vtk with dependencies + apt-get install -y libgl1-mesa-glx && \ pip install vtk && \ # xvfbwrapper with dependencies apt-get install -y xvfb && \ @@ -295,22 +283,19 @@ RUN pip install mpld3 \ nibabel \ imgaug \ preprocessing \ - path.py \ - Geohash && \ + path.py && \ pip install deap \ - # b/302136621 Fix eli5 import for learntools, newer version require scikit-learn > 1.3 + # b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 "tpot==0.12.1" \ scikit-optimize \ haversine \ toolz cytoolz \ plotly \ hyperopt \ - fitter \ langid \ # Useful data exploration libraries (for missing data and generating reports) missingno \ pandas-profiling \ - s2sphere \ bayesian-optimization \ matplotlib-venn \ pyldavis \ @@ -320,32 +305,20 @@ RUN pip install mpld3 \ ecos \ CVXcanon \ pymc3 \ - imagecodecs \ tifffile \ - spectral \ - descartes \ geojson \ pydicom \ wavio \ SimpleITK \ - hmmlearn \ - gplearn \ squarify \ fuzzywuzzy \ python-louvain \ pyexcel-ods \ sklearn-pandas \ - stemming \ - # b/266272046 prophet 1.1.2 breaks the test - prophet==1.1.1 \ - # b/283847935 holidays >0.24 is broken - "holidays==0.24" \ + prophet \ + holidays \ holoviews \ - geoviews \ - hypertools \ - mlens \ scikit-multilearn \ - cleverhans \ leven \ catboost \ folium \ @@ -354,7 +327,6 @@ RUN pip install mpld3 \ plotnine \ scikit-surprise \ pymongo \ - geoplot \ eli5 \ kaggle \ kagglehub \ @@ -362,22 +334,16 @@ RUN pip install mpld3 \ pytest && \ /tmp/clean-layer.sh -RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy-1.23.5.dist-info* # Add google PAIR-code Facets RUN cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install kmodes --no-dependencies && \ pip install librosa \ - polyglot \ sentencepiece \ cufflinks \ lime \ memory_profiler && \ /tmp/clean-layer.sh -RUN pip install cython \ - fasttext && \ - apt-get install -y libhunspell-dev && pip install hunspell RUN pip install annoy \ category_encoders && \ # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. @@ -390,33 +356,26 @@ RUN pip install annoy \ google-cloud-bigquery \ google-cloud-storage && \ # Split these installations to avoid `pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000` - # TODO(b/315753846) Unpin translate package. + # b/315753846: Unpin translate package. pip install google-cloud-translate==3.12.1 \ google-cloud-language==2.* \ google-cloud-videointelligence==2.* \ google-cloud-vision==2.* \ protobuf==3.20.3 \ - ortools \ - scattertext \ # Pandas data reader pandas-datareader \ - wordsegment \ emoji \ # Add Japanese morphological analysis engine janome \ - wfdb \ - vecstack \ # yellowbrick machine learning visualization library yellowbrick \ mlcrate && \ /tmp/clean-layer.sh -# b/273059949 The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. -# b/274619697 learntools also requires a specific nbconvert right now +# b/273059949: The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. +# b/274619697: learntools also requires a specific nbconvert right now RUN rm -rf /opt/conda/lib/python3.10/site-packages/{nbconvert,nbclient,mistune,platformdirs}* -# Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 -# allennlp \ RUN pip install bleach \ certifi \ cycler \ @@ -426,6 +385,7 @@ RUN pip install bleach \ ipykernel \ ipython \ ipython-genutils \ + # Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 ipywidgets==7.7.1 \ isoweek \ jedi \ @@ -447,7 +407,6 @@ RUN pip install bleach \ pandocfilters \ pexpect \ pickleshare \ - # TODO(b/290035631) unpin when EasyOCR did a release. Pillow && \ # Install openslide and its python binding apt-get install -y openslide-tools && \ @@ -469,7 +428,6 @@ RUN pip install bleach \ widgetsnbextension \ # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} \ - feather-format \ fastai RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ @@ -484,20 +442,14 @@ RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_ # ########### -RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json -RUN rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED - +RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json && \ + rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED # dlib has a libmkl incompatibility: # test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. # Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. # nnabla breaks protobuf compatibiilty: -RUN pip install flashtext \ - wandb \ - # b/214080882 blake3 0.3.0 is not compatible with vaex. - blake3==0.2.1 \ - vaex \ +RUN pip install wandb \ pyemd \ - pyupset \ pympler \ featuretools \ #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ @@ -506,30 +458,24 @@ RUN pip install flashtext \ gym \ pyarabic \ pandasql \ - # b/302136621 Fix eli5 import for learntools + # b/302136621: Fix eli5 import for learntools scikit-learn==1.2.2 \ - hpsklearn \ - kmapper \ - # b/329869023 shap 0.45.0 breaks learntools + # b/329869023 shap 0.45.0 breaks learntools shap==0.44.1 \ cesium \ rgf_python \ jieba \ - # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668 - https://github.com/hbasria/ggpy/archive/0.11.5.zip \ tsfresh \ - pykalman \ optuna \ plotly_express \ albumentations \ - accelerate \ - # b/290207097 switch back to the pip catalyst package when bug fixed - # https://github.com/catalyst-team/catalyst/issues/1440 - git+https://github.com/Philmod/catalyst.git@fix-fp16#egg=catalyst \ - osmnx && \ + Rtree \ + accelerate && \ apt-get -y install libspatialindex-dev -RUN pip install pytorch-ignite \ +RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ + pip install "numpy==1.26.4" && \ + pip install pytorch-ignite \ qgrid \ bqplot \ earthengine-api \ @@ -541,7 +487,6 @@ RUN pip install pytorch-ignite \ # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 \ "shapely<2" \ - vowpalwabbit \ pydub \ pydegensac \ torchmetrics \ @@ -552,14 +497,12 @@ RUN pip install pytorch-ignite \ # pycrypto is used by competitions team. pycryptodome \ easyocr \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ onnx \ tables \ openpyxl \ timm \ torchinfo && \ - pip install git+https://github.com/facebookresearch/segment-anything.git && \ + pip install git+https://github.com/facebookresearch/segment-anything.git && \ # b/343971718: remove duplicate aiohttp installs, and reinstall it rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ mamba install --force-reinstall -y aiohttp && \ @@ -586,12 +529,12 @@ RUN apt-get install tesseract-ocr -y && \ pdf2image \ PyPDF && \ /tmp/clean-layer.sh -ENV TESSERACT_PATH=/usr/bin/tesseract -# For Facets -ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ -# For Theano with MKL -ENV MKL_THREADING_LAYER=GNU +ENV TESSERACT_PATH=/usr/bin/tesseract \ + # For Facets + PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ \ + # For Theano with MKL + MKL_THREADING_LAYER=GNU # Temporary fixes and patches # Temporary patch for Dask getting downgraded, which breaks Keras @@ -605,7 +548,7 @@ RUN pip install --upgrade dask && \ mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - # pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) + # b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) pip install --force-reinstall --no-deps jupyter_server==2.12.5 && \ /tmp/clean-layer.sh @@ -620,13 +563,15 @@ RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/ # Add BigQuery client proxy settings ENV PYTHONUSERBASE "/root/.local" -ADD patches/kaggle_gcp.py /root/.local/lib/python3.10/site-packages/kaggle_gcp.py -ADD patches/kaggle_secrets.py /root/.local/lib/python3.10/site-packages/kaggle_secrets.py -ADD patches/kaggle_session.py /root/.local/lib/python3.10/site-packages/kaggle_session.py -ADD patches/kaggle_web_client.py /root/.local/lib/python3.10/site-packages/kaggle_web_client.py -ADD patches/kaggle_datasets.py /root/.local/lib/python3.10/site-packages/kaggle_datasets.py -ADD patches/log.py /root/.local/lib/python3.10/site-packages/log.py -ADD patches/sitecustomize.py /root/.local/lib/python3.10/site-packages/sitecustomize.py +ADD patches/kaggle_gcp.py \ + patches/kaggle_secrets.py \ + patches/kaggle_session.py \ + patches/kaggle_web_client.py \ + patches/kaggle_datasets.py \ + patches/log.py \ + patches/sitecustomize.py \ + /root/.local/lib/python3.10/site-packages/ + # Override default imagemagick policies ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml @@ -635,20 +580,6 @@ ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/te RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py -# TensorBoard Jupyter extension. Should be replaced with TensorBoard's provided magic once we have -# worker tunneling support in place. -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ENV JUPYTER_CONFIG_DIR "/root/.jupyter/" -# RUN pip install jupyter_tensorboard && \ -# jupyter serverextension enable jupyter_tensorboard && \ -# jupyter tensorboard enable -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.10/site-packages/tensorboard/notebook.py - -# Disable unnecessary jupyter extensions -#RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ -# jupyter-serverextension disable nb_conda --py --sys-prefix && \ -# python -m nb_conda_kernels.install --disable - # Disable preloaded jupyter modules (they add to startup, and break when they are missing) RUN sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ @@ -662,37 +593,37 @@ RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/li RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 {{ end }} -# b/270147159 conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. +# b/270147159: conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 -# b/276358430 fix Jupyter lsp freezing up the jupyter server +# b/276358430: fix Jupyter lsp freezing up the jupyter server RUN pip install "jupyter-lsp==1.5.1" # Set backend for matplotlib -ENV MPLBACKEND "agg" +ENV MPLBACKEND="agg" \ + # Set LC_ALL + # https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 + LC_ALL="POSIX" -# Set LC_ALL -# https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 -ENV LC_ALL "POSIX" +ARG GIT_COMMIT=unknown \ + BUILD_DATE=unknown -ARG GIT_COMMIT=unknown -ARG BUILD_DATE=unknown +LABEL git-commit=$GIT_COMMIT \ + build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT -LABEL build-date=$BUILD_DATE -ENV GIT_COMMIT=${GIT_COMMIT} -ENV BUILD_DATE=${BUILD_DATE} +ENV GIT_COMMIT=${GIT_COMMIT} \ + BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION -# Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. -LABEL kaggle-lang=python +LABEL tensorflow-version=$TENSORFLOW_VERSION \ + # Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. + kaggle-lang=python # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} # Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" -# Add the CUDA home. -ENV CUDA_HOME=/usr/local/cuda +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \ + # Add the CUDA home. + CUDA_HOME=/usr/local/cuda {{ end }} diff --git a/tests/test_catalyst.py b/tests/test_catalyst.py deleted file mode 100644 index 3b9c97d4..00000000 --- a/tests/test_catalyst.py +++ /dev/null @@ -1,158 +0,0 @@ -import unittest -import collections -import json -import numpy as np - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchvision -import torchvision.transforms as transforms - -import catalyst -from catalyst.dl import SupervisedRunner, CheckpointCallback -from catalyst import utils - - -def _to_categorical(y, num_classes=None, dtype='float32'): - """ - Taken from - github.com/keras-team/keras/blob/master/keras/utils/np_utils.py - Converts a class vector (integers) to binary class matrix. - E.g. for use with categorical_crossentropy. - # Arguments - y: class vector to be converted into a matrix - (integers from 0 to num_classes). - num_classes: total number of classes. - dtype: The data type expected by the input, as a string - (`float32`, `float64`, `int32`...) - # Returns - A binary matrix representation of the input. The classes axis - is placed last. - # Example - ```python - # Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}: - > labels - array([0, 2, 1, 2, 0]) - # `to_categorical` converts this into a matrix with as many - # columns as there are classes. The number of rows - # stays the same. - > to_categorical(labels) - array([[ 1., 0., 0.], - [ 0., 0., 1.], - [ 0., 1., 0.], - [ 0., 0., 1.], - [ 1., 0., 0.]], dtype=float32) - ``` - """ - - y = np.array(y, dtype='int') - input_shape = y.shape - if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: - input_shape = tuple(input_shape[:-1]) - y = y.ravel() - if not num_classes: - num_classes = np.max(y) + 1 - n = y.shape[0] - categorical = np.zeros((n, num_classes), dtype=dtype) - categorical[np.arange(n), y] = 1 - output_shape = input_shape + (num_classes,) - categorical = np.reshape(categorical, output_shape) - return categorical - - -class Net(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 20, 5, 1) - self.conv2 = nn.Conv2d(20, 50, 5, 1) - self.fc1 = nn.Linear(4 * 4 * 50, 500) - self.fc2 = nn.Linear(500, 10) - - def forward(self, x): - x = F.relu(self.conv1(x)) - x = F.max_pool2d(x, 2, 2) - x = F.relu(self.conv2(x)) - x = F.max_pool2d(x, 2, 2) - x = x.view(-1, 4 * 4 * 50) - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return x - - -class TestCatalyst(unittest.TestCase): - - def test_version(self): - self.assertIsNotNone(catalyst.__version__) - - def test_mnist(self): - utils.set_global_seed(42) - x_train = np.random.random((100, 1, 28, 28)).astype(np.float32) - y_train = _to_categorical( - np.random.randint(10, size=(100, 1)), - num_classes=10 - ).astype(np.float32) - x_valid = np.random.random((20, 1, 28, 28)).astype(np.float32) - y_valid = _to_categorical( - np.random.randint(10, size=(20, 1)), - num_classes=10 - ).astype(np.float32) - - x_train, y_train, x_valid, y_valid = \ - list(map(torch.tensor, [x_train, y_train, x_valid, y_valid])) - - bs = 32 - num_workers = 4 - data_transform = transforms.ToTensor() - - loaders = collections.OrderedDict() - - trainset = torch.utils.data.TensorDataset(x_train, y_train) - trainloader = torch.utils.data.DataLoader( - trainset, batch_size=bs, - shuffle=True, num_workers=num_workers) - - validset = torch.utils.data.TensorDataset(x_valid, y_valid) - validloader = torch.utils.data.DataLoader( - validset, batch_size=bs, - shuffle=False, num_workers=num_workers) - - loaders["train"] = trainloader - loaders["valid"] = validloader - - # experiment setup - num_epochs = 3 - logdir = "./logs" - - # model, criterion, optimizer - model = Net() - criterion = nn.BCEWithLogitsLoss() - optimizer = torch.optim.Adam(model.parameters()) - - # model runner - runner = SupervisedRunner() - - # model training - runner.train( - model=model, - criterion=criterion, - optimizer=optimizer, - loaders=loaders, - logdir=logdir, - num_epochs=num_epochs, - verbose=False, - callbacks=[CheckpointCallback( - logdir, - topk=3, - save_best=True, - loader_key="valid", - metric_key="loss", - minimize=True)] - ) - - with open('./logs/model.storage.json') as f: - metrics = json.load(f) - storage = metrics['storage'] - self.assertEqual(3, len(storage)) - self.assertTrue(storage[0]['metric'] < storage[2]['metric']) - self.assertTrue(storage[0]['metric']< 0.35) diff --git a/tests/test_essentia.py b/tests/test_essentia.py deleted file mode 100644 index 749b9466..00000000 --- a/tests/test_essentia.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -from essentia.standard import Windowing - -class TestEssentia(unittest.TestCase): - def test_windowing(self): - Windowing(type = 'hann') diff --git a/tests/test_geoviews.py b/tests/test_geoviews.py deleted file mode 100644 index 2636cc6f..00000000 --- a/tests/test_geoviews.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - -from common import p100_exempt - -class TestGeoviews(unittest.TestCase): - - @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs. - - def test_viz(self): - import geoviews.feature as gf - import holoviews as hv - from cartopy import crs - - hv.extension('matplotlib') - (gf.ocean + gf.land + gf.ocean * gf.land * gf.coastline * gf.borders).options( - 'Feature', projection=crs.Geostationary(), global_extent=True - ).cols(3) diff --git a/tests/test_ggplot.py b/tests/test_ggplot.py deleted file mode 100644 index 30aec29f..00000000 --- a/tests/test_ggplot.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest -import os.path - -from ggplot import * - -class TestGgplot(unittest.TestCase): - - def test_plot(self): - p = ggplot(aes(x='mpg'), data=mtcars) + geom_histogram() - p.save("myplot.png") - - self.assertTrue(os.path.isfile("myplot.png")) diff --git a/tests/test_imports.py b/tests/test_imports.py index 4977ff9c..b22ebe7a 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -4,6 +4,5 @@ class TestImport(unittest.TestCase): # Basic import tests for packages without any. def test_basic(self): import bq_helper - import cleverhans import tensorflow_datasets import segment_anything diff --git a/tests/test_kmapper.py b/tests/test_kmapper.py deleted file mode 100644 index c75deea3..00000000 --- a/tests/test_kmapper.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -import kmapper as km - -class TestKMapper(unittest.TestCase): - def test_init(self): - km.KeplerMapper() diff --git a/tests/test_matplotlib.py b/tests/test_matplotlib.py index 1cbc939a..c04f3f23 100644 --- a/tests/test_matplotlib.py +++ b/tests/test_matplotlib.py @@ -1,10 +1,17 @@ import unittest import os.path +from distutils.version import StrictVersion + +import matplotlib import matplotlib.pyplot as plt import numpy as np class TestMatplotlib(unittest.TestCase): + def test_version(self): + # b/308525631: newer versions of Matplotlib causes learntools to fail + self.assertLess(StrictVersion(matplotlib.__version__), StrictVersion("3.8.0")) + def test_plot(self): plt.plot(np.linspace(0,1,50), np.random.rand(50)) plt.savefig("plot1.png") diff --git a/tests/test_pykalman.py b/tests/test_pykalman.py deleted file mode 100644 index 26d86003..00000000 --- a/tests/test_pykalman.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -import numpy as np -from pykalman import KalmanFilter -from pykalman import UnscentedKalmanFilter -from pykalman.sqrt import CholeskyKalmanFilter, AdditiveUnscentedKalmanFilter - -class TestPyKalman(unittest.TestCase): - def test_kalman_filter(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements) - return filtered_state_means - - def test_kalman_missing(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - measurements = np.ma.asarray(measurements) - measurements[1] = np.ma.masked - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements) - return filtered_state_means - - def test_unscented_kalman(self): - ukf = UnscentedKalmanFilter(lambda x, w: x + np.sin(w), lambda x, v: x + v, transition_covariance=0.1) - (filtered_state_means, filtered_state_covariances) = ukf.filter([0, 1, 2]) - (smoothed_state_means, smoothed_state_covariances) = ukf.smooth([0, 1, 2]) - return filtered_state_means - - def test_online_update(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - measurements = np.ma.asarray(measurements) - measurements[1] = np.ma.masked # measurement at timestep 1 is unobserved - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - for t in range(1, 3): - filtered_state_means[t], filtered_state_covariances[t] = \ - kf.filter_update(filtered_state_means[t-1], filtered_state_covariances[t-1], measurements[t]) - return filtered_state_means - - def test_robust_sqrt(self): - kf = CholeskyKalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - ukf = AdditiveUnscentedKalmanFilter(lambda x, w: x + np.sin(w), lambda x, v: x + v, observation_covariance=0.1) - diff --git a/tests/test_vaex.py b/tests/test_vaex.py deleted file mode 100644 index b64061b0..00000000 --- a/tests/test_vaex.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - -import vaex - -class TestVaex(unittest.TestCase): - def test_read_csv(self): - df = vaex.read_csv("/input/tests/data/train.csv") - - self.assertEqual((100, 785), df.shape) - self.assertEqual(10, df['label'].nunique()) \ No newline at end of file diff --git a/tests/test_vowpalwabbit.py b/tests/test_vowpalwabbit.py deleted file mode 100644 index 839aed05..00000000 --- a/tests/test_vowpalwabbit.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - -from vowpalwabbit import pyvw - -class TestVowpalwabbit(unittest.TestCase): - def test_basic(self): - vw = pyvw.vw(quiet=True) - ex = vw.example('1 | a b c') - vw.learn(ex) - self.assertGreater(vw.predict(ex), 0)