diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index 02ad5a0e3b..0000000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -ARG PYTHON_VERSION -FROM mcr.microsoft.com/vscode/devcontainers/python:${PYTHON_VERSION} - -ARG REMOTE_USER -ENV HOME="/home/${REMOTE_USER}" \ - JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" \ - PYSPARK_PYTHON="/usr/local/bin/python" \ - PYSPARK_DRIVER_PYTHON="/usr/local/bin/python" - -RUN apt-get update && \ - apt-get -y install --no-install-recommends software-properties-common && \ - apt-add-repository 'deb http://security.debian.org/debian-security stretch/updates main' && \ - apt-get update && \ - apt-get -y install --no-install-recommends \ - openjdk-8-jre \ - cmake - -# Switch to non-root user -USER ${REMOTE_USER} -WORKDIR ${HOME} - -# Setup Jupyter Notebook -ENV NOTEBOOK_CONFIG="${HOME}/.jupyter/jupyter_notebook_config.py" -RUN mkdir -p $(dirname ${NOTEBOOK_CONFIG}) && \ - echo "c.NotebookApp.ip='0.0.0.0'" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.open_browser=False" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.allow_origin='*'" >> ${NOTEBOOK_CONFIG} -EXPOSE 8888 diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4b74a526c5..12d6ed8228 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,44 +1,50 @@ { - "name": "Recommenders", - "build": { - "dockerfile": "Dockerfile", - "context": "..", - "args": { - // Python version: 3, 3.6, 3.7 - "PYTHON_VERSION": "3.7", - "REMOTE_USER": "vscode" - } - }, + "name": "Recommenders", + // Version list: https://github.com/devcontainers/images/tree/main/src/base-ubuntu + // Includes: curl, wget, ca-certificates, git, Oh My Zsh!, + "image": "mcr.microsoft.com/devcontainers/base:ubuntu-24.04", + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" + }, + "features": { + // https://github.com/devcontainers/features/blob/main/src/anaconda/devcontainer-feature.json + "ghcr.io/devcontainers/features/anaconda:1": { + "version": "2024.06-1" + } + }, + "customizations": { + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + } + }, + "isort.args": ["--profile", "black"], + "python.analysis.autoImportCompletions": true, + "python.defaultInterpreterPath": "/usr/local/conda/envs/Recommenders/bin/python", + "python.testing.pytestEnabled": true, + // set the directory where all tests are + "python.testing.pytestArgs": ["tests"] + }, + // Add the IDs of extensions you want installed when the container is created. + "extensions": [ + "ms-python.black-formatter", // https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter + "ms-python.isort", // https://marketplace.visualstudio.com/items?itemName=ms-python.isort + "ms-python.mypy-type-checker", // https://marketplace.visualstudio.com/items?itemName=ms-python.mypy-type-checker + "ms-python.pylint", // https://marketplace.visualstudio.com/items?itemName=ms-python.pylint + "ms-python.python", // https://marketplace.visualstudio.com/items?itemName=ms-python.python + "ms-toolsai.datawrangler", // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.datawrangler + "ms-toolsai.jupyter" // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter + ] + } + }, - // Set *default* container specific settings.json values on container create. - "settings": { - "python.pythonPath": "/usr/local/bin/python", - "python.languageServer": "Pylance", - "python.linting.enabled": true, - "python.linting.pylintEnabled": true, - "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", - "python.formatting.blackPath": "/usr/local/py-utils/bin/black", - "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", - "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", - "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", - "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", - "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", - "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", - "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint" - }, - - // Add the IDs of extensions you want installed when the container is created. - "extensions": [ - "ms-python.python", - "ms-python.vscode-pylance" - ], - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - "forwardPorts": [8888], - - // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "pip install -U pip && pip install --user -e .[dev,examples,spark,xlearn]", - - // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. - "remoteUser": "vscode" + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "conda create -n Recommenders -c conda-forge -y python=3.10 openjdk=21 pip && conda init bash && bash -c -i 'conda activate Recommenders && pip install -e .[dev,spark]' && conda config --set auto_activate_base false" } diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml index 91a437719c..7c44abb37a 100644 --- a/.github/actions/azureml-test/action.yml +++ b/.github/actions/azureml-test/action.yml @@ -15,9 +15,15 @@ inputs: TEST_KIND: required: true description: Type of test - unit or nightly - AZUREML_TEST_CREDENTIALS: + AZUREML_TEST_UMI_CLIENT_ID: required: true - description: Credentials for AzureML login + description: AzureML User-managed identity client ID + AZUREML_TEST_UMI_TENANT_ID: + required: true + description: AzureML User-managed identity tenant ID + AZUREML_TEST_UMI_SUB_ID: + required: true + description: AzureML User-managed identity subscription ID AZUREML_TEST_SUBID: required: true description: AzureML subscription ID @@ -53,7 +59,9 @@ runs: - name: Log in to Azure uses: azure/login@v2 with: - creds: ${{ inputs.AZUREML_TEST_CREDENTIALS }} + client-id: ${{ inputs.AZUREML_TEST_UMI_CLIENT_ID }} + tenant-id: ${{ inputs.AZUREML_TEST_UMI_TENANT_ID }} + subscription-id: ${{ inputs.AZUREML_TEST_UMI_SUB_ID }} - name: Submit tests to AzureML shell: bash run: | diff --git a/.github/actions/get-test-groups/action.yml b/.github/actions/get-test-groups/action.yml index dc50e4b93c..6e87da900f 100644 --- a/.github/actions/get-test-groups/action.yml +++ b/.github/actions/get-test-groups/action.yml @@ -8,7 +8,7 @@ description: "Get test group names from tests_groups.py" inputs: TEST_KIND: required: true - description: Type of test - unit or nightly + description: Type of test - pr gate or nightly TEST_ENV: required: false description: Test environment - cpu, gpu or spark diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 549926c0d2..b52b7f8d4d 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -64,6 +64,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -79,7 +81,9 @@ jobs: EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index da5417a403..087c18c512 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -64,6 +64,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -79,7 +81,9 @@ jobs: EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index 6c902fcae0..11a0184b21 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -63,6 +63,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -78,7 +80,9 @@ jobs: EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index 481d3d80b1..8106b6fbff 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -53,6 +53,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -68,7 +70,9 @@ jobs: EXP_NAME: recommenders-unit-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.sha }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'unit' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/AUTHORS.md b/AUTHORS.md index 1816f73e27..b70bfa644b 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -52,6 +52,8 @@ To contributors: please add your name to the list when you submit a patch to the * **[Aaron He](https://github.com/AaronHeee)** * Reco utils of NCF * Deep dive notebook demonstrating the use of NCF +* **[Aaron Palpallatoc](https://github.com/ubergonmx)** + * Corrected variable in pickle dump in `mind_utils.ipynb` notebook * **[Abir Chakraborty](https://github.com/aeroabir)** * Self-Attentive Sequential Recommendation (SASRec) * Sequential Recommendation Via Personalized Transformer (SSEPT) diff --git a/README.md b/README.md index 72ee780040..7f78b29f80 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ We provide a [benchmark notebook](examples/06_benchmarks/movielens.ipynb) to ill This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md). -This project adheres to [Microsoft's Open Source Code of Conduct](CODE_OF_CONDUCT.md) in order to foster a welcoming and inspiring community for all. +This project adheres to this [Code of Conduct](CODE_OF_CONDUCT.md) in order to foster a welcoming and inspiring community for all. ## Build Status diff --git a/SETUP.md b/SETUP.md index 814118a490..323aefddf2 100644 --- a/SETUP.md +++ b/SETUP.md @@ -50,16 +50,19 @@ pip install recommenders[spark] # c. Run the notebook. ``` -## Setup for Azure Databricks +## Setup for Databricks -The following instructions were tested on Azure Databricks Runtime 12.2 LTS (Apache Spark version 3.3.2) and 11.3 LTS (Apache Spark version 3.3.0). -As of April 2023, Databricks Runtime 13 is not yet supported as it is on Python 3.10. +The following instructions were tested on Databricks Runtime 15.4 LTS (Apache Spark version 3.5.0), 14.3 LTS (Apache Spark version 3.5.0), 13.3 LTS (Apache Spark version 3.4.1), and 12.2 LTS (Apache Spark version 3.3.2). We have tested the runtime on python 3.9,3.10 and 3.11. -After an Azure Databricks cluster is provisioned: +After an Databricks cluster is provisioned: ```bash # 1. Go to the "Compute" tab on the left of the page, click on the provisioned cluster and then click on "Libraries". # 2. Click the "Install new" button. # 3. In the popup window, select "PyPI" as the library source. Enter "recommenders[examples]" as the package name. Click "Install" to install the package. +# 4. Now, repeat the step 3 for below packages: +# a. numpy<2.0.0 +# b. pandera<=0.18.3 +# c. scipy<=1.13.1 ``` ### Prepare Azure Databricks for Operationalization diff --git a/examples/01_prepare_data/mind_utils.ipynb b/examples/01_prepare_data/mind_utils.ipynb index e03a3683d9..7a2d81e6e6 100644 --- a/examples/01_prepare_data/mind_utils.ipynb +++ b/examples/01_prepare_data/mind_utils.ipynb @@ -306,7 +306,7 @@ " pickle.dump(word_dict, f)\n", " \n", "with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:\n", - " pickle.dump(word_dict, f)" + " pickle.dump(word_dict_all, f)" ] }, { diff --git a/recommenders/datasets/mind.py b/recommenders/datasets/mind.py index f396044c65..7295786c2e 100644 --- a/recommenders/datasets/mind.py +++ b/recommenders/datasets/mind.py @@ -17,18 +17,6 @@ ) -URL_MIND_LARGE_TRAIN = ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip" -) -URL_MIND_LARGE_VALID = ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip" -) -URL_MIND_SMALL_TRAIN = ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip" -) -URL_MIND_SMALL_VALID = ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip" -) URL_MIND_DEMO_TRAIN = ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip" ) @@ -39,6 +27,29 @@ "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip" ) +URL_MIND_SMALL_TRAIN = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip" +) +URL_MIND_SMALL_VALID = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip" +) +URL_MIND_SMALL_UTILS = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip" +) + +URL_MIND_LARGE_TRAIN = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip" +) +URL_MIND_LARGE_VALID = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip" +) +URL_MIND_LARGE_TEST = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_test.zip" +) +URL_MIND_LARGE_UTILS = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip" +) + URL_MIND = { "large": (URL_MIND_LARGE_TRAIN, URL_MIND_LARGE_VALID), "small": (URL_MIND_SMALL_TRAIN, URL_MIND_SMALL_VALID), diff --git a/recommenders/models/deeprec/DataModel/ImplicitCF.py b/recommenders/models/deeprec/DataModel/ImplicitCF.py index 3cfbb2821f..42bb319c46 100644 --- a/recommenders/models/deeprec/DataModel/ImplicitCF.py +++ b/recommenders/models/deeprec/DataModel/ImplicitCF.py @@ -206,6 +206,8 @@ def train_loader(self, batch_size): """ def sample_neg(x): + if len(x) >= self.n_items: + raise ValueError("A user has voted in every item. Can't find a negative sample.") while True: neg_id = random.randint(0, self.n_items - 1) if neg_id not in x: diff --git a/recommenders/models/newsrec/newsrec_utils.py b/recommenders/models/newsrec/newsrec_utils.py index 429f24b837..48e1ce8f30 100644 --- a/recommenders/models/newsrec/newsrec_utils.py +++ b/recommenders/models/newsrec/newsrec_utils.py @@ -310,7 +310,7 @@ def get_mind_data_set(type): if type == "large": return ( - "https://mind201910small.blob.core.windows.net/release/", + "https://recodatasets.z20.web.core.windows.net/newsrec/", "MINDlarge_train.zip", "MINDlarge_dev.zip", "MINDlarge_utils.zip", @@ -318,7 +318,7 @@ def get_mind_data_set(type): elif type == "small": return ( - "https://mind201910small.blob.core.windows.net/release/", + "https://recodatasets.z20.web.core.windows.net/newsrec/", "MINDsmall_train.zip", "MINDsmall_dev.zip", "MINDsmall_utils.zip", diff --git a/recommenders/models/sasrec/model.py b/recommenders/models/sasrec/model.py index 4ac6fa93d4..778eecc9e8 100644 --- a/recommenders/models/sasrec/model.py +++ b/recommenders/models/sasrec/model.py @@ -240,7 +240,7 @@ def call(self, x, training, mask): Args: x (tf.Tensor): Input tensor. - training (tf.Tensor): Training tensor. + training (Boolean): True if in training mode. mask (tf.Tensor): Mask tensor. Returns: @@ -305,7 +305,7 @@ def call(self, x, training, mask): Args: x (tf.Tensor): Input tensor. - training (tf.Tensor): Training tensor. + training (Boolean): True if in training mode. mask (tf.Tensor): Mask tensor. Returns: @@ -313,7 +313,7 @@ def call(self, x, training, mask): """ for i in range(self.num_layers): - x = self.enc_layers[i](x, training, mask) + x = self.enc_layers[i](x, training=training, mask=mask) return x # (batch_size, input_seq_len, d_model) @@ -689,7 +689,7 @@ def train_step(inp, tar): for epoch in range(1, num_epochs + 1): step_loss = [] - train_loss.reset_states() + train_loss.reset_state() for step in tqdm( range(num_steps), total=num_steps, ncols=70, leave=False, unit="b" ): diff --git a/recommenders/models/sasrec/ssept.py b/recommenders/models/sasrec/ssept.py index dbf7abdce8..15da43082b 100644 --- a/recommenders/models/sasrec/ssept.py +++ b/recommenders/models/sasrec/ssept.py @@ -122,7 +122,7 @@ def call(self, x, training): # --- ATTENTION BLOCKS --- seq_attention = seq_embeddings # (b, s, h1 + h2) - seq_attention = self.encoder(seq_attention, training, mask) + seq_attention = self.encoder(seq_attention, training=training, mask=mask) seq_attention = self.layer_normalization(seq_attention) # (b, s, h1+h2) # --- PREDICTION LAYER --- @@ -197,7 +197,7 @@ def predict(self, inputs): seq_embeddings *= mask seq_attention = seq_embeddings - seq_attention = self.encoder(seq_attention, training, mask) + seq_attention = self.encoder(seq_attention, training=training, mask=mask) seq_attention = self.layer_normalization(seq_attention) # (b, s, h1+h2) seq_emb = tf.reshape( seq_attention, diff --git a/setup.py b/setup.py index ef74ab5a84..6fe5e62d60 100644 --- a/setup.py +++ b/setup.py @@ -34,18 +34,18 @@ "locust>=2.12.2,<3", # requires jinja2 "memory-profiler>=0.61.0,<1", "nltk>=3.8.1,<4", # requires tqdm - "numpy>=1.26.4,<2;python_version>='3.12'", # https://stackoverflow.com/a/77364602/4505998 + "numpy>=1.26.4;python_version>='3.12'", # https://stackoverflow.com/a/77364602/4505998 "notebook>=6.5.5,<8", # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests "numba>=0.57.0,<1", - "numpy<2.0.0", # FIXME: Remove numpy<2.0.0 once cornac release a version newer than 2.2.1 that resolve ImportError: numpy.core.multiarray failed to import. "pandas>2.0.0,<3.0.0", # requires numpy "pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'", # For generating fake datasets "pandera[strategies]>=0.15.0;python_version>='3.9'", "retrying>=1.3.4,<2", "scikit-learn>=1.2.0,<2", # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df "scikit-surprise>=1.1.3", - "scipy>=1.9,<=1.13.1", # FIXME: Remove scipy<=1.13.1 once cornac release a version newer than 2.2.1. See #2128 "seaborn>=0.13.0,<1", # requires matplotlib, packaging + "statsmodels<=0.14.1;python_version<='3.8'", + "statsmodels>=0.14.4;python_version>='3.9'", "transformers>=4.27.0,<5", # requires packaging, pyyaml, requests, tqdm ] @@ -53,7 +53,9 @@ extras_require = { "gpu": [ "fastai>=2.7.11,<3", + "numpy<1.25.0;python_version<='3.8'", "nvidia-ml-py>=11.525.84", + "spacy<=3.7.5;python_version<='3.8'", "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<2.16; python_version<='3.8'", # Fixed TF due to constant security problems and breaking changes #2073 "tensorflow~=2.16; python_version>'3.8'", # Version needed for python 3.12 "tf-slim>=1.1.0", # No python_requires in its setup.py diff --git a/tests/README.md b/tests/README.md index df8e3e96d0..893df94c2f 100644 --- a/tests/README.md +++ b/tests/README.md @@ -216,28 +216,46 @@ Then, follow the steps below to create the AzureML infrastructure: - Name: `azureml-test-workspace` - Resource group: `recommenders_project_resources` - Location: *Make sure you have enough quota in the location you choose* -2. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new. +1. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new. - Select the CPU VM base. Anything above 64GB of RAM, and 8 cores should be fine. - Select the GPU VM base. Anything above 56GB of RAM, and 6 cores, and an NVIDIA K80 should be fine. -3. Add the subscription ID to GitHub action secrets [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value. -4. Make sure you have installed [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), and that you are logged in: `az login`. -5. Select your subscription: `az account set -s $AZURE_SUBSCRIPTION_ID`. -6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role contributor --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal: - ``` - { - "clientId": "XXXXXXXXXXXXXXXXXXXXX", - "clientSecret": "XXXXXXXXXXXXXXXXXXXXX", - "subscriptionId": "XXXXXXXXXXXXXXXXXXXXX", - "tenantId": "XXXXXXXXXXXXXXXXXXXXX", - "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", - "resourceManagerEndpointUrl": "https://management.azure.com/", - "activeDirectoryGraphResourceId": "https://graph.windows.net/", - "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/", - "galleryEndpointUrl": "https://gallery.azure.com/", - "managementEndpointUrl": "https://management.core.windows.net/" - } - ``` -7. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**. +1. Add the subscription ID to GitHub action secrets + [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). + * Create a new repository secret called `AZUREML_TEST_SUBID` and + add the subscription ID as the value. +1. Set up [login with OpenID Connect + (OIDC)](https://github.com/marketplace/actions/azure-login#login-with-openid-connect-oidc-recommended) + for GitHub Actions. + 1. Create a user-assigned managed identity (UMI) and assign the + following 3 roles of the AzureML workspace created above to the + UMI (See [Create a user-assigned managed + identity](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-manage-user-assigned-managed-identities?pivots=identity-mi-methods-azp#create-a-user-assigned-managed-identity)): + * AzureML Compute Operator + * AzureML Data Scientist + * Reader + 1. [Create a federated identiy credential on the + UMI](https://learn.microsoft.com/en-us/entra/workload-id/workload-identity-federation-create-trust-user-assigned-managed-identity?pivots=identity-wif-mi-methods-azp#github-actions-deploying-azure-resources) + with the following settings: + * Name: A unique name for the federated identity credential + within your application. + * Issuer: Set to `https://token.actions.githubusercontent.com` + for GitHub Actions. + * Subject: The subject claim format, e.g., + `repo:recommenders-team/recommenders:ref:refs/heads/`: + + `repo:recommenders-team/recommenders:pull_request` + + `repo:recommenders-team/recommenders:ref:refs/heads/staging` + + `repo:recommenders-team/recommenders:ref:refs/heads/main` + * Description: (Optional) A description of the credential. + * Audiences: Specifies who can use this credential; for GitHub + Actions, use `api://AzureADTokenExchange`. +1. Create 3 Actions secrets + * `AZUREML_TEST_UMI_TENANT_ID` + * `AZUREML_TEST_UMI_SUB_ID` + * `AZUREML_TEST_UMI_CLIENT_ID` + + and use the UMI's tenant ID, subscription ID and client ID as the + values of the secrets, respectively, under the repository's + **Settings > Security > Secrets and variables > Actions**. ## How to execute tests in your local environment diff --git a/tests/ci/azureml_tests/post_pytest.py b/tests/ci/azureml_tests/post_pytest.py index b457e709d2..26472ea469 100644 --- a/tests/ci/azureml_tests/post_pytest.py +++ b/tests/ci/azureml_tests/post_pytest.py @@ -89,8 +89,12 @@ def parse_args(): run_id=run.info.run_id, dst_path=args.log_dir, ) - log_path = pathlib.Path("user_logs/std_log.txt") - with open(pathlib.Path(args.log_dir) / log_path, "r") as file: - print(f"\nDumping logs in {log_path}") - print("=====================================") - print(file.read()) + log_path = next( + (path for path in pathlib.Path(args.log_dir).rglob("std_log.txt")), + None + ) + if log_path is not None: + with open(log_path, "r") as file: + print(f"\nDumping logs in {log_path}") + print("=====================================") + print(file.read()) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index d4f5f8c1f4..8d835ad9bd 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -27,34 +27,34 @@ '"0x8D8B8AD5B126C3B"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip", - "52952752", - "0x8D834F2EB31BDEC", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip", + "52994575", + '"0x8DCC5A830190676"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip", - "30945572", - "0x8D834F2EBA8D865", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", + "30948560", + '"0x8DCC5A82E182A0F"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_utils.zip", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", "155178106", - "0x8D87F67F4AEB960", + '"0x8D8B8AD5B3677C6"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip", - "530196631", - "0x8D8244E90C15C07", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", + "531360717", + '"0x8DCC5A8375BDC1D"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip", - "103456245", - "0x8D8244E92005849", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", + "103592887", + '"0x8DCC5A82FE8609C"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_utils.zip", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", "150359301", - "0x8D87F67E6CA4364", + '"0x8D8B8AD5B2ED4C9"', ), ], ) @@ -72,14 +72,6 @@ def test_download_mind_demo(tmp): assert statinfo.st_size == 10080022 -def test_download_mind_small(tmp): - train_path, valid_path = download_mind(size="small", dest_path=tmp) - statinfo = os.stat(train_path) - assert statinfo.st_size == 52952752 - statinfo = os.stat(valid_path) - assert statinfo.st_size == 30945572 - - def test_extract_mind_demo(tmp): train_zip, valid_zip = download_mind(size="demo", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip, clean_zip_file=False) @@ -102,6 +94,14 @@ def test_extract_mind_demo(tmp): assert statinfo.st_size == 1044588 +def test_download_mind_small(tmp): + train_path, valid_path = download_mind(size="small", dest_path=tmp) + statinfo = os.stat(train_path) + assert statinfo.st_size == 52994575 + statinfo = os.stat(valid_path) + assert statinfo.st_size == 30948560 + + def test_extract_mind_small(tmp): train_zip, valid_zip = download_mind(size="small", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip, clean_zip_file=False) @@ -127,9 +127,9 @@ def test_extract_mind_small(tmp): def test_download_mind_large(tmp_path): train_path, valid_path = download_mind(size="large", dest_path=tmp_path) statinfo = os.stat(train_path) - assert statinfo.st_size == 530196631 + assert statinfo.st_size == 531360717 statinfo = os.stat(valid_path) - assert statinfo.st_size == 103456245 + assert statinfo.st_size == 103592887 def test_extract_mind_large(tmp): diff --git a/tests/unit/recommenders/models/test_lightfm_utils.py b/tests/unit/recommenders/models/test_lightfm_utils.py index 2155fb6559..62b9d2ccc8 100644 --- a/tests/unit/recommenders/models/test_lightfm_utils.py +++ b/tests/unit/recommenders/models/test_lightfm_utils.py @@ -6,14 +6,17 @@ import itertools import numpy as np import pandas as pd -from lightfm.data import Dataset -from lightfm import LightFM, cross_validation -from recommenders.models.lightfm.lightfm_utils import ( - track_model_metrics, - similar_users, - similar_items, -) +try: + from lightfm.data import Dataset + from lightfm import LightFM, cross_validation + from recommenders.models.lightfm.lightfm_utils import ( + track_model_metrics, + similar_users, + similar_items, + ) +except ModuleNotFoundError: + pass SEEDNO = 42 @@ -128,6 +131,7 @@ def sim_items(interactions, fitting): ) +@pytest.mark.experimental def test_interactions(interactions): train_interactions, test_interactions, item_features, user_features = interactions assert train_interactions.shape == (10, 10) @@ -136,6 +140,7 @@ def test_interactions(interactions): assert user_features.shape == (10, 17) +@pytest.mark.experimental @pytest.mark.skip(reason="Flaky test") def test_fitting(fitting): output, _ = fitting @@ -152,9 +157,11 @@ def test_fitting(fitting): np.testing.assert_array_equal(output, target) +@pytest.mark.experimental def test_sim_users(sim_users): assert sim_users.shape == (5, 2) +@pytest.mark.experimental def test_sim_items(sim_items): assert sim_items.shape == (5, 2)