Merge in staging

Signed-off-by: Simon Zhao <[email protected]>
recommenders-team · Nov 12, 2024 · 450bf17 · 450bf17
2 parents 4a544b2 + 12bc1e4
commit 450bf17
Show file tree

Hide file tree

Showing 22 changed files with 215 additions and 164 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,44 +1,50 @@
 {
-	"name": "Recommenders",
-	"build": {
-		"dockerfile": "Dockerfile",
-		"context": "..",
-		"args": { 
-			// Python version: 3, 3.6, 3.7
-			"PYTHON_VERSION": "3.7",
-			"REMOTE_USER": "vscode"
-		}
-	},
+    "name": "Recommenders",
+    // Version list: https://github.com/devcontainers/images/tree/main/src/base-ubuntu
+    // Includes: curl, wget, ca-certificates, git, Oh My Zsh!, 
+    "image": "mcr.microsoft.com/devcontainers/base:ubuntu-24.04",
+    "hostRequirements": {
+        "cpus": 4,
+        "memory": "16gb",
+        "storage": "32gb"
+    },
+    "features": {
+        // https://github.com/devcontainers/features/blob/main/src/anaconda/devcontainer-feature.json
+        "ghcr.io/devcontainers/features/anaconda:1": {
+            "version": "2024.06-1"
+        }
+    },
+    "customizations": {
+        "vscode": {
+            // Set *default* container specific settings.json values on container create.
+            "settings": {
+                "[python]": {
+                    "editor.defaultFormatter": "ms-python.black-formatter",
+                    "editor.formatOnSave": true,
+                    "editor.codeActionsOnSave": {
+                        "source.organizeImports": "explicit"
+                    }
+                },
+                "isort.args": ["--profile", "black"],
+                "python.analysis.autoImportCompletions": true,
+                "python.defaultInterpreterPath": "/usr/local/conda/envs/Recommenders/bin/python",
+                "python.testing.pytestEnabled": true,
+                // set the directory where all tests are 
+                "python.testing.pytestArgs": ["tests"]
+            },
+            // Add the IDs of extensions you want installed when the container is created.
+            "extensions": [
+                "ms-python.black-formatter",  // https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter
+                "ms-python.isort",  // https://marketplace.visualstudio.com/items?itemName=ms-python.isort
+                "ms-python.mypy-type-checker",  // https://marketplace.visualstudio.com/items?itemName=ms-python.mypy-type-checker
+                "ms-python.pylint",  // https://marketplace.visualstudio.com/items?itemName=ms-python.pylint
+                "ms-python.python",  // https://marketplace.visualstudio.com/items?itemName=ms-python.python
+                "ms-toolsai.datawrangler",  // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.datawrangler
+                "ms-toolsai.jupyter"  // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter
+            ]
+        }
+    },
 
-	// Set *default* container specific settings.json values on container create.
-	"settings": { 
-		"python.pythonPath": "/usr/local/bin/python",
-		"python.languageServer": "Pylance",
-		"python.linting.enabled": true,
-		"python.linting.pylintEnabled": true,
-		"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
-		"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
-		"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
-		"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
-		"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
-		"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
-		"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
-		"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
-		"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
-	},
-
-	// Add the IDs of extensions you want installed when the container is created.
-	"extensions": [
-		"ms-python.python",
-		"ms-python.vscode-pylance"
-	],
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	"forwardPorts": [8888],
-
-	// Use 'postCreateCommand' to run commands after the container is created.
-	"postCreateCommand": "pip install -U pip && pip install --user -e .[dev,examples,spark,xlearn]",
-
-	// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
-	"remoteUser": "vscode"
+    // Use 'postCreateCommand' to run commands after the container is created.
+    "postCreateCommand": "conda create -n Recommenders -c conda-forge -y python=3.10 openjdk=21 pip && conda init bash && bash -c -i 'conda activate Recommenders && pip install -e .[dev,spark]' && conda config --set auto_activate_base false"
 }
diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml
@@ -15,9 +15,15 @@ inputs:
   TEST_KIND:
     required: true
     description: Type of test - unit or nightly
-  AZUREML_TEST_CREDENTIALS:
+  AZUREML_TEST_UMI_CLIENT_ID:
     required: true
-    description: Credentials for AzureML login
+    description: AzureML User-managed identity client ID
+  AZUREML_TEST_UMI_TENANT_ID:
+    required: true
+    description: AzureML User-managed identity tenant ID
+  AZUREML_TEST_UMI_SUB_ID:
+    required: true
+    description: AzureML User-managed identity subscription ID
   AZUREML_TEST_SUBID:
     required: true
     description: AzureML subscription ID
@@ -53,7 +59,9 @@ runs:
     - name: Log in to Azure
       uses: azure/login@v2
       with:
-        creds: ${{ inputs.AZUREML_TEST_CREDENTIALS }}
+        client-id: ${{ inputs.AZUREML_TEST_UMI_CLIENT_ID }}
+        tenant-id: ${{ inputs.AZUREML_TEST_UMI_TENANT_ID }}
+        subscription-id: ${{ inputs.AZUREML_TEST_UMI_SUB_ID }}
     - name: Submit tests to AzureML
       shell: bash
       run: |

diff --git a/.github/actions/get-test-groups/action.yml b/.github/actions/get-test-groups/action.yml
@@ -8,7 +8,7 @@ description: "Get test group names from tests_groups.py"
 inputs:
   TEST_KIND:
     required: true
-    description: Type of test - unit or nightly
+    description: Type of test - pr gate or nightly
   TEST_ENV:
     required: false
     description: Test environment - cpu, gpu or spark

diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml
@@ -64,6 +64,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -79,7 +81,9 @@ jobs:
           EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'nightly'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml
@@ -64,6 +64,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -79,7 +81,9 @@ jobs:
           EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'nightly'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml
@@ -63,6 +63,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -78,7 +80,9 @@ jobs:
           EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'nightly'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml
@@ -53,6 +53,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -68,7 +70,9 @@ jobs:
           EXP_NAME: recommenders-unit-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.sha }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'unit'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}

diff --git a/AUTHORS.md b/AUTHORS.md
@@ -52,6 +52,8 @@ To contributors: please add your name to the list when you submit a patch to the
 * **[Aaron He](https://github.com/AaronHeee)**
    * Reco utils of NCF
    * Deep dive notebook demonstrating the use of NCF
+* **[Aaron Palpallatoc](https://github.com/ubergonmx)**
+   * Corrected variable in pickle dump in `mind_utils.ipynb` notebook
 * **[Abir Chakraborty](https://github.com/aeroabir)**
    * Self-Attentive Sequential Recommendation (SASRec)
    * Sequential Recommendation Via Personalized Transformer (SSEPT)

diff --git a/README.md b/README.md
@@ -144,7 +144,7 @@ We provide a [benchmark notebook](examples/06_benchmarks/movielens.ipynb) to ill
 
 This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md).
 
-This project adheres to [Microsoft's Open Source Code of Conduct](CODE_OF_CONDUCT.md) in order to foster a welcoming and inspiring community for all.
+This project adheres to this [Code of Conduct](CODE_OF_CONDUCT.md) in order to foster a welcoming and inspiring community for all.
 
 ## Build Status
 

diff --git a/SETUP.md b/SETUP.md
@@ -50,16 +50,19 @@ pip install recommenders[spark]
 #   c. Run the notebook.
 ```
 
-## Setup for Azure Databricks
+## Setup for Databricks
 
-The following instructions were tested on Azure Databricks Runtime 12.2 LTS (Apache Spark version 3.3.2) and 11.3 LTS (Apache Spark version 3.3.0).
-As of April 2023, Databricks Runtime 13 is not yet supported as it is on Python 3.10.
+The following instructions were tested on Databricks Runtime 15.4 LTS (Apache Spark version 3.5.0), 14.3 LTS (Apache Spark version 3.5.0), 13.3 LTS (Apache Spark version 3.4.1), and 12.2 LTS (Apache Spark version 3.3.2). We have tested the runtime on python 3.9,3.10 and 3.11. 
 
-After an Azure Databricks cluster is provisioned:
+After an Databricks cluster is provisioned:
 ```bash
 # 1. Go to the "Compute" tab on the left of the page, click on the provisioned cluster and then click on "Libraries". 
 # 2. Click the "Install new" button.  
 # 3. In the popup window, select "PyPI" as the library source. Enter "recommenders[examples]" as the package name. Click "Install" to install the package.
+# 4. Now, repeat the step 3 for below packages:
+#   a. numpy<2.0.0
+#   b. pandera<=0.18.3
+#   c. scipy<=1.13.1
 ```
 
 ### Prepare Azure Databricks for Operationalization

diff --git a/examples/01_prepare_data/mind_utils.ipynb b/examples/01_prepare_data/mind_utils.ipynb
@@ -306,7 +306,7 @@
                 "    pickle.dump(word_dict, f)\n",
                 "    \n",
                 "with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:\n",
-                "    pickle.dump(word_dict, f)"
+                "    pickle.dump(word_dict_all, f)"
             ]
         },
         {

diff --git a/recommenders/datasets/mind.py b/recommenders/datasets/mind.py
@@ -17,18 +17,6 @@
 )
 
 
-URL_MIND_LARGE_TRAIN = (
-    "https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip"
-)
-URL_MIND_LARGE_VALID = (
-    "https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip"
-)
-URL_MIND_SMALL_TRAIN = (
-    "https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip"
-)
-URL_MIND_SMALL_VALID = (
-    "https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip"
-)
 URL_MIND_DEMO_TRAIN = (
     "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip"
 )
@@ -39,6 +27,29 @@
     "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip"
 )
 
+URL_MIND_SMALL_TRAIN = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip"
+)
+URL_MIND_SMALL_VALID = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip"
+)
+URL_MIND_SMALL_UTILS = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip"
+)
+
+URL_MIND_LARGE_TRAIN = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip"
+)
+URL_MIND_LARGE_VALID = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip"
+)
+URL_MIND_LARGE_TEST = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_test.zip"
+)
+URL_MIND_LARGE_UTILS = (
+    "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip"
+)
+
 URL_MIND = {
     "large": (URL_MIND_LARGE_TRAIN, URL_MIND_LARGE_VALID),
     "small": (URL_MIND_SMALL_TRAIN, URL_MIND_SMALL_VALID),

diff --git a/recommenders/models/deeprec/DataModel/ImplicitCF.py b/recommenders/models/deeprec/DataModel/ImplicitCF.py
@@ -206,6 +206,8 @@ def train_loader(self, batch_size):
         """
 
         def sample_neg(x):
+            if len(x) >= self.n_items:
+                raise ValueError("A user has voted in every item. Can't find a negative sample.")
             while True:
                 neg_id = random.randint(0, self.n_items - 1)
                 if neg_id not in x:

diff --git a/recommenders/models/newsrec/newsrec_utils.py b/recommenders/models/newsrec/newsrec_utils.py
@@ -310,15 +310,15 @@ def get_mind_data_set(type):
 
     if type == "large":
         return (
-            "https://mind201910small.blob.core.windows.net/release/",
+            "https://recodatasets.z20.web.core.windows.net/newsrec/",
             "MINDlarge_train.zip",
             "MINDlarge_dev.zip",
             "MINDlarge_utils.zip",
         )
 
     elif type == "small":
         return (
-            "https://mind201910small.blob.core.windows.net/release/",
+            "https://recodatasets.z20.web.core.windows.net/newsrec/",
             "MINDsmall_train.zip",
             "MINDsmall_dev.zip",
             "MINDsmall_utils.zip",