diff --git a/examples/01_prepare_data/data_split.ipynb b/examples/01_prepare_data/data_split.ipynb
index 8b74bb3ae2..a9fee08ff7 100644
--- a/examples/01_prepare_data/data_split.ipynb
+++ b/examples/01_prepare_data/data_split.ipynb
@@ -113,7 +113,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|████████████████████████████████████████████████████████████████████████████| 1.93k/1.93k [00:08<00:00, 217KB/s]\n"
+ "100%|██████████████████████████████████████████████████████████████████████████| 1.93k/1.93k [00:01<00:00, 1.82kKB/s]\n"
]
}
],
diff --git a/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb b/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb
index 4c658fc6b5..66dca5045d 100644
--- a/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb
+++ b/examples/02_model_collaborative_filtering/baseline_deep_dive.ipynb
@@ -59,8 +59,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 11 2019, 15:03:47) [MSC v.1915 64 bit (AMD64)]\n",
- "Pandas version: 0.24.1\n"
+ "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n",
+ "[GCC 11.2.0]\n",
+ "Pandas version: 1.5.3\n"
]
}
],
@@ -69,6 +70,7 @@
"\n",
"import itertools\n",
"import pandas as pd\n",
+ "import scrapbook as sb\n",
"\n",
"from recommenders.utils.notebook_utils import is_jupyter\n",
"from recommenders.datasets import movielens\n",
@@ -79,8 +81,8 @@
" map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
")\n",
"\n",
- "print(\"System version: {}\".format(sys.version))\n",
- "print(\"Pandas version: {}\".format(pd.__version__))"
+ "print(f\"System version: {sys.version}\")\n",
+ "print(f\"Pandas version: {pd.__version__}\")"
]
},
{
@@ -100,7 +102,8 @@
},
"outputs": [],
"source": [
- "MOVIELENS_DATA_SIZE = '100k'"
+ "MOVIELENS_DATA_SIZE = \"100k\"\n",
+ "TOP_K = 10"
]
},
{
@@ -108,6 +111,13 @@
"execution_count": 3,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:09<00:00, 495KB/s]\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -184,14 +194,15 @@
"4 166 346 1.0 886397596"
]
},
+ "execution_count": 3,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
"data = movielens.load_pandas_df(\n",
- " size=MOVIELENS_DATA_SIZE,\n",
- " header=['UserId', 'MovieId', 'Rating', 'Timestamp']\n",
+ " size=MOVIELENS_DATA_SIZE, \n",
+ " header=[\"UserId\", \"MovieId\", \"Rating\", \"Timestamp\"]\n",
")\n",
"\n",
"data.head()"
@@ -284,22 +295,23 @@
"4 5 2.868217"
]
},
+ "execution_count": 5,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
"# Calculate avg ratings from the training set\n",
- "users_ratings = train.groupby(['UserId'])['Rating'].mean()\n",
+ "users_ratings = train.groupby([\"UserId\"])[\"Rating\"].mean()\n",
"users_ratings = users_ratings.to_frame().reset_index()\n",
- "users_ratings.rename(columns = {'Rating': 'AvgRating'}, inplace = True)\n",
+ "users_ratings.rename(columns={\"Rating\": \"AvgRating\"}, inplace=True)\n",
"\n",
"users_ratings.head()"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -384,15 +396,16 @@
"12219 1 63 2.0 878543196 3.69697"
]
},
+ "execution_count": 6,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
"# Generate prediction for the test set\n",
- "baseline_predictions = pd.merge(test, users_ratings, on=['UserId'], how='inner')\n",
+ "baseline_predictions = pd.merge(test, users_ratings, on=[\"UserId\"], how=\"inner\")\n",
"\n",
- "baseline_predictions.loc[baseline_predictions['UserId'] == 1].head()"
+ "baseline_predictions.loc[baseline_predictions[\"UserId\"] == 1].head()"
]
},
{
@@ -404,7 +417,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -419,13 +432,13 @@
}
],
"source": [
- "baseline_predictions = baseline_predictions[['UserId', 'MovieId', 'AvgRating']]\n",
+ "baseline_predictions = baseline_predictions[[\"UserId\", \"MovieId\", \"AvgRating\"]]\n",
"\n",
"cols = {\n",
- " 'col_user': 'UserId',\n",
- " 'col_item': 'MovieId',\n",
- " 'col_rating': 'Rating',\n",
- " 'col_prediction': 'AvgRating',\n",
+ " \"col_user\": \"UserId\",\n",
+ " \"col_item\": \"MovieId\",\n",
+ " \"col_rating\": \"Rating\",\n",
+ " \"col_prediction\": \"AvgRating\",\n",
"}\n",
"\n",
"eval_rmse = rmse(test, baseline_predictions, **cols)\n",
@@ -459,7 +472,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -526,19 +539,20 @@
"4 288 371"
]
},
+ "execution_count": 8,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
- "item_counts = train['MovieId'].value_counts().to_frame().reset_index()\n",
- "item_counts.columns = ['MovieId', 'Count']\n",
+ "item_counts = train[\"MovieId\"].value_counts().to_frame().reset_index()\n",
+ "item_counts.columns = [\"MovieId\", \"Count\"]\n",
"item_counts.head()"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -551,7 +565,7 @@
}
],
"source": [
- "user_item_col = ['UserId', 'MovieId']\n",
+ "user_item_col = [\"UserId\", \"MovieId\"]\n",
"\n",
"# Cross join users and items\n",
"test_users = test['UserId'].unique()\n",
@@ -568,7 +582,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -641,8 +655,9 @@
"4 50 419 598"
]
},
+ "execution_count": 10,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
@@ -653,29 +668,27 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "MAP:\t0.055007\n",
- "NDCG@K:\t0.252864\n",
+ "MAP:\t0.055008\n",
+ "NDCG@K:\t0.252867\n",
"Precision@K:\t0.224628\n",
"Recall@K:\t0.111736\n"
]
}
],
"source": [
- "k = 10\n",
- "\n",
- "cols['col_prediction'] = 'Count'\n",
+ "cols[\"col_prediction\"] = \"Count\"\n",
"\n",
- "eval_map = map_at_k(test, baseline_recommendations, k=k, **cols)\n",
- "eval_ndcg = ndcg_at_k(test, baseline_recommendations, k=k, **cols)\n",
- "eval_precision = precision_at_k(test, baseline_recommendations, k=k, **cols)\n",
- "eval_recall = recall_at_k(test, baseline_recommendations, k=k, **cols)\n",
+ "eval_map = map_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n",
+ "eval_ndcg = ndcg_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n",
+ "eval_precision = precision_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n",
+ "eval_recall = recall_at_k(test, baseline_recommendations, k=TOP_K, **cols)\n",
"\n",
"print(\"MAP:\\t%f\" % eval_map,\n",
" \"NDCG@K:\\t%f\" % eval_ndcg,\n",
@@ -699,87 +712,157 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
- "application/papermill.record+json": {
- "map": 0.055007342636635974
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.05500831263949166,
+ "encoder": "json",
+ "name": "map",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "map"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "ndcg": 0.25286402361020544
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.2528673962200594,
+ "encoder": "json",
+ "name": "ndcg",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "ndcg"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "precision": 0.22462845010615715
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.22462845010615715,
+ "encoder": "json",
+ "name": "precision",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "precision"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "recall": 0.1117356507425933
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.1117356507425933,
+ "encoder": "json",
+ "name": "recall",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "recall"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "rmse": 1.044885130655045
+ "application/scrapbook.scrap.json+json": {
+ "data": 1.044885130655045,
+ "encoder": "json",
+ "name": "rmse",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "rmse"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "mae": 0.8369250150730534
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.8369250150730534,
+ "encoder": "json",
+ "name": "mae",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "mae"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "exp_var": 0.1364955485850292
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.1364955485850292,
+ "encoder": "json",
+ "name": "exp_var",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "exp_var"
}
},
- "metadata": {},
"output_type": "display_data"
},
{
"data": {
- "application/papermill.record+json": {
- "rsquared": 0.13649128638749664
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.13649128638749664,
+ "encoder": "json",
+ "name": "rsquared",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "rsquared"
}
},
- "metadata": {},
"output_type": "display_data"
}
],
"source": [
"if is_jupyter():\n",
- " # Record results with papermill for unit-tests\n",
- " import papermill as pm\n",
- " import scrapbook as sb\n",
+ " # Record results with papermill and scrapbook for tests\n",
" sb.glue(\"map\", eval_map)\n",
" sb.glue(\"ndcg\", eval_ndcg)\n",
" sb.glue(\"precision\", eval_precision)\n",
@@ -799,20 +882,14 @@
"[[1](https://dl.acm.org/citation.cfm?id=1401944)] Yehuda Koren,\tFactorization meets the neighborhood: a multifaceted collaborative filtering model, KDD '08 pp. 426-434 2008. \n",
"[[2](https://surprise.readthedocs.io/en/stable/basic_algorithms.html)] Surprise lib, Basic algorithms"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
+ "celltoolbar": "Tags",
"kernelspec": {
- "display_name": "reco_base",
+ "display_name": "Python (recommenders)",
"language": "python",
- "name": "reco_base"
+ "name": "recommenders"
},
"language_info": {
"codemirror_mode": {
@@ -824,9 +901,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}
diff --git a/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb b/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb
index 77bcc4e0d2..f46fc49486 100644
--- a/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb
+++ b/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb
@@ -107,36 +107,25 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n",
- "[GCC 7.3.0]\n",
- "Pandas version: 0.24.2\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# set the environment path to find Recommenders\n",
"import sys\n",
- "\n",
- "import itertools\n",
"import logging\n",
- "import os\n",
- "\n",
+ "import scipy\n",
"import numpy as np\n",
"import pandas as pd\n",
- "import papermill as pm\n",
+ "import scrapbook as sb\n",
"\n",
"from recommenders.datasets import movielens\n",
"from recommenders.datasets.python_splitters import python_stratified_split\n",
"from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from recommenders.models.sar import SAR\n",
"\n",
- "print(\"System version: {}\".format(sys.version))\n",
- "print(\"Pandas version: {}\".format(pd.__version__))"
+ "print(f\"System version: {sys.version}\")\n",
+ "print(f\"Pandas version: {pd.__version__}\")\n",
+ "print(f\"NumPy version: {np.__version__}\")\n",
+ "print(f\"SciPy version: {scipy.__version__}\")"
]
},
{
@@ -149,11 +138,25 @@
},
"outputs": [],
"source": [
- "# top k items to recommend\n",
+ "# Top k items to recommend\n",
"TOP_K = 10\n",
"\n",
"# Select MovieLens data size: 100k, 1m, 10m, or 20m\n",
- "MOVIELENS_DATA_SIZE = '100k'"
+ "MOVIELENS_DATA_SIZE = \"100k\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set log level to INFO\n",
+ "logging.basicConfig(\n",
+ " level=logging.DEBUG,\n",
+ " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n",
+ " datefmt=\"%Y-%m-%d %H:%M:%S\",\n",
+ ")"
]
},
{
@@ -172,112 +175,18 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "4.93MB [00:02, 2.36MB/s] \n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " UserId | \n",
- " MovieId | \n",
- " Rating | \n",
- " Timestamp | \n",
- " Title | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 196 | \n",
- " 242 | \n",
- " 3.0 | \n",
- " 881250949 | \n",
- " Kolya (1996) | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 63 | \n",
- " 242 | \n",
- " 3.0 | \n",
- " 875747190 | \n",
- " Kolya (1996) | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 226 | \n",
- " 242 | \n",
- " 5.0 | \n",
- " 883888671 | \n",
- " Kolya (1996) | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 154 | \n",
- " 242 | \n",
- " 3.0 | \n",
- " 879138235 | \n",
- " Kolya (1996) | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 306 | \n",
- " 242 | \n",
- " 5.0 | \n",
- " 876503793 | \n",
- " Kolya (1996) | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " UserId MovieId Rating Timestamp Title\n",
- "0 196 242 3.0 881250949 Kolya (1996)\n",
- "1 63 242 3.0 875747190 Kolya (1996)\n",
- "2 226 242 5.0 883888671 Kolya (1996)\n",
- "3 154 242 3.0 879138235 Kolya (1996)\n",
- "4 306 242 5.0 876503793 Kolya (1996)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"data = movielens.load_pandas_df(\n",
" size=MOVIELENS_DATA_SIZE,\n",
- " header=['UserId', 'MovieId', 'Rating', 'Timestamp'],\n",
- " title_col='Title'\n",
+ " header=[\"UserId\", \"MovieId\", \"Rating\", \"Timestamp\"],\n",
+ " title_col=\"Title\",\n",
")\n",
"\n",
- "# Convert the float precision to 32-bit in order to reduce memory consumption \n",
- "data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)\n",
+ "# Convert the float precision to 32-bit in order to reduce memory consumption\n",
+ "data[\"Rating\"] = data[\"Rating\"].astype(np.float32)\n",
"\n",
"data.head()"
]
@@ -293,7 +202,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -308,11 +217,13 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "train, test = python_stratified_split(data, ratio=0.75, col_user=header[\"col_user\"], col_item=header[\"col_item\"], seed=42)"
+ "train, test = python_stratified_split(\n",
+ " data, ratio=0.75, col_user=header[\"col_user\"], col_item=header[\"col_item\"], seed=42\n",
+ ")\n"
]
},
{
@@ -331,14 +242,10 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
- "# set log level to INFO\n",
- "logging.basicConfig(level=logging.DEBUG, \n",
- " format='%(asctime)s %(levelname)-8s %(message)s')\n",
- "\n",
"model = SAR(\n",
" similarity_type=\"jaccard\", \n",
" time_decay_coefficient=30, \n",
@@ -350,21 +257,21 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2019-05-28 22:40:09,133 INFO Collecting user affinity matrix\n",
- "2019-05-28 22:40:09,137 INFO Calculating time-decayed affinities\n",
- "2019-05-28 22:40:09,178 INFO Creating index columns\n",
- "2019-05-28 22:40:09,188 INFO Building user affinity sparse matrix\n",
- "2019-05-28 22:40:09,194 INFO Calculating item co-occurrence\n",
- "2019-05-28 22:40:09,412 INFO Calculating item similarity\n",
- "2019-05-28 22:40:09,413 INFO Using jaccard based similarity\n",
- "2019-05-28 22:40:09,534 INFO Done training\n"
+ "2023-07-04 09:49:54 INFO Collecting user affinity matrix\n",
+ "2023-07-04 09:49:54 INFO Calculating time-decayed affinities\n",
+ "2023-07-04 09:49:54 INFO Creating index columns\n",
+ "2023-07-04 09:49:54 INFO Building user affinity sparse matrix\n",
+ "2023-07-04 09:49:54 INFO Calculating item co-occurrence\n",
+ "2023-07-04 09:49:55 INFO Calculating item similarity\n",
+ "2023-07-04 09:49:55 INFO Using jaccard based similarity\n",
+ "2023-07-04 09:49:55 INFO Done training\n"
]
}
],
@@ -374,7 +281,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 18,
"metadata": {
"scrolled": true
},
@@ -383,8 +290,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2019-05-28 22:40:09,546 INFO Calculating recommendation scores\n",
- "2019-05-28 22:40:09,641 INFO Removing seen items\n"
+ "2023-07-04 09:49:57 INFO Calculating recommendation scores\n",
+ "2023-07-04 09:49:57 INFO Removing seen items\n"
]
}
],
@@ -401,7 +308,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 19,
"metadata": {
"scrolled": true
},
@@ -435,74 +342,74 @@
" \n",
" \n",
" \n",
- " 9424 | \n",
+ " 9420 | \n",
" 943 | \n",
- " 82 | \n",
- " 21.313228 | \n",
- " Jurassic Park (1993) | \n",
+ " 176 | \n",
+ " 21.325644 | \n",
+ " Aliens (1986) | \n",
"
\n",
" \n",
- " 9425 | \n",
+ " 9421 | \n",
" 943 | \n",
- " 403 | \n",
- " 21.158839 | \n",
- " Batman (1989) | \n",
+ " 89 | \n",
+ " 20.901408 | \n",
+ " Blade Runner (1982) | \n",
"
\n",
" \n",
- " 9426 | \n",
+ " 9422 | \n",
" 943 | \n",
- " 568 | \n",
- " 20.962922 | \n",
- " Speed (1994) | \n",
+ " 82 | \n",
+ " 20.688100 | \n",
+ " Jurassic Park (1993) | \n",
"
\n",
" \n",
- " 9428 | \n",
+ " 9423 | \n",
" 943 | \n",
- " 423 | \n",
- " 20.162170 | \n",
- " E.T. the Extra-Terrestrial (1982) | \n",
+ " 172 | \n",
+ " 20.287318 | \n",
+ " Empire Strikes Back, The (1980) | \n",
"
\n",
" \n",
- " 9427 | \n",
+ " 9424 | \n",
" 943 | \n",
- " 89 | \n",
- " 19.890513 | \n",
- " Blade Runner (1982) | \n",
+ " 423 | \n",
+ " 20.256682 | \n",
+ " E.T. the Extra-Terrestrial (1982) | \n",
"
\n",
" \n",
- " 9429 | \n",
+ " 9425 | \n",
" 943 | \n",
- " 393 | \n",
- " 19.832944 | \n",
- " Mrs. Doubtfire (1993) | \n",
+ " 195 | \n",
+ " 20.250996 | \n",
+ " Terminator, The (1984) | \n",
"
\n",
" \n",
- " 9423 | \n",
+ " 9426 | \n",
" 943 | \n",
- " 11 | \n",
- " 19.570244 | \n",
- " Seven (Se7en) (1995) | \n",
+ " 202 | \n",
+ " 20.145059 | \n",
+ " Groundhog Day (1993) | \n",
"
\n",
" \n",
- " 9422 | \n",
+ " 9427 | \n",
" 943 | \n",
- " 71 | \n",
- " 19.553877 | \n",
- " Lion King, The (1994) | \n",
+ " 68 | \n",
+ " 19.983884 | \n",
+ " Crow, The (1994) | \n",
"
\n",
" \n",
- " 9421 | \n",
+ " 9428 | \n",
" 943 | \n",
- " 202 | \n",
- " 19.422129 | \n",
- " Groundhog Day (1993) | \n",
+ " 566 | \n",
+ " 19.820856 | \n",
+ " Clear and Present Danger (1994) | \n",
"
\n",
" \n",
- " 9420 | \n",
+ " 9429 | \n",
" 943 | \n",
- " 238 | \n",
- " 19.115604 | \n",
- " Raising Arizona (1987) | \n",
+ " 550 | \n",
+ " 19.804157 | \n",
+ " Die Hard: With a Vengeance (1995) | \n",
"
\n",
" \n",
"\n",
@@ -510,27 +417,31 @@
],
"text/plain": [
" UserId MovieId Prediction Title\n",
- "9424 943 82 21.313228 Jurassic Park (1993)\n",
- "9425 943 403 21.158839 Batman (1989)\n",
- "9426 943 568 20.962922 Speed (1994)\n",
- "9428 943 423 20.162170 E.T. the Extra-Terrestrial (1982)\n",
- "9427 943 89 19.890513 Blade Runner (1982)\n",
- "9429 943 393 19.832944 Mrs. Doubtfire (1993)\n",
- "9423 943 11 19.570244 Seven (Se7en) (1995)\n",
- "9422 943 71 19.553877 Lion King, The (1994)\n",
- "9421 943 202 19.422129 Groundhog Day (1993)\n",
- "9420 943 238 19.115604 Raising Arizona (1987)"
+ "9420 943 176 21.325644 Aliens (1986)\n",
+ "9421 943 89 20.901408 Blade Runner (1982)\n",
+ "9422 943 82 20.688100 Jurassic Park (1993)\n",
+ "9423 943 172 20.287318 Empire Strikes Back, The (1980)\n",
+ "9424 943 423 20.256682 E.T. the Extra-Terrestrial (1982)\n",
+ "9425 943 195 20.250996 Terminator, The (1984)\n",
+ "9426 943 202 20.145059 Groundhog Day (1993)\n",
+ "9427 943 68 19.983884 Crow, The (1994)\n",
+ "9428 943 566 19.820856 Clear and Present Danger (1994)\n",
+ "9429 943 550 19.804157 Die Hard: With a Vengeance (1995)"
]
},
+ "execution_count": 19,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
- "top_k_with_titles = (top_k.join(data[['MovieId', 'Title']].drop_duplicates().set_index('MovieId'), \n",
- " on='MovieId', \n",
- " how='inner').sort_values(by=['UserId', 'Prediction'], ascending=False))\n",
- "display(top_k_with_titles.head(10))"
+ "top_k_with_titles = top_k.join(\n",
+ " data[[\"MovieId\", \"Title\"]].drop_duplicates().set_index(\"MovieId\"),\n",
+ " on=\"MovieId\",\n",
+ " how=\"inner\",\n",
+ ").sort_values(by=[\"UserId\", \"Prediction\"], ascending=False)\n",
+ "\n",
+ "top_k_with_titles.head(10)"
]
},
{
@@ -541,23 +452,25 @@
"\n",
"It should be known that the recommendation scores generated by multiplying the item similarity matrix $S$ and the user affinity matrix $A$ **DOES NOT** have the same scale with the original explicit ratings in the movielens dataset. That is to say, SAR algorithm is meant for the task of *recommending relevent items to users* rather than *predicting explicit ratings for user-item pairs*. \n",
"\n",
- "To this end, ranking metrics like precision@k, recall@k, etc., are more applicable to evaluate SAR algorithm. The following illustrates how to evaluate SAR model by using the evaluation functions provided in the `recommenders`."
+ "To this end, ranking metrics like precision@k, recall@k, etc., are more applicable to evaluate SAR algorithm. The following illustrates how to evaluate SAR model by using the evaluation functions provided in Recommenders library."
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# all ranking metrics have the same arguments\n",
"args = [test, top_k]\n",
- "kwargs = dict(col_user='UserId', \n",
- " col_item='MovieId', \n",
- " col_rating='Rating', \n",
- " col_prediction='Prediction', \n",
- " relevancy_method='top_k', \n",
- " k=TOP_K)\n",
+ "kwargs = dict(\n",
+ " col_user=\"UserId\",\n",
+ " col_item=\"MovieId\",\n",
+ " col_rating=\"Rating\",\n",
+ " col_prediction=\"Prediction\",\n",
+ " relevancy_method=\"top_k\",\n",
+ " k=TOP_K,\n",
+ ")\n",
"\n",
"eval_map = map_at_k(*args, **kwargs)\n",
"eval_ndcg = ndcg_at_k(*args, **kwargs)\n",
@@ -567,7 +480,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
@@ -576,10 +489,10 @@
"text": [
"Model:\n",
"Top K:\t\t 10\n",
- "MAP:\t\t 0.095544\n",
- "NDCG:\t\t 0.350232\n",
- "Precision@K:\t 0.305726\n",
- "Recall@K:\t 0.164690\n"
+ "MAP:\t\t 0.113796\n",
+ "NDCG:\t\t 0.384809\n",
+ "Precision@K:\t 0.331707\n",
+ "Recall@K:\t 0.182571\n"
]
}
],
@@ -592,6 +505,19 @@
" f\"Recall@K:\\t {eval_recall:f}\", sep='\\n')"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Record results for tests - ignore this cell\n",
+ "sb.glue(\"map\", eval_map)\n",
+ "sb.glue(\"ndcg\", eval_ndcg)\n",
+ "sb.glue(\"precision\", eval_precision)\n",
+ "sb.glue(\"recall\", eval_recall)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -606,10 +532,11 @@
}
],
"metadata": {
+ "celltoolbar": "Tags",
"kernelspec": {
- "display_name": "Python (reco_base)",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
- "name": "reco_base"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -621,7 +548,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.9.16"
}
},
"nbformat": 4,
diff --git a/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb b/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb
index c5e72c7e26..69310d1bb7 100644
--- a/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb
+++ b/examples/02_model_collaborative_filtering/surprise_svd_deep_dive.ipynb
@@ -91,17 +91,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 11 2019, 15:03:47) [MSC v.1915 64 bit (AMD64)]\n",
- "Surprise version: 1.0.6\n"
+ "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n",
+ "[GCC 11.2.0]\n",
+ "Surprise version: 1.1.3\n"
]
}
],
"source": [
- "import sys\n",
-
"import os\n",
+ "import sys\n",
"import surprise\n",
- "import papermill as pm\n",
"import scrapbook as sb\n",
"import pandas as pd\n",
"\n",
@@ -112,8 +111,8 @@
" recall_at_k, get_top_k_items)\n",
"from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions\n",
"\n",
- "print(\"System version: {}\".format(sys.version))\n",
- "print(\"Surprise version: {}\".format(surprise.__version__))"
+ "print(f\"System version: {sys.version}\")\n",
+ "print(f\"Surprise version: {surprise.__version__}\")"
]
},
{
@@ -126,6 +125,9 @@
},
"outputs": [],
"source": [
+ "# Top k items to recommend\n",
+ "TOP_K = 10\n",
+ "\n",
"# Select MovieLens data size: 100k, 1m, 10m, or 20m\n",
"MOVIELENS_DATA_SIZE = '100k'"
]
@@ -142,6 +144,13 @@
"execution_count": 3,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:07<00:00, 646KB/s]\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -263,7 +272,7 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
"execution_count": 5,
@@ -332,7 +341,7 @@
"Processing epoch 27\n",
"Processing epoch 28\n",
"Processing epoch 29\n",
- "Took 19.879321813583374 seconds for training.\n"
+ "Took 2.276676100009354 seconds for training.\n"
]
}
],
@@ -342,7 +351,7 @@
"with Timer() as train_time:\n",
" svd.fit(train_set)\n",
"\n",
- "print(\"Took {} seconds for training.\".format(train_time.interval))"
+ "print(f\"Took {train_time.interval} seconds for training.\")"
]
},
{
@@ -356,7 +365,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -388,33 +397,33 @@
" \n",
" \n",
" 0 | \n",
- " 600.0 | \n",
- " 651.0 | \n",
- " 4.119 | \n",
+ " 877 | \n",
+ " 381 | \n",
+ " 3.698217 | \n",
"
\n",
" \n",
" 1 | \n",
- " 607.0 | \n",
- " 494.0 | \n",
- " 3.728 | \n",
+ " 815 | \n",
+ " 602 | \n",
+ " 3.590957 | \n",
"
\n",
" \n",
" 2 | \n",
- " 875.0 | \n",
- " 1103.0 | \n",
- " 4.225 | \n",
+ " 94 | \n",
+ " 431 | \n",
+ " 3.841149 | \n",
"
\n",
" \n",
" 3 | \n",
- " 648.0 | \n",
- " 238.0 | \n",
- " 4.225 | \n",
+ " 416 | \n",
+ " 875 | \n",
+ " 2.642248 | \n",
"
\n",
" \n",
" 4 | \n",
- " 113.0 | \n",
- " 273.0 | \n",
- " 4.043 | \n",
+ " 500 | \n",
+ " 182 | \n",
+ " 4.384139 | \n",
"
\n",
" \n",
"\n",
@@ -422,14 +431,14 @@
],
"text/plain": [
" userID itemID prediction\n",
- "0 600.0 651.0 4.119\n",
- "1 607.0 494.0 3.728\n",
- "2 875.0 1103.0 4.225\n",
- "3 648.0 238.0 4.225\n",
- "4 113.0 273.0 4.043"
+ "0 877 381 3.698217\n",
+ "1 815 602 3.590957\n",
+ "2 94 431 3.841149\n",
+ "3 416 875 2.642248\n",
+ "4 500 182 4.384139"
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -450,14 +459,14 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Took 28.51998782157898 seconds for prediction.\n"
+ "Took 16.501801499980502 seconds for prediction.\n"
]
}
],
@@ -465,12 +474,12 @@
"with Timer() as test_time:\n",
" all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)\n",
" \n",
- "print(\"Took {} seconds for prediction.\".format(test_time.interval))"
+ "print(f\"Took {test_time.interval} seconds for prediction.\")"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -502,33 +511,33 @@
" \n",
" \n",
" 75000 | \n",
- " 496 | \n",
- " 101 | \n",
- " 2.981 | \n",
+ " 811 | \n",
+ " 755 | \n",
+ " 4.090273 | \n",
"
\n",
" \n",
" 75001 | \n",
- " 496 | \n",
- " 471 | \n",
- " 3.196 | \n",
+ " 811 | \n",
+ " 287 | \n",
+ " 4.557071 | \n",
"
\n",
" \n",
" 75002 | \n",
- " 496 | \n",
- " 121 | \n",
- " 3.282 | \n",
+ " 811 | \n",
+ " 181 | \n",
+ " 4.571596 | \n",
"
\n",
" \n",
" 75003 | \n",
- " 496 | \n",
- " 238 | \n",
- " 3.577 | \n",
+ " 811 | \n",
+ " 96 | \n",
+ " 4.458827 | \n",
"
\n",
" \n",
" 75004 | \n",
- " 496 | \n",
- " 243 | \n",
- " 1.930 | \n",
+ " 811 | \n",
+ " 83 | \n",
+ " 4.559237 | \n",
"
\n",
" \n",
"\n",
@@ -536,14 +545,14 @@
],
"text/plain": [
" userID itemID prediction\n",
- "75000 496 101 2.981\n",
- "75001 496 471 3.196\n",
- "75002 496 121 3.282\n",
- "75003 496 238 3.577\n",
- "75004 496 243 1.930"
+ "75000 811 755 4.090273\n",
+ "75001 811 287 4.557071\n",
+ "75002 811 181 4.571596\n",
+ "75003 811 96 4.458827\n",
+ "75004 811 83 4.559237"
]
},
- "execution_count": 12,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -569,22 +578,22 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "RMSE:\t\t0.957953\n",
- "MAE:\t\t0.754764\n",
- "rsquared:\t0.286992\n",
- "exp var:\t0.287030\n",
+ "RMSE:\t\t0.948771\n",
+ "MAE:\t\t0.747003\n",
+ "rsquared:\t0.288045\n",
+ "exp var:\t0.288157\n",
"----\n",
- "MAP:\t0.013018\n",
- "NDCG:\t0.099960\n",
- "Precision@K:\t0.095122\n",
- "Recall@K:\t0.032043\n"
+ "MAP:\t\t0.015624\n",
+ "NDCG:\t\t0.110465\n",
+ "Precision@K:\t0.100425\n",
+ "Recall@K:\t0.035267\n"
]
}
],
@@ -594,11 +603,10 @@
"eval_rsquared = rsquared(test, predictions)\n",
"eval_exp_var = exp_var(test, predictions)\n",
"\n",
- "k = 10\n",
- "eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)\n",
- "eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)\n",
- "eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)\n",
- "eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)\n",
+ "eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n",
+ "eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n",
+ "eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n",
+ "eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n",
"\n",
"\n",
"print(\"RMSE:\\t\\t%f\" % eval_rmse,\n",
@@ -608,65 +616,195 @@
"\n",
"print('----')\n",
"\n",
- "print(\"MAP:\\t%f\" % eval_map,\n",
- " \"NDCG:\\t%f\" % eval_ndcg,\n",
+ "print(\"MAP:\\t\\t%f\" % eval_map,\n",
+ " \"NDCG:\\t\\t%f\" % eval_ndcg,\n",
" \"Precision@K:\\t%f\" % eval_precision,\n",
" \"Recall@K:\\t%f\" % eval_recall, sep='\\n')"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.9487710439747563,
+ "encoder": "json",
+ "name": "rmse",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "rmse"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.7470034925349859,
+ "encoder": "json",
+ "name": "mae",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "mae"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.28804512193443,
+ "encoder": "json",
+ "name": "rsquared",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "rsquared"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.28815720397413125,
+ "encoder": "json",
+ "name": "exp_var",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "exp_var"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.015624359303961253,
+ "encoder": "json",
+ "name": "map",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "map"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.1104645586650869,
+ "encoder": "json",
+ "name": "ndcg",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "ndcg"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.10042462845010618,
+ "encoder": "json",
+ "name": "precision",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "precision"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 0.03526739062158758,
+ "encoder": "json",
+ "name": "recall",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "recall"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 2.276676100009354,
+ "encoder": "json",
+ "name": "train_time",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "train_time"
+ }
+ },
"output_type": "display_data"
},
{
- "data": {},
- "metadata": {},
+ "data": {
+ "application/scrapbook.scrap.json+json": {
+ "data": 16.501801499980502,
+ "encoder": "json",
+ "name": "test_time",
+ "version": 1
+ }
+ },
+ "metadata": {
+ "scrapbook": {
+ "data": true,
+ "display": false,
+ "name": "test_time"
+ }
+ },
"output_type": "display_data"
}
],
@@ -699,7 +837,7 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -713,9 +851,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.5.5"
+ "version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}