diff --git a/examples/00_quick_start/als_movielens.ipynb b/examples/00_quick_start/als_movielens.ipynb index 059d7f0ead..1f1b1a0bf0 100644 --- a/examples/00_quick_start/als_movielens.ipynb +++ b/examples/00_quick_start/als_movielens.ipynb @@ -2,46 +2,34 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Running ALS on MovieLens (PySpark)\n", "\n", "Matrix factorization by [ALS](https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/recommendation.html#ALS) (Alternating Least Squares) is a well known collaborative filtering algorithm.\n", "\n", "This notebook provides an example of how to utilize and evaluate ALS PySpark ML (DataFrame-based API) implementation, meant for large-scale distributed datasets. We use a smaller dataset in this example to run ALS efficiently on multiple cores of a [Data Science Virtual Machine](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/)." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](../../SETUP.md) to install the PySpark environment." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", - "[GCC 7.3.0]\n", - "Spark version: 2.3.1\n" - ] - } - ], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -61,74 +49,105 @@ "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "[GCC 7.3.0]\n", + "Spark version: 2.3.1\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Set the default parameters." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], "source": [ "# top k items to recommend\n", "TOP_K = 10\n", "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'" - ] + "MOVIELENS_DATA_SIZE = '100k'\n", + "\n", + "# Column names for the dataset\n", + "COL_USER = \"UserId\"\n", + "COL_ITEM = \"MovieId\"\n", + "COL_RATING = \"Rating\"\n", + "COL_TIMESTAMP = \"Timestamp\"" + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 0. Set up Spark context\n", "\n", "The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, - "outputs": [], "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 1. Download the MovieLens dataset" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "source": [ + "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", + "schema = StructType(\n", + " (\n", + " StructField(COL_USER, IntegerType()),\n", + " StructField(COL_ITEM, IntegerType()),\n", + " StructField(COL_RATING, FloatType()),\n", + " StructField(COL_TIMESTAMP, LongType()),\n", + " )\n", + ")\n", + "\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", + "data.show()" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "100%|██████████| 4.81k/4.81k [00:00<00:00, 19.9kKB/s]\n" ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+------+---------+\n", "|UserId|MovieId|Rating|Timestamp|\n", @@ -159,68 +178,53 @@ ] } ], - "source": [ - "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", - "schema = StructType(\n", - " (\n", - " StructField(\"UserId\", IntegerType()),\n", - " StructField(\"MovieId\", IntegerType()),\n", - " StructField(\"Rating\", FloatType()),\n", - " StructField(\"Timestamp\", LongType()),\n", - " )\n", - ")\n", - "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", - "data.show()" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 2. Split the data using the Spark random splitter provided in utilities" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "source": [ + "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", + "print (\"N train\", train.cache().count())\n", + "print (\"N test\", test.cache().count())" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "N train 75193\n", "N test 24807\n" ] } ], - "source": [ - "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", - "print (\"N train\", train.cache().count())\n", - "print (\"N test\", test.cache().count())" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3. Train the ALS model on the training data, and get the top-k recommendations for our testing data\n", "\n", "To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used in building the model are referenced from [here](http://mymedialite.net/examples/datasets.html). We do not constrain the latent factors (`nonnegative = False`) in order to allow for both positive and negative preferences towards movies.\n", "Timing will vary depending on the machine being used to train." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, - "outputs": [], "source": [ "header = {\n", - " \"userCol\": \"UserId\",\n", - " \"itemCol\": \"MovieId\",\n", - " \"ratingCol\": \"Rating\",\n", + " \"userCol\": COL_USER,\n", + " \"itemCol\": COL_ITEM,\n", + " \"ratingCol\": COL_RATING,\n", "}\n", "\n", "\n", @@ -234,84 +238,88 @@ " seed=42,\n", " **header\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "source": [ + "with Timer() as train_time:\n", + " model = als.fit(train)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Took 3.2410509269684553 seconds for training.\n" ] } ], - "source": [ - "with Timer() as train_time:\n", - " model = als.fit(train)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time.interval))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. Therefore, the rated movies are removed from the recommended items.\n", "\n", "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 10.559875106438994 seconds for prediction.\n" - ] - } - ], "source": [ "with Timer() as test_time:\n", "\n", " # Get the cross join of all user-item pairs and score them.\n", - " users = train.select('UserId').distinct()\n", - " items = train.select('MovieId').distinct()\n", + " users = train.select(COL_USER).distinct()\n", + " items = train.select(COL_ITEM).distinct()\n", " user_item = users.crossJoin(items)\n", " dfs_pred = model.transform(user_item)\n", "\n", " # Remove seen items.\n", " dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", " train.alias(\"train\"),\n", - " (dfs_pred['UserId'] == train['UserId']) & (dfs_pred['MovieId'] == train['MovieId']),\n", + " (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),\n", " how='outer'\n", " )\n", "\n", - " top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", - " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n", + " top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f\"train.{COL_RATING}\"].isNull()) \\\n", + " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", "\n", " # In Spark, transformations are lazy evaluation\n", " # Use an action to force execute and measure the test time \n", " top_all.cache().count()\n", "\n", "print(\"Took {} seconds for prediction.\".format(test_time.interval))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Took 10.559875106438994 seconds for prediction.\n" + ] + } + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "source": [ + "top_all.show()" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+----------+\n", "|UserId|MovieId|prediction|\n", @@ -342,36 +350,41 @@ ] } ], - "source": [ - "top_all.show()" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4. Evaluate how well ALS performs" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, - "outputs": [], "source": [ - "rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=\"UserId\", col_item=\"MovieId\", \n", - " col_rating=\"Rating\", col_prediction=\"prediction\", \n", + "rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_ITEM, \n", + " col_rating=COL_RATING, col_prediction=\"prediction\", \n", " relevancy_method=\"top_k\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "source": [ + "print(\"Model:\\tALS\",\n", + " \"Top K:\\t%d\" % rank_eval.k,\n", + " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n", + " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n", + " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n", + " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Model:\tALS\n", "Top K:\t10\n", @@ -382,30 +395,27 @@ ] } ], - "source": [ - "print(\"Model:\\tALS\",\n", - " \"Top K:\\t%d\" % rank_eval.k,\n", - " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n", - " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n", - " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n", - " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 5. Evaluate rating prediction" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "source": [ + "# Generate predicted ratings.\n", + "prediction = model.transform(test)\n", + "prediction.cache().show()\n" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+------+---------+----------+\n", "|UserId|MovieId|Rating|Timestamp|prediction|\n", @@ -436,20 +446,25 @@ ] } ], - "source": [ - "# Generate predicted ratings.\n", - "prediction = model.transform(test)\n", - "prediction.cache().show()\n" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "source": [ + "rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_ITEM, \n", + " col_rating=COL_RATING, col_prediction=\"prediction\")\n", + "\n", + "print(\"Model:\\tALS rating prediction\",\n", + " \"RMSE:\\t%f\" % rating_eval.rmse(),\n", + " \"MAE:\\t%f\" % rating_eval.mae(),\n", + " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n", + " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n')" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Model:\tALS rating prediction\n", "RMSE:\t0.967296\n", @@ -459,22 +474,11 @@ ] } ], - "source": [ - "rating_eval = SparkRatingEvaluation(test, prediction, col_user=\"UserId\", col_item=\"MovieId\", \n", - " col_rating=\"Rating\", col_prediction=\"prediction\")\n", - "\n", - "print(\"Model:\\tALS rating prediction\",\n", - " \"RMSE:\\t%f\" % rating_eval.rmse(),\n", - " \"MAE:\\t%f\" % rating_eval.mae(),\n", - " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n", - " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n')" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "if is_jupyter():\n", " # Record results with papermill for tests\n", @@ -490,17 +494,19 @@ " sb.glue(\"rsquared\", rating_eval.rsquared())\n", " sb.glue(\"train_time\", train_time.interval)\n", " sb.glue(\"test_time\", test_time.interval)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 17, - "metadata": {}, - "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} } ], "metadata": { diff --git a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb index b633257bff..0d90bb65d4 100644 --- a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb @@ -2,32 +2,31 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Spark Collaborative Filtering (ALS) Deep Dive" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Spark MLlib provides a collaborative filtering algorithm that can be used for training a matrix factorization model, which predicts explicit or implicit ratings of users on items for recommendations.\n", "\n", "This notebook presents a deep dive into the Spark collaborative filtering algorithm." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 1 Matrix factorization algorithm\n", "\n", @@ -54,11 +53,11 @@ "Owing to the term of $q_{i}^{T}p_{u}$ the loss function is non-convex. Gradient descent method can be applied but this will incur expensive computations. An Alternating Least Square (ALS) algorithm was therefore developed to overcome this issue. \n", "\n", "The basic idea of ALS is to learn one of $q$ and $p$ at a time for optimization while keeping the other as constant. This makes the objective at each iteration convex and solvable. The alternating between $q$ and $p$ stops when there is convergence to the optimal. It is worth noting that this iterative computation can be parallelised and/or distributed, which makes the algorithm desirable for use cases where the dataset is large and thus the user-item rating matrix is super sparse (as is typical in recommendation scenarios). A comprehensive discussion of ALS and its distributed computation can be found [here](http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf)." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 2 Spark Mllib implementation\n", "\n", @@ -67,40 +66,28 @@ "* The uniqueness of ALS implementation is that it distributes the matrix factorization model training by using \"Alternating Least Square\" method. \n", "* In the training method, there are parameters that can be selected to control the model performance.\n", "* Both explicit and implicit ratings are supported by Spark ALS model." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 3 Spark ALS based MovieLens recommender\n", "\n", "In the following code, the MovieLens-100K dataset is used to illustrate the ALS algorithm in Spark." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", - "[GCC 7.2.0]\n", - "Pandas version: 0.23.0\n", - "PySpark version: 2.3.1\n" - ] - } - ], + "execution_count": 1, "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -129,27 +116,51 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", + "[GCC 8.4.0]\n", + "Pandas version: 1.1.5\n", + "PySpark version: 2.4.8\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data column names" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], + "execution_count": 2, "source": [ + "MOVIELENS_DATA_SIZE = \"100k\"\n", + "\n", "COL_USER = \"UserId\"\n", "COL_ITEM = \"MovieId\"\n", "COL_RATING = \"Rating\"\n", "COL_PREDICTION = \"prediction\"\n", - "COL_TIMESTAMP = \"Timestamp\"\n", - "\n", + "COL_TIMESTAMP = \"Timestamp\"" + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ "schema = StructType(\n", " (\n", " StructField(COL_USER, IntegerType()),\n", @@ -158,102 +169,101 @@ " StructField(COL_TIMESTAMP, LongType()),\n", " )\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Model hyper parameters - these parameters are selected with reference to the benchmarking results [here](http://mymedialite.net/examples/datasets.html)." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], + "execution_count": 4, "source": [ "RANK = 10\n", "MAX_ITER = 15\n", "REG_PARAM = 0.05" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Number of recommended items" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], + "execution_count": 5, "source": [ "K = 10" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Initialize a Spark session." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], + "execution_count": 6, "source": [ "spark = start_or_get_spark(\"ALS Deep Dive\", memory=\"16g\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.1 Load and prepare data" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data is read from csv into a Spark DataFrame." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, + "execution_count": 7, + "source": [ + "dfs = movielens.load_spark_df(spark=spark, size=MOVIELENS_DATA_SIZE, schema=schema)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "../../recommenders/dataset/movielens.py:471: UserWarning: Both schema and header are provided.\n", - " The header argument will be ignored.\n", - " warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)\n", - "100%|██████████| 4.81k/4.81k [00:01<00:00, 2.50kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.5kKB/s]\n" ] } ], - "source": [ - "dfs = movielens.load_spark_df(spark=spark, size=\"100k\", schema=schema)" - ] + "metadata": {} }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "scrolled": true - }, + "execution_count": 8, + "source": [ + "dfs.show(5)" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+------+---------+\n", "|UserId|MovieId|Rating|Timestamp|\n", @@ -269,45 +279,43 @@ ] } ], - "source": [ - "dfs.show(5)" - ] + "metadata": { + "scrolled": true + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data is then randomly split by 80-20 ratio for training and testing." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], + "execution_count": 9, "source": [ "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.2 Train a movielens model " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It is worth noting that Spark ALS model allows dropping cold users to favor a robust evaluation with the testing data. In case there are cold users, Spark ALS implementation allows users to drop cold users in order to make sure evaluations on the prediction results are sound." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], + "execution_count": 10, "source": [ "als = ALS(\n", " maxIter=MAX_ITER, \n", @@ -320,49 +328,38 @@ ")\n", "\n", "model = als.fit(dfs_train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.3 Prediction with the model\n", "\n", "The trained model can be used to predict ratings with a given test data." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], + "execution_count": 11, "source": [ "dfs_pred = model.transform(dfs_test).drop(COL_RATING)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "With the prediction results, the model performance can be evaluated." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RMSE score = 0.9697095550242029\n", - "MAE score = 0.7554838330206419\n", - "R2 score = 0.24874053010909036\n", - "Explained variance score = 0.2547961843833687\n" - ] - } - ], + "execution_count": 12, "source": [ "evaluations = SparkRatingEvaluation(\n", " dfs_test, \n", @@ -380,89 +377,89 @@ " \"Explained variance score = {}\".format(evaluations.exp_var()),\n", " sep=\"\\n\"\n", ")" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "RMSE score = 0.9726930349322086\n", + "MAE score = 0.7565710909806911\n", + "R2 score = 0.24411065820407096\n", + "Explained variance score = 0.249700271662727\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Oftentimes ranking metrics are also of interest to data scientists. Note usually ranking metrics apply to the scenario of recommending a list of items. In our case, the recommended items should be different from those that have been rated by the users. " - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+-------+----------+\n", - "|UserId|MovieId|prediction|\n", - "+------+-------+----------+\n", - "| 1| 587| 2.9286714|\n", - "| 1| 869| 2.0478792|\n", - "| 1| 1208| 2.349619|\n", - "| 1| 1677| 3.1982298|\n", - "| 2| 80| 2.2628117|\n", - "| 2| 303| 2.9711432|\n", - "| 2| 472| 3.0840402|\n", - "| 2| 582| 4.65145|\n", - "| 2| 838| 1.8449162|\n", - "| 2| 975| 3.177288|\n", - "| 2| 1260| 3.466885|\n", - "| 2| 1325| 1.1348095|\n", - "| 2| 1381| 4.0551796|\n", - "| 2| 1530| 2.1732688|\n", - "| 3| 22| 3.0636034|\n", - "| 3| 57| 2.8428345|\n", - "| 3| 89| 3.459687|\n", - "| 3| 367| 2.3071244|\n", - "| 3| 1091| 1.9453487|\n", - "| 3| 1167| 2.0511415|\n", - "+------+-------+----------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], + "execution_count": 13, "source": [ "# Get the cross join of all user-item pairs and score them.\n", - "users = dfs_train.select('UserId').distinct()\n", - "items = dfs_train.select('MovieId').distinct()\n", + "users = dfs_train.select(COL_USER).distinct()\n", + "items = dfs_train.select(COL_ITEM).distinct()\n", "user_item = users.crossJoin(items)\n", "dfs_pred = model.transform(user_item)\n", "\n", "# Remove seen items.\n", "dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", " dfs_train.alias(\"train\"),\n", - " (dfs_pred['UserId'] == dfs_train['UserId']) & (dfs_pred['MovieId'] == dfs_train['MovieId']),\n", + " (dfs_pred[COL_USER] == dfs_train[COL_USER]) & (dfs_pred[COL_ITEM] == dfs_train[COL_ITEM]),\n", " how='outer'\n", ")\n", "\n", "dfs_pred_final = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", - " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n", + " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", "\n", "dfs_pred_final.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "Precision@k = 0.04061505832449631\n", - "Recall@k = 0.013571438145917577\n", - "NDCG@k = 0.03699684800440573\n", - "Mean average precision = 0.003702411260039904\n" + "+------+-------+----------+\n", + "|UserId|MovieId|prediction|\n", + "+------+-------+----------+\n", + "| 1| 587| 3.2763875|\n", + "| 1| 869| 1.996331|\n", + "| 1| 1208| 3.0924819|\n", + "| 1| 1677| 3.0549564|\n", + "| 2| 80| 2.2266486|\n", + "| 2| 303| 3.5071766|\n", + "| 2| 472| 2.4076686|\n", + "| 2| 582| 4.137449|\n", + "| 2| 838| 1.6214753|\n", + "| 2| 975| 2.7880914|\n", + "| 2| 1260| 3.155648|\n", + "| 2| 1325| 1.2494813|\n", + "| 2| 1381| 3.712147|\n", + "| 2| 1530| 2.04168|\n", + "| 3| 22| 2.5458775|\n", + "| 3| 57| 1.7472819|\n", + "| 3| 89| 3.85607|\n", + "| 3| 367| 3.2235723|\n", + "| 3| 1091| 1.5452085|\n", + "| 3| 1167| 3.5050836|\n", + "+------+-------+----------+\n", + "only showing top 20 rows\n", + "\n" ] } ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 14, "source": [ "evaluations = SparkRankingEvaluation(\n", " dfs_test, \n", @@ -481,11 +478,23 @@ " \"Mean average precision = {}\".format(evaluations.map_at_k()),\n", " sep=\"\\n\"\n", ")" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Precision@k = 0.03170731707317073\n", + "Recall@k = 0.012679519170565132\n", + "NDCG@k = 0.02914424248125332\n", + "Mean average precision = 0.0033674440032626088\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.4 Fine tune the model\n", "\n", @@ -498,48 +507,47 @@ "|`maxIters`|Maximum number of iterations|10|The more iterations the better the model converges to the optimal point.|\n", "\n", "It is always a good practice to start model building with default parameter values and then sweep the parameter in a range to find the optimal combination of parameters. The following parameter set is used for training ALS models for comparison study purposes." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], + "execution_count": 15, "source": [ "param_dict = {\n", " \"rank\": [10, 15, 20],\n", " \"regParam\": [0.001, 0.1, 1.0]\n", "}" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Generate a dictionary for each parameter combination which can then be fed into model training." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], + "execution_count": 16, "source": [ "param_grid = generate_param_grid(param_dict)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Train models with parameters specified in the parameter grid. Evaluate the model with, for example, the RMSE metric, and then record the metrics for visualization." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], + "execution_count": 17, "source": [ "rmse_score = []\n", "\n", @@ -569,163 +577,166 @@ "\n", "rmse_score = [float('%.4f' % x) for x in rmse_score]\n", "rmse_score_array = np.reshape(rmse_score, (len(param_dict[\"rank\"]), len(param_dict[\"regParam\"]))) " - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], + "execution_count": 18, "source": [ "rmse_df = pd.DataFrame(data=rmse_score_array, index=pd.Index(param_dict[\"rank\"], name=\"rank\"), \n", " columns=pd.Index(param_dict[\"regParam\"], name=\"reg. parameter\"))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 36, - "metadata": {}, + "execution_count": 19, + "source": [ + "fig, ax = plt.subplots()\n", + "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 36, "metadata": {}, - "output_type": "execute_result" + "execution_count": 19 }, { + "output_type": "display_data", "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEKCAYAAAD6q1UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XeYU1X+x/H3yUwywzAFZqjSUVSKCD8BCwsMyoLiosgKLBaKAhYWbCC4soC6riIKu6KsIChFBUR2F8Te2MUyUpQiTao4OCK9OS3J+f2REBhghqySBOZ+Xs+Th+Sec5Pvmct8cnNykjHWWkRExFlcsS5ARESiT+EvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHCg+1gUUZ/e1bfTR47NU5Q82xroE+RX2D8+MdQnyK5Qd8aoJp5/O/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoPhYF1AalL1nKJ4Wl+Pft5f9A/qc0O6+rCVJt9wO1g8+H4cnPYd3zSoAEq7qQJnuPQHInT2d/I/eA8DTqi1lut8KLheFS7L4+eUXojcgB+jQPpOxYx8lzuXipZdn8tSY54u016xZjcmTxlKhYjp79+yjZ+9BbN+ew8UXN+T58U+QkpqMz+fjiSfHM2fOfADaZrZk9Og/4/G4+eqrVfTr/wA+n49OndrzyKgh+P0Wr9fLAw+M5LPPl8Ri2KWOp1M/4s9vij18gNwXhp3QHnf+JXja3oi1Fvw+Ct6bgf/7b3HVboCn/S2hfq4KVcmf+xy+9ctw1WmIp10PMC4oyCN/3kTs3h3RHFZUGGttrGs4qd3XtjkzCzuJ+IaNsXm5JN//p5OGP4llIC8XgLjadUkZNop9d/bEJKeQ9vdJ7L+nP2BJ+/uL7L+nHxgXac9OZv89/bAH9lP2vofI//g9vCu+iu7AfqHKH2yMdQklcrlcrF29iKs79iA7O4esL97mllvvZu3aDaE+s2ZO5K23P2TGjDm0zWxJr17d6d1nEPXq1cVay8aNW6hatTKLs96hUeNMDhw4yOaNi2l/dXc2bNjMqJGD+e67bF6eOouyZZM4fPhnAC66qD4zX3uBRhe1idXwT2n/8MxYlxA2V80LoSCPhM53njT8cSdAYT4AplINEm8cRO6EIUX7JJYlaeBYfh43ELwFlBnwNHmzx2J3/UB8s3a4zjmXgvkTozCa06PsiFdNOP007XMaeFevxB48WHyHYPADmMQyHHlWc1/SgsKvl2IPHcQeOkTh10txX3Iprirn4Pvhe+yB/QAULl9GQsszNyzONi2aN2XTpq1s2bKNwsJCXn99Htd16lCkT/369fj4408B+GThZ1zXqT0AGzZsZuPGLQDk5Ozgp527qVgxg4yM8uTn57Nhw2YAPvzwv3S5oSNAKPgByiYlcaaecJ2N/NvWYXMPFd8hGPwAxpMAJ/nZxzdogW/jCvAWBDZYi0koE9gnIQl7aO9prflMoWmfKPFc3oqkXv0w5cpzcFTgDMWVUQH/zp9Cffy7duLKqEDhsi+Jq14TV6Uq+HftxHP5bzDx7liVXuqcU60K32f/ELqdvT2HFs2bFumzcuUautzQkfHPTaFz52tITU0hPb08e/YcDYLmzZrg8bjZtGkr1lrcbjeX/F9jln21ki5drqV6jXNCfa+//moe/8tDVKqYwXXX94r8ICUk7oJmeK7qjimbSt7MMSe0xze8nMKsd0K38xdMJrHHEKy3EPJzyZ0yMprlRo3O/KOk4ItF7LuzJwcfe5gyt94W3HqSV2fWYg8d4vDz40geNpLUp8bj3/Ej+HxRrbc0M+bEn/vxZ+MPDn2M1q0vY8ni92jd6jKys3Pwer2h9ipVKjF16rP07Xt/aN+bb7mbZ54exRefLeDQocN4vUeP2bx579Loojb8/sbbeWTUcdMOElG+9UvJnTCEvNnj8GR2LdJmksvhqlQD36aVoW3uS68hb+YYcv82EO/y/+Bpf3O0S46KiJz5G2PSgIeAzkDF4OafgHnAk9bafcXs1x/oD/BMo3r0qlk1EuXFlHf1SuKqVMOkpuHfvRP3RU1Cba4KFSlctRyAwsWfU7j4cwASru4Efn9M6i2NtmfnUKP60bPy6tWqkpNT9A29nJwddO3WD4CyZZPocsO1HDgQmNpLSUlm/rzpjBj5FF8uPvo+TNaXy8i8sgsAv23Xmnr16p7w2Is+/ZK6dWuRkVGe3btL53TCmcq/bR2mfCUokwzBqaK4BpfiXbcU/MEn6qQUXJVr4t++CQDv6iwSbx4aq5IjKlJn/q8De4FMa22GtTYDaBvcNqe4nay1k6y1zay1zUpT8LuqVgtdjzu3HiY+HntgP4XLFuNu2hyTnIxJTsbdtDmFyxYDYNLKBf5NTibx2uvJe29BTGovjZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3KLHpGLFDAA8Hg9DBg9g0qQZAJx7bu1Qn6ZNGuHxuBX8UWLKVw5dd1WpDXHxoeAHiG90Bd7VXxzdIfcwJjEJk14FgLi6jfDv2h6tcqMqUnP+ta21o4/dYK39ERhtjLmtmH3OWskPjsB9URNMahrlps0h99WXA//JgPx35uNp2ZqEKzuAz4vNL+Dg6EcAsIcOkjtrOmnjAisJcmdOwx4KnF2WvWMQcXXODW33/5Adg5GVTj6fj3vuHc7bb71GnMvF1GmzWbPmW0aNHMzSZStYsOAD2rS5gscfewiLZdGiLAYOehiArl070arVpaRnlKdnz24A3N73PlasWM3g+++i47XtcLlcTJw4nU8WfgZAlxs6csstN1JY6CUvN4+bbr4rZmMvbRK6DMBVqz4mKYUy946ncOEbod8977KPiK/fnPjGrbB+H3gLyJ87PrSvSauASU3Hv3Xt0Tu0fvLfnExi13ux1g95h8mfPynaw4qKiCz1NMa8D3wITLPW7ghuqwz0Bn5rrW13qvs4m5Z6SlFn+lJPKdnZtNRTThTrpZ7dgQzgP8aYPcaYPcBCIB3oWtKOIiISeRGZ9rHW7gWGBi9FGGP6AC9H4nFFRCQ8sVjq+UgMHlNERI4RqaWeK4trAioX0yYiIlESqdU+lYEOBJZ2HssAn0foMUVEJEyRCv8FQLK1dvnxDcaYhRF6TBERCVOk3vC9vYS2myLxmCIiEj59t4+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXGg+FgXUJzUaS/HugT5pc5pFesKROQUdOYvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOFBY4W+MSTjJtvTTX46IiERDuGf+/zTGuI/cMMZUBT6ITEkiIhJp4Yb/v4E5xpg4Y0xt4D3goUgVJSIikRUfTidr7YvGGA+BJ4HawB3W2s8jWZiIiEROieFvjLn/2JtADWA5cJkx5jJr7dhIFne2GP7Xsfz3s8Wkly/Hv1954YT2xV+tZNCwR6hWtQoA7dpcwV233Ux+fgG9BgyhoLAQn9fHb9v+hj/2vRWArKVf88zzU/D7LUlJiTz+8APUrH5OVMdVmnVon8nYsY8S53Lx0sszeWrM80Xaa9asxuRJY6lQMZ29e/bRs/cgtm/P4eKLG/L8+CdISU3G5/PxxJPjmTNnPgBtM1syevSf8XjcfPXVKvr1fwCfz0e5cmlMfvEZ6tatRX5ePn37P8Dq1etjMexSx9OpH/HnN8UePkDuC8NOaI87/xI8bW/EWgt+HwXvzcD//be4ajfA0/6WUD9Xharkz30O3/pluOo0xNOuBxgXFOSRP28idu+OaA4rKoy1tvhGY0aWtLO19pHTXlFQ4a7NxRd2hlm6fBVJZcrwp8eeLjb8p86cy4QxRX9c1lpyc/NISipDoddLz7sGM+yeO7i4UX2u/UNfnn1yBOfWrsmsfy5g1Zr1PD78gWgN6Vcpc06rWJdQIpfLxdrVi7i6Yw+ys3PI+uJtbrn1btau3RDqM2vmRN56+0NmzJhD28yW9OrVnd59BlGvXl2stWzcuIWqVSuzOOsdGjXO5MCBg2zeuJj2V3dnw4bNjBo5mO++y+blqbMY/cRwDh0+zGN/GccFF5zL+L//lfZXd4/hT6Bk+4dnxrqEsLlqXggFeSR0vvOk4Y87AQrzATCVapB44yByJwwp2iexLEkDx/LzuIHgLaDMgKfJmz0Wu+sH4pu1w3XOuRTMnxiF0ZweZUe8asLpV+KZfyTDvTRp1uQituf872cGxhiSksoA4PV68Xq9GBM4bgY4fPhnAA4eOkzFChmnrV6na9G8KZs2bWXLlm0AvP76PK7r1KFI+NevX48HBo8C4JOFnzH3jSkAbNiwOdQnJ2cHP+3cTcWKGbjd8eTn54faP/zwvwx98I+8PHUW9eufz+inxgOwfv0matWqTqVKFfjpp13RGG6p5t+2DpNWofgOweAHMJ4EOMnJbnyDFvg2rgBvQWCDtZiEMljAJCRhD+09zVWfGcKa8zfGnA8MJjDfH9rHWntlZMoqfVZ8s5Yuve6mUoUMBg/oy3l1awHg8/nodtsgtm3/gR5dfkfjhhcC8Miwe7lr8AgSEzyULZvEa5PGxbL8UuWcalX4PvuH0O3s7Tm0aN60SJ+VK9fQ5YaOjH9uCp07X0Nqagrp6eXZs+doEDRv1gSPx82mTVux1uJ2u7nk/xqz7KuVdOlyLdVrBKbpVq5aww2dO/LZ50to3qwJtWpVp3q1qgr/KIm7oBmeq7pjyqaSN3PMCe3xDS+nMOud0O38BZNJ7DEE6y2E/Fxyp5Q4AXLWCne1zxzga2A4MOSYi4ShwQXn8sHcafxz2gRu+n0nBj30aKgtLi6OudOe56N/zWDVmm/ZsHkrANNn/4t/PP0oH/37FTp3bM9Tz74Yo+pLnyOvro51/PTng0Mfo3Xry1iy+D1at7qM7OwcvF5vqL1KlUpMnfosffveH9r35lvu5pmnR/HFZws4dOgwXq8PgNFPPUe58mksXfI+AwbcxtfLv8Hr80VwhHIs3/ql5E4YQt7scXgyuxZpM8nlcFWqgW/TytA296XXkDdzDLl/G4h3+X/wtL852iVHRbjh77XW/sNau9hau+zIpbjOxpirj7meZoyZYoxZaYx5zRhTuYT9+htjlhpjlk6ePvN/GMaZLbls2dD0TusrWuD1etm7b3+RPqkpyTT/v8Z8mrWUPXv3sX7j5tCrgGuuas3yb9ZEve7Sant2DjWOefO8erWq5Bw3bZeTs4Ou3frRvEUH/jxiNAAHDhwEICUlmfnzpjNi5FN8ufir0D5ZXy4j88ouXN7ydyxalMXGjVsAOHjwEH373U+z5u3p3WcQFStkhKacJHr829ZhyleCMsmhbXENLsW7bin4g0/GSSm4KtfEv30TAN7VWcTVOD8W5UZcuOH/pjHmbmNMVWNM+pFLCf3/esz1Z4AcoBOwBCj2nRNr7SRrbTNrbbO+PXuEWdqZb9fuPaGzw1Vr1uO3lnJpqezZu48DBw8BkJefT9aSr6lTqwapKSkcOvwzW7dlA/D5kq+pW6tmzOovbZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3AVF9qlYMfC+jMfjYcjgAUyaNAOAtLRU3O7AZyRvv+0mFn36JQeDx10iy5Q/eq7pqlIb4uIh9+jPPr7RFXhXf3F0h9zDmMQkTHpgZV5c3Ub4d22PVrlRFdacP9Ar+O+xUz0WqBvGvs2stU2C18cZY3qV2PssNGTkkyz5eiX79h3gqs63cPftt4amCLrfcC3vf/Ips//1FnHxcSR6PIx5ZBjGGHbu3svDf3kan9+P9Vs6XNmKzJaXAjBq6CDue/hxjMuQmpLMYw/dF8shlio+n4977h3O22+9RpzLxdRps1mz5ltGjRzM0mUrWLDgA9q0uYLHH3sIi2XRoiwGDnoYgK5dO9Gq1aWkZ5SnZ89uANze9z5WrFjN4PvvouO17XC5XEycOJ1PFn4GQP0L6/HyS3/H5/exdu239Os/OGZjL20SugzAVas+JimFMveOp3DhG4GAB7zLPiK+fnPiG7fC+n3gLSB/7vjQviatAiY1Hf/WtUfv0PrJf3MyiV3vxVo/5B0mf/6kaA8rKkpc6vmL79SYbGAsgUUrA4BzbfCBjDErrbWNT3UfZ9NSTynqTF/qKSU7m5Z6yolOy1LPYxljGgENgMQj26y104vp/iKQErw+DagA7DTGVCHwITEREYmhcJd6jgQyCYT/28A1wKfAScO/uM8HWGt/NMZ88osqFRGR0ybcN3xvBK4CfrTW9gEuBk74mucw6YNjIiIxFu60T5611m+M8RpjUoGfKOHNXmPMyuKagGKXeoqISHScMvxNYL3bSmNMOQJz+cuAQ8DiEnarDHQAjv9ctAH0baAiIjF2yvC31lpjTBNr7T7gBWPMu0Cqtba4s3uABUCytfaEN3eNMQt/cbUiInJahDvtk2WMaW6tXWKt3Xqqztba20touync4kREJDLCDf+2wB3GmO+AwwSmb2w46/VFROTME274XxPRKkREJKrC/TOO30W6EBERiZ5w1/mLiEgpovAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQeKj3UBxfL7Yl2BiEippTN/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBwoPtYFlAbDn/gb//18Cenl0/j39AkntC/+eiWDHvoL1apWBqBd6yu4q08P8vML6DVwKAUFhfh8fn6b2ZI/3n4zAA8/Po6lK74huWwSAI//6T4urFc3eoMq5Tq0z2Ts2EeJc7l46eWZPDXm+SLtNWtWY/KksVSomM7ePfvo2XsQ27fncPHFDXl+/BOkpCbj8/l44snxzJkzH4Ar2/6GJ58cjsvl4vChw9zW9z42bdrKvff057bbeuD1etm1cw99+9/Ptm3bYzHsUsfTqR/x5zfFHj5A7gvDTmiPO/8SPG1vxFoLfh8F783A//23uGo3wNP+llA/V4Wq5M99Dt/6ZbjqNMTTrgcYFxTkkT9vInbvjmgOKyqMtTbWNZxU4U8bzszCTmLp8m9IKpPInx4fW2z4T535LyY8NbLIdmstubl5JCWVodDrpefdDzLsnv5c3PBCHn58HG2uaE77tr+J1jBOmzLVM2NdQolcLhdrVy/i6o49yM7OIeuLt7nl1rtZu3ZDqM+smRN56+0PmTFjDm0zW9KrV3d69xlEvXp1sdayceMWqlatzOKsd2jUOJP9+w+wZvUiuvy+D+vWbeTOO3rRvHkTbu97H5ltruDLxV+Rm5vHHf170qbN5dx0810x/AmUbP/wzFiXEDZXzQuhII+EzneeNPxxJ0BhPgCmUg0SbxxE7oQhRfskliVp4Fh+HjcQvAWUGfA0ebPHYnf9QHyzdrjOOZeC+ROjMJrTo+yIV004/TTtcxo0a9KItNSU/3k/YwxJSWUA8Hq9eL0+DGEdN/kVWjRvyqZNW9myZRuFhYW8/vo8ruvUoUif+vXr8fHHnwLwycLPuK5TewA2bNjMxo1bAMjJ2cFPO3dTsWIGEHgyT00J/D9IS0shJydwtrjwP5+Tm5sHwJeLl1G9WtXID9Ih/NvWYXMPFd8hGPwAxpMAJznZjW/QAt/GFeAtCGywFpMQ+L00CUnYQ3tPa81nCk37RMmK1evo0vuPVKqQweABt3FenVoA+Hw+uvW9l23bc+hxw7U0bnhBaJ9nX5zBP6bO4rJLLua+O3vj8bhjVX6pck61Knyf/UPodvb2HFo0b1qkz8qVa+hyQ0fGPzeFzp2vITU1hfT08uzZczQImjdrgsfjZtOmrQDcccdg3pw/g9zcPA4cPEjL33Q64bH79O7Bu+99EpmByUnFXdAMz1XdMWVTyZs55oT2+IaXU5j1Tuh2/oLJJPYYgvUWQn4uuVNGnrBPaRCRM39jTJox5kljzDpjzO7gZW1wW7lIPOaZrMH55/HBnJf459TnuOn3v2PQn/4SaouLi2Puy+P5aO5UVq39lg2btwJw7x29ePPVF5j94jj2HzzIlFffiFH1pY8xJ766On7688Ghj9G69WUsWfwerVtdRnZ2Dl6vN9RepUolpk59lr597w/te889/eh03a3UrtuMadNm8/SYoqFx001daHbJxTz9zD8iMCopjm/9UnInDCFv9jg8mV2LtJnkcrgq1cC3aWVom/vSa8ibOYbcvw3Eu/w/eNrfHO2SoyJS0z6vA3uBTGtthrU2A2gb3DanuJ2MMf2NMUuNMUsnT58VodKiL7lsUmh6p/XlzfF6fezdt79In9SUZJo3vYhPv/wKgIoV0jHG4PG46dyxHavWfhv1ukur7dk51Kh+Tuh29WpVQ1M0R+Tk7KBrt340b9GBP48YDcCBAwcBSElJZv686YwY+RRfLg4crwoV0ml8UQMWL/kagNfnzOfyy5uF7u+qK1vx0LBBdO7Sm4KCgoiOT07Ov20dpnwlKJMc2hbX4FK865aC3xfYkJSCq3JN/Ns3AeBdnUVcjfNjUW7ERSr8a1trR1trfzyywVr7o7V2NFCzuJ2stZOstc2stc369vxDhEqLvl2794bODletWY/fbymXlsqevfs5cDAwX5mXn0/W0uXUqVkdgJ279gCBM9KPF2VRr26t2BRfCi1ZupzzzqtD7do1cLvddOt2PW8ueL9In4yM8qFXCMOGDmTqtMDJiNvtZu6cKbzyyhvMnbsg1H/v3v2kpaVSL7giq91VrVm3LvAGcpMmDZnw/JPc0KUPO3fujsYQJciUrxy67qpSG+Li4Zj3COIbXYF39RdHd8g9jElMwqRXASCubiP8u0rnyqxIzfl/Z4x5EJhmrd0BYIypDPQGvo/QY8bMkFFPseTrVezbf4CruvTi7ttuDk0RdO/ckfcXfsrsf79DXJyLxIQExox6EGMMO3fv4eG/jsPn82Otnw5tW5HZsgUAQx97mr379mOt5YLz6jJy8IBYDrFU8fl83HPvcN5+6zXiXC6mTpvNmjXfMmrkYJYuW8GCBR/Qps0VPP7YQ1gsixZlMXDQwwB07dqJVq0uJT2jPD17dgPg9r73sWLFau64awivz56E32/Zt3cfffs/AMDoJ/5McnJZZs0MrBj5/vvt3NClT2wGX8okdBmAq1Z9TFIKZe4dT+HCNwIBD3iXfUR8/ebEN26F9fvAW0D+3PGhfU1aBUxqOv6ta4/eofWT/+ZkErvei7V+yDtM/vxJ0R5WVERkqacxpjwwDLgeqAxYYAcwHxhtrd1zqvs4m5Z6SlFn+lJPKdnZtNRTThTuUs+InPlba/caY14GPgCyrLWh11nGmKuBdyPxuCIiEp5IrfYZBMwD/gh8Y4y5/pjmv0biMUVEJHyRmvPvB1xirT1kjKkNvGGMqW2t/TvoU0wiIrEWqfCPOzLVY63daozJJPAEUAuFv4hIzEVqqeePxpgmR24Enwh+B1QALorQY4qISJgiFf49gR+P3WCt9VprewKtI/SYIiISpkit9skuoe2zSDymiIiET9/qKSLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQMZaG+saHMkY099aOynWdcgvo+N39tKxC9CZf+z0j3UB8qvo+J29dOxQ+IuIOJLCX0TEgRT+seP4OceznI7f2UvHDr3hKyLiSDrzFxFxIIX/aWCMudoYs94Ys9EYM+wk7QnGmNnB9i+NMbWPaXsouH29MabDMdtfMsb8ZIz5JjqjkOOFcVxbG2O+MsZ4jTE3xqJGOblT/f6YgGeDx3alMeb/ol1jrCn8fyVjTBzwPHAN0ADoYYxpcFy324G91trzgHHA6OC+DYA/AA2Bq4EJwfsDmBrcJjEQ5nHdBvQGXotudRKGqZT8+3MNUC946Q/8Iwo1nVEU/r9eC2CjtXaztbYAmAVcf1yf64FpwetvAFcZY0xw+yxrbb61dguwMXh/WGv/C+yJxgDkpE55XK21W621KwF/LAqU4oXx+3M9MN0GZAHljDFVo1PdmUHh/+tVA74/5nZ2cNtJ+1hrvcB+ICPMfSU2dGxKN8cfX4X/r2dOsu34JVTF9QlnX4kNHZvSzfHHV+H/62UDNY65XR34obg+xph4II3AS9Jw9pXY0LEp3Rx/fBX+v94SoJ4xpo4xxkPgDdz5x/WZD/QKXr8R+NgGPmAxH/hDcDVQHQJvPi2OUt1SsnCOq5xhr+LzAAADzklEQVS95gM9g6t+LgP2W2tzYl1UNMXHuoCznbXWa4z5I/AeEAe8ZK1dbYx5FFhqrZ0PTAFmGGM2Ejjj/0Nw39XGmNeBNYAXGGCt9QEYY2YCmUAFY0w2MNJaOyXKw3OscI6rMaY58C+gPNDJGPOItbZhDMuWoJP9/gBuAGvtC8DbQEcCiyx+BvrEptLY0Sd8RUQcSNM+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/kTOQMaa3MeacWNchpZfCX84awQ/knDH/Z4Of1o6U3sD/FP4RrkdKGa3zlzNa8G8fvAN8AlwOdAYuAB4BEoBNQB9r7SFjTEdgLLAL+Aqoa6393Snu+13gS6Ap8C3Q01r7szFmBNAJKAN8DtxhrbXGmIXB2y0JfEr0W2A44AF2Azdba3cYY0YBdYCqwPnA/cBlBL5KeDvQyVpbaIy5JFhzcrDu3sH7nhrslxscd4Pj+1lrc46vx1r7zP/y8xUHs9bqossZewFqE/jK5MuCtysA/wXKBm8PBUYAiQS+pbFOcPtMYEEY922BlsHbLwGDg9fTj+k3g0BYAywEJhzTVp6jJ1F9gWeC10cBnxL4VOnFBD5Fek2w7V8EnsTcBIK7YnB7dwKfJD7yOM2C10/Vb0JJ49RFl5Nd9DJRzgbf2cB3rkPg7LkB8FngTyLgAb4ALgQ228DfRYBA+PcP476/t9Z+Frz+CjAIeBpoa4x5EEgC0oHVwJvBfrOP2b86MDv4XfAeYMsxbe/YwNn9KgJfEfFucPsqAk88FwCNgA+CY4kDTvb9MqfqN/sk+4iUSOEvZ4PDx1w3wAfW2h7HdjDGNP2F9338vKc1xiQCEwiceX8fnMJJLKae8cBYG/iun0wCZ/xH5ANYa/3GmEJr7ZHH8hP43TPAamvt5aeo8VT9DhezXaRYZ8ybZyJhygJaGmPOAzDGJBljzgfWAXWP+fvI3cO8v5rGmCOh2oPAVM2RoN9ljEkm8E2sxUkjMDcPR7+5NVzrgYpHHt8Y4zbGHPliuINAShj9RH4Rhb+cVay1Owm8KTrTGLOSwJPBhdbaXOBu4F1jzKfADgJ/MQ1jTDNjzORi7nIt0Ct4X+nAP6y1+4AXCUzP/JvA1zsXZxQwxxiziMAbsf/LWAoIPLGMNsasAJYDVwSbpwIvGGOWE5jmKa6fyC+i1T5Sahhjkm1g1Y8h8MfXN1hrx5XQvzaBN4UbRalEkTOGzvylNOkXPFNeTWA6ZmKM6xE5Y+nMX0TEgXTmLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoP8HkqdScClQ374AAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEGCAYAAABmXi5tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAitElEQVR4nO3deXxU1d3H8c9vMlkhLAnIjmjBRxDRyiKIIFoFRFGkAloQcK2i4AIIWhXQtooIVn1wwaosPoJg2cSFpYWKCBIEAVksyKKBgLILJCSZnOePGQKBJKTKZEju9/16zYuZe86993dzX/nOzblnBnPOISIipZ8v0gWIiEjxUOCLiHiEAl9ExCMU+CIiHqHAFxHxCH+kCyjI3i5tNH2ohKo8Y0OkS5BfYf/gVpEuQX6FMk9PsoLadIUvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEf4I11AaZBw36NEN26B27+PA/1vP6k9uklL4m65A5yDQIDDY/+XwPrVRNWpS8LdD2PxCbicHDKmvkvWF/MBKPv0y1h8AgC+chXI3rieQyOeKNbjKs3atW3DqFFPE+Xz8fY7E3l+xOg87bVr1+DvY0ZRqXISe/fso2fvfmzblsZFF13A6FeeJbFcWQKBAM8+9wpTpswE4Mo2LRk+/EliYqJZvnw1d9/Tn0AgAMAVrVswcuQwoqP97N61h6uuvrnYj7k0iun0R/znXYI7dID00QNPao86vzExV3XFOQc5ATI/GU/O99/iO6cBMe175vbzVarOkSkvE1i/DN+5DYlp2x3MIDODI9New+3ZWZyHFTbmnIt0Dfna26XNmVlYPvz1G+Ey0inzwOP5Bj5x8ZCRDkBU7XMp88hQDjzUE1+1muAcOTu2YRWTKTd8DAce6oU7fDDP6mX6DyMrZRGZn80phqP59SrP2BDpEgrl8/lYt2Yh7TvcSmpqGksWf0yP2/qwbt2xuidNfIOPPp7HhAlTuLJNS3r16kbv2/tRr965OOfYuHEz1apVYemST2jYqA0HDvzMpo1Ladu+Gxs2bGLokAFs3ZrKO2MnUb58ORZ+NoPrru/ODz9sp3LlZH76aXcEfwKF2z+4VaRLKDLf2edDZgaxne/PN/CJiYXMIwBYldrEdX2Q9Ff65+0TX4aEB1/i8Mg+kJVJfL8XyXhvBG7XdvxNr8FXsy6Z014rhqM5Pco8PckKatOQzmmQvW4V7uDPBXcIhT0AcXHBK30gJy2VnB3bAHB7d5Ozfy9WrnzedeMT8De8hMyUz0932Z7VrOlv+e67LWze/D1ZWVlMnjyDGzq2y9Onfv16zJ+/CID5CxZxQ8e2AGzYsImNGzcDkJa2kx9/2k3lyskkJ1ckMzOTDRs2ATBv3md0vqkDALfechPTp3/CDz9sBzijw76kydm6Hpd+qOAOobAHsJjYfLv4GzQnsOFryMoMLXFYXPCva4tLwP289zRVG3kK/GIS3exyyv1tPGUfe45Drw0/qT2q7vmYP5qcndvzLI9pejnZ3yyH9MPFVWqpV71GVX5IPfZzTt2WRvXqVfP0WbVqLTd1uhaATp2upVy5RJKSKubp07TJxcTERPPdd1vYtWsPfr+fxpc0AqBz5+uoWas6APXqnUuFCuX559wpfLnkE3r00HBOcYqq35T4viOJ6z6II9NfP6ndf2ELsld/kfv6yIwxxPUYRHz/0fgvakXWwhnFWW5YKfCLSdbSzznwUE8OPf8E8d3uzNNmFZIo0/dxDr06PPfq/6iYy39H5uf/LM5SBXh00DO0bt2clKWzad2qOampabnj8QBVq57F2LEvc9ddj3B0WLR7jz6MfGEoixfN4uDBQwQCOQD4/VE0vqQRHW/sSYfr/sCfHnuIevXOjchxeVFgXQrpr/QnY+ILxFzVNU+bla2Ar0ptAhtX5i6LbtGBjHeHkz7yfrJXLCCm/W3FXXLYhCXwzay8mT1nZuvNbI+Z7TazdaFlFQpZ7x4zW2Zmy8Zu2l5QtxIte90qfFWqYYmhoZv4BMo+9hzpE98isGFtnr6WWJ6ouueTtXxJBCotvbZv20GtmtVzX9esUY3t23fk6ZOWtpMuXe+mabN2PPlU8C+y/fsPAJCYWJaZM8bz5FPD+XLp8tx1lnz5FW2u6kyLltezcOGS3OGdbdvSmDN3AYcPp7N7914Wfr6ERo0ahPsw5QQ5W9djFc+ChMTcZVENW5C9LgVyQm/mCYn4qp5NTupGALK/WUxUrfMiUW5YhOsKfzKwF2jjnEtyziUDV4aWTS5oJefcGOdcE+dck97nVi+oW4njq1oj93nUOfWw6Gjcz/vB76fswGfI/Pccspb8+6T1optfQdZXi48bW5TTIWXZ19Stew516tQiOjqarl1v5MNZeW+IJydXxCx472vwoL6MHTcJgOjoaP4x5S3effcDpk79KM86lSsnAxATE8PAAfczZswEAGZ+OJuWlzUjKiqK+Pg4mjX7LevXn9k3tksLS6qS+9xXrQ74o+Hwsftt/gsvI3v1omMrZBzCYuOx5GoARP2mETk/bSuucsMuXNMy6zjn8gxUO+d2AMPN7I4w7TNiyjz4JP4LLsYSy1P+9SmkT34HooI/2sy5M4m+tDWxV7TFBQKQeYSDLz4NQEyLK/HXvwhLLE/Mle0BODz6OQJbglcXMS2vImP6e5E5qFIsEAjw4ENP8PFH7xHl8zF23PusXfsfhg4ZwLKvVjJr1lyuuOIy/vLMYzgcCxcuoW+/PwHQpUtHWrW6lKTkivTsGRweuPOuh1m5cg0DHrmPDtddjc/n4403xjN/QTBI1q/fyOw581mxfB45OTm8/fZE1qz5NmLHX5rE3twX3zkNsIRE4vuPJmv+B+CLAiB72Tz8DS7Ff3Gr4O9ediZHJr+Uu65VqIyVTyZny7pjG8zJ4cjMN4m75eHgUF36oXzH/UuqsEzLNLM5wDxgnHNuZ2hZFaA3cI1z7upTbaMkTcuUvM70aZlSuJI0LVNOFolpmd2AZODfoTH8PcACIAnoEqZ9iohIIcIypOOc2wsMCj3yMLPbgXfCsV8RESlYJKZlDovAPkVEPC8sV/hmtqqgJqBKAW0iIhJG4ZqlUwVoR3Aa5vEM+OLk7iIiEm7hCvxZQFnn3NcnNpjZgjDtU0REChGum7Z3FtL2h3DsU0RECqfv0hER8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh7hj3QBBSk7+s1IlyC/1Iw2ka5ARPKhK3wREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEI4oU+GYWm8+ypNNfjoiIhEtRr/Cnmln00RdmVg2YG56SREQkHIoa+NOByWYWZWZ1gNnAY+EqSkRETj9/UTo55940sxiCwV8H+KNz7osw1iUiIqdZoYFvZo8c/xKoDXwNNDez5s65UWGsTURETqNTXeEnnvB6agHLPe2JZ//GZ1+kkFSxPNPHv3pS+9IVq+j32J+pUa0KAFe3voz7br81tz0QCNDt7oc5q1Iyrz4/BIBBT49gzfqN+P1RNKx/HkMGPkC0v0h/kEkRtGvbhlGjnibK5+Ptdyby/IjRedpr167B38eMolLlJPbu2UfP3v3Yti2N2rVr8MGUt/D5fERH+xk9+h3GvDkBgEt+eyFvvfUi8XFxfPLpv3j4kacAGP7sE1x3/TVkZmayadNW7rzrEfbvP1Dsx1waxXT6I/7zLsEdOkD66IEntUed35iYq7rinIOcAJmfjCfn+2/xndOAmPY9c/v5KlXnyJSXCaxfhu/chsS07Q5mkJnBkWmv4fbsLM7DChtzzkW6hnxl/bjhzCwsH8u+/oaE+Dge/8uoAgN/7MRpuWF+onGTprHm240cPHQ4t89ni1No1bwJAI8OG0Hjixpyy00dwncQp1F8zTaRLqFQPp+PdWsW0r7DraSmprFk8cf0uK0P69ZtyO0zaeIbfPTxPCZMmMKVbVrSq1c3et/ej+joaMyMzMxMypRJYOWKf9HqihtJS9vJ4kWzeOjhp/hy6XJmzZzA/45+m09nz+eaq1vzr/mLCAQCPPvXxwF47PG/RurwT2n/4FaRLqHIfGefD5kZxHa+P9/AJyYWMo8AYFVqE9f1QdJf6Z+3T3wZEh58icMj+0BWJvH9XiTjvRG4XdvxN70GX826ZE57rRiO5vQo8/QkK6itqNMyzzOzMWY2x8z+dfRx+kos2Zpc3JDy5X7ZHz07ftzFZ4tT+P31bfMsb92iKWaGmXFh/fPY+dOu01GqAM2a/pbvvtvC5s3fk5WVxeTJM7ihY7s8ferXr8f8+YsAmL9gETd0DJ6frKwsMjMzAYiNjcXnC/4KVa16FonlEvly6XIAJvzfB9xwQ3sA5s77jEAgAMCSL5dTo0a18B+kR+RsXY9LP1Rwh1DYA1jMSbPLAfA3aE5gw9eQlRla4rC4hOA6cQm4n/eepmojr6izdKYAK4AngIHHPaSIVq5ZT+feD3DvgCFs3Lw1d/nwl8fwSJ87MF/+b8pZ2dl8OHs+l196SXGVWupVr1GVH1K3575O3ZZG9epV8/RZtWotN3W6FoBOna6lXLlEkpIqAlCzZnWWfzWXLZtSGPHCaNLSdlKjelW2pablrr8tNY0aJ2wT4Pbet/Dp7PnhOCwpQFT9psT3HUlc90Ecmf76Se3+C1uQvfrYHJQjM8YQ12MQ8f1H47+oFVkLZxRnuWFV1MDPds695pxb6pz76ujjdBdjZveY2TIzW/b38ZNO9+YjpsF5dZk75W2mjv1f/vD76+n3+J8BWLBoKUkVK3DB/9QtcN0/j3yVxhdfQOOLGhZXuQI8OugZWrduTsrS2bRu1ZzU1LTcq/TU1O1c0vga/qd+S3re1oWzzqpUpG0+Nrgf2dnZvPfe1FN3ltMmsC6F9Ff6kzHxBWKu6pqnzcpWwFelNoGNK3OXRbfoQMa7w0kfeT/ZKxYQ0/624i45bIoa+B+aWR8zq2ZmSUcfBXU2s/bHPS9vZm+Z2Soze8/MqhS0nnNujHOuiXOuyV09b/kvDuPMVrZMAgkJ8UBwqCY7O8DefftZsXotCxZ9SdsudzBw6PMsXb6KQU+/kLveq++8x959B3j0gbsiVXqptH3bDmrVrJ77umaNamzfviNPn7S0nXTpejdNm7XjyaeGA5x0ozUtbSffrPmWyy+/lG3bd1Cj5rGhmho1q7HtuG32vK0r13W4mtt6PhCOQ5IiyNm6Hqt4FiQcG36NatiC7HUpkBN8MychEV/Vs8lJ3QhA9jeLiap1XiTKDYuiBn4vgkM4XwBfhR7LCul//B2pkUAa0BFIAd7478ss2Xbt3svRm+Or135LTo6jQvlyPHxvb/45dRxzprzNiKGP0uySRgx/agAAH3w4m0VLl/P80IG548RyeqQs+5q6dc+hTp1aREdH07XrjXw4a06ePsnJFTELDrMNHtSXseOCf3HWqFGNuLg4ACpUKE/Lls34z3++Y8eOH/n5wM9c2iw49HZb95v58MPZQHBG0IAB99Gpc2/S0zOK6zAFsKRj15e+anXAHw2Hf85d5r/wMrJXLzq2QsYhLDYeSw6+eUf9phE5P20rrnLDrqgfvDrnV+yjiXPu4tDzF82s16/Y1hlp4NDnSVmxmn37D/C7zr3oc0d3srOzAejWqQNzFnzO+9M/ISrKR1xsLCOGPpobJgV5ZuRoqlU5i+73Bt8ATpzKKb9cIBDgwYee4OOP3iPK52PsuPdZu/Y/DB0ygGVfrWTWrLlcccVl/OWZx3A4Fi5cQt9+fwKg/vl1ef75p3AuOGtv1KjX+eab9QA80Pfx3GmZn86ezyefBuc1vPS3PxMbG8unnwTfNL78cjn3PzA4MgdfysTe3BffOQ2whETi+48ma/4H4IsCIHvZPPwNLsV/cStcIADZmRyZ/FLuulahMlY+mZwt645tMCeHIzPfJO6Wh4MXaemH8h33L6mKPC3TzBoCDYC4o8ucc+ML6JsKjCL4Ya37gd+40I7MbJVzrtGp9leSpmVKXmf6tEwpXEmaliknK2xaZpGu8M1sCNCGYOB/DFwLfA7kG/jAmxz7cNY4oBLwk5lVJfhJXRERKWZF/ejmzcBFwArn3O2hG6/vFtTZOTesgOU7zExz0kREIqCodwMznHM5QLaZlQN+BGr9wn3m+2YgIiLhdcorfAveXVxlZhUIDtV8BRwEFheyzqqCmoACp2WKiEj4nDLwnXPOzJo55/YBr5vZp0A551xBoQ7BUG8HnPiZZCM4tVNERIpZUcfwl5tZU+dcinNuSxH6zwLKOue+PrHBzBYUvTwRETldihr4lwLdzWwrcIjglboraHqlc+7OgjbknPvDf12liIj8akUN/Han7iIiImeyon7Sduupe4mIyJlMX9IiIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8Qh/pAsokC8q0hWIiJQqusIXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCP8kS6gNHjir6P4bNFSkipWYPq7r5/UvnT5KvoNHkaNalUBuPqKy7jvju657YFAgG539uOsypV4dcQwAJxzvDxmHHPmf47P56PbTdfRo8uNxXNAHtCubRtGjXqaKJ+Pt9+ZyPMjRudpr127Bn8fM4pKlZPYu2cfPXv3Y9u2NC666AJGv/IsieXKEggEePa5V5gyZSYAV7ZpyfDhTxITE83y5au5+57+BAIB+j9yL7fe2hkAvz+K+ufXo2r1Ruzdu6+4D7vUien0R/znXYI7dID00QNPao86vzExV3XFOQc5ATI/GU/O99/iO6cBMe175vbzVarOkSkvE1i/DN+5DYlp2x3MIDODI9New+3ZWZyHFTbmnIt0DfnK2rXpzCwsH8u+Xk1CfDyPP/NCgYE/duI/csP8ROMmTWXN+g0cPHQ4t8+0j+awdPkq/vKnR/D5fOzeu4/kihXCeRinTXz1VpEuoVA+n491axbSvsOtpKamsWTxx/S4rQ/r1m3I7TNp4ht89PE8JkyYwpVtWtKrVzd6396PevXOxTnHxo2bqVatCkuXfELDRm04cOBnNm1cStv23diwYRNDhwxg69ZU3hk7Kc++r7/uGh7sdzfXtOta3IddZPsHn9nn73i+s8+HzAxiO9+fb+ATEwuZRwCwKrWJ6/og6a/0z9snvgwJD77E4ZF9ICuT+H4vkvHeCNyu7fibXoOvZl0yp71WDEdzepR5epIV1KYhndOgycUXUr5c4i9ad8ePP/HZF0v5fcd2eZa/P+0j7rv9D/h8wVNUUsK+JGjW9Ld8990WNm/+nqysLCZPnsENJ/z869evx/z5iwCYv2ARN3RsC8CGDZvYuHEzAGlpO/nxp91UrpxMcnJFMjMz2bBhEwDz5n1G55s6nLTvbt1uZNL708N4dN6Ss3U9Lv1QwR1CYQ9gMbH5dvE3aE5gw9eQlRla4rC4hOA6cQm4n/eepmojT4FfTFZ+s47Ovfpwb/8n2bhpa+7y4S+9wSN97sQs76n4YVsan/zz33S9ox/39n+SrT9sK+6SS63qNaryQ+r23Nep29KoXr1qnj6rVq3lpk7XAtCp07WUK5dIUlLFPH2aNrmYmJhovvtuC7t27cHv99P4kkYAdO58HTVrVc/TPz4+jnZt2zB12sfhOCwpQFT9psT3HUlc90EcmX7yX+D+C1uQvfqL3NdHZowhrscg4vuPxn9RK7IWzijOcsMqLIFvZuXN7DkzW29me8xst5mtCy2rEI59nska/M9vmPuPcUwd9yp/+H1H+j32NAALFn1JUsUKXHB+vZPWyczKIjYmhslvv8zvO7bnyb++WNxle9qjg56hdevmpCydTetWzUlNTSMQCOS2V616FmPHvsxddz3C0WHR7j36MPKFoSxeNIuDBw8RCOTk2eb117fli8XLNHZfzALrUkh/pT8ZE18g5qq8Q2lWtgK+KrUJbFyZuyy6RQcy3h1O+sj7yV6xgJj2txV3yWETriv8ycBeoI1zLsk5lwxcGVo2uaCVzOweM1tmZsv+Pn5imEorfmXLlCEhIR6A1pc1Izs7m7379rNi1VoWfL6Etr/vxcAhz7H0q5UMGvY8AFUrV+LqK1oCwZu8//luc8TqL222b9tBrZrHrr5r1qjG9u078vRJS9tJl65307RZO558ajgA+/cfACAxsSwzZ4znyaeG8+XS5bnrLPnyK9pc1ZkWLa9n4cIlucM7R3XreoOGcyIoZ+t6rOJZkHBs+DWqYQuy16VATujNPCERX9WzyUndCED2N4uJqnVeJMoNi3AFfh3n3HDnXO5vkXNuh3NuOHB2QSs558Y455o455rc1fPWMJVW/Hbt3pN7Fbh67bfkOEeF8uV4+L7b+ef0d5nzj3GMGDaYZo0vYviQRwG4qnULli4PXnWkrFjN2bVqRKz+0iZl2dfUrXsOderUIjo6mq5db+TDWXPy9ElOrohZ8N7X4EF9GTsuePM1Ojqaf0x5i3ff/YCpUz/Ks07lyskAxMTEMHDA/YwZMyG3rVy5RFq3as7MmbPDeWhyAkuqkvvcV60O+KPh8M+5y/wXXkb26kXHVsg4hMXGY8nVAIj6TSNyfio9w6nhmpa51cweBcY553YCmFkVoDfwQ5j2GTEDhzxHyopV7Nt3gN916kGfO28jOzsbgG43Xcec+Z/z/rSPiPJHERcTw4hhg3PDpCB39ujKoGHPM+H96STExzFs8EPFcCTeEAgEePChJ/j4o/eI8vkYO+591q79D0OHDGDZVyuZNWsuV1xxGX955jEcjoULl9C3358A6NKlI61aXUpSckV69gwOD9x518OsXLmGAY/cR4frrsbn8/HGG+OZv+BYkHS68VrmzvuMw4fTI3LMpVXszX3xndMAS0gkvv9osuZ/AL4oALKXzcPf4FL8F7fCBQKQncmRyS/lrmsVKmPlk8nZsu7YBnNyODLzTeJueTh4kZZ+KN9x/5IqLNMyzawiMBi4EagCOGAnMBMY7pzbc6ptlKRpmZLXmT4tUwpXkqZlyskKm5YZlit859xeM3sHmAsscc4dPNpmZu2BT8OxXxERKVi4Zun0A2YADwDfmNnxHxH9azj2KSIihQvXGP7dQGPn3EEzqwN8YGZ1nHMvAYUPXouISFiEK/B9R4dxnHNbzKwNwdA/GwW+iEhEhGta5k4zu/joi1D4Xw9UAi4M0z5FRKQQ4Qr8nkCeT7I457Kdcz2B1mHap4iIFCJcs3RSC2lbVFCbiIiEj748TUTEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEI8w5F+kaPMnM7nHOjYl0HfLL6PyVXF4+d7rCj5x7Il2A/Co6fyWXZ8+dAl9ExCMU+CIiHqHAjxxPjiGWIjp/JZdnz51u2oqIeISu8EVEPEKBLyLiEQr808DM2pvZt2a20cwG59Mea2bvh9q/NLM6x7U9Flr+rZm1O27522b2o5l9U0yHIScownltbWbLzSzbzG6ORI2Sv1P9/ljQy6Fzu8rMLinuGiNBgf8rmVkUMBq4FmgA3GpmDU7odiew1zlXF3gRGB5atwFwC3AB0B54NbQ9gLGhZRIBRTyv3wO9gfeKtzopgrEU/vtzLVAv9LgHeK0Yaoo4Bf6v1wzY6Jzb5JzLBCYBN57Q50ZgXOj5B8DvzMxCyyc554445zYDG0Pbwzn3GbCnOA5A8nXK8+qc2+KcWwXkRKJAKVgRfn9uBMa7oCVABTOrVjzVRY4C/9erAfxw3OvU0LJ8+zjnsoH9QHIR15XI0Lkp3Tx5fhX4IiIeocD/9bYBtY57XTO0LN8+ZuYHygO7i7iuRIbOTenmyfOrwP/1UoB6ZnaOmcUQvAk784Q+M4Feoec3A/9ywU+8zQRuCc3iOYfgDaSlxVS3FK4o51VKrplAz9BsnebAfudcWqSLCjd/pAso6Zxz2Wb2ADAbiALeds6tMbOngWXOuZnAW8AEM9tI8EbSLaF115jZZGAtkA3c75wLAJjZRKANUMnMUoEhzrm3ivnwPKso59XMmgLTgIpARzMb5py7IIJlS0h+vz9ANIBz7nXgY6ADwYkSh4HbI1Np8dJXK4iIeISGdEREPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CJnIDN7PNI1SOmjaZlSYoS+cM6cc2fEl5WZWdTRz02EYdsHnXNlz5R6pHTQFb6c0cysTug76ccD3wC1zGygmaWEvsd82HF9nwz1/dzMJprZgFNsu7eZzTCzBWa2wcyGHNc23cy+MrM1ZnbPccsPmtlIM1sJtDCzp0K1fGNmY0JvSoS2+aKZLTOzdWbW1Mymhvbz5+O218PMlprZ12b2hplFmdlzQHxo2f8V1C+/ek7LD11KL+ecHnqcsQ+gDsGvH24eet2W4H9CbQQvWGYBrYGmwNdAHJAIbAAGnGLbvYE0gt9cGk/wDaVJqC0p9O/R5cmh1w7oetw2ko57PgHoGHq+ABgeev4gsB2oBsQS/GbGZKA+8CEQHer3KtAz9PzgcdstrF+eevTQo7CHvlpBSoKtLvid5RAM/LbAitDrsgS/gygRmOGcywAyzOzDIm57rnNuN4CZTQUuB5YB/czsplCfWqF97AYCwD+OW/9KM3sUSACSgDUEwxmOfffOamCNC31Xi5ltCm3zcqAxkBL6wyAe+DGfGn9XSL8T6xEpkAJfSoJDxz034Fnn3BvHdzCzh37htk+8ieXMrA1wNdDCOXfYzBYQ/MsBIMMd+76jOIJX202ccz+Y2dDj+gEcCf2bc9zzo6/9oWMZ55x77BQ1FtYvtx6RU9EYvpQ0s4E7zKwsgJnVMLOzgEUEv8AsLtR2fRG3d42ZJZlZPNAptJ3yBP9LysNmdj7QvIB1j4b7rtA+/9v/1/afwM2h+gnVcXaoLcvMoovQT6TIdIUvJYpzbo6Z1QcWh4Y3DgI9nHMpZjYTWAXsJDiMsh/AzO4Nrft6PptcSnBIpCbwrnNumZmtBu41s3XAt8CSfNbDObfPzN4kOMa/g+BXKv83x7LWzJ4A5piZD8gC7ge2ErxPscrMljvnuhfST6TINC1TSg0zK+ucO2hmCcBnwD3OueWF9O9NcDjmgeKqUSSSdIUvpckYM2tAcKhlXGFhL+JFusIXEfEI3bQVEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGP+H8bcalIQKGLvQAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": { + "needs_background": "light" + } } ], - "source": [ - "fig, ax = plt.subplots()\n", - "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The calculated RMSE scores can be visualized to comparatively study how model performance is affected by different parameters." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It can be seen from this visualization that RMSE first decreases and then increases as rank increases, due to overfitting. When the rank equals 20 and the regularization parameter equals 0.1, the model achieves the lowest RMSE score." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.5 Top K recommendation" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.1 Top k for all users (items)" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], + "execution_count": 20, "source": [ "dfs_rec = model.recommendForAllUsers(10)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 38, - "metadata": {}, + "execution_count": 21, + "source": [ + "dfs_rec.show(10)" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+--------------------+\n", "|UserId| recommendations|\n", "+------+--------------------+\n", - "| 471|[[814, 3.7504902]...|\n", - "| 463|[[814, 3.1264882]...|\n", - "| 833|[[814, 3.3154674]...|\n", - "| 496|[[814, 3.0553887]...|\n", - "| 148|[[814, 4.030121],...|\n", - "| 540|[[814, 3.866104],...|\n", - "| 392|[[814, 4.1199512]...|\n", - "| 243|[[814, 3.7487845]...|\n", + "| 471|[[814, 3.7504895]...|\n", + "| 463|[[814, 3.1264873]...|\n", + "| 833|[[814, 3.3154662]...|\n", + "| 496|[[814, 3.055388],...|\n", + "| 148|[[814, 4.03012], ...|\n", + "| 540|[[814, 3.8661027]...|\n", + "| 392|[[814, 4.119951],...|\n", + "| 243|[[814, 3.748784],...|\n", "| 623|[[814, 3.9018161]...|\n", - "| 737|[[814, 3.85075], ...|\n", + "| 737|[[814, 3.8507497]...|\n", "+------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], - "source": [ - "dfs_rec.show(10)" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.2 Top k for a selected set of users (items)" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], + "execution_count": 22, "source": [ "users = dfs_train.select(als.getUserCol()).distinct().limit(3)\n", "\n", "dfs_rec_subset = model.recommendForUserSubset(users, 10)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 40, - "metadata": {}, + "execution_count": 23, + "source": [ + "dfs_rec_subset.show(10)" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+--------------------+\n", "|UserId| recommendations|\n", "+------+--------------------+\n", - "| 471|[[814, 3.7504902]...|\n", - "| 463|[[814, 3.1264882]...|\n", - "| 148|[[814, 4.030121],...|\n", + "| 471|[[814, 3.7504895]...|\n", + "| 463|[[814, 3.1264873]...|\n", + "| 148|[[814, 4.03012], ...|\n", "+------+--------------------+\n", "\n" ] } ], - "source": [ - "dfs_rec_subset.show(10)" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.3 Run-time considerations for top-k recommendations\n", "\n", @@ -734,28 +745,28 @@ "* Inner products of user-item pairs are calculated individually instead of leveraging matrix block multiplication features which are available in certain contemporary computing acceleration libraries (e.g., BLAS).\n", "\n", "More details about possible optimizations of the top k recommendations in Spark can be found [here](https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html)." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], + "execution_count": 24, "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## References" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "1. Yehuda Koren, Robert Bell, and Chris Volinsky, \"Matrix Factorization Techniques for Recommender Systems\n", "\", ACM Computer, Vol. 42, Issue 8, pp 30-37, Aug., 2009.\n", @@ -765,14 +776,18 @@ "4. Seaborn. url: https://seaborn.pydata.org/\n", "5. Scaling collaborative filtering with PySpark. url: https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html\n", "6. Matrix Completion via Alternating Least Square (ALS). url: http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf" - ] + ], + "metadata": {} } ], "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" + }, "kernelspec": { - "display_name": "Python 3 Spark - local", - "language": "python", - "name": "spark-3-python" + "name": "python3", + "display_name": "Python 3.6.9 64-bit ('.env': venv)" }, "language_info": { "codemirror_mode": { @@ -784,7 +799,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.5" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 4b6fa73ac9..289d5b93fd 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -177,8 +177,8 @@ "output_type": "stream", "name": "stdout", "text": [ - "System version: 3.6.13 |Anaconda, Inc.| (default, Jun 4 2021, 14:25:59) \n", - "[GCC 7.5.0]\n", + "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", + "[GCC 8.4.0]\n", "Spark version: 2.4.8\n" ] } @@ -206,7 +206,9 @@ "# user, item column names\n", "COL_USER=\"UserId\"\n", "COL_ITEM=\"MovieId\"\n", - "COL_RATING=\"Rating\"" + "COL_RATING=\"Rating\"\n", + "COL_TITLE=\"Title\"\n", + "COL_GENRE=\"Genre\"" ], "outputs": [], "metadata": { @@ -259,7 +261,7 @@ " )\n", ")\n", "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=\"title\", genres_col=\"genres\")\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=COL_GENRE)\n", "data.show()" ], "outputs": [ @@ -267,7 +269,7 @@ "output_type": "stream", "name": "stderr", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 17.1kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.1kKB/s]\n" ] }, { @@ -275,7 +277,7 @@ "name": "stdout", "text": [ "+-------+------+------+---------+--------------------+------+\n", - "|MovieId|UserId|Rating|Timestamp| title|genres|\n", + "|MovieId|UserId|Rating|Timestamp| Title| Genre|\n", "+-------+------+------+---------+--------------------+------+\n", "| 26| 138| 5.0|879024232|Brothers McMullen...|Comedy|\n", "| 26| 224| 3.0|888104153|Brothers McMullen...|Comedy|\n", @@ -406,7 +408,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "Took 4.012367556002573 seconds for training.\n" + "Took 4.189040212018881 seconds for training.\n" ] } ], @@ -563,9 +565,9 @@ " test_df, \n", " top_all, \n", " k = TOP_K, \n", - " col_user=\"UserId\", \n", - " col_item=\"MovieId\",\n", - " col_rating=\"Rating\", \n", + " col_user=COL_USER, \n", + " col_item=COL_ITEM,\n", + " col_rating=COL_RATING, \n", " col_prediction=\"prediction\",\n", " relevancy_method=\"top_k\"\n", ")\n", @@ -735,15 +737,15 @@ " 100k\n", " random\n", " 10\n", - " 0.016755\n", - " 0.005883\n", - " 0.017849\n", - " 0.001890\n", - " 0.996326\n", - " 10.540834\n", - " 12.133664\n", - " 0.922288\n", - " 0.893001\n", + " 0.016543\n", + " 0.005566\n", + " 0.016373\n", + " 0.001441\n", + " 0.994489\n", + " 10.541850\n", + " 12.136439\n", + " 0.922613\n", + " 0.892511\n", " \n", " \n", "\n", @@ -752,15 +754,15 @@ "text/plain": [ " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", "1 100k als 10 0.047296 0.016015 0.043097 0.004579 \n", - "2 100k random 10 0.016755 0.005883 0.017849 0.001890 \n", + "2 100k random 10 0.016543 0.005566 0.016373 0.001441 \n", "\n", " catalog_coverage distributional_coverage novelty diversity \\\n", "1 0.385793 7.967257 11.659776 0.892277 \n", - "2 0.996326 10.540834 12.133664 0.922288 \n", + "2 0.994489 10.541850 12.136439 0.922613 \n", "\n", " serendipity \n", "1 0.878733 \n", - "2 0.893001 " + "2 0.892511 " ] }, "metadata": {}, @@ -791,10 +793,10 @@ "source": [ "# Get movie features \"title\" and \"genres\"\n", "movies = (\n", - " data.groupBy(\"MovieId\", \"title\", \"genres\").count()\n", + " data.groupBy(COL_ITEM, COL_TITLE, COL_GENRE).count()\n", " .na.drop() # remove rows with null values\n", - " .withColumn(\"genres\", F.split(F.col(\"genres\"), \"\\|\")) # convert to array of genres\n", - " .withColumn(\"title\", F.regexp_replace(F.col(\"title\"), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", + " .withColumn(COL_GENRE, F.split(F.col(COL_GENRE), \"\\|\")) # convert to array of genres\n", + " .withColumn(COL_TITLE, F.regexp_replace(F.col(COL_TITLE), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", " .drop(\"count\") # remove unused columns\n", ")" ], @@ -806,12 +808,12 @@ "execution_count": 25, "source": [ "# tokenize \"title\" column\n", - "title_tokenizer = Tokenizer(inputCol=\"title\", outputCol=\"title_words\")\n", + "title_tokenizer = Tokenizer(inputCol=COL_TITLE, outputCol=\"title_words\")\n", "tokenized_data = title_tokenizer.transform(movies)\n", "\n", "# remove stop words\n", "remover = StopWordsRemover(inputCol=\"title_words\", outputCol=\"text\")\n", - "clean_data = remover.transform(tokenized_data).drop(\"title\", \"title_words\")" + "clean_data = remover.transform(tokenized_data).drop(COL_TITLE, \"title_words\")" ], "outputs": [], "metadata": {} @@ -827,7 +829,7 @@ "hashed_data = text_hasher.transform(clean_data)\n", "\n", "# step 2: fit a CountVectorizerModel from column \"genres\".\n", - "count_vectorizer = CountVectorizer(inputCol=\"genres\", outputCol=\"genres_features\")\n", + "count_vectorizer = CountVectorizer(inputCol=COL_GENRE, outputCol=\"genres_features\")\n", "count_vectorizer_model = count_vectorizer.fit(hashed_data)\n", "vectorized_data = count_vectorizer_model.transform(hashed_data)\n", "\n", @@ -836,7 +838,7 @@ " inputCols=[\"text_features\", \"genres_features\"],\n", " outputCol=\"features\",\n", ")\n", - "feature_data = assembler.transform(vectorized_data).select(\"MovieId\", \"features\")\n", + "feature_data = assembler.transform(vectorized_data).select(COL_ITEM, \"features\")\n", "\n", "feature_data.show(10, False)" ], @@ -845,20 +847,20 @@ "output_type": "stream", "name": "stdout", "text": [ - "+-------+---------------------------------------------+\n", - "|MovieId|features |\n", - "+-------+---------------------------------------------+\n", - "|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n", - "|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n", - "|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n", - "|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n", - "|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n", - "|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n", - "|1118 |(1043,[702,1025],[1.0,1.0]) |\n", - "|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n", - "|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n", - "|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n", - "+-------+---------------------------------------------+\n", + "+------+---------------------------------------------+\n", + "|ItemId|features |\n", + "+------+---------------------------------------------+\n", + "|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n", + "|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n", + "|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n", + "|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n", + "|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n", + "|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n", + "|1118 |(1043,[702,1025],[1.0,1.0]) |\n", + "|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n", + "|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n", + "|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n", + "+------+---------------------------------------------+\n", "only showing top 10 rows\n", "\n" ] @@ -926,8 +928,8 @@ "output_type": "stream", "name": "stdout", "text": [ - "0.8978120851519519\n", - "0.8937850286817351\n" + "0.8982144953920664\n", + "0.8941807579293202\n" ] } ], @@ -965,9 +967,8 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (reco_pyspark)", - "language": "python", - "name": "reco_pyspark" + "name": "python3", + "display_name": "Python 3.6.9 64-bit ('.env': venv)" }, "language_info": { "codemirror_mode": { @@ -979,7 +980,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.6.9" + }, + "interpreter": { + "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" } }, "nbformat": 4, diff --git a/examples/04_model_select_and_optimize/tuning_spark_als.ipynb b/examples/04_model_select_and_optimize/tuning_spark_als.ipynb index 0d8cf261ea..e0d839412c 100644 --- a/examples/04_model_select_and_optimize/tuning_spark_als.ipynb +++ b/examples/04_model_select_and_optimize/tuning_spark_als.ipynb @@ -2,23 +2,22 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Hyperparameter tuning (Spark based recommender)" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Hyperparameter tuning for Spark based recommender algorithm is important to select a model with the optimal performance. This notebook introduces good practices in performing hyperparameter tuning for building recommender models with the utility functions provided in the [Microsoft/Recommenders](https://github.com/Microsoft/Recommenders.git) repository.\n", "\n", @@ -26,31 +25,19 @@ "* Spark native/custom constructs (`ParamGridBuilder`, `TrainValidationSplit`).\n", "* `hyperopt` package with Tree of Parzen Estimator algorithm. \n", "* Brute-force random search of parameter values sampled with pre-defined space. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 0 Global settings and import" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", - "[GCC 7.2.0]\n", - "Pandas version: 0.23.0\n", - "PySpark version: 2.3.1\n" - ] - } - ], "source": [ "# set the environment path to find Recommenders\n", "%matplotlib notebook\n", @@ -58,7 +45,6 @@ "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import sys\n", - "import pandas as pd\n", "import numpy as np\n", "\n", @@ -90,18 +76,27 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", + "[GCC 7.2.0]\n", + "Pandas version: 0.23.0\n", + "PySpark version: 2.3.1\n" + ] + } + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], "source": [ + "MOVIELENS_DATA_SIZE = \"100k\"\n", + "\n", "NUMBER_CORES = 1\n", "NUMBER_ITERATIONS = 25\n", "\n", @@ -128,138 +123,142 @@ "\n", "RANK = [10, 15, 20, 30, 40]\n", "REG = [ 0.1, 0.01, 0.001, 0.0001, 0.00001]" - ] + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 1 Data preparation" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A Spark session is created. Note in this case, to study the running time for different approaches, the Spark session in local mode uses only one core for running. This eliminates the impact of parallelization of parameter tuning. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "spark = start_or_get_spark(url=\"local[{}]\".format(NUMBER_CORES))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "MovieLens 100k dataset is used for running the demonstration." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "source": [ + "data = load_spark_df(spark, size=MOVIELENS_DATA_SIZE, header=(COL_USER, COL_ITEM, COL_RATING))" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "100%|██████████| 4.81k/4.81k [00:01<00:00, 2.47kKB/s]\n" ] } ], - "source": [ - "data = load_spark_df(spark, size='100k', header=(COL_USER, COL_ITEM, COL_RATING))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "To reduce time spent on the comparitive study, 50% of the data is used for the experimentation below." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "data, _ = spark_random_split(data, ratio=SUBSET_RATIO)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The dataset is split into 3 subsets randomly with a given split ratio. The hyperparameter tuning is performed on the training and the validating data, and then the optimal recommender selected is evaluated on the testing dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, - "outputs": [], "source": [ "train, valid, test = spark_random_split(data, ratio=[3, 1, 1])" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 2 Hyper parameter tuning with Azure Machine Learning Services" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The `hyperdrive` module in the [Azure Machine Learning Services](https://azure.microsoft.com/en-us/services/machine-learning-service/) runs [hyperparameter tuning and optimizing for machine learning model selection](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters). At the moment, the service supports running hyperparameter tuning on heterogenous computing targets such as cluster of commodity compute nodes with or without GPU devices (see detailed documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets)). It is feasible to run parameter tuning on a cluster of VM nodes. In this case, the service containerizes individual and independent Spark session on each node of the cluster to run the parameter tuning job in parallel, instead of inside a single Spark session where the training is executed in a distributed manner. \n", "\n", "Detailed instructions of tuning hyperparameter of non-Spark workloads by using Azure Machine Learning Services can be found in [this](./hypertune_aml_wide_and_deep_quickstart.ipynb) notebook. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 3 Hyper parameter tuning with Spark ML constructs" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.1 Spark native construct" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Spark ML lib implements modules such as `CrossValidator` and `TrainValidationSplit` for tuning hyperparameters (see [here](https://spark.apache.org/docs/2.2.0/ml-tuning.html)). However, by default, it does not support custom machine learning algorithms, data splitting methods, and evaluation metrics, like what are offered as utility functions in the Recommenders repository. \n", "\n", "For example, the Spark native constuct can be used for tuning a recommender against the `rmse` metric which is one of the available regression metrics in Spark." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Firstly, a Spark ALS object needs to be created. In this case, for illustration purpose, it is an ALS model object." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, - "outputs": [], "source": [ "# NOTE the parameters of interest, rank and regParam, are left unset, \n", "# because their values will be assigned in the parameter grid builder.\n", @@ -271,41 +270,41 @@ " nonnegative=False,\n", " **HEADER_ALS\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Then, a parameter grid can be defined as follows. Without loss of generity, only `rank` and `regParam` are considered." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, - "outputs": [], "source": [ "paramGrid = ParamGridBuilder() \\\n", " .addGrid(als.rank, RANK) \\\n", " .addGrid(als.regParam, REG) \\\n", " .build()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Given the settings above, a `TrainValidationSplit` constructor can be created for fitting the best model in the given parameter range. In this case, the `RegressionEvaluator` is using `RMSE`, by default, as an evaluation metric. \n", "\n", "Since the data splitter is embedded in the `TrainValidationSplit` object, to make sure the splitting ratio is consistent across different approaches, the split ratio is set to be 0.75 and in the model training the training dataset and validating dataset are combined. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, - "outputs": [], "source": [ "tvs = TrainValidationSplit(\n", " estimator=als,\n", @@ -317,36 +316,44 @@ " # are therefore not available here. \n", " trainRatio=0.75\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, - "outputs": [], "source": [ "with Timer() as time_spark:\n", " # Run TrainValidationSplit, and choose the best set of parameters.\n", " # NOTE train and valid is union because in Spark TrainValidationSplit does splitting by itself.\n", " model = tvs.fit(train.union(valid))\n", "\n" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The model parameters in the grid and the best metrics can be then returned. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "source": [ + "for idx, item in enumerate(model.getEstimatorParamMaps()):\n", + " print('Run {}:'.format(idx))\n", + " print('\\tValidation Metric: {}'.format(model.validationMetrics[idx]))\n", + " for key, value in item.items():\n", + " print('\\t{0}: {1}'.format(repr(key), value))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Run 0:\n", "\tValidation Metric: 1.0505385750367227\n", @@ -451,20 +458,17 @@ ] } ], - "source": [ - "for idx, item in enumerate(model.getEstimatorParamMaps()):\n", - " print('Run {}:'.format(idx))\n", - " print('\\tValidation Metric: {}'.format(model.validationMetrics[idx]))\n", - " for key, value in item.items():\n", - " print('\\t{0}: {1}'.format(repr(key), value))" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "source": [ + "model.validationMetrics" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "[1.0505385750367227,\n", @@ -494,54 +498,49 @@ " 4.426604995574413]" ] }, - "execution_count": 12, "metadata": {}, - "output_type": "execute_result" + "execution_count": 12 } ], - "source": [ - "model.validationMetrics" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "To get the best model, just do" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 13, - "metadata": {}, - "outputs": [], "source": [ "model_best_spark = model.bestModel" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.2 Custom `Estimator`, `Transformer`, and `Evaluator` for Spark ALS\n", "\n", "One can also customize Spark modules to allow tuning hyperparameters for a desired model and evaluation metric, given that the native Spark ALS does not allow tuning hyperparameters for ranking metrics such as precision@k, recall@k, etc. This can be done by creating custom `Estimator`, `Transformer` and `Evaluator`. The benefit is that, after the customization, the tuning process can make use of `trainValidSplit` directly, which distributes the tuning in a Spark session." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Customized `Estimator` and `Transformer` for top k recommender based on Spark ALS\n", "\n", "The following shows how to implement a PySpark `Estimator` and `Transfomer` for recommending top k items from ALS model. The latter generates top k recommendations from the model object. Both of the two are designed by following the protocol of Spark APIs, to make sure that they can be run with the hyperparameter tuning constructs in Spark." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 14, - "metadata": {}, - "outputs": [], "source": [ "class ALSTopK(\n", " ALS,\n", @@ -696,22 +695,22 @@ " )\n", " \n", " return topk_recommendation_all.select(self.userCol, labelCol, predictionCol)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Customized precision@k evaluation metric\n", "\n", "In addition to the custom `Estimator` and `Transformer`, it may also be desired to customize an `Evaluator` to allow \"beyond-rating\" metrics. The codes as following illustrates a precision@k evaluator. Other types of evaluators can be developed in a similar way." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 15, - "metadata": {}, - "outputs": [], "source": [ "# Define a custom Evaulator. Here precision@k is used.\n", "class PrecisionAtKEvaluator(Evaluator):\n", @@ -733,20 +732,20 @@ "\n", " def isLargerBetter(self):\n", " return True" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Then a new ALS top-k recommender can be created, and the Spark native construct, `TrainValidationSplit` module, can be used to find the optimal model w.r.t the precision@k metric." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 16, - "metadata": {}, - "outputs": [], "source": [ "alstopk = ALSTopK(\n", " userCol=COL_USER,\n", @@ -771,14 +770,23 @@ " # are therefore not available here. \n", " trainRatio=0.75\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "source": [ + "# Run TrainValidationSplit, and choose the best set of parameters.\n", + "# NOTE train and valid is union because in Spark TrainValidationSplit does splitting by itself.\n", + "model_precision = tvs.fit(train.union(valid))\n", + "\n", + "model_precision.getEstimatorParamMaps()" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "[{Param(parent='ALSTopK_4f48b7cc6cf2badfcea7', name='rank', doc='rank of the factorization'): 10,\n", @@ -791,24 +799,15 @@ " Param(parent='ALSTopK_4f48b7cc6cf2badfcea7', name='regParam', doc='regularization parameter (>= 0).'): 0.01}]" ] }, - "execution_count": 17, "metadata": {}, - "output_type": "execute_result" + "execution_count": 17 } ], - "source": [ - "# Run TrainValidationSplit, and choose the best set of parameters.\n", - "# NOTE train and valid is union because in Spark TrainValidationSplit does splitting by itself.\n", - "model_precision = tvs.fit(train.union(valid))\n", - "\n", - "model_precision.getEstimatorParamMaps()" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 18, - "metadata": {}, - "outputs": [], "source": [ "def best_param(model, is_larger_better=True):\n", " if is_larger_better:\n", @@ -819,25 +818,35 @@ " parameters = model.getEstimatorParamMaps()[model.validationMetrics.index(best_metric)]\n", " \n", " return list(parameters.values())" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 19, - "metadata": {}, - "outputs": [], "source": [ "params = best_param(model_precision)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "source": [ + "model_precision.bestModel.transform(valid).limit(5).show()\n", + "\n", + "for idx, item in enumerate(model_precision.getEstimatorParamMaps()):\n", + " print('Run {}:'.format(idx))\n", + " print('\\tValidation Metric: {}'.format(model_precision.validationMetrics[idx]))\n", + " for key, value in item.items():\n", + " print('\\t{0}: {1}'.format(repr(key), value))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+--------------------+--------------------+\n", "|userID| label| prediction|\n", @@ -868,39 +877,29 @@ ] } ], - "source": [ - "model_precision.bestModel.transform(valid).limit(5).show()\n", - "\n", - "for idx, item in enumerate(model_precision.getEstimatorParamMaps()):\n", - " print('Run {}:'.format(idx))\n", - " print('\\tValidation Metric: {}'.format(model_precision.validationMetrics[idx]))\n", - " for key, value in item.items():\n", - " print('\\t{0}: {1}'.format(repr(key), value))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 4 Hyperparameter tuning with `hyperopt`" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "`hyperopt` is an open source Python package that is designed for tuning parameters for generic function with any pre-defined loss. More information about `hyperopt` can be found [here](https://github.com/hyperopt/hyperopt). `hyperopt` supports parallelization on MongoDB but not Spark. In our case, the tuning is performed in a sequential mode on a local computer.\n", "\n", "In `hyperopt`, an *objective* function is defined for optimizing the hyper parameters. In this case, the objective is similar to that in the Spark native construct situation, which is *to the RMSE metric for an ALS recommender*. Parameters of `rank` and `regParam` are used as hyperparameters. \n", "\n", "The objective function shown below demonstrates a RMSE loss for an ALS recommender. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 21, - "metadata": {}, - "outputs": [], "source": [ "# Customize an objective function\n", "def objective(params):\n", @@ -946,11 +945,12 @@ " 'status': STATUS_OK,\n", " 'eval_time': time_run_start.interval\n", " }" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A search space is usually defined for hyperparameter exploration. Design of search space is empirical, and depends on the understanding of how distribution of parameter of interest affects the model performance measured by the loss function. \n", "\n", @@ -959,13 +959,12 @@ "* The reg parameter prevents overfitting in certain way. \n", "\n", "Therefore, in this case, a uniform distribution and a lognormal distribution sampling spaces are used for rank and reg, respectively. A narrow search space is used for illustration purpose, that is, the range of rank is from 10 to 20, while that of reg is from $e^{-5}$ to $e^{-1}$. Together with the randomly sampled hyper parameters, other parameters use for building / evaluating the recommender, like `k`, column names, data, etc., are kept as constants." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 22, - "metadata": {}, - "outputs": [], "source": [ "# define a search space\n", "space = {\n", @@ -980,31 +979,31 @@ " 'k': 10,\n", " 'relevancy_method': \"top_k\"\n", "}" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4.1 Hyperparameter tuning with TPE" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "`fmin` of `hyperopt` is used for running the trials for searching optimal hyper parameters. In `hyperopt`, there are different strategies for intelligently optimize hyper parameters. For example, `hyperopt` avails [Tree of Parzen Estimators (TPE) method](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) for searching optimal parameters. \n", "\n", "The TPE method models a surface response of $p(x|y)$ by transforming a generative process, replacing the distributions of the configuration prior with non-parametric densities, where $p$ is the probability of configuration space $x$ given the loss $y$. For different configuration space, the TPE method does different replacements. That is, uniform $\\to$ truncated Gaussian mixture, log-uniform $\\to$ exponentiated truncated Gaussian mixture, categorical $\\to$ re-weighted categorical, etc. Using different observations ${x(1), ..., x(k)}$ in the non-parametric densities, these substitutions represent a learning algorithm that can produce a variety of densities over the configuration space $X$. By maintaining sorted lists of observed variables in $H$, the runtime of each iteration of the TPE algorithm can scale linearly in $|H|$ and linearly in the number of variables (dimensions) being optimized. In a nutshell, the algorithm recognizes the irrelevant variables in the configuration space, and thus reduces iterations in searching for the optimal ones. Details of the TPE algorithm can be found in the reference paper.\n", "\n", "The following runs the trials with the pre-defined objective function and search space. TPE is used as the optimization method. Totally there will be 10 evaluations run for searching the best parameters." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 23, - "metadata": {}, - "outputs": [], "source": [ "with Timer() as time_hyperopt:\n", " # Trials for recording each iteration of the hyperparameter searching.\n", @@ -1018,14 +1017,19 @@ " max_evals=NUMBER_ITERATIONS\n", " )\n", " \n" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "source": [ + "trials.best_trial" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'book_time': datetime.datetime(2019, 7, 17, 12, 28, 19, 108000),\n", @@ -1046,801 +1050,41 @@ " 'version': 0}" ] }, - "execution_count": 24, "metadata": {}, - "output_type": "execute_result" + "execution_count": 24 } ], - "source": [ - "trials.best_trial" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 25, - "metadata": {}, + "source": [ + "parameters = ['rank', 'reg']\n", + "cols = len(parameters)\n", + "f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(15,5))\n", + "cmap = plt.cm.jet\n", + "for i, val in enumerate(parameters):\n", + " xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()\n", + " ys = [t['result']['loss'] for t in trials.trials]\n", + " xs, ys = zip(*sorted(zip(xs, ys)))\n", + " ys = np.array(ys)\n", + " axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75, c=cmap(float(i)/len(parameters)))\n", + " axes[i].set_title(val)" + ], "outputs": [ { + "output_type": "display_data", "data": { - "application/javascript": [ - "/* Put everything inside the global mpl namespace */\n", - "window.mpl = {};\n", - "\n", - "\n", - "mpl.get_websocket_type = function() {\n", - " if (typeof(WebSocket) !== 'undefined') {\n", - " return WebSocket;\n", - " } else if (typeof(MozWebSocket) !== 'undefined') {\n", - " return MozWebSocket;\n", - " } else {\n", - " alert('Your browser does not have WebSocket support.' +\n", - " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", - " 'Firefox 4 and 5 are also supported but you ' +\n", - " 'have to enable WebSockets in about:config.');\n", - " };\n", - "}\n", - "\n", - "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", - " this.id = figure_id;\n", - "\n", - " this.ws = websocket;\n", - "\n", - " this.supports_binary = (this.ws.binaryType != undefined);\n", - "\n", - " if (!this.supports_binary) {\n", - " var warnings = document.getElementById(\"mpl-warnings\");\n", - " if (warnings) {\n", - " warnings.style.display = 'block';\n", - " warnings.textContent = (\n", - " \"This browser does not support binary websocket messages. \" +\n", - " \"Performance may be slow.\");\n", - " }\n", - " }\n", - "\n", - " this.imageObj = new Image();\n", - "\n", - " this.context = undefined;\n", - " this.message = undefined;\n", - " this.canvas = undefined;\n", - " this.rubberband_canvas = undefined;\n", - " this.rubberband_context = undefined;\n", - " this.format_dropdown = undefined;\n", - "\n", - " this.image_mode = 'full';\n", - "\n", - " this.root = $('
');\n", - " this._root_extra_style(this.root)\n", - " this.root.attr('style', 'display: inline-block');\n", - "\n", - " $(parent_element).append(this.root);\n", - "\n", - " this._init_header(this);\n", - " this._init_canvas(this);\n", - " this._init_toolbar(this);\n", - "\n", - " var fig = this;\n", - "\n", - " this.waiting = false;\n", - "\n", - " this.ws.onopen = function () {\n", - " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", - " fig.send_message(\"send_image_mode\", {});\n", - " if (mpl.ratio != 1) {\n", - " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", - " }\n", - " fig.send_message(\"refresh\", {});\n", - " }\n", - "\n", - " this.imageObj.onload = function() {\n", - " if (fig.image_mode == 'full') {\n", - " // Full images could contain transparency (where diff images\n", - " // almost always do), so we need to clear the canvas so that\n", - " // there is no ghosting.\n", - " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", - " }\n", - " fig.context.drawImage(fig.imageObj, 0, 0);\n", - " };\n", - "\n", - " this.imageObj.onunload = function() {\n", - " fig.ws.close();\n", - " }\n", - "\n", - " this.ws.onmessage = this._make_on_message_function(this);\n", - "\n", - " this.ondownload = ondownload;\n", - "}\n", - "\n", - "mpl.figure.prototype._init_header = function() {\n", - " var titlebar = $(\n", - " '
');\n", - " var titletext = $(\n", - " '
');\n", - " titlebar.append(titletext)\n", - " this.root.append(titlebar);\n", - " this.header = titletext[0];\n", - "}\n", - "\n", - "\n", - "\n", - "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", - "\n", - "}\n", - "\n", - "\n", - "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", - "\n", - "}\n", - "\n", - "mpl.figure.prototype._init_canvas = function() {\n", - " var fig = this;\n", - "\n", - " var canvas_div = $('
');\n", - "\n", - " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", - "\n", - " function canvas_keyboard_event(event) {\n", - " return fig.key_event(event, event['data']);\n", - " }\n", - "\n", - " canvas_div.keydown('key_press', canvas_keyboard_event);\n", - " canvas_div.keyup('key_release', canvas_keyboard_event);\n", - " this.canvas_div = canvas_div\n", - " this._canvas_extra_style(canvas_div)\n", - " this.root.append(canvas_div);\n", - "\n", - " var canvas = $('');\n", - " canvas.addClass('mpl-canvas');\n", - " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", - "\n", - " this.canvas = canvas[0];\n", - " this.context = canvas[0].getContext(\"2d\");\n", - "\n", - " var backingStore = this.context.backingStorePixelRatio ||\n", - "\tthis.context.webkitBackingStorePixelRatio ||\n", - "\tthis.context.mozBackingStorePixelRatio ||\n", - "\tthis.context.msBackingStorePixelRatio ||\n", - "\tthis.context.oBackingStorePixelRatio ||\n", - "\tthis.context.backingStorePixelRatio || 1;\n", - "\n", - " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", - "\n", - " var rubberband = $('');\n", - " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", - "\n", - " var pass_mouse_events = true;\n", - "\n", - " canvas_div.resizable({\n", - " start: function(event, ui) {\n", - " pass_mouse_events = false;\n", - " },\n", - " resize: function(event, ui) {\n", - " fig.request_resize(ui.size.width, ui.size.height);\n", - " },\n", - " stop: function(event, ui) {\n", - " pass_mouse_events = true;\n", - " fig.request_resize(ui.size.width, ui.size.height);\n", - " },\n", - " });\n", - "\n", - " function mouse_event_fn(event) {\n", - " if (pass_mouse_events)\n", - " return fig.mouse_event(event, event['data']);\n", - " }\n", - "\n", - " rubberband.mousedown('button_press', mouse_event_fn);\n", - " rubberband.mouseup('button_release', mouse_event_fn);\n", - " // Throttle sequential mouse events to 1 every 20ms.\n", - " rubberband.mousemove('motion_notify', mouse_event_fn);\n", - "\n", - " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", - " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", - "\n", - " canvas_div.on(\"wheel\", function (event) {\n", - " event = event.originalEvent;\n", - " event['data'] = 'scroll'\n", - " if (event.deltaY < 0) {\n", - " event.step = 1;\n", - " } else {\n", - " event.step = -1;\n", - " }\n", - " mouse_event_fn(event);\n", - " });\n", - "\n", - " canvas_div.append(canvas);\n", - " canvas_div.append(rubberband);\n", - "\n", - " this.rubberband = rubberband;\n", - " this.rubberband_canvas = rubberband[0];\n", - " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", - " this.rubberband_context.strokeStyle = \"#000000\";\n", - "\n", - " this._resize_canvas = function(width, height) {\n", - " // Keep the size of the canvas, canvas container, and rubber band\n", - " // canvas in synch.\n", - " canvas_div.css('width', width)\n", - " canvas_div.css('height', height)\n", - "\n", - " canvas.attr('width', width * mpl.ratio);\n", - " canvas.attr('height', height * mpl.ratio);\n", - " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", - "\n", - " rubberband.attr('width', width);\n", - " rubberband.attr('height', height);\n", - " }\n", - "\n", - " // Set the figure to an initial 600x600px, this will subsequently be updated\n", - " // upon first draw.\n", - " this._resize_canvas(600, 600);\n", - "\n", - " // Disable right mouse context menu.\n", - " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", - " return false;\n", - " });\n", - "\n", - " function set_focus () {\n", - " canvas.focus();\n", - " canvas_div.focus();\n", - " }\n", - "\n", - " window.setTimeout(set_focus, 100);\n", - "}\n", - "\n", - "mpl.figure.prototype._init_toolbar = function() {\n", - " var fig = this;\n", - "\n", - " var nav_element = $('
')\n", - " nav_element.attr('style', 'width: 100%');\n", - " this.root.append(nav_element);\n", - "\n", - " // Define a callback function for later on.\n", - " function toolbar_event(event) {\n", - " return fig.toolbar_button_onclick(event['data']);\n", - " }\n", - " function toolbar_mouse_event(event) {\n", - " return fig.toolbar_button_onmouseover(event['data']);\n", - " }\n", - "\n", - " for(var toolbar_ind in mpl.toolbar_items) {\n", - " var name = mpl.toolbar_items[toolbar_ind][0];\n", - " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", - " var image = mpl.toolbar_items[toolbar_ind][2];\n", - " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", - "\n", - " if (!name) {\n", - " // put a spacer in here.\n", - " continue;\n", - " }\n", - " var button = $('');\n button.click(method_name, toolbar_event);\n button.mouseover(tooltip, toolbar_mouse_event);\n nav_element.append(button);\n }\n\n // Add the status bar.\n var status_bar = $('');\n nav_element.append(status_bar);\n this.message = status_bar[0];\n\n // Add the close button to the window.\n var buttongrp = $('
');\n var button = $('');\n button.click(function (evt) { fig.handle_close(fig, {}); } );\n button.mouseover('Stop Interaction', toolbar_mouse_event);\n buttongrp.append(button);\n var titlebar = this.root.find($('.ui-dialog-titlebar'));\n titlebar.prepend(buttongrp);\n}\n\nmpl.figure.prototype._root_extra_style = function(el){\n var fig = this\n el.on(\"remove\", function(){\n\tfig.close_ws(fig, {});\n });\n}\n\nmpl.figure.prototype._canvas_extra_style = function(el){\n // this is important to make the div 'focusable\n el.attr('tabindex', 0)\n // reach out to IPython and tell the keyboard manager to turn it's self\n // off when our div gets focus\n\n // location in version 3\n if (IPython.notebook.keyboard_manager) {\n IPython.notebook.keyboard_manager.register_events(el);\n }\n else {\n // location in version 2\n IPython.keyboard_manager.register_events(el);\n }\n\n}\n\nmpl.figure.prototype._key_event_extra = function(event, name) {\n var manager = IPython.notebook.keyboard_manager;\n if (!manager)\n manager = IPython.keyboard_manager;\n\n // Check for shift+enter\n if (event.shiftKey && event.which == 13) {\n this.canvas_div.blur();\n event.shiftKey = false;\n // Send a \"J\" for go to next cell\n event.which = 74;\n event.keyCode = 74;\n manager.command_mode();\n manager.handle_keydown(event);\n }\n}\n\nmpl.figure.prototype.handle_save = function(fig, msg) {\n fig.ondownload(fig, null);\n}\n\n\nmpl.find_output_cell = function(html_output) {\n // Return the cell and output element which can be found *uniquely* in the notebook.\n // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n // IPython event is triggered only after the cells have been serialised, which for\n // our purposes (turning an active figure into a static one), is too late.\n var cells = IPython.notebook.get_cells();\n var ncells = cells.length;\n for (var i=0; i= 3 moved mimebundle to data attribute of output\n data = data.data;\n }\n if (data['text/html'] == html_output) {\n return [cell, data, j];\n }\n }\n }\n }\n}\n\n// Register the function which deals with the matplotlib target/channel.\n// The kernel may be null if the page has been refreshed.\nif (IPython.notebook.kernel != null) {\n IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n}\n", "text/plain": [ "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { "text/html": [ "" @@ -1849,45 +1093,30 @@ "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], - "source": [ - "parameters = ['rank', 'reg']\n", - "cols = len(parameters)\n", - "f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(15,5))\n", - "cmap = plt.cm.jet\n", - "for i, val in enumerate(parameters):\n", - " xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()\n", - " ys = [t['result']['loss'] for t in trials.trials]\n", - " xs, ys = zip(*sorted(zip(xs, ys)))\n", - " ys = np.array(ys)\n", - " axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75, c=cmap(float(i)/len(parameters)))\n", - " axes[i].set_title(val)" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It can be seen from the above plot that\n", "* The actual impact of rank is in line with the intuition - the smaller the value the better the result.\n", "* It is interesting to see that the optimal value of reg is around 0.1 to 0.15. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the best model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 26, - "metadata": {}, - "outputs": [], "source": [ "als = ALS(\n", " rank=best[\"rank\"],\n", @@ -1902,20 +1131,20 @@ ")\n", " \n", "model_best_hyperopt = als.fit(train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Tuning prameters against other metrics can be simply done by modifying the `objective` function. The following shows an objective function of how to tune \"precision@k\". Since `fmin` in `hyperopt` only supports minimization while the actual objective of the loss is to maximize \"precision@k\", `-precision` instead of `precision` is used in the returned value of the `objective` function." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 27, - "metadata": {}, - "outputs": [], "source": [ "# Customize an objective function\n", "def objective_precision(params):\n", @@ -1988,29 +1217,29 @@ " 'status': STATUS_OK,\n", " 'eval_time': time_run_start.interval\n", " }" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4.2 Hyperparameter tuning with `hyperopt` sampling methods" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Though `hyperopt` works well in a single node machine, its features (e.g., `Trials` module) do not support Spark environment, which makes it hard to perform the tuning tasks in a distributed/parallel manner. It is useful to use `hyperopt` for sampling parameter values from the pre-defined sampling space, and then parallelize the model training onto Spark cluster with the sampled parameter combinations.\n", "\n", "The downside of this method is that the intelligent searching algorithm (i.e., TPE) of `hyperopt` cannot be used. The approach introduced here is therefore equivalent to random search." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 28, - "metadata": {}, - "outputs": [], "source": [ "with Timer() as time_sample:\n", " # Sample the parameters used for model building from the pre-defined space. \n", @@ -2018,14 +1247,19 @@ " \n", " # The following runs model building on the sampled parameter values with the pre-defined objective function.\n", " results_map = list(map(lambda x: objective(x), sample_params))\n" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 30, - "metadata": {}, + "source": [ + "results_map" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "[{'eval_time': 9.468051671981812, 'loss': 1.027085217204854, 'status': 'ok'},\n", @@ -2055,46 +1289,41 @@ " {'eval_time': 9.08506464958191, 'loss': 1.254533287299843, 'status': 'ok'}]" ] }, - "execution_count": 30, "metadata": {}, - "output_type": "execute_result" + "execution_count": 30 } ], - "source": [ - "results_map" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the best model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 31, - "metadata": {}, - "outputs": [], "source": [ "loss_metrics = np.array([x['loss'] for x in results_map])\n", "best_loss = np.where(loss_metrics == min(loss_metrics))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 32, - "metadata": {}, - "outputs": [], "source": [ "best_param = sample_params[best_loss[0].item()]" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 33, - "metadata": {}, - "outputs": [], "source": [ "als = ALS(\n", " rank=best_param[\"rank\"],\n", @@ -2109,29 +1338,29 @@ ")\n", " \n", "model_best_sample = als.fit(train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 5 Evaluation on testing data" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The optimal parameters can then be used for building a recommender, which is then evaluated on the testing data.\n", "\n", "The following codes generate the evaluation results by using the testing dataset with the optimal model selected against the pre-defined loss. Without loss of generity, in this case, the optimal model that performs the best w.r.t regression loss (i.e., the RMSE metric) is used. One can simply use other metrics like precision@k, as illustrated in the above sections, to evaluate the optimal model on the testing dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 34, - "metadata": {}, - "outputs": [], "source": [ "# Get prediction results with the optimal modesl from different approaches.\n", "prediction_spark = model_best_spark.transform(test)\n", @@ -2160,14 +1389,19 @@ " }, index=[0])\n", " \n", " test_evaluations = test_evaluations.append(result)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 35, - "metadata": {}, + "source": [ + "test_evaluations" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -2235,62 +1469,58 @@ "0 sample 230.902271 0.287638 0.791199 0.232688 0.988922" ] }, - "execution_count": 35, "metadata": {}, - "output_type": "execute_result" + "execution_count": 35 } ], - "source": [ - "test_evaluations" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "From the results, it can be seen that, *with the same number of iterations*, Spark native construct based approach takes the least amount of time, even if there is no parallel computing. This is simply because Spark native constructs leverage the underlying Java codes for running the actual analytics with high performance efficiency. Interestingly, the run time for `hyperopt` with TPE algorithm and random search methods are almost the same. Possible reasons for this are that, the TPE algorithm searches optimal parameters intelligently but runs the tuning iterations sequentially. Also, the advantage of TPE may become obvious when there is a higher dimensionality of hyperparameters. \n", "\n", "The three approaches use the same RMSE loss. In this measure, the native Spark construct performs the best. The `hyperopt` based approach performs the second best, but the advantage is very subtle. It should be noted that these differences may be owing to many factors like characteristics of datasets, dimensionality of hyperparameter space, sampling size in the searching, etc. Note the differences in the RMSE metrics may also come from the randomness of the intermediate steps in parameter tuning process. In practice, multiple runs are required for generating statistically robust comparison results. We have tried 5 times for running the same comparison codes above. The results aligned well with each other in terms of objective metric values and elapsed time. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Conclusions" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "In summary, there are mainly three different approaches for running hyperparameter tuning for Spark based recommendation algorithm. The three different approaches are compared as follows." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "|Approach|Distributed (on Spark)|Param sampling|Advanced hyperparam searching algo|Custom evaluation metrics|Custom data split|\n", "|---------|-------------|--------------|--------------------------|--------------|------------|\n", "|AzureML Services|Parallelizing Spark sessions on multi-node cluster or single Spark session on one VM node.)|Random, Grid, Bayesian sampling for discrete and continuous variables.|Bandit policy, Median stopping policy, and truncation selection policy.|Yes|Yes|\n", "|Spark native construct|Distributed in single-node standalone Spark environment or multi-node Spark cluster.|No|No|Need to re-engineer Spark modules|Need to re-engineer Spark modules.|\n", "|`hyperopt`|No (only support parallelization on MongoDB)|Random sampling for discrete and continuous variables.|Tree Parzen Estimator|Yes|Yes|" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 36, - "metadata": {}, - "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# References\n", "\n", @@ -2300,7 +1530,8 @@ "* `hyperopt`, url: http://hyperopt.github.io/hyperopt/.\n", "* Bergstra, J., Yamins, D., Cox, D. D. (2013) Making a Science of Model Search: Hyperparameter Optimization in Hundreds of Dimensions for Vision Architectures. Proc. of the 30th International Conference on Machine Learning (ICML 2013).\n", "* Kris Wright, \"Hyper parameter tuning with hyperopt\", url:https://districtdatalabs.silvrback.com/parameter-tuning-with-hyperopt" - ] + ], + "metadata": {} } ], "metadata": { @@ -2325,4 +1556,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 73d7a58f1c..d054bc64fb 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -3,33 +3,44 @@ import os import re +import random import shutil import warnings import pandas as pd +from typing import Optional from zipfile import ZipFile from recommenders.datasets.download_utils import maybe_download, download_path from recommenders.utils.notebook_utils import is_databricks from recommenders.utils.constants import ( - DEFAULT_USER_COL, + DEFAULT_HEADER, DEFAULT_ITEM_COL, + DEFAULT_USER_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL, + DEFAULT_TITLE_COL, + DEFAULT_GENRE_COL, ) try: from pyspark.sql.types import ( StructType, StructField, + StringType, IntegerType, FloatType, - DoubleType, - LongType, - StringType, + LongType ) from pyspark.sql.functions import concat_ws, col except ImportError: pass # so the environment without spark doesn't break +try: + import pandera as pa + from pandera import Field + from pandera.typing import Series +except ImportError: + pass # so the environment without recommender['dev'] doesn't break + class _DataFormat: def __init__( @@ -100,6 +111,11 @@ def item_has_header(self): "20m": _DataFormat(",", "ml-20m/ratings.csv", True, ",", "ml-20m/movies.csv", True), } +# Fake data for testing only +MOCK_DATA_FORMAT = { + "mock100": {"size": 100, "seed": 6}, +} + # 100K data genres index to string mapper. For 1m, 10m, and 20m, the genres labels are already in the dataset. GENRES = ( "unknown", @@ -123,12 +139,6 @@ def item_has_header(self): "Western", ) -DEFAULT_HEADER = ( - DEFAULT_USER_COL, - DEFAULT_ITEM_COL, - DEFAULT_RATING_COL, - DEFAULT_TIMESTAMP_COL, -) # Warning and error messages WARNING_MOVIE_LENS_HEADER = """MovieLens rating dataset has four columns @@ -136,7 +146,7 @@ def item_has_header(self): Will only use the first four column names.""" WARNING_HAVE_SCHEMA_AND_HEADER = """Both schema and header are provided. The header argument will be ignored.""" -ERROR_MOVIE_LENS_SIZE = "Invalid data size. Should be one of {100k, 1m, 10m, or 20m}" +ERROR_MOVIE_LENS_SIZE = "Invalid data size. Should be one of {100k, 1m, 10m, or 20m, or mock100}" ERROR_HEADER = "Header error. At least user and movie column names should be provided" @@ -154,14 +164,17 @@ def load_pandas_df( To load movie information only, you can use load_item_df function. Args: - size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). + size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). header (list or tuple or None): Rating dataset header. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored and data is rendered using the 'DEFAULT_HEADER' instead. local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. title_col (str): Movie title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. Returns: pandas.DataFrame: Movie rating dataset. @@ -185,7 +198,7 @@ def load_pandas_df( ) """ size = size.lower() - if size not in DATA_FORMAT: + if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) if header is None: @@ -196,6 +209,15 @@ def load_pandas_df( warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] + if size in MOCK_DATA_FORMAT: + # generate fake data + return MockMovielensSchema.get_df( + keep_first_n_cols=len(header), + keep_title_col=(title_col is not None), + keep_genre_col=(genres_col is not None), + **MOCK_DATA_FORMAT[size] # supply the rest of the kwarg with the dictionary + ) + movie_col = header[1] with download_path(local_cache_path) as path: @@ -349,17 +371,20 @@ def load_spark_df( Args: spark (pyspark.SparkSession): Spark session. - size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). + size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). header (list or tuple): Rating dataset header. - If schema is provided, this argument is ignored. + If `schema` is provided or `size` is set to any of 'MOCK_DATA_FORMAT', this argument is ignored. schema (pyspark.StructType): Dataset schema. + If `size` is set to any of 'MOCK_DATA_FORMAT', data is rendered in the 'MockMovielensSchema' instead. local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. title_col (str): Title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. Returns: pyspark.sql.DataFrame: Movie rating dataset. @@ -394,9 +419,18 @@ def load_spark_df( spark_df = load_spark_df(spark, dbutils=dbutils) """ size = size.lower() - if size not in DATA_FORMAT: + if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) + if size in MOCK_DATA_FORMAT: + # generate fake data + return MockMovielensSchema.get_spark_df( + spark, + keep_title_col=(title_col is not None), + keep_genre_col=(genres_col is not None), + **MOCK_DATA_FORMAT[size] # supply the rest of the kwarg with the dictionary + ) + schema = _get_schema(header, schema) if len(schema) < 2: raise ValueError(ERROR_HEADER) @@ -537,3 +571,109 @@ def extract_movielens(size, rating_path, item_path, zip_path): shutil.copyfileobj(zf, f) with z.open(DATA_FORMAT[size].item_path) as zf, open(item_path, "wb") as f: shutil.copyfileobj(zf, f) + + +class MockMovielensSchema(pa.SchemaModel): + """ + Mock dataset schema to generate fake data for testing purpose. + This schema is configured to mimic the Movielens dataset + + http://files.grouplens.org/datasets/movielens/ml-100k/ + + Dataset schema and generation is configured using pandera. + Please see https://pandera.readthedocs.io/en/latest/schema_models.html + for more information. + """ + # Some notebooks will do a cross join with userID and itemID, + # a sparse range for these IDs can slow down the notebook tests + userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) + itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) + rating: Series[float] = Field(in_range={"min_value": 1, "max_value": 5}) + timestamp: Series[int] + title: Series[str] = Field(eq="foo") + genre: Series[str] = Field(eq="genreA|0") + + @classmethod + def get_df( + cls, + size: int = 3, seed: int = 100, + keep_first_n_cols: Optional[int] = None, + keep_title_col: bool = False, keep_genre_col: bool = False, + ) -> pd.DataFrame: + """Return fake movielens dataset as a Pandas Dataframe with specified rows. + + Args: + size (int): number of rows to generate + seed (int, optional): seeding the pseudo-number generation. Defaults to 100. + keep_first_n_cols (int, optional): keep the first n default movielens columns. + keep_title_col (bool): remove the title column if False. Defaults to True. + keep_genre_col (bool): remove the genre column if False. Defaults to True. + + Returns: + pandas.DataFrame: a mock dataset + """ + schema = cls.to_schema() + if keep_first_n_cols is not None: + if keep_first_n_cols < 1 or keep_first_n_cols > len(DEFAULT_HEADER): + raise ValueError(f"Invalid value for 'keep_first_n_cols': {keep_first_n_cols}. Valid range: [1-{len(DEFAULT_HEADER)}]") + schema = schema.remove_columns(DEFAULT_HEADER[keep_first_n_cols:]) + if not keep_title_col: + schema = schema.remove_columns([DEFAULT_TITLE_COL]) + if not keep_genre_col: + schema = schema.remove_columns([DEFAULT_GENRE_COL]) + + random.seed(seed) + # For more information on data synthesis, see https://pandera.readthedocs.io/en/latest/data_synthesis_strategies.html + return schema.example(size=size) + + @classmethod + def get_spark_df( + cls, + spark, + size: int = 3, seed: int = 100, + keep_title_col: bool = False, keep_genre_col: bool = False, + tmp_path: Optional[str] = None, + ): + """Return fake movielens dataset as a Spark Dataframe with specified rows + + Args: + spark (SparkSession): spark session to load the dataframe into + size (int): number of rows to generate + seed (int): seeding the pseudo-number generation. Defaults to 100. + keep_title_col (bool): remove the title column if False. Defaults to False. + keep_genre_col (bool): remove the genre column if False. Defaults to False. + tmp_path (str, optional): path to store files for serialization purpose + when transferring data from python to java. + If None, a temporal path is used instead + + Returns: + pyspark.sql.DataFrame: a mock dataset + """ + pandas_df = cls.get_df(size=size, seed=seed, keep_title_col=True, keep_genre_col=True) + + # generate temp folder + with download_path(tmp_path) as tmp_folder: + filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv") + # serialize the pandas.df as a csv to avoid the expensive java <-> python communication + pandas_df.to_csv(filepath, header=False, index=False) + spark_df = spark.read.csv(filepath, schema=cls._get_spark_deserialization_schema()) + # Cache and force trigger action since data-file might be removed. + spark_df.cache() + spark_df.count() + + if not keep_title_col: + spark_df = spark_df.drop(DEFAULT_TITLE_COL) + if not keep_genre_col: + spark_df = spark_df.drop(DEFAULT_GENRE_COL) + return spark_df + + @classmethod + def _get_spark_deserialization_schema(cls): + return StructType([ + StructField(DEFAULT_USER_COL, IntegerType()), + StructField(DEFAULT_ITEM_COL, IntegerType()), + StructField(DEFAULT_RATING_COL, FloatType()), + StructField(DEFAULT_TIMESTAMP_COL, StringType()), + StructField(DEFAULT_TITLE_COL, StringType()), + StructField(DEFAULT_GENRE_COL, StringType()), + ]) diff --git a/recommenders/evaluation/spark_evaluation.py b/recommenders/evaluation/spark_evaluation.py index 37a73778ea..e5112965b2 100644 --- a/recommenders/evaluation/spark_evaluation.py +++ b/recommenders/evaluation/spark_evaluation.py @@ -1,9 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. - -import numpy as np - try: from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics from pyspark.sql import Window, DataFrame diff --git a/recommenders/utils/constants.py b/recommenders/utils/constants.py index 0e7ed34a9e..e24a58d725 100644 --- a/recommenders/utils/constants.py +++ b/recommenders/utils/constants.py @@ -6,6 +6,8 @@ DEFAULT_ITEM_COL = "itemID" DEFAULT_RATING_COL = "rating" DEFAULT_LABEL_COL = "label" +DEFAULT_TITLE_COL = "title" +DEFAULT_GENRE_COL = "genre" DEFAULT_RELEVANCE_COL = "relevance" DEFAULT_TIMESTAMP_COL = "timestamp" DEFAULT_PREDICTION_COL = "prediction" @@ -13,6 +15,13 @@ DEFAULT_ITEM_FEATURES_COL = "features" DEFAULT_ITEM_SIM_MEASURE = "item_cooccurrence_count" +DEFAULT_HEADER = ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, +) + COL_DICT = { "col_user": DEFAULT_USER_COL, "col_item": DEFAULT_ITEM_COL, diff --git a/setup.py b/setup.py index 4e37f407bc..9cca36aa38 100644 --- a/setup.py +++ b/setup.py @@ -81,8 +81,10 @@ ], "dev": [ "black>=18.6b4,<21", + "pandera[strategies]>=0.6.5", # For generating fake datasets "pytest>=3.6.4", "pytest-cov>=2.12.1", + "pytest-mock>=3.6.1", # for access to mock fixtures in pytest ], } # for the brave of heart diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml index 2c5a698243..15b237650a 100644 --- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml @@ -33,6 +33,6 @@ extends: timeout: 180 conda_env: "nightly_linux_cpu" conda_opts: "python=3.6" - pip_opts: "[examples]" + pip_opts: "[examples,dev]" pytest_markers: "not spark and not gpu" pytest_params: "-x" diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml index b1182c34c9..c43e8ec981 100644 --- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml @@ -32,6 +32,6 @@ extends: timeout: 240 conda_env: "nightly_linux_gpu" conda_opts: "python=3.6 cudatoolkit=10.0 \"cudnn>=7.6\"" - pip_opts: "[gpu,examples] -f https://download.pytorch.org/whl/cu100/torch_stable.html" + pip_opts: "[gpu,examples,dev] -f https://download.pytorch.org/whl/cu100/torch_stable.html" pytest_markers: "not spark and gpu" pytest_params: "-x" diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml index 6fd4e526ea..f542f059ff 100644 --- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml +++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml @@ -33,6 +33,6 @@ extends: timeout: 180 conda_env: "nightly_linux_spark" conda_opts: "python=3.6" - pip_opts: "[spark,examples]" + pip_opts: "[spark,examples,dev]" pytest_markers: "spark and not gpu" pytest_params: "-x" diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml index b75cc0c3f5..93eaeacc84 100644 --- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Notebook Linux CPU" conda_env: "unit_notebook_linux_cpu" conda_opts: "python=3.6" - pip_opts: "[examples]" + pip_opts: "[examples,dev]" pytest_markers: "notebooks and not spark and not gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml index 9cb44639e0..6d7594a143 100644 --- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Notebook Linux GPU" conda_env: "unit_notebook_linux_gpu" conda_opts: "python=3.6 cudatoolkit=10.0 \"cudnn>=7.6\"" - pip_opts: "[gpu,examples] -f https://download.pytorch.org/whl/cu100/torch_stable.html" + pip_opts: "[gpu,examples,dev] -f https://download.pytorch.org/whl/cu100/torch_stable.html" pytest_markers: "notebooks and not spark and gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml index 535f6936a7..31d699588d 100644 --- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml +++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Notebook Linux Spark" conda_env: "unit_notebook_linux_spark" conda_opts: "python=3.6" - pip_opts: "[spark,examples]" + pip_opts: "[spark,examples,dev]" pytest_markers: "notebooks and spark and not gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml index be3b95c587..26ed5bdf2f 100644 --- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Linux CPU" conda_env: "unit_linux_cpu" conda_opts: "python=3.6" - pip_opts: "" + pip_opts: "[dev]" pytest_markers: "not notebooks and not spark and not gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml index b9a76211d9..9aa46047e6 100644 --- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Linux GPU" conda_env: "unit_linux_gpu" conda_opts: "python=3.6 cudatoolkit=10.0 \"cudnn>=7.6\"" - pip_opts: "[gpu] -f https://download.pytorch.org/whl/cu100/torch_stable.html" + pip_opts: "[gpu,dev] -f https://download.pytorch.org/whl/cu100/torch_stable.html" pytest_markers: "not notebooks and not spark and gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml index f99b151cad..1f3006a05e 100644 --- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml +++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Linux Spark" conda_env: "unit_linux_spark" conda_opts: "python=3.6" - pip_opts: "[spark]" + pip_opts: "[spark,dev]" pytest_markers: "not notebooks and spark and not gpu" diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index e4ae1d9464..6ccd970492 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -8,6 +8,8 @@ except ImportError: pass # disable error while collecting tests for non-notebook environments +from recommenders.utils.constants import DEFAULT_RATING_COL, DEFAULT_USER_COL, DEFAULT_ITEM_COL + @pytest.mark.notebooks @pytest.mark.spark @@ -16,7 +18,13 @@ ) def test_als_pyspark_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["als_pyspark"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict( + MOVIELENS_DATA_SIZE="mock100", + COL_USER=DEFAULT_USER_COL, + COL_ITEM=DEFAULT_ITEM_COL, + COL_RATING=DEFAULT_RATING_COL, + )) @pytest.mark.notebooks @@ -33,7 +41,13 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): ) def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["als_deep_dive"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict( + MOVIELENS_DATA_SIZE="mock100", + COL_USER=DEFAULT_USER_COL, + COL_ITEM=DEFAULT_ITEM_COL, + COL_RATING=DEFAULT_RATING_COL, + )) @pytest.mark.notebooks @@ -50,7 +64,14 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.spark def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["evaluation_diversity"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict( + TOP_K=10, + MOVIELENS_DATA_SIZE="mock100", + COL_USER=DEFAULT_USER_COL, + COL_ITEM=DEFAULT_ITEM_COL, + COL_RATING=DEFAULT_RATING_COL, + )) @pytest.mark.notebooks @@ -65,6 +86,7 @@ def test_spark_tuning(notebooks, output_notebook, kernel_name): output_notebook, kernel_name=kernel_name, parameters=dict( + MOVIELENS_DATA_SIZE="mock100", NUMBER_CORES="*", NUMBER_ITERATIONS=3, SUBSET_RATIO=0.5, diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index 76cd854d28..e9cda6810e 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -52,7 +52,8 @@ def test_baseline_deep_dive_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["surprise_svd_deep_dive"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict(MOVIELENS_DATA_SIZE="mock100")) @pytest.mark.notebooks @@ -100,7 +101,8 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["rlrmc_quickstart"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict(rank_parameter=2, MOVIELENS_DATA_SIZE="mock100")) @pytest.mark.notebooks diff --git a/tests/unit/recommenders/datasets/test_movielens.py b/tests/unit/recommenders/datasets/test_movielens.py new file mode 100644 index 0000000000..d8f12771f9 --- /dev/null +++ b/tests/unit/recommenders/datasets/test_movielens.py @@ -0,0 +1,125 @@ +import os +import pandas +import pytest + +from recommenders.datasets.movielens import MockMovielensSchema +from recommenders.datasets.movielens import load_pandas_df, load_spark_df +from recommenders.datasets.movielens import DATA_FORMAT, MOCK_DATA_FORMAT, DEFAULT_HEADER +from recommenders.utils.constants import DEFAULT_GENRE_COL, DEFAULT_TITLE_COL + +from pandas.core.series import Series +from pytest_mock import MockerFixture + + +@pytest.mark.parametrize("size", [10, 100]) +def test_mock_movielens_schema__has_default_col_names(size): + df = MockMovielensSchema.example(size=size) + for col_name in DEFAULT_HEADER: + assert col_name in df.columns + + +@pytest.mark.parametrize("keep_first_n_cols", [1, 2, 3, 4]) +def test_mock_movielens_schema__get_df_remove_default_col__return_success(keep_first_n_cols): + df = MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) + assert len(df) > 0 + assert len(df.columns) == keep_first_n_cols + + +@pytest.mark.parametrize("keep_first_n_cols", [-1, 0, 100]) +def test_mock_movielens_schema__get_df_invalid_param__return_failure(keep_first_n_cols): + with pytest.raises(ValueError, match=r"Invalid value.*"): + MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) + + +@pytest.mark.parametrize("keep_genre_col", [True, False]) +@pytest.mark.parametrize("keep_title_col", [True, False]) +@pytest.mark.parametrize("keep_first_n_cols", [None, 2]) +@pytest.mark.parametrize("seed", [-1]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [0, 3, 10]) +def test_mock_movielens_schema__get_df__return_success(size, seed, keep_first_n_cols, keep_title_col, keep_genre_col): + df = MockMovielensSchema.get_df( + size=size, seed=seed, + keep_first_n_cols=keep_first_n_cols, + keep_title_col=keep_title_col, keep_genre_col=keep_genre_col + ) + assert type(df) == pandas.DataFrame + assert len(df) == size + + if keep_title_col: + assert len(df[DEFAULT_TITLE_COL]) == size + if keep_genre_col: + assert len(df[DEFAULT_GENRE_COL]) == size + + +@pytest.mark.spark +@pytest.mark.parametrize("keep_genre_col", [True, False]) +@pytest.mark.parametrize("keep_title_col", [True, False]) +@pytest.mark.parametrize("seed", [101]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [0, 3, 10]) +def test_mock_movielens_schema__get_spark_df__return_success(spark, size, seed, keep_title_col, keep_genre_col): + df = MockMovielensSchema.get_spark_df(spark, size=size, seed=seed, keep_title_col=keep_title_col, keep_genre_col=keep_genre_col) + assert df.count() == size + + if keep_title_col: + assert df.schema[DEFAULT_TITLE_COL] + if keep_genre_col: + assert df.schema[DEFAULT_GENRE_COL] + + +@pytest.mark.spark +def test_mock_movielens_schema__get_spark_df__store_tmp_file(spark, tmp_path): + data_size = 3 + MockMovielensSchema.get_spark_df(spark, size=data_size, tmp_path=tmp_path) + assert os.path.exists(os.path.join(tmp_path, f"mock_movielens_{data_size}.csv")) + + +@pytest.mark.spark +def test_mock_movielens_schema__get_spark_df__data_serialization_default_param(spark, mocker: MockerFixture): + data_size = 3 + to_csv_spy = mocker.spy(pandas.DataFrame, "to_csv") + + df = MockMovielensSchema.get_spark_df(spark, size=data_size) + # assertions + to_csv_spy.assert_called_once() + assert df.count() == data_size + + +def test_mock_movielens_data__no_name_collision(): + """ + Making sure that no common names are shared between the mock and real dataset sizes + """ + dataset_name = set(DATA_FORMAT.keys()) + dataset_name_mock = set(MOCK_DATA_FORMAT.keys()) + collision = dataset_name.intersection(dataset_name_mock) + assert not collision + + +@pytest.mark.spark +def test_load_spark_df_mock_100__with_default_param__succeed(spark): + df = load_spark_df(spark, "mock100") + assert df.count() == 100 + + +def test_load_pandas_df_mock_100__with_default_param__succeed(): + df = load_pandas_df("mock100") + assert type(df) == pandas.DataFrame + assert len(df) == 100 + + +@pytest.mark.spark +def test_load_spark_df_mock_100__with_custom_param__succeed(spark): + df = load_spark_df(spark, "mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL) + assert df.schema[DEFAULT_TITLE_COL] + assert df.schema[DEFAULT_GENRE_COL] + assert df.count() == 100 + assert '|' in df.take(1)[0][DEFAULT_GENRE_COL] + assert df.take(1)[0][DEFAULT_TITLE_COL] == 'foo' + + +def test_load_pandas_df_mock_100__with_custom_param__succeed(): + df = load_pandas_df("mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL) + assert type(df[DEFAULT_TITLE_COL]) == Series + assert type(df[DEFAULT_GENRE_COL]) == Series + assert len(df) == 100 + assert '|' in df.loc[0, DEFAULT_GENRE_COL] + assert df.loc[0, DEFAULT_TITLE_COL] == 'foo'