diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml index 72b7e7dea1..67f23b6aa2 100644 --- a/.github/actions/azureml-test/action.yml +++ b/.github/actions/azureml-test/action.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- diff --git a/.github/actions/get-test-groups/action.yml b/.github/actions/get-test-groups/action.yml index 849f57e8f2..3e803c800e 100644 --- a/.github/actions/get-test-groups/action.yml +++ b/.github/actions/get-test-groups/action.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 36e3335512..ec4c809bd0 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index dc4601f106..08a7f00ccb 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- diff --git a/.github/workflows/azureml-release-pipeline.yml b/.github/workflows/azureml-release-pipeline.yml index 6c1dd6438d..8475a9a2f6 100644 --- a/.github/workflows/azureml-release-pipeline.yml +++ b/.github/workflows/azureml-release-pipeline.yml @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. on: diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index 74695b1ce0..6079a5032f 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index 64761d52cf..f068f46f18 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 3639735fa1..e5a25fa14e 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -1,5 +1,5 @@ # --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # --------------------------------------------------------- # This workflow will run tests and do packaging for contrib/sarplus. diff --git a/LICENSE b/LICENSE index 21071075c2..e74b0d177a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License - Copyright (c) Microsoft Corporation. All rights reserved. + Copyright (c) 2018-present Microsoft Corporation. + Copyright (c) 2023-present Recommenders contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/contrib/sarplus/python/pysarplus/SARModel.py b/contrib/sarplus/python/pysarplus/SARModel.py index 3117fcd05d..86b331a5bd 100644 --- a/contrib/sarplus/python/pysarplus/SARModel.py +++ b/contrib/sarplus/python/pysarplus/SARModel.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. import pysarplus_cpp @@ -21,7 +21,9 @@ def __init__(self, path): sar_files = list(Path(path).glob("*" + SARModel.__extension)) sar_files.sort(key=os.path.getmtime, reverse=True) if len(sar_files) < 1: - raise ValueError(f"Directory '{path}' must contain at least 1 file ending in '{SARModel.__extension}'") + raise ValueError( + f"Directory '{path}' must contain at least 1 file ending in '{SARModel.__extension}'" + ) # instantiate C++ backend SARModel.__model = self.model = pysarplus_cpp.SARModelCpp(str(sar_files[0])) diff --git a/contrib/sarplus/python/pysarplus/SARPlus.py b/contrib/sarplus/python/pysarplus/SARPlus.py index e8fa561a78..16a0e73036 100644 --- a/contrib/sarplus/python/pysarplus/SARPlus.py +++ b/contrib/sarplus/python/pysarplus/SARPlus.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. """This is the implementation of SAR.""" diff --git a/contrib/sarplus/python/pysarplus/__init__.py b/contrib/sarplus/python/pysarplus/__init__.py index 0d922d7df1..c7afd83736 100644 --- a/contrib/sarplus/python/pysarplus/__init__.py +++ b/contrib/sarplus/python/pysarplus/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. from pathlib import Path @@ -10,7 +10,7 @@ __version__ = (Path(__file__).resolve().parent / "VERSION").read_text().strip() __author__ = "RecoDev Team at Microsoft" __license__ = "MIT" -__copyright__ = "Copyright 2018-present Microsoft Corporation" +__copyright__ = "Copyright 2018-present Recommenders contributors." # Synonyms TITLE = __title__ diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index a6204ec641..f24431775c 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. from pathlib import Path @@ -26,10 +26,12 @@ def __str__(self): setup( name="pysarplus", - version=(Path(__file__).resolve().parent / "pysarplus" / "VERSION").read_text().strip(), + version=(Path(__file__).resolve().parent / "pysarplus" / "VERSION") + .read_text() + .strip(), description="SAR prediction for use with PySpark", long_description=(Path(__file__).resolve().parent / "README.md").read_text(), - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", url="https://github.com/microsoft/recommenders/tree/main/contrib/sarplus", author="RecoDev Team at Microsoft", author_email="recodevteam@service.microsoft.com", diff --git a/contrib/sarplus/python/src/pysarplus.cpp b/contrib/sarplus/python/src/pysarplus.cpp index 0b06912740..18b45d3a59 100644 --- a/contrib/sarplus/python/src/pysarplus.cpp +++ b/contrib/sarplus/python/src/pysarplus.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/contrib/sarplus/python/tests/conftest.py b/contrib/sarplus/python/tests/conftest.py index 44efbde4a7..6245024f79 100644 --- a/contrib/sarplus/python/tests/conftest.py +++ b/contrib/sarplus/python/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. import calendar diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index cbcdc8b547..9848f561e8 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. import math @@ -381,7 +381,13 @@ def test_userpred( df = spark.createDataFrame(demo_usage_data) model.fit(df) - url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" + url = ( + sar_settings["FILE_DIR"] + + "userpred_" + + file + + str(threshold) + + "_userid_only.csv" + ) pred_ref = pd.read_csv(url) pred_ref = ( diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index 7883c972da..f4a6d49251 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala index 16f89922f1..1613c838a6 100644 --- a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala +++ b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py b/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py index 0720a92163..ee755b37b9 100644 --- a/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py +++ b/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. installed = 1 diff --git a/contrib/sarplus/scala/python/setup.py b/contrib/sarplus/scala/python/setup.py index 183db208f6..0b821a3dd2 100644 --- a/contrib/sarplus/scala/python/setup.py +++ b/contrib/sarplus/scala/python/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. from distutils.core import setup @@ -6,7 +6,9 @@ setup( name="pysarplus_dummy", - version=(Path(__file__).resolve().parent.parent.parent / "VERSION").read_text().strip(), + version=(Path(__file__).resolve().parent.parent.parent / "VERSION") + .read_text() + .strip(), description="pysarplus dummy package to trigger spark packaging", author="RecoDev Team at Microsoft", author_email="recodevteam@service.microsoft.com", diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala index f7a1da5376..57148e3ac2 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala index 49c924c6c7..e2450e1671 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala index 2e41effa3a..e447e94d2e 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala b/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala index 7565965e80..8173f78245 100644 --- a/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala +++ b/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) Microsoft Corporation. All rights reserved. + * Copyright (c) Recommenders contributors. * Licensed under the MIT License. */ diff --git a/docs/Makefile b/docs/Makefile index cabf815b4b..2fe93422bd 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # You can set these variables from the command line. diff --git a/docs/source/conf.py b/docs/source/conf.py index 5038c2a6e2..caed50ab7b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,4 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. +# Copyright (c) Recommenders contributors. # Licensed under the MIT License. # -*- coding: utf-8 -*- diff --git a/examples/00_quick_start/als_movielens.ipynb b/examples/00_quick_start/als_movielens.ipynb index d184ee5109..4e1c21a095 100644 --- a/examples/00_quick_start/als_movielens.ipynb +++ b/examples/00_quick_start/als_movielens.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "Copyright (c) Recommenders contributors.\n", "\n", "Licensed under the MIT License." ] @@ -298,7 +298,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", + "\r\n", " \r" ] } @@ -425,7 +425,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", + "\r\n", " \r" ] } @@ -494,9 +494,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", - "[Stage 500:=================================================> (186 + 3) / 200]\r", - "\r", + "\r\n", + "[Stage 500:=================================================> (186 + 3) / 200]\r\n", + "\r\n", " \r" ] } @@ -534,7 +534,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", + "\r\n", " \r" ] } @@ -627,9 +627,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", - "[Stage 904:> (0 + 2) / 2]\r", - "\r", + "\r\n", + "[Stage 904:> (0 + 2) / 2]\r\n", + "\r\n", " \r" ] }, diff --git a/examples/00_quick_start/dkn_MIND.ipynb b/examples/00_quick_start/dkn_MIND.ipynb index efe179682d..d37bb2cd32 100644 --- a/examples/00_quick_start/dkn_MIND.ipynb +++ b/examples/00_quick_start/dkn_MIND.ipynb @@ -1,398 +1,398 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DKN : Deep Knowledge-Aware Network for News Recommendation\n", - "\n", - "DKN \\[1\\] is a deep learning model which incorporates information from knowledge graph for better news recommendation. Specifically, DKN uses TransX \\[2\\] method for knowledge graph representation learning, then applies a CNN framework, named KCNN, to combine entity embedding with word embedding and generate a final embedding vector for a news article. CTR prediction is made via an attention-based neural scorer. \n", - "\n", - "## Properties of DKN:\n", - "\n", - "- DKN is a content-based deep model for CTR prediction rather than traditional ID-based collaborative filtering. \n", - "- It makes use of knowledge entities and common sense in news content via joint learning from semantic-level and knowledge-level representations of news articles.\n", - "- DKN uses an attention module to dynamically calculate a user's aggregated historical representaition.\n", - "\n", - "\n", - "## Data format:\n", - "\n", - "### DKN takes several files as input as follows:\n", - "\n", - "- **training / validation / test files**: each line in these files represents one instance. Impressionid is used to evaluate performance within an impression session, so it is only used when evaluating, you can set it to 0 for training data. The format is :
\n", - "`[label] [userid] [CandidateNews]%[impressionid] `
\n", - "e.g., `1 train_U1 N1%0`
\n", - "\n", - "- **user history file**: each line in this file represents a users' click history. You need to set `history_size` parameter in the config file, which is the max number of user's click history we use. We will automatically keep the last `history_size` number of user click history, if user's click history is more than `history_size`, and we will automatically pad with 0 if user's click history is less than `history_size`. the format is :
\n", - "`[Userid] [newsid1,newsid2...]`
\n", - "e.g., `train_U1 N1,N2`
\n", - "\n", - "- **document feature file**: It contains the word and entity features for news articles. News articles are represented by aligned title words and title entities. To take a quick example, a news title may be: \"Trump to deliver State of the Union address next week\", then the title words value may be `CandidateNews:34,45,334,23,12,987,3456,111,456,432` and the title entitie value may be: `entity:45,0,0,0,0,0,0,0,0,0`. Only the first value of entity vector is non-zero due to the word \"Trump\". The title value and entity value is hashed from 1 to `n` (where `n` is the number of distinct words or entities). Each feature length should be fixed at k (`doc_size` parameter), if the number of words in document is more than k, you should truncate the document to k words, and if the number of words in document is less than k, you should pad 0 to the end. \n", - "the format is like:
\n", - "`[Newsid] [w1,w2,w3...wk] [e1,e2,e3...ek]`\n", - "\n", - "- **word embedding/entity embedding/ context embedding files**: These are `*.npy` files of pretrained embeddings. After loading, each file is a `[n+1,k]` two-dimensional matrix, n is the number of words(or entities) of their hash dictionary, k is dimension of the embedding, note that we keep embedding 0 for zero padding. \n", - "\n", - "In this experiment, we used GloVe \\[4\\] vectors to initialize the word embedding. We trained entity embedding using TransE \\[2\\] on knowledge graph and context embedding is the average of the entity's neighbors in the knowledge graph.
\n", - "\n", - "## MIND dataset\n", - "\n", - "MIND dataset\\[3\\] is a large-scale English news dataset. It was collected from anonymized behavior logs of Microsoft News website. MIND contains 1,000,000 users, 161,013 news articles and 15,777,377 impression logs. Every news article contains rich textual content including title, abstract, body, category and entities. Each impression log contains the click events, non-clicked events and historical news click behaviors of this user before this impression.\n", - "\n", - "In this notebook we are going to use a subset of MIND dataset, **MIND demo**. MIND demo contains 500 users, 9,432 news articles and 6,134 impression logs. \n", - "\n", - "For this quick start notebook, we are providing directly all the necessary word embeddings, entity embeddings and context embedding files." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global settings and imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", - "[GCC 11.2.0]\n", - "Tensorflow version: 2.7.4\n" - ] - } - ], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "import os\n", - "import sys\n", - "from tempfile import TemporaryDirectory\n", - "import scrapbook as sb\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel(\"ERROR\") # only show error messages\n", - "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", - "\n", - "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources, prepare_hparams\n", - "from recommenders.models.deeprec.models.dkn import DKN\n", - "from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator\n", - "\n", - "print(f\"System version: {sys.version}\")\n", - "print(f\"Tensorflow version: {tf.__version__}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download and load data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|███████████████████████████████████████████████████████████████████████████████| 11.3k/11.3k [01:39<00:00, 113KB/s]\n" - ] - } - ], - "source": [ - "tmpdir = TemporaryDirectory()\n", - "data_path = os.path.join(tmpdir.name, \"mind-demo-dkn\")\n", - "\n", - "yaml_file = os.path.join(data_path, \"dkn.yaml\")\n", - "train_file = os.path.join(data_path, \"train_mind_demo.txt\")\n", - "valid_file = os.path.join(data_path, \"valid_mind_demo.txt\")\n", - "test_file = os.path.join(data_path, \"test_mind_demo.txt\")\n", - "news_feature_file = os.path.join(data_path, \"doc_feature.txt\")\n", - "user_history_file = os.path.join(data_path, \"user_history.txt\")\n", - "wordEmb_file = os.path.join(data_path, \"word_embeddings_100.npy\")\n", - "entityEmb_file = os.path.join(data_path, \"TransE_entity2vec_100.npy\")\n", - "contextEmb_file = os.path.join(data_path, \"TransE_context2vec_100.npy\")\n", - "if not os.path.exists(yaml_file):\n", - " download_deeprec_resources(\"https://recodatasets.z20.web.core.windows.net/deeprec/\", tmpdir.name, \"mind-demo-dkn.zip\")\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create hyper-parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "pycharm": { - "is_executing": false - }, - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "EPOCHS = 10\n", - "HISTORY_SIZE = 50\n", - "BATCH_SIZE = 500" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "HParams object with values {'use_entity': True, 'use_context': True, 'cross_activation': 'identity', 'user_dropout': False, 'dropout': [0.0], 'attention_dropout': 0.0, 'load_saved_model': False, 'fast_CIN_d': 0, 'use_Linear_part': False, 'use_FM_part': False, 'use_CIN_part': False, 'use_DNN_part': False, 'init_method': 'uniform', 'init_value': 0.1, 'embed_l2': 1e-06, 'embed_l1': 0.0, 'layer_l2': 1e-06, 'layer_l1': 0.0, 'cross_l2': 0.0, 'cross_l1': 0.0, 'reg_kg': 0.0, 'learning_rate': 0.0005, 'lr_rs': 1, 'lr_kg': 0.5, 'kg_training_interval': 5, 'max_grad_norm': 2, 'is_clip_norm': 0, 'dtype': 32, 'optimizer': 'adam', 'epochs': 10, 'batch_size': 500, 'enable_BN': True, 'show_step': 10000, 'save_model': False, 'save_epoch': 2, 'write_tfevents': False, 'train_num_ngs': 4, 'need_sample': True, 'embedding_dropout': 0.0, 'EARLY_STOP': 100, 'min_seq_length': 1, 'slots': 5, 'cell': 'SUM', 'doc_size': 10, 'history_size': 50, 'word_size': 12600, 'entity_size': 3987, 'data_format': 'dkn', 'metrics': ['auc'], 'pairwise_metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'method': 'classification', 'activation': ['sigmoid'], 'attention_activation': 'relu', 'attention_layer_sizes': 100, 'dim': 100, 'entity_dim': 100, 'transform': True, 'filter_sizes': [1, 2, 3], 'layer_sizes': [300], 'model_type': 'dkn', 'num_filters': 100, 'loss': 'log_loss', 'news_feature_file': '/tmp/tmpgy77utho/mind-demo-dkn/doc_feature.txt', 'user_history_file': '/tmp/tmpgy77utho/mind-demo-dkn/user_history.txt', 'wordEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/word_embeddings_100.npy', 'entityEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/TransE_entity2vec_100.npy', 'contextEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/TransE_context2vec_100.npy'}\n" - ] - } - ], - "source": [ - "hparams = prepare_hparams(yaml_file,\n", - " news_feature_file = news_feature_file,\n", - " user_history_file = user_history_file,\n", - " wordEmb_file=wordEmb_file,\n", - " entityEmb_file=entityEmb_file,\n", - " contextEmb_file=contextEmb_file,\n", - " epochs=EPOCHS,\n", - " history_size=HISTORY_SIZE,\n", - " batch_size=BATCH_SIZE)\n", - "print(hparams)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train the DKN model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [], - "source": [ - "model = DKN(hparams, DKNTextIterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'auc': 0.5218, 'group_auc': 0.5071, 'mean_mrr': 0.1494, 'ndcg@5': 0.1539, 'ndcg@10': 0.2125}\n" - ] - } - ], - "source": [ - "print(model.run_eval(valid_file))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "pycharm": { - "is_executing": false - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 1\n", - "train info: logloss loss:0.6945172200600306\n", - "eval info: auc:0.5929, group_auc:0.5633, mean_mrr:0.1834, ndcg@10:0.2511, ndcg@5:0.1939\n", - "at epoch 1 , train time: 39.8 eval time: 8.8\n", - "at epoch 2\n", - "train info: logloss loss:0.6527644917368889\n", - "eval info: auc:0.5877, group_auc:0.5499, mean_mrr:0.1891, ndcg@10:0.2542, ndcg@5:0.2013\n", - "at epoch 2 , train time: 36.0 eval time: 9.0\n", - "at epoch 3\n", - "train info: logloss loss:0.6361906168361505\n", - "eval info: auc:0.6013, group_auc:0.5799, mean_mrr:0.1999, ndcg@10:0.2703, ndcg@5:0.2078\n", - "at epoch 3 , train time: 36.0 eval time: 9.0\n", - "at epoch 4\n", - "train info: logloss loss:0.6205979473888874\n", - "eval info: auc:0.611, group_auc:0.5862, mean_mrr:0.1851, ndcg@10:0.2624, ndcg@5:0.1853\n", - "at epoch 4 , train time: 36.1 eval time: 8.9\n", - "at epoch 5\n", - "train info: logloss loss:0.6062351117531458\n", - "eval info: auc:0.6148, group_auc:0.5931, mean_mrr:0.1947, ndcg@10:0.2715, ndcg@5:0.1951\n", - "at epoch 5 , train time: 36.2 eval time: 9.0\n", - "at epoch 6\n", - "train info: logloss loss:0.5931083386143049\n", - "eval info: auc:0.6153, group_auc:0.5942, mean_mrr:0.2015, ndcg@10:0.2737, ndcg@5:0.2084\n", - "at epoch 6 , train time: 36.3 eval time: 9.3\n", - "at epoch 7\n", - "train info: logloss loss:0.582433108240366\n", - "eval info: auc:0.6268, group_auc:0.5981, mean_mrr:0.2011, ndcg@10:0.2765, ndcg@5:0.2085\n", - "at epoch 7 , train time: 36.4 eval time: 10.3\n", - "at epoch 8\n", - "train info: logloss loss:0.5735978713879982\n", - "eval info: auc:0.6263, group_auc:0.6052, mean_mrr:0.2034, ndcg@10:0.279, ndcg@5:0.217\n", - "at epoch 8 , train time: 36.8 eval time: 9.2\n", - "at epoch 9\n", - "train info: logloss loss:0.5567030770083269\n", - "eval info: auc:0.62, group_auc:0.5958, mean_mrr:0.1942, ndcg@10:0.2688, ndcg@5:0.2019\n", - "at epoch 9 , train time: 39.3 eval time: 11.0\n", - "at epoch 10\n", - "train info: logloss loss:0.5417348792155584\n", - "eval info: auc:0.6198, group_auc:0.6035, mean_mrr:0.1929, ndcg@10:0.2692, ndcg@5:0.201\n", - "at epoch 10 , train time: 46.3 eval time: 13.2\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.fit(train_file, valid_file)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluate the DKN model\n", - "\n", - "Now we can check the performance on the test set:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'auc': 0.6227, 'group_auc': 0.5963, 'mean_mrr': 0.2014, 'ndcg@5': 0.2066, 'ndcg@10': 0.28}\n" - ] - } - ], - "source": [ - "res = model.run_eval(test_file)\n", - "print(res)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sb.glue(\"res\", res)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## References\n", - "\n", - "\\[1\\] Wang, Hongwei, et al. \"DKN: Deep Knowledge-Aware Network for News Recommendation.\" Proceedings of the 2018 World Wide Web Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2018.
\n", - "\\[2\\] Knowledge Graph Embeddings including TransE, TransH, TransR and PTransE. https://github.com/thunlp/KB2E
\n", - "\\[3\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", - "\\[4\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "interpreter": { - "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DKN : Deep Knowledge-Aware Network for News Recommendation\n", + "\n", + "DKN \\[1\\] is a deep learning model which incorporates information from knowledge graph for better news recommendation. Specifically, DKN uses TransX \\[2\\] method for knowledge graph representation learning, then applies a CNN framework, named KCNN, to combine entity embedding with word embedding and generate a final embedding vector for a news article. CTR prediction is made via an attention-based neural scorer. \n", + "\n", + "## Properties of DKN:\n", + "\n", + "- DKN is a content-based deep model for CTR prediction rather than traditional ID-based collaborative filtering. \n", + "- It makes use of knowledge entities and common sense in news content via joint learning from semantic-level and knowledge-level representations of news articles.\n", + "- DKN uses an attention module to dynamically calculate a user's aggregated historical representaition.\n", + "\n", + "\n", + "## Data format:\n", + "\n", + "### DKN takes several files as input as follows:\n", + "\n", + "- **training / validation / test files**: each line in these files represents one instance. Impressionid is used to evaluate performance within an impression session, so it is only used when evaluating, you can set it to 0 for training data. The format is :
\n", + "`[label] [userid] [CandidateNews]%[impressionid] `
\n", + "e.g., `1 train_U1 N1%0`
\n", + "\n", + "- **user history file**: each line in this file represents a users' click history. You need to set `history_size` parameter in the config file, which is the max number of user's click history we use. We will automatically keep the last `history_size` number of user click history, if user's click history is more than `history_size`, and we will automatically pad with 0 if user's click history is less than `history_size`. the format is :
\n", + "`[Userid] [newsid1,newsid2...]`
\n", + "e.g., `train_U1 N1,N2`
\n", + "\n", + "- **document feature file**: It contains the word and entity features for news articles. News articles are represented by aligned title words and title entities. To take a quick example, a news title may be: \"Trump to deliver State of the Union address next week\", then the title words value may be `CandidateNews:34,45,334,23,12,987,3456,111,456,432` and the title entitie value may be: `entity:45,0,0,0,0,0,0,0,0,0`. Only the first value of entity vector is non-zero due to the word \"Trump\". The title value and entity value is hashed from 1 to `n` (where `n` is the number of distinct words or entities). Each feature length should be fixed at k (`doc_size` parameter), if the number of words in document is more than k, you should truncate the document to k words, and if the number of words in document is less than k, you should pad 0 to the end. \n", + "the format is like:
\n", + "`[Newsid] [w1,w2,w3...wk] [e1,e2,e3...ek]`\n", + "\n", + "- **word embedding/entity embedding/ context embedding files**: These are `*.npy` files of pretrained embeddings. After loading, each file is a `[n+1,k]` two-dimensional matrix, n is the number of words(or entities) of their hash dictionary, k is dimension of the embedding, note that we keep embedding 0 for zero padding. \n", + "\n", + "In this experiment, we used GloVe \\[4\\] vectors to initialize the word embedding. We trained entity embedding using TransE \\[2\\] on knowledge graph and context embedding is the average of the entity's neighbors in the knowledge graph.
\n", + "\n", + "## MIND dataset\n", + "\n", + "MIND dataset\\[3\\] is a large-scale English news dataset. It was collected from anonymized behavior logs of Microsoft News website. MIND contains 1,000,000 users, 161,013 news articles and 15,777,377 impression logs. Every news article contains rich textual content including title, abstract, body, category and entities. Each impression log contains the click events, non-clicked events and historical news click behaviors of this user before this impression.\n", + "\n", + "In this notebook we are going to use a subset of MIND dataset, **MIND demo**. MIND demo contains 500 users, 9,432 news articles and 6,134 impression logs. \n", + "\n", + "For this quick start notebook, we are providing directly all the necessary word embeddings, entity embeddings and context embedding files." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global settings and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", + "[GCC 11.2.0]\n", + "Tensorflow version: 2.7.4\n" + ] + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "import os\n", + "import sys\n", + "from tempfile import TemporaryDirectory\n", + "import scrapbook as sb\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel(\"ERROR\") # only show error messages\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", + "\n", + "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources, prepare_hparams\n", + "from recommenders.models.deeprec.models.dkn import DKN\n", + "from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator\n", + "\n", + "print(f\"System version: {sys.version}\")\n", + "print(f\"Tensorflow version: {tf.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and load data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████| 11.3k/11.3k [01:39<00:00, 113KB/s]\n" + ] + } + ], + "source": [ + "tmpdir = TemporaryDirectory()\n", + "data_path = os.path.join(tmpdir.name, \"mind-demo-dkn\")\n", + "\n", + "yaml_file = os.path.join(data_path, \"dkn.yaml\")\n", + "train_file = os.path.join(data_path, \"train_mind_demo.txt\")\n", + "valid_file = os.path.join(data_path, \"valid_mind_demo.txt\")\n", + "test_file = os.path.join(data_path, \"test_mind_demo.txt\")\n", + "news_feature_file = os.path.join(data_path, \"doc_feature.txt\")\n", + "user_history_file = os.path.join(data_path, \"user_history.txt\")\n", + "wordEmb_file = os.path.join(data_path, \"word_embeddings_100.npy\")\n", + "entityEmb_file = os.path.join(data_path, \"TransE_entity2vec_100.npy\")\n", + "contextEmb_file = os.path.join(data_path, \"TransE_context2vec_100.npy\")\n", + "if not os.path.exists(yaml_file):\n", + " download_deeprec_resources(\"https://recodatasets.z20.web.core.windows.net/deeprec/\", tmpdir.name, \"mind-demo-dkn.zip\")\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create hyper-parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "is_executing": false + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "EPOCHS = 10\n", + "HISTORY_SIZE = 50\n", + "BATCH_SIZE = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HParams object with values {'use_entity': True, 'use_context': True, 'cross_activation': 'identity', 'user_dropout': False, 'dropout': [0.0], 'attention_dropout': 0.0, 'load_saved_model': False, 'fast_CIN_d': 0, 'use_Linear_part': False, 'use_FM_part': False, 'use_CIN_part': False, 'use_DNN_part': False, 'init_method': 'uniform', 'init_value': 0.1, 'embed_l2': 1e-06, 'embed_l1': 0.0, 'layer_l2': 1e-06, 'layer_l1': 0.0, 'cross_l2': 0.0, 'cross_l1': 0.0, 'reg_kg': 0.0, 'learning_rate': 0.0005, 'lr_rs': 1, 'lr_kg': 0.5, 'kg_training_interval': 5, 'max_grad_norm': 2, 'is_clip_norm': 0, 'dtype': 32, 'optimizer': 'adam', 'epochs': 10, 'batch_size': 500, 'enable_BN': True, 'show_step': 10000, 'save_model': False, 'save_epoch': 2, 'write_tfevents': False, 'train_num_ngs': 4, 'need_sample': True, 'embedding_dropout': 0.0, 'EARLY_STOP': 100, 'min_seq_length': 1, 'slots': 5, 'cell': 'SUM', 'doc_size': 10, 'history_size': 50, 'word_size': 12600, 'entity_size': 3987, 'data_format': 'dkn', 'metrics': ['auc'], 'pairwise_metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'method': 'classification', 'activation': ['sigmoid'], 'attention_activation': 'relu', 'attention_layer_sizes': 100, 'dim': 100, 'entity_dim': 100, 'transform': True, 'filter_sizes': [1, 2, 3], 'layer_sizes': [300], 'model_type': 'dkn', 'num_filters': 100, 'loss': 'log_loss', 'news_feature_file': '/tmp/tmpgy77utho/mind-demo-dkn/doc_feature.txt', 'user_history_file': '/tmp/tmpgy77utho/mind-demo-dkn/user_history.txt', 'wordEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/word_embeddings_100.npy', 'entityEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/TransE_entity2vec_100.npy', 'contextEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/TransE_context2vec_100.npy'}\n" + ] + } + ], + "source": [ + "hparams = prepare_hparams(yaml_file,\n", + " news_feature_file = news_feature_file,\n", + " user_history_file = user_history_file,\n", + " wordEmb_file=wordEmb_file,\n", + " entityEmb_file=entityEmb_file,\n", + " contextEmb_file=contextEmb_file,\n", + " epochs=EPOCHS,\n", + " history_size=HISTORY_SIZE,\n", + " batch_size=BATCH_SIZE)\n", + "print(hparams)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the DKN model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "model = DKN(hparams, DKNTextIterator)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'auc': 0.5218, 'group_auc': 0.5071, 'mean_mrr': 0.1494, 'ndcg@5': 0.1539, 'ndcg@10': 0.2125}\n" + ] + } + ], + "source": [ + "print(model.run_eval(valid_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "is_executing": false + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 1\n", + "train info: logloss loss:0.6945172200600306\n", + "eval info: auc:0.5929, group_auc:0.5633, mean_mrr:0.1834, ndcg@10:0.2511, ndcg@5:0.1939\n", + "at epoch 1 , train time: 39.8 eval time: 8.8\n", + "at epoch 2\n", + "train info: logloss loss:0.6527644917368889\n", + "eval info: auc:0.5877, group_auc:0.5499, mean_mrr:0.1891, ndcg@10:0.2542, ndcg@5:0.2013\n", + "at epoch 2 , train time: 36.0 eval time: 9.0\n", + "at epoch 3\n", + "train info: logloss loss:0.6361906168361505\n", + "eval info: auc:0.6013, group_auc:0.5799, mean_mrr:0.1999, ndcg@10:0.2703, ndcg@5:0.2078\n", + "at epoch 3 , train time: 36.0 eval time: 9.0\n", + "at epoch 4\n", + "train info: logloss loss:0.6205979473888874\n", + "eval info: auc:0.611, group_auc:0.5862, mean_mrr:0.1851, ndcg@10:0.2624, ndcg@5:0.1853\n", + "at epoch 4 , train time: 36.1 eval time: 8.9\n", + "at epoch 5\n", + "train info: logloss loss:0.6062351117531458\n", + "eval info: auc:0.6148, group_auc:0.5931, mean_mrr:0.1947, ndcg@10:0.2715, ndcg@5:0.1951\n", + "at epoch 5 , train time: 36.2 eval time: 9.0\n", + "at epoch 6\n", + "train info: logloss loss:0.5931083386143049\n", + "eval info: auc:0.6153, group_auc:0.5942, mean_mrr:0.2015, ndcg@10:0.2737, ndcg@5:0.2084\n", + "at epoch 6 , train time: 36.3 eval time: 9.3\n", + "at epoch 7\n", + "train info: logloss loss:0.582433108240366\n", + "eval info: auc:0.6268, group_auc:0.5981, mean_mrr:0.2011, ndcg@10:0.2765, ndcg@5:0.2085\n", + "at epoch 7 , train time: 36.4 eval time: 10.3\n", + "at epoch 8\n", + "train info: logloss loss:0.5735978713879982\n", + "eval info: auc:0.6263, group_auc:0.6052, mean_mrr:0.2034, ndcg@10:0.279, ndcg@5:0.217\n", + "at epoch 8 , train time: 36.8 eval time: 9.2\n", + "at epoch 9\n", + "train info: logloss loss:0.5567030770083269\n", + "eval info: auc:0.62, group_auc:0.5958, mean_mrr:0.1942, ndcg@10:0.2688, ndcg@5:0.2019\n", + "at epoch 9 , train time: 39.3 eval time: 11.0\n", + "at epoch 10\n", + "train info: logloss loss:0.5417348792155584\n", + "eval info: auc:0.6198, group_auc:0.6035, mean_mrr:0.1929, ndcg@10:0.2692, ndcg@5:0.201\n", + "at epoch 10 , train time: 46.3 eval time: 13.2\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(train_file, valid_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate the DKN model\n", + "\n", + "Now we can check the performance on the test set:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'auc': 0.6227, 'group_auc': 0.5963, 'mean_mrr': 0.2014, 'ndcg@5': 0.2066, 'ndcg@10': 0.28}\n" + ] + } + ], + "source": [ + "res = model.run_eval(test_file)\n", + "print(res)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sb.glue(\"res\", res)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "\\[1\\] Wang, Hongwei, et al. \"DKN: Deep Knowledge-Aware Network for News Recommendation.\" Proceedings of the 2018 World Wide Web Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2018.
\n", + "\\[2\\] Knowledge Graph Embeddings including TransE, TransH, TransR and PTransE. https://github.com/thunlp/KB2E
\n", + "\\[3\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", + "\\[4\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" + ] + } + ], "metadata": { - "collapsed": false + "celltoolbar": "Tags", + "interpreter": { + "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/00_quick_start/fastai_movielens.ipynb b/examples/00_quick_start/fastai_movielens.ipynb index f7b00cf0ba..973906d51c 100644 --- a/examples/00_quick_start/fastai_movielens.ipynb +++ b/examples/00_quick_start/fastai_movielens.ipynb @@ -1,967 +1,967 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## FastAI Recommender\n", - "\n", - "This notebook shows how to use the [FastAI](https://fast.ai) recommender which is using [Pytorch](https://pytorch.org/) under the hood. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.6.11 | packaged by conda-forge | (default, Aug 5 2020, 20:09:42) \n", - "[GCC 7.5.0]\n", - "Pandas version: 0.25.3\n", - "Fast AI version: 1.0.46\n", - "Torch version: 1.4.0\n", - "Cuda Available: False\n", - "CuDNN Enabled: True\n" - ] - } - ], - "source": [ - "# set the environment path to find Recommenders\n", - "from tempfile import TemporaryDirectory\n", - "import sys\n", - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "import scrapbook as sb\n", - "import torch, fastai\n", - "from fastai.collab import collab_learner, CollabDataBunch, load_learner\n", - "\n", - "from recommenders.utils.constants import (\n", - " DEFAULT_USER_COL as USER, \n", - " DEFAULT_ITEM_COL as ITEM, \n", - " DEFAULT_RATING_COL as RATING, \n", - " DEFAULT_TIMESTAMP_COL as TIMESTAMP, \n", - " DEFAULT_PREDICTION_COL as PREDICTION\n", - ") \n", - "from recommenders.utils.timer import Timer\n", - "from recommenders.datasets import movielens\n", - "from recommenders.datasets.python_splitters import python_stratified_split\n", - "from recommenders.models.fastai.fastai_utils import cartesian_product, score\n", - "from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", - "from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Pandas version: {}\".format(pd.__version__))\n", - "print(\"Fast AI version: {}\".format(fastai.__version__))\n", - "print(\"Torch version: {}\".format(torch.__version__))\n", - "print(\"Cuda Available: {}\".format(torch.cuda.is_available()))\n", - "print(\"CuDNN Enabled: {}\".format(torch.backends.cudnn.enabled))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Defining some constants to refer to the different columns of our dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# top k items to recommend\n", - "TOP_K = 10\n", - "\n", - "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'\n", - "\n", - "# Model parameters\n", - "N_FACTORS = 40\n", - "EPOCHS = 5" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 4.49kKB/s]\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FastAI Recommender\n", + "\n", + "This notebook shows how to use the [FastAI](https://fast.ai) recommender which is using [Pytorch](https://pytorch.org/) under the hood. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.11 | packaged by conda-forge | (default, Aug 5 2020, 20:09:42) \n", + "[GCC 7.5.0]\n", + "Pandas version: 0.25.3\n", + "Fast AI version: 1.0.46\n", + "Torch version: 1.4.0\n", + "Cuda Available: False\n", + "CuDNN Enabled: True\n" + ] + } + ], + "source": [ + "# set the environment path to find Recommenders\n", + "from tempfile import TemporaryDirectory\n", + "import sys\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import scrapbook as sb\n", + "import torch, fastai\n", + "from fastai.collab import collab_learner, CollabDataBunch, load_learner\n", + "\n", + "from recommenders.utils.constants import (\n", + " DEFAULT_USER_COL as USER, \n", + " DEFAULT_ITEM_COL as ITEM, \n", + " DEFAULT_RATING_COL as RATING, \n", + " DEFAULT_TIMESTAMP_COL as TIMESTAMP, \n", + " DEFAULT_PREDICTION_COL as PREDICTION\n", + ") \n", + "from recommenders.utils.timer import Timer\n", + "from recommenders.datasets import movielens\n", + "from recommenders.datasets.python_splitters import python_stratified_split\n", + "from recommenders.models.fastai.fastai_utils import cartesian_product, score\n", + "from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", + "from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"Fast AI version: {}\".format(fastai.__version__))\n", + "print(\"Torch version: {}\".format(torch.__version__))\n", + "print(\"Cuda Available: {}\".format(torch.cuda.is_available()))\n", + "print(\"CuDNN Enabled: {}\".format(torch.backends.cudnn.enabled))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Defining some constants to refer to the different columns of our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# top k items to recommend\n", + "TOP_K = 10\n", + "\n", + "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", + "MOVIELENS_DATA_SIZE = '100k'\n", + "\n", + "# Model parameters\n", + "N_FACTORS = 40\n", + "EPOCHS = 5" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4.81k/4.81k [00:01<00:00, 4.49kKB/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIdMovieIdRatingTimestamp
01962423.0881250949
11863023.0891717742
2223771.0878887116
3244512.0880606923
41663461.0886397596
\n", + "
" + ], + "text/plain": [ + " UserId MovieId Rating Timestamp\n", + "0 196 242 3.0 881250949\n", + "1 186 302 3.0 891717742\n", + "2 22 377 1.0 878887116\n", + "3 244 51 2.0 880606923\n", + "4 166 346 1.0 886397596" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings_df = movielens.load_pandas_df(\n", + " size=MOVIELENS_DATA_SIZE,\n", + " header=[USER,ITEM,RATING,TIMESTAMP]\n", + ")\n", + "\n", + "# make sure the IDs are loaded as strings to better prevent confusion with embedding ids\n", + "ratings_df[USER] = ratings_df[USER].astype('str')\n", + "ratings_df[ITEM] = ratings_df[ITEM].astype('str')\n", + "\n", + "ratings_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the dataset\n", + "train_valid_df, test_df = python_stratified_split(\n", + " ratings_df, \n", + " ratio=0.75, \n", + " min_rating=1, \n", + " filter_by=\"item\", \n", + " col_user=USER, \n", + " col_item=ITEM\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove \"cold\" users from test set \n", + "test_df = test_df[test_df.userID.isin(train_valid_df.userID)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# fix random seeds to make sure our runs are reproducible\n", + "np.random.seed(101)\n", + "torch.manual_seed(101)\n", + "torch.cuda.manual_seed_all(101)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "with Timer() as preprocess_time:\n", + " data = CollabDataBunch.from_df(train_valid_df, \n", + " user_name=USER, \n", + " item_name=ITEM, \n", + " rating_name=RATING, \n", + " valid_pct=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIdMovieIdtarget
54315553.0
909455.0
2925154.0
30310921.0
4979464.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data.show_batch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will create a `collab_learner` for the data, which by default uses the [EmbeddingDotBias](https://docs.fast.ai/collab.html#EmbeddingDotBias) model. We will be using 40 latent factors. This will create an embedding for the users and the items that will map each of these to 40 floats as can be seen below. Note that the embedding parameters are not predefined, but are learned by the model.\n", + "\n", + "Although ratings can only range from 1-5, we are setting the range of possible ratings to a range from 0 to 5.5 -- that will allow the model to predict values around 1 and 5, which improves accuracy. Lastly, we set a value for weight-decay for regularization." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EmbeddingDotBias(\n", + " (u_weight): Embedding(944, 40)\n", + " (i_weight): Embedding(1683, 40)\n", + " (u_bias): Embedding(944, 1)\n", + " (i_bias): Embedding(1683, 1)\n", + ")" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0,5.5], wd=1e-1)\n", + "learn.model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now train the model for 5 epochs setting the maximal learning rate. The learner will reduce the learning rate with each epoch using cosine annealing." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_losstime
10.98599300:05
20.88549600:05
30.77763700:05
40.62897100:05
50.53232800:06
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 29.5549 seconds for training.\n" + ] + } + ], + "source": [ + "with Timer() as train_time:\n", + " learn.fit_one_cycle(EPOCHS, max_lr=5e-3)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save the learner so it can be loaded back later for inferencing / generating recommendations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp = TemporaryDirectory()\n", + "model_path = os.path.join(tmp.name, \"movielens_model.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "learn.export(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generating Recommendations\n", + "\n", + "Load the learner from disk." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "learner = load_learner(tmp.name, \"movielens_model.pkl\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get all users and items that the model knows" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "total_users, total_items = learner.data.train_ds.x.classes.values()\n", + "total_items = total_items[1:]\n", + "total_users = total_users[1:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get all users from the test set and remove any users that were know in the training set" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "test_users = test_df[USER].unique()\n", + "test_users = np.intersect1d(test_users, total_users)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build the cartesian product of test set users and all items known to the model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "users_items = cartesian_product(np.array(test_users),np.array(total_items))\n", + "users_items = pd.DataFrame(users_items, columns=[USER,ITEM])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "\n", + "Lastly, remove the user/items combinations that are in the training set -- we don't want to propose a movie that the user has already watched." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')\n", + "training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Score the model to find the top K recommendation" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 1.9734 seconds for 1511060 predictions.\n" + ] + } + ], + "source": [ + "with Timer() as test_time:\n", + " top_k_scores = score(learner, \n", + " test_df=training_removed,\n", + " user_col=USER, \n", + " item_col=ITEM, \n", + " prediction_col=PREDICTION)\n", + "\n", + "print(\"Took {} seconds for {} predictions.\".format(test_time, len(training_removed)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculate some metrics for our model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "eval_map = map_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", + " col_rating=RATING, col_prediction=PREDICTION, \n", + " relevancy_method=\"top_k\", k=TOP_K)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", + " col_rating=RATING, col_prediction=PREDICTION, \n", + " relevancy_method=\"top_k\", k=TOP_K)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", + " col_rating=RATING, col_prediction=PREDICTION, \n", + " relevancy_method=\"top_k\", k=TOP_K)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", + " col_rating=RATING, col_prediction=PREDICTION, \n", + " relevancy_method=\"top_k\", k=TOP_K)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model:\tCollabLearner\n", + "Top K:\t10\n", + "MAP:\t0.026115\n", + "NDCG:\t0.155065\n", + "Precision@K:\t0.136691\n", + "Recall@K:\t0.054940\n" + ] + } + ], + "source": [ + "print(\"Model:\\t\" + learn.__class__.__name__,\n", + " \"Top K:\\t%d\" % TOP_K,\n", + " \"MAP:\\t%f\" % eval_map,\n", + " \"NDCG:\\t%f\" % eval_ndcg,\n", + " \"Precision@K:\\t%f\" % eval_precision,\n", + " \"Recall@K:\\t%f\" % eval_recall, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above numbers are lower than [SAR](../sar_single_node_movielens.ipynb), but expected, since the model is explicitly trying to generalize the users and items to the latent factors. Next look at how well the model predicts how the user would rate the movie. Need to score `test_df` user-items only. " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "scores = score(learner, \n", + " test_df=test_df.copy(), \n", + " user_col=USER, \n", + " item_col=ITEM, \n", + " prediction_col=PREDICTION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now calculate some regression metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model:\tCollabLearner\n", + "RMSE:\t0.902379\n", + "MAE:\t0.712163\n", + "Explained variance:\t0.346523\n", + "R squared:\t0.345672\n" + ] + } + ], + "source": [ + "eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", + "eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", + "eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", + "eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", + "\n", + "print(\"Model:\\t\" + learn.__class__.__name__,\n", + " \"RMSE:\\t%f\" % eval_rmse,\n", + " \"MAE:\\t%f\" % eval_mae,\n", + " \"Explained variance:\\t%f\" % eval_exp_var,\n", + " \"R squared:\\t%f\" % eval_r2, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That RMSE is actually quite good when compared to these benchmarks: https://www.librec.net/release/v1.3/example.html" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.02611475567509659, + "encoder": "json", + "name": "map", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "map" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.15506533130248687, + "encoder": "json", + "name": "ndcg", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "ndcg" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.13669141039236482, + "encoder": "json", + "name": "precision", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "precision" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.05493986799753499, + "encoder": "json", + "name": "recall", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "recall" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.9023793356156464, + "encoder": "json", + "name": "rmse", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rmse" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.7121634655740025, + "encoder": "json", + "name": "mae", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "mae" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.34652281723228295, + "encoder": "json", + "name": "exp_var", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "exp_var" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.3456716162958503, + "encoder": "json", + "name": "rsquared", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rsquared" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 29.554921820759773, + "encoder": "json", + "name": "train_time", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "train_time" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 1.973397959023714, + "encoder": "json", + "name": "test_time", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "test_time" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Record results with papermill for tests\n", + "sb.glue(\"map\", eval_map)\n", + "sb.glue(\"ndcg\", eval_ndcg)\n", + "sb.glue(\"precision\", eval_precision)\n", + "sb.glue(\"recall\", eval_recall)\n", + "sb.glue(\"rmse\", eval_rmse)\n", + "sb.glue(\"mae\", eval_mae)\n", + "sb.glue(\"exp_var\", eval_exp_var)\n", + "sb.glue(\"rsquared\", eval_r2)\n", + "sb.glue(\"train_time\", train_time.interval)\n", + "sb.glue(\"test_time\", test_time.interval)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp.cleanup()" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python (reco_gpu)", + "language": "python", + "name": "reco_gpu" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.11" + } }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UserIdMovieIdRatingTimestamp
01962423.0881250949
11863023.0891717742
2223771.0878887116
3244512.0880606923
41663461.0886397596
\n", - "
" - ], - "text/plain": [ - " UserId MovieId Rating Timestamp\n", - "0 196 242 3.0 881250949\n", - "1 186 302 3.0 891717742\n", - "2 22 377 1.0 878887116\n", - "3 244 51 2.0 880606923\n", - "4 166 346 1.0 886397596" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ratings_df = movielens.load_pandas_df(\n", - " size=MOVIELENS_DATA_SIZE,\n", - " header=[USER,ITEM,RATING,TIMESTAMP]\n", - ")\n", - "\n", - "# make sure the IDs are loaded as strings to better prevent confusion with embedding ids\n", - "ratings_df[USER] = ratings_df[USER].astype('str')\n", - "ratings_df[ITEM] = ratings_df[ITEM].astype('str')\n", - "\n", - "ratings_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Split the dataset\n", - "train_valid_df, test_df = python_stratified_split(\n", - " ratings_df, \n", - " ratio=0.75, \n", - " min_rating=1, \n", - " filter_by=\"item\", \n", - " col_user=USER, \n", - " col_item=ITEM\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Remove \"cold\" users from test set \n", - "test_df = test_df[test_df.userID.isin(train_valid_df.userID)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# fix random seeds to make sure our runs are reproducible\n", - "np.random.seed(101)\n", - "torch.manual_seed(101)\n", - "torch.cuda.manual_seed_all(101)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "with Timer() as preprocess_time:\n", - " data = CollabDataBunch.from_df(train_valid_df, \n", - " user_name=USER, \n", - " item_name=ITEM, \n", - " rating_name=RATING, \n", - " valid_pct=0)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UserIdMovieIdtarget
54315553.0
909455.0
2925154.0
30310921.0
4979464.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "data.show_batch()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we will create a `collab_learner` for the data, which by default uses the [EmbeddingDotBias](https://docs.fast.ai/collab.html#EmbeddingDotBias) model. We will be using 40 latent factors. This will create an embedding for the users and the items that will map each of these to 40 floats as can be seen below. Note that the embedding parameters are not predefined, but are learned by the model.\n", - "\n", - "Although ratings can only range from 1-5, we are setting the range of possible ratings to a range from 0 to 5.5 -- that will allow the model to predict values around 1 and 5, which improves accuracy. Lastly, we set a value for weight-decay for regularization." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EmbeddingDotBias(\n", - " (u_weight): Embedding(944, 40)\n", - " (i_weight): Embedding(1683, 40)\n", - " (u_bias): Embedding(944, 1)\n", - " (i_bias): Embedding(1683, 1)\n", - ")" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0,5.5], wd=1e-1)\n", - "learn.model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now train the model for 5 epochs setting the maximal learning rate. The learner will reduce the learning rate with each epoch using cosine annealing." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_losstime
10.98599300:05
20.88549600:05
30.77763700:05
40.62897100:05
50.53232800:06
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 29.5549 seconds for training.\n" - ] - } - ], - "source": [ - "with Timer() as train_time:\n", - " learn.fit_one_cycle(EPOCHS, max_lr=5e-3)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save the learner so it can be loaded back later for inferencing / generating recommendations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp = TemporaryDirectory()\n", - "model_path = os.path.join(tmp.name, \"movielens_model.pkl\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "learn.export(model_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating Recommendations\n", - "\n", - "Load the learner from disk." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "learner = load_learner(tmp.name, \"movielens_model.pkl\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get all users and items that the model knows" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "total_users, total_items = learner.data.train_ds.x.classes.values()\n", - "total_items = total_items[1:]\n", - "total_users = total_users[1:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get all users from the test set and remove any users that were know in the training set" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "test_users = test_df[USER].unique()\n", - "test_users = np.intersect1d(test_users, total_users)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Build the cartesian product of test set users and all items known to the model" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "users_items = cartesian_product(np.array(test_users),np.array(total_items))\n", - "users_items = pd.DataFrame(users_items, columns=[USER,ITEM])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "scrolled": false - }, - "source": [ - "\n", - "Lastly, remove the user/items combinations that are in the training set -- we don't want to propose a movie that the user has already watched." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')\n", - "training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Score the model to find the top K recommendation" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 1.9734 seconds for 1511060 predictions.\n" - ] - } - ], - "source": [ - "with Timer() as test_time:\n", - " top_k_scores = score(learner, \n", - " test_df=training_removed,\n", - " user_col=USER, \n", - " item_col=ITEM, \n", - " prediction_col=PREDICTION)\n", - "\n", - "print(\"Took {} seconds for {} predictions.\".format(test_time, len(training_removed)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate some metrics for our model" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "eval_map = map_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", - " col_rating=RATING, col_prediction=PREDICTION, \n", - " relevancy_method=\"top_k\", k=TOP_K)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", - " col_rating=RATING, col_prediction=PREDICTION, \n", - " relevancy_method=\"top_k\", k=TOP_K)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", - " col_rating=RATING, col_prediction=PREDICTION, \n", - " relevancy_method=\"top_k\", k=TOP_K)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n", - " col_rating=RATING, col_prediction=PREDICTION, \n", - " relevancy_method=\"top_k\", k=TOP_K)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model:\tCollabLearner\n", - "Top K:\t10\n", - "MAP:\t0.026115\n", - "NDCG:\t0.155065\n", - "Precision@K:\t0.136691\n", - "Recall@K:\t0.054940\n" - ] - } - ], - "source": [ - "print(\"Model:\\t\" + learn.__class__.__name__,\n", - " \"Top K:\\t%d\" % TOP_K,\n", - " \"MAP:\\t%f\" % eval_map,\n", - " \"NDCG:\\t%f\" % eval_ndcg,\n", - " \"Precision@K:\\t%f\" % eval_precision,\n", - " \"Recall@K:\\t%f\" % eval_recall, sep='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above numbers are lower than [SAR](../sar_single_node_movielens.ipynb), but expected, since the model is explicitly trying to generalize the users and items to the latent factors. Next look at how well the model predicts how the user would rate the movie. Need to score `test_df` user-items only. " - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "scores = score(learner, \n", - " test_df=test_df.copy(), \n", - " user_col=USER, \n", - " item_col=ITEM, \n", - " prediction_col=PREDICTION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now calculate some regression metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model:\tCollabLearner\n", - "RMSE:\t0.902379\n", - "MAE:\t0.712163\n", - "Explained variance:\t0.346523\n", - "R squared:\t0.345672\n" - ] - } - ], - "source": [ - "eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", - "eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", - "eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", - "eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", - "\n", - "print(\"Model:\\t\" + learn.__class__.__name__,\n", - " \"RMSE:\\t%f\" % eval_rmse,\n", - " \"MAE:\\t%f\" % eval_mae,\n", - " \"Explained variance:\\t%f\" % eval_exp_var,\n", - " \"R squared:\\t%f\" % eval_r2, sep='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That RMSE is actually quite good when compared to these benchmarks: https://www.librec.net/release/v1.3/example.html" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.02611475567509659, - "encoder": "json", - "name": "map", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "map" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.15506533130248687, - "encoder": "json", - "name": "ndcg", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "ndcg" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.13669141039236482, - "encoder": "json", - "name": "precision", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "precision" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.05493986799753499, - "encoder": "json", - "name": "recall", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "recall" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.9023793356156464, - "encoder": "json", - "name": "rmse", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "rmse" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.7121634655740025, - "encoder": "json", - "name": "mae", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "mae" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.34652281723228295, - "encoder": "json", - "name": "exp_var", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "exp_var" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.3456716162958503, - "encoder": "json", - "name": "rsquared", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "rsquared" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 29.554921820759773, - "encoder": "json", - "name": "train_time", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "train_time" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 1.973397959023714, - "encoder": "json", - "name": "test_time", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "test_time" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Record results with papermill for tests\n", - "sb.glue(\"map\", eval_map)\n", - "sb.glue(\"ndcg\", eval_ndcg)\n", - "sb.glue(\"precision\", eval_precision)\n", - "sb.glue(\"recall\", eval_recall)\n", - "sb.glue(\"rmse\", eval_rmse)\n", - "sb.glue(\"mae\", eval_mae)\n", - "sb.glue(\"exp_var\", eval_exp_var)\n", - "sb.glue(\"rsquared\", eval_r2)\n", - "sb.glue(\"train_time\", train_time.interval)\n", - "sb.glue(\"test_time\", test_time.interval)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp.cleanup()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python (reco_gpu)", - "language": "python", - "name": "reco_gpu" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/00_quick_start/lightgbm_tinycriteo.ipynb b/examples/00_quick_start/lightgbm_tinycriteo.ipynb index 9e2a897d01..a62e7538ac 100644 --- a/examples/00_quick_start/lightgbm_tinycriteo.ipynb +++ b/examples/00_quick_start/lightgbm_tinycriteo.ipynb @@ -1,982 +1,982 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# LightGBM: A Highly Efficient Gradient Boosting Decision Tree\n", - "This notebook will give you an example of how to train a LightGBM model to estimate click-through rates on an e-commerce advertisement. We will train a LightGBM based model on the Criteo dataset.\n", - "\n", - "[LightGBM](https://github.com/Microsoft/LightGBM) is a gradient boosting framework that uses tree-based learning algorithms. It is designed to be distributed and efficient with the following advantages:\n", - "* Fast training speed and high efficiency.\n", - "* Low memory usage.\n", - "* Great accuracy.\n", - "* Support of parallel and GPU learning.\n", - "* Capable of handling large-scale data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global Settings and Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", - "[GCC 11.2.0]\n", - "LightGBM version: 3.3.5\n" - ] - } - ], - "source": [ - "import os\n", - "import sys\n", - "import numpy as np\n", - "import lightgbm as lgb\n", - "import scrapbook as sb\n", - "import pandas as pd\n", - "import category_encoders as ce\n", - "from tempfile import TemporaryDirectory\n", - "from sklearn.metrics import roc_auc_score, log_loss\n", - "\n", - "import recommenders.datasets.criteo as criteo\n", - "import recommenders.models.lightgbm.lightgbm_utils as lgb_utils\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"LightGBM version: {}\".format(lgb.__version__))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Parameter Setting\n", - "Let's set the main related parameters for LightGBM now. Basically, the task is a binary classification (predicting click or no click), so the objective function is set to binary logloss, and 'AUC' metric, is used as a metric which is less effected by imbalance in the classes of the dataset.\n", - "\n", - "Generally, we can adjust the number of leaves (MAX_LEAF), the minimum number of data in each leaf (MIN_DATA), maximum number of trees (NUM_OF_TREES), the learning rate of trees (TREE_LEARNING_RATE) and EARLY_STOPPING_ROUNDS (to avoid overfitting) in the model to get better performance.\n", - "\n", - "Besides, we can also adjust some other listed parameters to optimize the results. [In this link](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst), a list of all the parameters is shown. Also, some advice on how to tune these parameters can be found [in this url](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst). " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "MAX_LEAF = 64\n", - "MIN_DATA = 20\n", - "NUM_OF_TREES = 100\n", - "TREE_LEARNING_RATE = 0.15\n", - "EARLY_STOPPING_ROUNDS = 20\n", - "METRIC = \"auc\"\n", - "SIZE = \"sample\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "params = {\n", - " \"task\": \"train\",\n", - " \"boosting_type\": \"gbdt\",\n", - " \"num_class\": 1,\n", - " \"objective\": \"binary\",\n", - " \"metric\": METRIC,\n", - " \"num_leaves\": MAX_LEAF,\n", - " \"min_data\": MIN_DATA,\n", - " \"boost_from_average\": True,\n", - " # set it according to your cpu cores.\n", - " \"num_threads\": 20,\n", - " \"feature_fraction\": 0.8,\n", - " \"learning_rate\": TREE_LEARNING_RATE,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation\n", - "Here we use CSV format as the example data input. Our example data is a sample (about 100 thousand samples) from [Criteo dataset](https://www.kaggle.com/c/criteo-display-ad-challenge). The Criteo dataset is a well-known industry benchmarking dataset for developing CTR prediction models, and it's frequently adopted as evaluation dataset by research papers. The original dataset is too large for a lightweight demo, so we sample a small portion from it as a demo dataset.\n", - "\n", - "Specifically, there are 39 columns of features in Criteo, where 13 columns are numerical features (I1-I13) and the other 26 columns are categorical features (C1-C26)." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████████████████████████████████████████████████████████████████████████| 8.58k/8.58k [00:07<00:00, 1.15kKB/s]\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LightGBM: A Highly Efficient Gradient Boosting Decision Tree\n", + "This notebook will give you an example of how to train a LightGBM model to estimate click-through rates on an e-commerce advertisement. We will train a LightGBM based model on the Criteo dataset.\n", + "\n", + "[LightGBM](https://github.com/Microsoft/LightGBM) is a gradient boosting framework that uses tree-based learning algorithms. It is designed to be distributed and efficient with the following advantages:\n", + "* Fast training speed and high efficiency.\n", + "* Low memory usage.\n", + "* Great accuracy.\n", + "* Support of parallel and GPU learning.\n", + "* Capable of handling large-scale data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global Settings and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", + "[GCC 11.2.0]\n", + "LightGBM version: 3.3.5\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import numpy as np\n", + "import lightgbm as lgb\n", + "import scrapbook as sb\n", + "import pandas as pd\n", + "import category_encoders as ce\n", + "from tempfile import TemporaryDirectory\n", + "from sklearn.metrics import roc_auc_score, log_loss\n", + "\n", + "import recommenders.datasets.criteo as criteo\n", + "import recommenders.models.lightgbm.lightgbm_utils as lgb_utils\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"LightGBM version: {}\".format(lgb.__version__))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parameter Setting\n", + "Let's set the main related parameters for LightGBM now. Basically, the task is a binary classification (predicting click or no click), so the objective function is set to binary logloss, and 'AUC' metric, is used as a metric which is less effected by imbalance in the classes of the dataset.\n", + "\n", + "Generally, we can adjust the number of leaves (MAX_LEAF), the minimum number of data in each leaf (MIN_DATA), maximum number of trees (NUM_OF_TREES), the learning rate of trees (TREE_LEARNING_RATE) and EARLY_STOPPING_ROUNDS (to avoid overfitting) in the model to get better performance.\n", + "\n", + "Besides, we can also adjust some other listed parameters to optimize the results. [In this link](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst), a list of all the parameters is shown. Also, some advice on how to tune these parameters can be found [in this url](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst). " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "MAX_LEAF = 64\n", + "MIN_DATA = 20\n", + "NUM_OF_TREES = 100\n", + "TREE_LEARNING_RATE = 0.15\n", + "EARLY_STOPPING_ROUNDS = 20\n", + "METRIC = \"auc\"\n", + "SIZE = \"sample\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "params = {\n", + " \"task\": \"train\",\n", + " \"boosting_type\": \"gbdt\",\n", + " \"num_class\": 1,\n", + " \"objective\": \"binary\",\n", + " \"metric\": METRIC,\n", + " \"num_leaves\": MAX_LEAF,\n", + " \"min_data\": MIN_DATA,\n", + " \"boost_from_average\": True,\n", + " # set it according to your cpu cores.\n", + " \"num_threads\": 20,\n", + " \"feature_fraction\": 0.8,\n", + " \"learning_rate\": TREE_LEARNING_RATE,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "Here we use CSV format as the example data input. Our example data is a sample (about 100 thousand samples) from [Criteo dataset](https://www.kaggle.com/c/criteo-display-ad-challenge). The Criteo dataset is a well-known industry benchmarking dataset for developing CTR prediction models, and it's frequently adopted as evaluation dataset by research papers. The original dataset is too large for a lightweight demo, so we sample a small portion from it as a demo dataset.\n", + "\n", + "Specifically, there are 39 columns of features in Criteo, where 13 columns are numerical features (I1-I13) and the other 26 columns are categorical features (C1-C26)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████| 8.58k/8.58k [00:07<00:00, 1.15kKB/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LabelI1I2I3I4I5I6I7I8I9...C17C18C19C20C21C22C23C24C25C26
001.015.00.01382.04.015.02.0181.0...e5ba7672f54016b921ddcdc9b1252a9d07b5194cNaN3a171ecbc5c50484e8b834079727dd16
102.0044.01.0102.08.02.02.04.0...07c540c4b04e467021ddcdc95840adea60f6221eNaN3a171ecb43f13e8be8b83407731c3655
202.001.014.0767.089.04.02.0245.0...8efede7f3412118dNaNNaNe587c466ad3062eb3a171ecb3b183c5cNaNNaN
30NaN893NaNNaN4392.0NaN0.00.00.0...1e88c74f74ef3502NaNNaN6b3a5ca6NaN3a171ecb9117a34aNaNNaN
403.0-1NaN0.02.00.03.00.00.0...1e88c74f26b3c7a7NaNNaN21c9516aNaN32c7478eb34f3128NaNNaN
\n", + "

5 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " Label I1 I2 I3 I4 I5 I6 I7 I8 I9 ... C17 \\\n", + "0 0 1.0 1 5.0 0.0 1382.0 4.0 15.0 2.0 181.0 ... e5ba7672 \n", + "1 0 2.0 0 44.0 1.0 102.0 8.0 2.0 2.0 4.0 ... 07c540c4 \n", + "2 0 2.0 0 1.0 14.0 767.0 89.0 4.0 2.0 245.0 ... 8efede7f \n", + "3 0 NaN 893 NaN NaN 4392.0 NaN 0.0 0.0 0.0 ... 1e88c74f \n", + "4 0 3.0 -1 NaN 0.0 2.0 0.0 3.0 0.0 0.0 ... 1e88c74f \n", + "\n", + " C18 C19 C20 C21 C22 C23 C24 \\\n", + "0 f54016b9 21ddcdc9 b1252a9d 07b5194c NaN 3a171ecb c5c50484 \n", + "1 b04e4670 21ddcdc9 5840adea 60f6221e NaN 3a171ecb 43f13e8b \n", + "2 3412118d NaN NaN e587c466 ad3062eb 3a171ecb 3b183c5c \n", + "3 74ef3502 NaN NaN 6b3a5ca6 NaN 3a171ecb 9117a34a \n", + "4 26b3c7a7 NaN NaN 21c9516a NaN 32c7478e b34f3128 \n", + "\n", + " C25 C26 \n", + "0 e8b83407 9727dd16 \n", + "1 e8b83407 731c3655 \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + "[5 rows x 40 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "nume_cols = [\"I\" + str(i) for i in range(1, 14)]\n", + "cate_cols = [\"C\" + str(i) for i in range(1, 27)]\n", + "label_col = \"Label\"\n", + "\n", + "header = [label_col] + nume_cols + cate_cols\n", + "with TemporaryDirectory() as tmp:\n", + " all_data = criteo.load_pandas_df(size=SIZE, local_cache_path=tmp, header=header)\n", + "display(all_data.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we cut three sets (train_data (first 80%), valid_data (middle 10%) and test_data (last 10%)), cut from the original all data.
\n", + "Notably, considering the Criteo is a kind of time-series streaming data, which is also very common in recommendation scenario, we split the data by its order." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# split data to 3 sets \n", + "length = len(all_data)\n", + "train_data = all_data.loc[:0.8*length-1]\n", + "valid_data = all_data.loc[0.8*length:0.9*length-1]\n", + "test_data = all_data.loc[0.9*length:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage\n", + "### Ordinal Encoding\n", + "Considering LightGBM could handle the low-frequency features and missing value by itself, for basic usage, we only encode the string-like categorical features by an ordinal encoder." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train Data Shape: X: (80000, 39); Y: (80000,).\n", + "Valid Data Shape: X: (10000, 39); Y: (10000,).\n", + "Test Data Shape: X: (10000, 39); Y: (10000,).\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
I1I2I3I4I5I6I7I8I9I10...C17C18C19C20C21C22C23C24C25C26
01.015.00.01382.04.015.02.0181.01.0...1111111111
12.0044.01.0102.08.02.02.04.01.0...2212211212
22.001.014.0767.089.04.02.0245.01.0...3323321323
3NaN893NaNNaN4392.0NaN0.00.00.0NaN...4423411423
43.0-1NaN0.02.00.03.00.00.01.0...4523512523
\n", + "

5 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " I1 I2 I3 I4 I5 I6 I7 I8 I9 I10 ... C17 C18 \\\n", + "0 1.0 1 5.0 0.0 1382.0 4.0 15.0 2.0 181.0 1.0 ... 1 1 \n", + "1 2.0 0 44.0 1.0 102.0 8.0 2.0 2.0 4.0 1.0 ... 2 2 \n", + "2 2.0 0 1.0 14.0 767.0 89.0 4.0 2.0 245.0 1.0 ... 3 3 \n", + "3 NaN 893 NaN NaN 4392.0 NaN 0.0 0.0 0.0 NaN ... 4 4 \n", + "4 3.0 -1 NaN 0.0 2.0 0.0 3.0 0.0 0.0 1.0 ... 4 5 \n", + "\n", + " C19 C20 C21 C22 C23 C24 C25 C26 \n", + "0 1 1 1 1 1 1 1 1 \n", + "1 1 2 2 1 1 2 1 2 \n", + "2 2 3 3 2 1 3 2 3 \n", + "3 2 3 4 1 1 4 2 3 \n", + "4 2 3 5 1 2 5 2 3 \n", + "\n", + "[5 rows x 39 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ord_encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)\n", + "\n", + "def encode_csv(df, encoder, label_col, typ=\"fit\"):\n", + " if typ == \"fit\":\n", + " df = encoder.fit_transform(df)\n", + " else:\n", + " df = encoder.transform(df)\n", + " y = df[label_col].values\n", + " del df[label_col]\n", + " return df, y\n", + "\n", + "train_x, train_y = encode_csv(train_data, ord_encoder, label_col)\n", + "valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, \"transform\")\n", + "test_x, test_y = encode_csv(test_data, ord_encoder, label_col, \"transform\")\n", + "\n", + "print(\"Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\\n\"\n", + " .format(trn_x_shape=train_x.shape,\n", + " trn_y_shape=train_y.shape,\n", + " vld_x_shape=valid_x.shape,\n", + " vld_y_shape=valid_y.shape,\n", + " tst_x_shape=test_x.shape,\n", + " tst_y_shape=test_y.shape,))\n", + "train_x.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create model\n", + "When both hyper-parameters and data are ready, we can create a model:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LightGBM] [Info] Number of positive: 17958, number of negative: 62042\n", + "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023124 seconds.\n", + "You can set `force_col_wise=true` to remove the overhead.\n", + "[LightGBM] [Info] Total Bins 38971\n", + "[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 39\n", + "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.224475 -> initscore=-1.239776\n", + "[LightGBM] [Info] Start training from score -1.239776\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/miguel/anaconda/envs/recommenders/lib/python3.9/site-packages/lightgbm/basic.py:1780: UserWarning: Overriding the parameters from Reference Dataset.\n", + " _log_warning('Overriding the parameters from Reference Dataset.')\n", + "/home/miguel/anaconda/envs/recommenders/lib/python3.9/site-packages/lightgbm/basic.py:1513: UserWarning: categorical_column in param dict is overridden.\n", + " _log_warning(f'{cat_alias} in param dict is overridden.')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training until validation scores don't improve for 20 rounds\n", + "Early stopping, best iteration is:\n", + "[18]\tvalid_0's auc: 0.759658\n" + ] + } + ], + "source": [ + "lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params, categorical_feature=cate_cols)\n", + "lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)\n", + "lgb_test = lgb.Dataset(test_x, test_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)\n", + "lgb_model = lgb.train(params,\n", + " lgb_train,\n", + " num_boost_round=NUM_OF_TREES,\n", + " valid_sets=lgb_valid,\n", + " categorical_feature=cate_cols,\n", + " callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS)])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see what is the model's performance:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'auc': 0.7655408801711783, 'logloss': 0.46825831788359984}\n" + ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": { + "auc": 0.7655408801711783, + "logloss": 0.46825831788359984 + }, + "encoder": "json", + "name": "res_basic", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "res_basic" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "test_preds = lgb_model.predict(test_x)\n", + "auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))\n", + "logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)\n", + "res_basic = {\"auc\": auc, \"logloss\": logloss}\n", + "print(res_basic)\n", + "sb.glue(\"res_basic\", res_basic)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Optimized Usage\n", + "### Label-encoding and Binary-encoding\n", + "Next, since LightGBM has a better capability in handling dense numerical features effectively, we try to convert all the categorical features in original data into numerical ones, by label-encoding [3] and binary-encoding [4]. Also due to the sequence property of Criteo, the label-encoding we adopted is executed one-by-one, which means we encode the samples in order, by the information of the previous samples before each sample (sequential label-encoding and sequential count-encoding). Besides, we also filter the low-frequency categorical features and fill the missing values by the mean of corresponding columns for the numerical features. (consulting `lgb_utils.NumEncoder`)\n", + "\n", + "Specifically, in `lgb_utils.NumEncoder`, the main steps are as follows.\n", + "* Firstly, we convert the low-frequency categorical features to `\"LESS\"` and the missing categorical features to `\"UNK\"`. \n", + "* Secondly, we convert the missing numerical features into the mean of corresponding columns. \n", + "* Thirdly, the string-like categorical features are ordinal encoded like the example shown in basic usage. \n", + "* And then, we target encode the categorical features in the samples order one-by-one. For each sample, we add the label and count information of its former samples into the data and produce new features. Formally, for $i=1,2,...,n$, we add $\\frac{\\sum\\nolimits_{j=1}^{i-1} I(x_j=c) \\cdot y}{\\sum\\nolimits_{j=1}^{i-1} I(x_j=c)}$ as a new label feature for current sample $x_i$, where $c$ is a category to encode in current sample, so $(i-1)$ is the number of former samples, and $I(\\cdot)$ is the indicator function that check the former samples contain $c$ (whether $x_j=c$) or not. At the meantime, we also add the count frequency of $c$, which is $\\frac{\\sum\\nolimits_{j=1}^{i-1} I(x_j=c)}{i-1}$, as a new count feature. \n", + "* Finally, based on the results of ordinal encoding, we add the binary encoding results as new columns into the data.\n", + "\n", + "Note that the statistics used in the above process only updates when fitting the training set, while maintaining static when transforming the testing set because the label of test data should be considered as unknown." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-14 17:43:41,680 [INFO] Filtering and fillna features\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:02<00:00, 11.68it/s]\n", + "100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 416.90it/s]\n", + "2023-08-14 17:43:43,946 [INFO] Ordinal encoding cate features\n", + "2023-08-14 17:43:44,781 [INFO] Target encoding cate features\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:02<00:00, 9.89it/s]\n", + "2023-08-14 17:43:47,414 [INFO] Start manual binary encoding\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 65/65 [00:04<00:00, 14.63it/s]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:02<00:00, 11.34it/s]\n", + "2023-08-14 17:43:54,278 [INFO] Filtering and fillna features\n", + "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 314.21it/s]\n", + "100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 3019.32it/s]\n", + "2023-08-14 17:43:54,372 [INFO] Ordinal encoding cate features\n", + "2023-08-14 17:43:54,473 [INFO] Target encoding cate features\n", + "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 149.84it/s]\n", + "2023-08-14 17:43:54,651 [INFO] Start manual binary encoding\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 65/65 [00:04<00:00, 15.60it/s]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 15.70it/s]\n", + "2023-08-14 17:44:00,559 [INFO] Filtering and fillna features\n", + "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 243.57it/s]\n", + "100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 2571.74it/s]\n", + "2023-08-14 17:44:00,677 [INFO] Ordinal encoding cate features\n", + "2023-08-14 17:44:00,781 [INFO] Target encoding cate features\n", + "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 124.60it/s]\n", + "2023-08-14 17:44:00,994 [INFO] Start manual binary encoding\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 65/65 [00:04<00:00, 14.66it/s]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 14.88it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train Data Shape: X: (80000, 268); Y: (80000, 1).\n", + "Valid Data Shape: X: (10000, 268); Y: (10000, 1).\n", + "Test Data Shape: X: (10000, 268); Y: (10000, 1).\n", + "\n" + ] + } + ], + "source": [ + "label_col = \"Label\"\n", + "num_encoder = lgb_utils.NumEncoder(cate_cols, nume_cols, label_col)\n", + "train_x, train_y = num_encoder.fit_transform(train_data)\n", + "valid_x, valid_y = num_encoder.transform(valid_data)\n", + "test_x, test_y = num_encoder.transform(test_data)\n", + "del num_encoder\n", + "print(\"Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\\n\"\n", + " .format(trn_x_shape=train_x.shape,\n", + " trn_y_shape=train_y.shape,\n", + " vld_x_shape=valid_x.shape,\n", + " vld_y_shape=valid_y.shape,\n", + " tst_x_shape=test_x.shape,\n", + " tst_y_shape=test_y.shape,))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training and Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LightGBM] [Info] Number of positive: 17958, number of negative: 62042\n", + "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.165299 seconds.\n", + "You can set `force_row_wise=true` to remove the overhead.\n", + "And if memory is not enough, you can set `force_col_wise=true`.\n", + "[LightGBM] [Info] Total Bins 15787\n", + "[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 267\n", + "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.224475 -> initscore=-1.239776\n", + "[LightGBM] [Info] Start training from score -1.239776\n", + "Training until validation scores don't improve for 20 rounds\n", + "Early stopping, best iteration is:\n", + "[43]\tvalid_0's auc: 0.77085\n" + ] + } + ], + "source": [ + "lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params)\n", + "lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train)\n", + "lgb_model = lgb.train(params,\n", + " lgb_train,\n", + " num_boost_round=NUM_OF_TREES,\n", + " valid_sets=lgb_valid,\n", + " callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS)])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "test_preds = lgb_model.predict(test_x)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'auc': 0.7758548016657666, 'logloss': 0.46030887404896165}\n" + ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": { + "auc": 0.7758548016657666, + "logloss": 0.46030887404896165 + }, + "encoder": "json", + "name": "res_optim", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "res_optim" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))\n", + "logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)\n", + "res_optim = {\"auc\": auc, \"logloss\": logloss}\n", + "\n", + "print(res_optim)\n", + "sb.glue(\"res_optim\", res_optim)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model saving and loading\n", + "Now we finish the basic training and testing for LightGBM, next let's try to save and reload the model, and then evaluate it again." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "with TemporaryDirectory() as tmp:\n", + " save_file = os.path.join(tmp, \"finished.model\")\n", + " lgb_model.save_model(save_file)\n", + " loaded_model = lgb.Booster(model_file=save_file)\n", + "\n", + "# eval the performance again\n", + "test_preds = loaded_model.predict(test_x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'auc': 0.7758548016657666, 'logloss': 0.46030887404896165}\n" + ] + } + ], + "source": [ + "auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))\n", + "logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)\n", + "\n", + "print({\"auc\": auc, \"logloss\": logloss})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Reading\n", + "\n", + "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154.
\n", + "\\[2\\] The parameters of LightGBM: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
\n", + "\\[3\\] Anna Veronika Dorogush, Vasily Ershov, and Andrey Gulin. 2018. CatBoost: gradient boosting with categorical features support. arXiv preprint arXiv:1810.11363 (2018).
\n", + "\\[4\\] Scikit-learn. 2018. categorical_encoding. https://github.com/scikit-learn-contrib/categorical-encoding
\n" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
LabelI1I2I3I4I5I6I7I8I9...C17C18C19C20C21C22C23C24C25C26
001.015.00.01382.04.015.02.0181.0...e5ba7672f54016b921ddcdc9b1252a9d07b5194cNaN3a171ecbc5c50484e8b834079727dd16
102.0044.01.0102.08.02.02.04.0...07c540c4b04e467021ddcdc95840adea60f6221eNaN3a171ecb43f13e8be8b83407731c3655
202.001.014.0767.089.04.02.0245.0...8efede7f3412118dNaNNaNe587c466ad3062eb3a171ecb3b183c5cNaNNaN
30NaN893NaNNaN4392.0NaN0.00.00.0...1e88c74f74ef3502NaNNaN6b3a5ca6NaN3a171ecb9117a34aNaNNaN
403.0-1NaN0.02.00.03.00.00.0...1e88c74f26b3c7a7NaNNaN21c9516aNaN32c7478eb34f3128NaNNaN
\n", - "

5 rows × 40 columns

\n", - "
" - ], - "text/plain": [ - " Label I1 I2 I3 I4 I5 I6 I7 I8 I9 ... C17 \\\n", - "0 0 1.0 1 5.0 0.0 1382.0 4.0 15.0 2.0 181.0 ... e5ba7672 \n", - "1 0 2.0 0 44.0 1.0 102.0 8.0 2.0 2.0 4.0 ... 07c540c4 \n", - "2 0 2.0 0 1.0 14.0 767.0 89.0 4.0 2.0 245.0 ... 8efede7f \n", - "3 0 NaN 893 NaN NaN 4392.0 NaN 0.0 0.0 0.0 ... 1e88c74f \n", - "4 0 3.0 -1 NaN 0.0 2.0 0.0 3.0 0.0 0.0 ... 1e88c74f \n", - "\n", - " C18 C19 C20 C21 C22 C23 C24 \\\n", - "0 f54016b9 21ddcdc9 b1252a9d 07b5194c NaN 3a171ecb c5c50484 \n", - "1 b04e4670 21ddcdc9 5840adea 60f6221e NaN 3a171ecb 43f13e8b \n", - "2 3412118d NaN NaN e587c466 ad3062eb 3a171ecb 3b183c5c \n", - "3 74ef3502 NaN NaN 6b3a5ca6 NaN 3a171ecb 9117a34a \n", - "4 26b3c7a7 NaN NaN 21c9516a NaN 32c7478e b34f3128 \n", - "\n", - " C25 C26 \n", - "0 e8b83407 9727dd16 \n", - "1 e8b83407 731c3655 \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - "[5 rows x 40 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "nume_cols = [\"I\" + str(i) for i in range(1, 14)]\n", - "cate_cols = [\"C\" + str(i) for i in range(1, 27)]\n", - "label_col = \"Label\"\n", - "\n", - "header = [label_col] + nume_cols + cate_cols\n", - "with TemporaryDirectory() as tmp:\n", - " all_data = criteo.load_pandas_df(size=SIZE, local_cache_path=tmp, header=header)\n", - "display(all_data.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we cut three sets (train_data (first 80%), valid_data (middle 10%) and test_data (last 10%)), cut from the original all data.
\n", - "Notably, considering the Criteo is a kind of time-series streaming data, which is also very common in recommendation scenario, we split the data by its order." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# split data to 3 sets \n", - "length = len(all_data)\n", - "train_data = all_data.loc[:0.8*length-1]\n", - "valid_data = all_data.loc[0.8*length:0.9*length-1]\n", - "test_data = all_data.loc[0.9*length:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage\n", - "### Ordinal Encoding\n", - "Considering LightGBM could handle the low-frequency features and missing value by itself, for basic usage, we only encode the string-like categorical features by an ordinal encoder." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train Data Shape: X: (80000, 39); Y: (80000,).\n", - "Valid Data Shape: X: (10000, 39); Y: (10000,).\n", - "Test Data Shape: X: (10000, 39); Y: (10000,).\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
I1I2I3I4I5I6I7I8I9I10...C17C18C19C20C21C22C23C24C25C26
01.015.00.01382.04.015.02.0181.01.0...1111111111
12.0044.01.0102.08.02.02.04.01.0...2212211212
22.001.014.0767.089.04.02.0245.01.0...3323321323
3NaN893NaNNaN4392.0NaN0.00.00.0NaN...4423411423
43.0-1NaN0.02.00.03.00.00.01.0...4523512523
\n", - "

5 rows × 39 columns

\n", - "
" - ], - "text/plain": [ - " I1 I2 I3 I4 I5 I6 I7 I8 I9 I10 ... C17 C18 \\\n", - "0 1.0 1 5.0 0.0 1382.0 4.0 15.0 2.0 181.0 1.0 ... 1 1 \n", - "1 2.0 0 44.0 1.0 102.0 8.0 2.0 2.0 4.0 1.0 ... 2 2 \n", - "2 2.0 0 1.0 14.0 767.0 89.0 4.0 2.0 245.0 1.0 ... 3 3 \n", - "3 NaN 893 NaN NaN 4392.0 NaN 0.0 0.0 0.0 NaN ... 4 4 \n", - "4 3.0 -1 NaN 0.0 2.0 0.0 3.0 0.0 0.0 1.0 ... 4 5 \n", - "\n", - " C19 C20 C21 C22 C23 C24 C25 C26 \n", - "0 1 1 1 1 1 1 1 1 \n", - "1 1 2 2 1 1 2 1 2 \n", - "2 2 3 3 2 1 3 2 3 \n", - "3 2 3 4 1 1 4 2 3 \n", - "4 2 3 5 1 2 5 2 3 \n", - "\n", - "[5 rows x 39 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ord_encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)\n", - "\n", - "def encode_csv(df, encoder, label_col, typ=\"fit\"):\n", - " if typ == \"fit\":\n", - " df = encoder.fit_transform(df)\n", - " else:\n", - " df = encoder.transform(df)\n", - " y = df[label_col].values\n", - " del df[label_col]\n", - " return df, y\n", - "\n", - "train_x, train_y = encode_csv(train_data, ord_encoder, label_col)\n", - "valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, \"transform\")\n", - "test_x, test_y = encode_csv(test_data, ord_encoder, label_col, \"transform\")\n", - "\n", - "print(\"Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\\n\"\n", - " .format(trn_x_shape=train_x.shape,\n", - " trn_y_shape=train_y.shape,\n", - " vld_x_shape=valid_x.shape,\n", - " vld_y_shape=valid_y.shape,\n", - " tst_x_shape=test_x.shape,\n", - " tst_y_shape=test_y.shape,))\n", - "train_x.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create model\n", - "When both hyper-parameters and data are ready, we can create a model:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[LightGBM] [Info] Number of positive: 17958, number of negative: 62042\n", - "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023124 seconds.\n", - "You can set `force_col_wise=true` to remove the overhead.\n", - "[LightGBM] [Info] Total Bins 38971\n", - "[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 39\n", - "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.224475 -> initscore=-1.239776\n", - "[LightGBM] [Info] Start training from score -1.239776\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/miguel/anaconda/envs/recommenders/lib/python3.9/site-packages/lightgbm/basic.py:1780: UserWarning: Overriding the parameters from Reference Dataset.\n", - " _log_warning('Overriding the parameters from Reference Dataset.')\n", - "/home/miguel/anaconda/envs/recommenders/lib/python3.9/site-packages/lightgbm/basic.py:1513: UserWarning: categorical_column in param dict is overridden.\n", - " _log_warning(f'{cat_alias} in param dict is overridden.')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training until validation scores don't improve for 20 rounds\n", - "Early stopping, best iteration is:\n", - "[18]\tvalid_0's auc: 0.759658\n" - ] - } - ], - "source": [ - "lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params, categorical_feature=cate_cols)\n", - "lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)\n", - "lgb_test = lgb.Dataset(test_x, test_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)\n", - "lgb_model = lgb.train(params,\n", - " lgb_train,\n", - " num_boost_round=NUM_OF_TREES,\n", - " valid_sets=lgb_valid,\n", - " categorical_feature=cate_cols,\n", - " callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS)])\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's see what is the model's performance:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'auc': 0.7655408801711783, 'logloss': 0.46825831788359984}\n" - ] - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": { - "auc": 0.7655408801711783, - "logloss": 0.46825831788359984 - }, - "encoder": "json", - "name": "res_basic", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "res_basic" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "test_preds = lgb_model.predict(test_x)\n", - "auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))\n", - "logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)\n", - "res_basic = {\"auc\": auc, \"logloss\": logloss}\n", - "print(res_basic)\n", - "sb.glue(\"res_basic\", res_basic)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Optimized Usage\n", - "### Label-encoding and Binary-encoding\n", - "Next, since LightGBM has a better capability in handling dense numerical features effectively, we try to convert all the categorical features in original data into numerical ones, by label-encoding [3] and binary-encoding [4]. Also due to the sequence property of Criteo, the label-encoding we adopted is executed one-by-one, which means we encode the samples in order, by the information of the previous samples before each sample (sequential label-encoding and sequential count-encoding). Besides, we also filter the low-frequency categorical features and fill the missing values by the mean of corresponding columns for the numerical features. (consulting `lgb_utils.NumEncoder`)\n", - "\n", - "Specifically, in `lgb_utils.NumEncoder`, the main steps are as follows.\n", - "* Firstly, we convert the low-frequency categorical features to `\"LESS\"` and the missing categorical features to `\"UNK\"`. \n", - "* Secondly, we convert the missing numerical features into the mean of corresponding columns. \n", - "* Thirdly, the string-like categorical features are ordinal encoded like the example shown in basic usage. \n", - "* And then, we target encode the categorical features in the samples order one-by-one. For each sample, we add the label and count information of its former samples into the data and produce new features. Formally, for $i=1,2,...,n$, we add $\\frac{\\sum\\nolimits_{j=1}^{i-1} I(x_j=c) \\cdot y}{\\sum\\nolimits_{j=1}^{i-1} I(x_j=c)}$ as a new label feature for current sample $x_i$, where $c$ is a category to encode in current sample, so $(i-1)$ is the number of former samples, and $I(\\cdot)$ is the indicator function that check the former samples contain $c$ (whether $x_j=c$) or not. At the meantime, we also add the count frequency of $c$, which is $\\frac{\\sum\\nolimits_{j=1}^{i-1} I(x_j=c)}{i-1}$, as a new count feature. \n", - "* Finally, based on the results of ordinal encoding, we add the binary encoding results as new columns into the data.\n", - "\n", - "Note that the statistics used in the above process only updates when fitting the training set, while maintaining static when transforming the testing set because the label of test data should be considered as unknown." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-14 17:43:41,680 [INFO] Filtering and fillna features\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:02<00:00, 11.68it/s]\n", - "100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 416.90it/s]\n", - "2023-08-14 17:43:43,946 [INFO] Ordinal encoding cate features\n", - "2023-08-14 17:43:44,781 [INFO] Target encoding cate features\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:02<00:00, 9.89it/s]\n", - "2023-08-14 17:43:47,414 [INFO] Start manual binary encoding\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 65/65 [00:04<00:00, 14.63it/s]\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:02<00:00, 11.34it/s]\n", - "2023-08-14 17:43:54,278 [INFO] Filtering and fillna features\n", - "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 314.21it/s]\n", - "100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 3019.32it/s]\n", - "2023-08-14 17:43:54,372 [INFO] Ordinal encoding cate features\n", - "2023-08-14 17:43:54,473 [INFO] Target encoding cate features\n", - "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 149.84it/s]\n", - "2023-08-14 17:43:54,651 [INFO] Start manual binary encoding\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 65/65 [00:04<00:00, 15.60it/s]\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 15.70it/s]\n", - "2023-08-14 17:44:00,559 [INFO] Filtering and fillna features\n", - "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 243.57it/s]\n", - "100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 2571.74it/s]\n", - "2023-08-14 17:44:00,677 [INFO] Ordinal encoding cate features\n", - "2023-08-14 17:44:00,781 [INFO] Target encoding cate features\n", - "100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 124.60it/s]\n", - "2023-08-14 17:44:00,994 [INFO] Start manual binary encoding\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 65/65 [00:04<00:00, 14.66it/s]\n", - "100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 14.88it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train Data Shape: X: (80000, 268); Y: (80000, 1).\n", - "Valid Data Shape: X: (10000, 268); Y: (10000, 1).\n", - "Test Data Shape: X: (10000, 268); Y: (10000, 1).\n", - "\n" - ] - } - ], - "source": [ - "label_col = \"Label\"\n", - "num_encoder = lgb_utils.NumEncoder(cate_cols, nume_cols, label_col)\n", - "train_x, train_y = num_encoder.fit_transform(train_data)\n", - "valid_x, valid_y = num_encoder.transform(valid_data)\n", - "test_x, test_y = num_encoder.transform(test_data)\n", - "del num_encoder\n", - "print(\"Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\\n\"\n", - " .format(trn_x_shape=train_x.shape,\n", - " trn_y_shape=train_y.shape,\n", - " vld_x_shape=valid_x.shape,\n", - " vld_y_shape=valid_y.shape,\n", - " tst_x_shape=test_x.shape,\n", - " tst_y_shape=test_y.shape,))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training and Evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[LightGBM] [Info] Number of positive: 17958, number of negative: 62042\n", - "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.165299 seconds.\n", - "You can set `force_row_wise=true` to remove the overhead.\n", - "And if memory is not enough, you can set `force_col_wise=true`.\n", - "[LightGBM] [Info] Total Bins 15787\n", - "[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 267\n", - "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.224475 -> initscore=-1.239776\n", - "[LightGBM] [Info] Start training from score -1.239776\n", - "Training until validation scores don't improve for 20 rounds\n", - "Early stopping, best iteration is:\n", - "[43]\tvalid_0's auc: 0.77085\n" - ] - } - ], - "source": [ - "lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params)\n", - "lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train)\n", - "lgb_model = lgb.train(params,\n", - " lgb_train,\n", - " num_boost_round=NUM_OF_TREES,\n", - " valid_sets=lgb_valid,\n", - " callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS)])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "test_preds = lgb_model.predict(test_x)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'auc': 0.7758548016657666, 'logloss': 0.46030887404896165}\n" - ] - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": { - "auc": 0.7758548016657666, - "logloss": 0.46030887404896165 - }, - "encoder": "json", - "name": "res_optim", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "res_optim" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))\n", - "logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)\n", - "res_optim = {\"auc\": auc, \"logloss\": logloss}\n", - "\n", - "print(res_optim)\n", - "sb.glue(\"res_optim\", res_optim)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model saving and loading\n", - "Now we finish the basic training and testing for LightGBM, next let's try to save and reload the model, and then evaluate it again." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "with TemporaryDirectory() as tmp:\n", - " save_file = os.path.join(tmp, \"finished.model\")\n", - " lgb_model.save_model(save_file)\n", - " loaded_model = lgb.Booster(model_file=save_file)\n", - "\n", - "# eval the performance again\n", - "test_preds = loaded_model.predict(test_x)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'auc': 0.7758548016657666, 'logloss': 0.46030887404896165}\n" - ] - } - ], - "source": [ - "auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))\n", - "logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)\n", - "\n", - "print({\"auc\": auc, \"logloss\": logloss})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Additional Reading\n", - "\n", - "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154.
\n", - "\\[2\\] The parameters of LightGBM: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
\n", - "\\[3\\] Anna Veronika Dorogush, Vasily Ershov, and Andrey Gulin. 2018. CatBoost: gradient boosting with categorical features support. arXiv preprint arXiv:1810.11363 (2018).
\n", - "\\[4\\] Scikit-learn. 2018. categorical_encoding. https://github.com/scikit-learn-contrib/categorical-encoding
\n" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/examples/00_quick_start/lstur_MIND.ipynb b/examples/00_quick_start/lstur_MIND.ipynb index 783ff31111..daf17b9eef 100644 --- a/examples/00_quick_start/lstur_MIND.ipynb +++ b/examples/00_quick_start/lstur_MIND.ipynb @@ -1,562 +1,562 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# LSTUR: Neural News Recommendation with Long- and Short-term User Representations\n", - "LSTUR \\[1\\] is a news recommendation approach capturing users' both long-term preferences and short-term interests. The core of LSTUR is a news encoder and a user encoder. In the news encoder, we learn representations of news from their titles. In user encoder, we propose to learn long-term\n", - "user representations from the embeddings of their IDs. In addition, we propose to learn short-term user representations from their recently browsed news via GRU network. Besides, we propose two methods to combine\n", - "long-term and short-term user representations. The first one is using the long-term user representation to initialize the hidden state of the GRU network in short-term user representation. The second one is concatenating both\n", - "long- and short-term user representations as a unified user vector.\n", - "\n", - "## Properties of LSTUR:\n", - "- LSTUR captures users' both long-term and short term preference.\n", - "- It uses embeddings of users' IDs to learn long-term user representations.\n", - "- It uses users' recently browsed news via GRU network to learn short-term user representations.\n", - "\n", - "## Data format:\n", - "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall and MINDlarge, please change the dowload source. Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", - " \n", - "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", - "\n", - "### news data\n", - "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", - "One simple example:
\n", - "\n", - "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", - "
\n", - "\n", - "In general, each line in data file represents information of one piece of news:
\n", - "\n", - "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", - "\n", - "
\n", - "\n", - "We generate a word_dict file to tranform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", - "\n", - "### behaviors data\n", - "One simple example:
\n", - "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", - "
\n", - "\n", - "In general, each line in data file represents one instance of an impression. The format is like:
\n", - "\n", - "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", - "\n", - "
\n", - "\n", - "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", - "\n", - "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", - "\n", - "
\n", - "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global settings and imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", - " from pyarrow import HadoopFileSystem\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LSTUR: Neural News Recommendation with Long- and Short-term User Representations\n", + "LSTUR \\[1\\] is a news recommendation approach capturing users' both long-term preferences and short-term interests. The core of LSTUR is a news encoder and a user encoder. In the news encoder, we learn representations of news from their titles. In user encoder, we propose to learn long-term\n", + "user representations from the embeddings of their IDs. In addition, we propose to learn short-term user representations from their recently browsed news via GRU network. Besides, we propose two methods to combine\n", + "long-term and short-term user representations. The first one is using the long-term user representation to initialize the hidden state of the GRU network in short-term user representation. The second one is concatenating both\n", + "long- and short-term user representations as a unified user vector.\n", + "\n", + "## Properties of LSTUR:\n", + "- LSTUR captures users' both long-term and short term preference.\n", + "- It uses embeddings of users' IDs to learn long-term user representations.\n", + "- It uses users' recently browsed news via GRU network to learn short-term user representations.\n", + "\n", + "## Data format:\n", + "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall and MINDlarge, please change the dowload source. Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", + " \n", + "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", + "\n", + "### news data\n", + "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", + "One simple example:
\n", + "\n", + "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", + "
\n", + "\n", + "In general, each line in data file represents information of one piece of news:
\n", + "\n", + "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", + "\n", + "
\n", + "\n", + "We generate a word_dict file to tranform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", + "\n", + "### behaviors data\n", + "One simple example:
\n", + "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", + "
\n", + "\n", + "In general, each line in data file represents one instance of an impression. The format is like:
\n", + "\n", + "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", + "\n", + "
\n", + "\n", + "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", + "\n", + "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", + "\n", + "
\n", + "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global settings and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", + " from pyarrow import HadoopFileSystem\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", + "[GCC 7.5.0]\n", + "Tensorflow version: 2.6.1\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import numpy as np\n", + "import zipfile\n", + "from tqdm import tqdm\n", + "import scrapbook as sb\n", + "from tempfile import TemporaryDirectory\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel('ERROR') # only show error messages\n", + "\n", + "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", + "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", + "from recommenders.models.newsrec.models.lstur import LSTURModel\n", + "from recommenders.models.newsrec.io.mind_iterator import MINDIterator\n", + "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "epochs = 5\n", + "seed = 40\n", + "batch_size = 32\n", + "\n", + "# Options: demo, small, large\n", + "MIND_type = 'demo'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and load data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 17.0k/17.0k [00:01<00:00, 9.67kKB/s]\n", + "100%|██████████| 9.84k/9.84k [00:01<00:00, 8.34kKB/s]\n", + "100%|██████████| 95.0k/95.0k [00:08<00:00, 11.4kKB/s]\n" + ] + } + ], + "source": [ + "tmpdir = TemporaryDirectory()\n", + "data_path = tmpdir.name\n", + "\n", + "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", + "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", + "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", + "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", + "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding.npy\")\n", + "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", + "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict.pkl\")\n", + "yaml_file = os.path.join(data_path, \"utils\", r'lstur.yaml')\n", + "\n", + "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", + "\n", + "if not os.path.exists(train_news_file):\n", + " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", + " \n", + "if not os.path.exists(valid_news_file):\n", + " download_deeprec_resources(mind_url, \\\n", + " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", + "if not os.path.exists(yaml_file):\n", + " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", + " os.path.join(data_path, 'utils'), mind_utils)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create hyper-parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_format=news,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmpcpgw9veg/utils/embedding.npy,wordDict_file=/tmp/tmpcpgw9veg/utils/word_dict.pkl,userDict_file=/tmp/tmpcpgw9veg/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=30,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=None,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" + ] + } + ], + "source": [ + "hparams = prepare_hparams(yaml_file, \n", + " wordEmb_file=wordEmb_file,\n", + " wordDict_file=wordDict_file, \n", + " userDict_file=userDict_file,\n", + " batch_size=batch_size,\n", + " epochs=epochs)\n", + "print(hparams)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "iterator = MINDIterator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the LSTUR model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"conv1d/Relu:0\", shape=(?, 30, 400), dtype=float32)\n", + "Tensor(\"att_layer2/Sum_1:0\", shape=(?, 400), dtype=float32)\n" + ] + } + ], + "source": [ + "model = LSTURModel(hparams, iterator, seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "586it [00:03, 155.76it/s]\n", + "236it [00:09, 26.08it/s]\n", + "7538it [00:00, 7590.51it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.5201, 'mean_mrr': 0.2214, 'ndcg@5': 0.2292, 'ndcg@10': 0.2912}\n" + ] + } + ], + "source": [ + "print(model.run_eval(valid_news_file, valid_behaviors_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [02:23, 7.55it/s]\n", + "586it [00:01, 430.29it/s]\n", + "236it [00:08, 28.16it/s]\n", + "7538it [00:01, 6738.86it/s]\n", + "1it [00:00, 7.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 1\n", + "train info: logloss loss:1.4868141242592814\n", + "eval info: group_auc:0.5973, mean_mrr:0.2622, ndcg@10:0.3501, ndcg@5:0.2861\n", + "at epoch 1 , train time: 143.8 eval time: 18.5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [02:18, 7.85it/s]\n", + "586it [00:01, 455.05it/s]\n", + "236it [00:08, 28.32it/s]\n", + "7538it [00:01, 6669.92it/s]\n", + "1it [00:00, 8.64it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 2\n", + "train info: logloss loss:1.3999453176011916\n", + "eval info: group_auc:0.6219, mean_mrr:0.2803, ndcg@10:0.3726, ndcg@5:0.3099\n", + "at epoch 2 , train time: 138.3 eval time: 19.2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [02:18, 7.83it/s]\n", + "586it [00:01, 448.54it/s]\n", + "236it [00:08, 28.40it/s]\n", + "7538it [00:00, 8089.03it/s]\n", + "1it [00:00, 8.04it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 3\n", + "train info: logloss loss:1.3563778104044455\n", + "eval info: group_auc:0.6281, mean_mrr:0.285, ndcg@10:0.3785, ndcg@5:0.3159\n", + "at epoch 3 , train time: 138.7 eval time: 18.2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [02:18, 7.84it/s]\n", + "586it [00:01, 431.78it/s]\n", + "236it [00:08, 28.00it/s]\n", + "7538it [00:01, 7187.47it/s]\n", + "1it [00:00, 8.33it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 4\n", + "train info: logloss loss:1.3173029956786892\n", + "eval info: group_auc:0.6369, mean_mrr:0.2913, ndcg@10:0.3851, ndcg@5:0.3225\n", + "at epoch 4 , train time: 138.5 eval time: 18.5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [02:18, 7.84it/s]\n", + "586it [00:01, 416.18it/s]\n", + "236it [00:08, 28.36it/s]\n", + "7538it [00:01, 7087.70it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 5\n", + "train info: logloss loss:1.2810899292017655\n", + "eval info: group_auc:0.6462, mean_mrr:0.3031, ndcg@10:0.3983, ndcg@5:0.3349\n", + "at epoch 5 , train time: 138.5 eval time: 18.4\n", + "CPU times: user 25min 40s, sys: 2min 21s, total: 28min 2s\n", + "Wall time: 13min 10s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "586it [00:01, 440.26it/s]\n", + "236it [00:08, 28.51it/s]\n", + "7538it [00:00, 9166.73it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.6462, 'mean_mrr': 0.3031, 'ndcg@5': 0.3349, 'ndcg@10': 0.3983}\n", + "CPU times: user 37.1 s, sys: 2.69 s, total: 39.8 s\n", + "Wall time: 18.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", + "print(res_syn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sb.glue(\"res_syn\", res_syn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = os.path.join(data_path, \"model\")\n", + "os.makedirs(model_path, exist_ok=True)\n", + "\n", + "model.model.save_weights(os.path.join(model_path, \"lstur_ckpt\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Prediction File\n", + "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", + "\n", + "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "586it [00:01, 438.04it/s]\n", + "236it [00:08, 28.26it/s]\n", + "7538it [00:00, 8876.72it/s]\n" + ] + } + ], + "source": [ + "group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "7538it [00:00, 44730.54it/s]\n" + ] + } + ], + "source": [ + "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", + " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", + " impr_index += 1\n", + " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", + " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", + " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", + "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", + "f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reference\n", + "\\[1\\] Mingxiao An, Fangzhao Wu, Chuhan Wu, Kun Zhang, Zheng Liu and Xing Xie: Neural News Recommendation with Long- and Short-term User Representations, ACL 2019
\n", + "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", + "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" + }, + "kernelspec": { + "display_name": "Python 3.7.11 64-bit ('tf2': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", - "[GCC 7.5.0]\n", - "Tensorflow version: 2.6.1\n" - ] - } - ], - "source": [ - "import sys\n", - "import os\n", - "import numpy as np\n", - "import zipfile\n", - "from tqdm import tqdm\n", - "import scrapbook as sb\n", - "from tempfile import TemporaryDirectory\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel('ERROR') # only show error messages\n", - "\n", - "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", - "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", - "from recommenders.models.newsrec.models.lstur import LSTURModel\n", - "from recommenders.models.newsrec.io.mind_iterator import MINDIterator\n", - "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Tensorflow version: {}\".format(tf.__version__))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prepare Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "epochs = 5\n", - "seed = 40\n", - "batch_size = 32\n", - "\n", - "# Options: demo, small, large\n", - "MIND_type = 'demo'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download and load data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 17.0k/17.0k [00:01<00:00, 9.67kKB/s]\n", - "100%|██████████| 9.84k/9.84k [00:01<00:00, 8.34kKB/s]\n", - "100%|██████████| 95.0k/95.0k [00:08<00:00, 11.4kKB/s]\n" - ] - } - ], - "source": [ - "tmpdir = TemporaryDirectory()\n", - "data_path = tmpdir.name\n", - "\n", - "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", - "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", - "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", - "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", - "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding.npy\")\n", - "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", - "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict.pkl\")\n", - "yaml_file = os.path.join(data_path, \"utils\", r'lstur.yaml')\n", - "\n", - "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", - "\n", - "if not os.path.exists(train_news_file):\n", - " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", - " \n", - "if not os.path.exists(valid_news_file):\n", - " download_deeprec_resources(mind_url, \\\n", - " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", - "if not os.path.exists(yaml_file):\n", - " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", - " os.path.join(data_path, 'utils'), mind_utils)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create hyper-parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data_format=news,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmpcpgw9veg/utils/embedding.npy,wordDict_file=/tmp/tmpcpgw9veg/utils/word_dict.pkl,userDict_file=/tmp/tmpcpgw9veg/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=30,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=None,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" - ] - } - ], - "source": [ - "hparams = prepare_hparams(yaml_file, \n", - " wordEmb_file=wordEmb_file,\n", - " wordDict_file=wordDict_file, \n", - " userDict_file=userDict_file,\n", - " batch_size=batch_size,\n", - " epochs=epochs)\n", - "print(hparams)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "iterator = MINDIterator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train the LSTUR model" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tensor(\"conv1d/Relu:0\", shape=(?, 30, 400), dtype=float32)\n", - "Tensor(\"att_layer2/Sum_1:0\", shape=(?, 400), dtype=float32)\n" - ] - } - ], - "source": [ - "model = LSTURModel(hparams, iterator, seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "586it [00:03, 155.76it/s]\n", - "236it [00:09, 26.08it/s]\n", - "7538it [00:00, 7590.51it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.5201, 'mean_mrr': 0.2214, 'ndcg@5': 0.2292, 'ndcg@10': 0.2912}\n" - ] - } - ], - "source": [ - "print(model.run_eval(valid_news_file, valid_behaviors_file))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [02:23, 7.55it/s]\n", - "586it [00:01, 430.29it/s]\n", - "236it [00:08, 28.16it/s]\n", - "7538it [00:01, 6738.86it/s]\n", - "1it [00:00, 7.26it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 1\n", - "train info: logloss loss:1.4868141242592814\n", - "eval info: group_auc:0.5973, mean_mrr:0.2622, ndcg@10:0.3501, ndcg@5:0.2861\n", - "at epoch 1 , train time: 143.8 eval time: 18.5\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [02:18, 7.85it/s]\n", - "586it [00:01, 455.05it/s]\n", - "236it [00:08, 28.32it/s]\n", - "7538it [00:01, 6669.92it/s]\n", - "1it [00:00, 8.64it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 2\n", - "train info: logloss loss:1.3999453176011916\n", - "eval info: group_auc:0.6219, mean_mrr:0.2803, ndcg@10:0.3726, ndcg@5:0.3099\n", - "at epoch 2 , train time: 138.3 eval time: 19.2\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [02:18, 7.83it/s]\n", - "586it [00:01, 448.54it/s]\n", - "236it [00:08, 28.40it/s]\n", - "7538it [00:00, 8089.03it/s]\n", - "1it [00:00, 8.04it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 3\n", - "train info: logloss loss:1.3563778104044455\n", - "eval info: group_auc:0.6281, mean_mrr:0.285, ndcg@10:0.3785, ndcg@5:0.3159\n", - "at epoch 3 , train time: 138.7 eval time: 18.2\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [02:18, 7.84it/s]\n", - "586it [00:01, 431.78it/s]\n", - "236it [00:08, 28.00it/s]\n", - "7538it [00:01, 7187.47it/s]\n", - "1it [00:00, 8.33it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 4\n", - "train info: logloss loss:1.3173029956786892\n", - "eval info: group_auc:0.6369, mean_mrr:0.2913, ndcg@10:0.3851, ndcg@5:0.3225\n", - "at epoch 4 , train time: 138.5 eval time: 18.5\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [02:18, 7.84it/s]\n", - "586it [00:01, 416.18it/s]\n", - "236it [00:08, 28.36it/s]\n", - "7538it [00:01, 7087.70it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 5\n", - "train info: logloss loss:1.2810899292017655\n", - "eval info: group_auc:0.6462, mean_mrr:0.3031, ndcg@10:0.3983, ndcg@5:0.3349\n", - "at epoch 5 , train time: 138.5 eval time: 18.4\n", - "CPU times: user 25min 40s, sys: 2min 21s, total: 28min 2s\n", - "Wall time: 13min 10s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "586it [00:01, 440.26it/s]\n", - "236it [00:08, 28.51it/s]\n", - "7538it [00:00, 9166.73it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.6462, 'mean_mrr': 0.3031, 'ndcg@5': 0.3349, 'ndcg@10': 0.3983}\n", - "CPU times: user 37.1 s, sys: 2.69 s, total: 39.8 s\n", - "Wall time: 18.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", - "print(res_syn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sb.glue(\"res_syn\", res_syn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save the model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "model_path = os.path.join(data_path, \"model\")\n", - "os.makedirs(model_path, exist_ok=True)\n", - "\n", - "model.model.save_weights(os.path.join(model_path, \"lstur_ckpt\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Output Prediction File\n", - "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", - "\n", - "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "586it [00:01, 438.04it/s]\n", - "236it [00:08, 28.26it/s]\n", - "7538it [00:00, 8876.72it/s]\n" - ] - } - ], - "source": [ - "group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "7538it [00:00, 44730.54it/s]\n" - ] - } - ], - "source": [ - "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", - " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", - " impr_index += 1\n", - " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", - " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", - " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", - "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", - "f.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reference\n", - "\\[1\\] Mingxiao An, Fangzhao Wu, Chuhan Wu, Kun Zhang, Zheng Liu and Xing Xie: Neural News Recommendation with Long- and Short-term User Representations, ACL 2019
\n", - "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", - "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "interpreter": { - "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" - }, - "kernelspec": { - "display_name": "Python 3.7.11 64-bit ('tf2': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/00_quick_start/naml_MIND.ipynb b/examples/00_quick_start/naml_MIND.ipynb index 0c1ef3a158..741803a683 100644 --- a/examples/00_quick_start/naml_MIND.ipynb +++ b/examples/00_quick_start/naml_MIND.ipynb @@ -1,559 +1,559 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# NAML: Neural News Recommendation with Attentive Multi-View Learning\n", - "NAML \\[1\\] is a multi-view news recommendation approach. The core of NAML is a news encoder and a user encoder. The newsencoder is composed of a title encoder, a body encoder, a vert encoder and a subvert encoder. The CNN-based title encoder and body encoder learn title and body representations by capturing words semantic information. After getting news title, body, vert and subvert representations, an attention network is used to aggregate those vectors. In the user encoder, we learn representations of users from their browsed news. Besides, we apply additive attention to learn more informative news and user representations by selecting important words and news.\n", - "\n", - "## Properties of NAML:\n", - "- NAML is a multi-view neural news recommendation approach.\n", - "- It uses news title, news body, news vert and news subvert to get news repersentations. And it uses user historical behaviors to learn user representations.\n", - "- NAML uses additive attention to learn informative news and user representations by selecting important words and news.\n", - "- Due to some legal issue, MIND dataset does not release news body. Therefore, we use news abstract instead.\n", - "\n", - "## Data format:\n", - "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall\n", - " and MINDlarge, please change the dowload source.\n", - " Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", - " \n", - "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", - "\n", - "### news data\n", - "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", - "One simple example:
\n", - "\n", - "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", - "
\n", - "\n", - "In general, each line in data file represents information of one piece of news:
\n", - "\n", - "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", - "\n", - "
\n", - "\n", - "We generate a word_dict file to tranform words in news title and news abstract to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", - "\n", - "### behaviors data\n", - "One simple example:
\n", - "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", - "
\n", - "\n", - "In general, each line in data file represents one instance of an impression. The format is like:
\n", - "\n", - "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", - "\n", - "
\n", - "\n", - "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", - "\n", - "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", - "\n", - "
\n", - "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global settings and imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", - " from pyarrow import HadoopFileSystem\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NAML: Neural News Recommendation with Attentive Multi-View Learning\n", + "NAML \\[1\\] is a multi-view news recommendation approach. The core of NAML is a news encoder and a user encoder. The newsencoder is composed of a title encoder, a body encoder, a vert encoder and a subvert encoder. The CNN-based title encoder and body encoder learn title and body representations by capturing words semantic information. After getting news title, body, vert and subvert representations, an attention network is used to aggregate those vectors. In the user encoder, we learn representations of users from their browsed news. Besides, we apply additive attention to learn more informative news and user representations by selecting important words and news.\n", + "\n", + "## Properties of NAML:\n", + "- NAML is a multi-view neural news recommendation approach.\n", + "- It uses news title, news body, news vert and news subvert to get news repersentations. And it uses user historical behaviors to learn user representations.\n", + "- NAML uses additive attention to learn informative news and user representations by selecting important words and news.\n", + "- Due to some legal issue, MIND dataset does not release news body. Therefore, we use news abstract instead.\n", + "\n", + "## Data format:\n", + "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall\n", + " and MINDlarge, please change the dowload source.\n", + " Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", + " \n", + "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", + "\n", + "### news data\n", + "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", + "One simple example:
\n", + "\n", + "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", + "
\n", + "\n", + "In general, each line in data file represents information of one piece of news:
\n", + "\n", + "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", + "\n", + "
\n", + "\n", + "We generate a word_dict file to tranform words in news title and news abstract to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", + "\n", + "### behaviors data\n", + "One simple example:
\n", + "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", + "
\n", + "\n", + "In general, each line in data file represents one instance of an impression. The format is like:
\n", + "\n", + "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", + "\n", + "
\n", + "\n", + "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", + "\n", + "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", + "\n", + "
\n", + "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global settings and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", + " from pyarrow import HadoopFileSystem\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", + "[GCC 7.5.0]\n", + "Tensorflow version: 2.6.1\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import numpy as np\n", + "import zipfile\n", + "from tqdm import tqdm\n", + "import scrapbook as sb\n", + "from tempfile import TemporaryDirectory\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel('ERROR') # only show error messages\n", + "\n", + "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", + "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", + "from recommenders.models.newsrec.models.naml import NAMLModel\n", + "from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator\n", + "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "epochs = 5\n", + "seed = 42\n", + "batch_size = 32\n", + "\n", + "# Options: demo, small, large\n", + "MIND_type = 'demo'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and load data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 17.0k/17.0k [00:01<00:00, 10.0kKB/s]\n", + "100%|██████████| 9.84k/9.84k [00:01<00:00, 8.42kKB/s]\n", + "100%|██████████| 95.0k/95.0k [00:06<00:00, 14.5kKB/s]\n" + ] + } + ], + "source": [ + "tmpdir = TemporaryDirectory()\n", + "data_path = tmpdir.name\n", + "\n", + "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", + "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", + "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", + "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", + "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding_all.npy\")\n", + "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", + "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict_all.pkl\")\n", + "vertDict_file = os.path.join(data_path, \"utils\", \"vert_dict.pkl\")\n", + "subvertDict_file = os.path.join(data_path, \"utils\", \"subvert_dict.pkl\")\n", + "yaml_file = os.path.join(data_path, \"utils\", r'naml.yaml')\n", + "\n", + "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", + "\n", + "if not os.path.exists(train_news_file):\n", + " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", + " \n", + "if not os.path.exists(valid_news_file):\n", + " download_deeprec_resources(mind_url, \\\n", + " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", + "if not os.path.exists(yaml_file):\n", + " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", + " os.path.join(data_path, 'utils'), mind_utils)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create hyper-parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_format=naml,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmp1_pb4727/utils/embedding_all.npy,wordDict_file=/tmp/tmp1_pb4727/utils/word_dict_all.pkl,userDict_file=/tmp/tmp1_pb4727/utils/uid2index.pkl,vertDict_file=/tmp/tmp1_pb4727/utils/vert_dict.pkl,subvertDict_file=/tmp/tmp1_pb4727/utils/subvert_dict.pkl,title_size=30,body_size=50,word_emb_dim=300,word_size=None,user_num=None,vert_num=17,subvert_num=249,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=relu,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" + ] + } + ], + "source": [ + "hparams = prepare_hparams(yaml_file, \n", + " wordEmb_file=wordEmb_file,\n", + " wordDict_file=wordDict_file, \n", + " userDict_file=userDict_file,\n", + " vertDict_file=vertDict_file, \n", + " subvertDict_file=subvertDict_file,\n", + " batch_size=batch_size,\n", + " epochs=epochs)\n", + "print(hparams)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "iterator = MINDAllIterator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the NAML model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "model = NAMLModel(hparams, iterator, seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "18693it [01:18, 239.62it/s]\n", + "7507it [00:30, 249.74it/s]\n", + "7538it [00:01, 6423.03it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.4807, 'mean_mrr': 0.2104, 'ndcg@5': 0.2141, 'ndcg@10': 0.2766}\n" + ] + } + ], + "source": [ + "print(model.run_eval(valid_news_file, valid_behaviors_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1085it [01:49, 9.91it/s]\n", + "18693it [01:16, 242.85it/s]\n", + "7507it [00:30, 244.22it/s]\n", + "7538it [00:01, 6540.79it/s]\n", + "2it [00:00, 10.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 1\n", + "train info: logloss loss:1.4915421532046411\n", + "eval info: group_auc:0.5789, mean_mrr:0.2473, ndcg@10:0.3371, ndcg@5:0.2699\n", + "at epoch 1 , train time: 109.5 eval time: 116.2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1085it [01:43, 10.48it/s]\n", + "18693it [01:16, 243.31it/s]\n", + "7507it [00:30, 247.71it/s]\n", + "7538it [00:01, 6653.75it/s]\n", + "1it [00:00, 9.91it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 2\n", + "train info: logloss loss:1.4155106401663222\n", + "eval info: group_auc:0.607, mean_mrr:0.2659, ndcg@10:0.3586, ndcg@5:0.2934\n", + "at epoch 2 , train time: 103.5 eval time: 115.7\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1085it [01:43, 10.45it/s]\n", + "18693it [01:16, 244.17it/s]\n", + "7507it [00:30, 246.78it/s]\n", + "7538it [00:01, 5570.62it/s]\n", + "1it [00:00, 9.92it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 3\n", + "train info: logloss loss:1.3669039504319291\n", + "eval info: group_auc:0.6199, mean_mrr:0.2719, ndcg@10:0.3675, ndcg@5:0.3038\n", + "at epoch 3 , train time: 103.8 eval time: 115.7\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1085it [01:43, 10.46it/s]\n", + "18693it [01:16, 244.82it/s]\n", + "7507it [00:30, 245.09it/s]\n", + "7538it [00:00, 8483.47it/s]\n", + "1it [00:00, 9.64it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 4\n", + "train info: logloss loss:1.3437963971344558\n", + "eval info: group_auc:0.6329, mean_mrr:0.2883, ndcg@10:0.3828, ndcg@5:0.3193\n", + "at epoch 4 , train time: 103.7 eval time: 115.3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1085it [01:43, 10.46it/s]\n", + "18693it [01:16, 244.65it/s]\n", + "7507it [00:30, 247.97it/s]\n", + "7538it [00:01, 6144.20it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 5\n", + "train info: logloss loss:1.3198621605398468\n", + "eval info: group_auc:0.6272, mean_mrr:0.2846, ndcg@10:0.376, ndcg@5:0.31\n", + "at epoch 5 , train time: 103.8 eval time: 115.3\n", + "CPU times: user 30min 1s, sys: 2min 34s, total: 32min 35s\n", + "Wall time: 18min 22s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "model.fit(train_news_file, train_behaviors_file,valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "18693it [01:16, 245.66it/s]\n", + "7507it [00:30, 249.77it/s]\n", + "7538it [00:01, 6922.17it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.6272, 'mean_mrr': 0.2846, 'ndcg@5': 0.31, 'ndcg@10': 0.376}\n", + "CPU times: user 4min 2s, sys: 25.8 s, total: 4min 28s\n", + "Wall time: 1min 54s\n" + ] + } + ], + "source": [ + "%%time\n", + "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", + "print(res_syn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sb.glue(\"res_syn\", res_syn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = os.path.join(data_path, \"model\")\n", + "os.makedirs(model_path, exist_ok=True)\n", + "\n", + "model.model.save_weights(os.path.join(model_path, \"naml_ckpt\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Prediction File\n", + "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", + "\n", + "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "18693it [01:16, 244.49it/s]\n", + "7507it [00:30, 249.89it/s]\n", + "7538it [00:01, 7159.06it/s]\n" + ] + } + ], + "source": [ + "group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "7538it [00:00, 40033.61it/s]\n" + ] + } + ], + "source": [ + "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", + " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", + " impr_index += 1\n", + " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", + " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", + " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", + "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", + "f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reference\n", + "\\[1\\] Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie: Neural News Recommendation with Attentive Multi-View Learning, IJCAI 2019
\n", + "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", + "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" + }, + "kernelspec": { + "display_name": "Python 3.7.11 64-bit ('tf2': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", - "[GCC 7.5.0]\n", - "Tensorflow version: 2.6.1\n" - ] - } - ], - "source": [ - "import sys\n", - "import os\n", - "import numpy as np\n", - "import zipfile\n", - "from tqdm import tqdm\n", - "import scrapbook as sb\n", - "from tempfile import TemporaryDirectory\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel('ERROR') # only show error messages\n", - "\n", - "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", - "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", - "from recommenders.models.newsrec.models.naml import NAMLModel\n", - "from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator\n", - "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Tensorflow version: {}\".format(tf.__version__))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "epochs = 5\n", - "seed = 42\n", - "batch_size = 32\n", - "\n", - "# Options: demo, small, large\n", - "MIND_type = 'demo'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download and load data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 17.0k/17.0k [00:01<00:00, 10.0kKB/s]\n", - "100%|██████████| 9.84k/9.84k [00:01<00:00, 8.42kKB/s]\n", - "100%|██████████| 95.0k/95.0k [00:06<00:00, 14.5kKB/s]\n" - ] - } - ], - "source": [ - "tmpdir = TemporaryDirectory()\n", - "data_path = tmpdir.name\n", - "\n", - "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", - "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", - "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", - "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", - "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding_all.npy\")\n", - "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", - "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict_all.pkl\")\n", - "vertDict_file = os.path.join(data_path, \"utils\", \"vert_dict.pkl\")\n", - "subvertDict_file = os.path.join(data_path, \"utils\", \"subvert_dict.pkl\")\n", - "yaml_file = os.path.join(data_path, \"utils\", r'naml.yaml')\n", - "\n", - "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", - "\n", - "if not os.path.exists(train_news_file):\n", - " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", - " \n", - "if not os.path.exists(valid_news_file):\n", - " download_deeprec_resources(mind_url, \\\n", - " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", - "if not os.path.exists(yaml_file):\n", - " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", - " os.path.join(data_path, 'utils'), mind_utils)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create hyper-parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data_format=naml,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmp1_pb4727/utils/embedding_all.npy,wordDict_file=/tmp/tmp1_pb4727/utils/word_dict_all.pkl,userDict_file=/tmp/tmp1_pb4727/utils/uid2index.pkl,vertDict_file=/tmp/tmp1_pb4727/utils/vert_dict.pkl,subvertDict_file=/tmp/tmp1_pb4727/utils/subvert_dict.pkl,title_size=30,body_size=50,word_emb_dim=300,word_size=None,user_num=None,vert_num=17,subvert_num=249,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=relu,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" - ] - } - ], - "source": [ - "hparams = prepare_hparams(yaml_file, \n", - " wordEmb_file=wordEmb_file,\n", - " wordDict_file=wordDict_file, \n", - " userDict_file=userDict_file,\n", - " vertDict_file=vertDict_file, \n", - " subvertDict_file=subvertDict_file,\n", - " batch_size=batch_size,\n", - " epochs=epochs)\n", - "print(hparams)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "iterator = MINDAllIterator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train the NAML model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "model = NAMLModel(hparams, iterator, seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "18693it [01:18, 239.62it/s]\n", - "7507it [00:30, 249.74it/s]\n", - "7538it [00:01, 6423.03it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.4807, 'mean_mrr': 0.2104, 'ndcg@5': 0.2141, 'ndcg@10': 0.2766}\n" - ] - } - ], - "source": [ - "print(model.run_eval(valid_news_file, valid_behaviors_file))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1085it [01:49, 9.91it/s]\n", - "18693it [01:16, 242.85it/s]\n", - "7507it [00:30, 244.22it/s]\n", - "7538it [00:01, 6540.79it/s]\n", - "2it [00:00, 10.26it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 1\n", - "train info: logloss loss:1.4915421532046411\n", - "eval info: group_auc:0.5789, mean_mrr:0.2473, ndcg@10:0.3371, ndcg@5:0.2699\n", - "at epoch 1 , train time: 109.5 eval time: 116.2\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1085it [01:43, 10.48it/s]\n", - "18693it [01:16, 243.31it/s]\n", - "7507it [00:30, 247.71it/s]\n", - "7538it [00:01, 6653.75it/s]\n", - "1it [00:00, 9.91it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 2\n", - "train info: logloss loss:1.4155106401663222\n", - "eval info: group_auc:0.607, mean_mrr:0.2659, ndcg@10:0.3586, ndcg@5:0.2934\n", - "at epoch 2 , train time: 103.5 eval time: 115.7\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1085it [01:43, 10.45it/s]\n", - "18693it [01:16, 244.17it/s]\n", - "7507it [00:30, 246.78it/s]\n", - "7538it [00:01, 5570.62it/s]\n", - "1it [00:00, 9.92it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 3\n", - "train info: logloss loss:1.3669039504319291\n", - "eval info: group_auc:0.6199, mean_mrr:0.2719, ndcg@10:0.3675, ndcg@5:0.3038\n", - "at epoch 3 , train time: 103.8 eval time: 115.7\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1085it [01:43, 10.46it/s]\n", - "18693it [01:16, 244.82it/s]\n", - "7507it [00:30, 245.09it/s]\n", - "7538it [00:00, 8483.47it/s]\n", - "1it [00:00, 9.64it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 4\n", - "train info: logloss loss:1.3437963971344558\n", - "eval info: group_auc:0.6329, mean_mrr:0.2883, ndcg@10:0.3828, ndcg@5:0.3193\n", - "at epoch 4 , train time: 103.7 eval time: 115.3\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1085it [01:43, 10.46it/s]\n", - "18693it [01:16, 244.65it/s]\n", - "7507it [00:30, 247.97it/s]\n", - "7538it [00:01, 6144.20it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 5\n", - "train info: logloss loss:1.3198621605398468\n", - "eval info: group_auc:0.6272, mean_mrr:0.2846, ndcg@10:0.376, ndcg@5:0.31\n", - "at epoch 5 , train time: 103.8 eval time: 115.3\n", - "CPU times: user 30min 1s, sys: 2min 34s, total: 32min 35s\n", - "Wall time: 18min 22s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "model.fit(train_news_file, train_behaviors_file,valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "18693it [01:16, 245.66it/s]\n", - "7507it [00:30, 249.77it/s]\n", - "7538it [00:01, 6922.17it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.6272, 'mean_mrr': 0.2846, 'ndcg@5': 0.31, 'ndcg@10': 0.376}\n", - "CPU times: user 4min 2s, sys: 25.8 s, total: 4min 28s\n", - "Wall time: 1min 54s\n" - ] - } - ], - "source": [ - "%%time\n", - "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", - "print(res_syn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sb.glue(\"res_syn\", res_syn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save the model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "model_path = os.path.join(data_path, \"model\")\n", - "os.makedirs(model_path, exist_ok=True)\n", - "\n", - "model.model.save_weights(os.path.join(model_path, \"naml_ckpt\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Output Prediction File\n", - "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", - "\n", - "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "18693it [01:16, 244.49it/s]\n", - "7507it [00:30, 249.89it/s]\n", - "7538it [00:01, 7159.06it/s]\n" - ] - } - ], - "source": [ - "group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "7538it [00:00, 40033.61it/s]\n" - ] - } - ], - "source": [ - "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", - " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", - " impr_index += 1\n", - " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", - " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", - " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", - "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", - "f.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reference\n", - "\\[1\\] Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie: Neural News Recommendation with Attentive Multi-View Learning, IJCAI 2019
\n", - "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", - "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "interpreter": { - "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" - }, - "kernelspec": { - "display_name": "Python 3.7.11 64-bit ('tf2': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/00_quick_start/ncf_movielens.ipynb b/examples/00_quick_start/ncf_movielens.ipynb index bcaf20ea14..4d669e92bd 100644 --- a/examples/00_quick_start/ncf_movielens.ipynb +++ b/examples/00_quick_start/ncf_movielens.ipynb @@ -1,395 +1,395 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Neural Collaborative Filtering on MovieLens dataset.\n", - "\n", - "Neural Collaborative Filtering (NCF) is a well known recommendation algorithm that generalizes the matrix factorization problem with multi-layer perceptron. \n", - "\n", - "This notebook provides an example of how to utilize and evaluate NCF implementation in the `recommenders`. We use a smaller dataset in this example to run NCF efficiently with GPU acceleration on a [Data Science Virtual Machine](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.7.5 (default, Dec 9 2021, 17:04:37) \n", - "[GCC 8.4.0]\n", - "Pandas version: 1.3.5\n", - "Tensorflow version: 2.7.0\n" - ] - } - ], - "source": [ - "import sys\n", - "import pandas as pd\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel('ERROR') # only show error messages\n", - "\n", - "from recommenders.utils.timer import Timer\n", - "from recommenders.models.ncf.ncf_singlenode import NCF\n", - "from recommenders.models.ncf.dataset import Dataset as NCFDataset\n", - "from recommenders.datasets import movielens\n", - "from recommenders.utils.notebook_utils import is_jupyter\n", - "from recommenders.datasets.python_splitters import python_chrono_split\n", - "from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n", - " recall_at_k, get_top_k_items)\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Pandas version: {}\".format(pd.__version__))\n", - "print(\"Tensorflow version: {}\".format(tf.__version__))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set the default parameters." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# top k items to recommend\n", - "TOP_K = 10\n", - "\n", - "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'\n", - "\n", - "# Model parameters\n", - "EPOCHS = 50\n", - "BATCH_SIZE = 256\n", - "\n", - "SEED = 42" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Download the MovieLens dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip\n", - "100%|██████████| 4.81k/4.81k [00:00<00:00, 16.9kKB/s]\n" - ] - } - ], - "source": [ - "df = movielens.load_pandas_df(\n", - " size=MOVIELENS_DATA_SIZE,\n", - " header=[\"userID\", \"itemID\", \"rating\", \"timestamp\"]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Split the data using the Spark chronological splitter provided in utilities" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "train, test = python_chrono_split(df, 0.75)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Filter out any users or items in the test set that do not appear in the training set." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "test = test[test[\"userID\"].isin(train[\"userID\"].unique())]\n", - "test = test[test[\"itemID\"].isin(train[\"itemID\"].unique())]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Write datasets to csv files." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "train_file = \"./train.csv\"\n", - "test_file = \"./test.csv\"\n", - "train.to_csv(train_file, index=False)\n", - "test.to_csv(test_file, index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate an NCF dataset object from the data subsets." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:recommenders.models.ncf.dataset:Indexing ./train.csv ...\n", - "INFO:recommenders.models.ncf.dataset:Indexing ./test.csv ...\n", - "INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file ./test_full.csv ...\n", - "100%|██████████| 943/943 [00:28<00:00, 33.06it/s]\n", - "INFO:recommenders.models.ncf.dataset:Indexing ./test_full.csv ...\n" - ] - } - ], - "source": [ - "data = NCFDataset(train_file=train_file, test_file=test_file, seed=SEED)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Train the NCF model on the training data, and get the top-k recommendations for our testing data\n", - "\n", - "NCF accepts implicit feedback and generates prospensity of items to be recommended to users in the scale of 0 to 1. A recommended item list can then be generated based on the scores. Note that this quickstart notebook is using a smaller number of epochs to reduce time for training. As a consequence, the model performance will be slighlty deteriorated. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = NCF (\n", - " n_users=data.n_users, \n", - " n_items=data.n_items,\n", - " model_type=\"NeuMF\",\n", - " n_factors=4,\n", - " layer_sizes=[16,8,4],\n", - " n_epochs=EPOCHS,\n", - " batch_size=BATCH_SIZE,\n", - " learning_rate=1e-3,\n", - " verbose=10,\n", - " seed=SEED\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [6.31s]: train_loss = 0.259318 \n", - "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [6.28s]: train_loss = 0.246134 \n", - "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [6.21s]: train_loss = 0.240125 \n", - "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [6.23s]: train_loss = 0.235913 \n", - "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [6.31s]: train_loss = 0.232268 \n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Neural Collaborative Filtering on MovieLens dataset.\n", + "\n", + "Neural Collaborative Filtering (NCF) is a well known recommendation algorithm that generalizes the matrix factorization problem with multi-layer perceptron. \n", + "\n", + "This notebook provides an example of how to utilize and evaluate NCF implementation in the `recommenders`. We use a smaller dataset in this example to run NCF efficiently with GPU acceleration on a [Data Science Virtual Machine](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.7.5 (default, Dec 9 2021, 17:04:37) \n", + "[GCC 8.4.0]\n", + "Pandas version: 1.3.5\n", + "Tensorflow version: 2.7.0\n" + ] + } + ], + "source": [ + "import sys\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel('ERROR') # only show error messages\n", + "\n", + "from recommenders.utils.timer import Timer\n", + "from recommenders.models.ncf.ncf_singlenode import NCF\n", + "from recommenders.models.ncf.dataset import Dataset as NCFDataset\n", + "from recommenders.datasets import movielens\n", + "from recommenders.utils.notebook_utils import is_jupyter\n", + "from recommenders.datasets.python_splitters import python_chrono_split\n", + "from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n", + " recall_at_k, get_top_k_items)\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the default parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# top k items to recommend\n", + "TOP_K = 10\n", + "\n", + "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", + "MOVIELENS_DATA_SIZE = '100k'\n", + "\n", + "# Model parameters\n", + "EPOCHS = 50\n", + "BATCH_SIZE = 256\n", + "\n", + "SEED = 42" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Download the MovieLens dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip\n", + "100%|██████████| 4.81k/4.81k [00:00<00:00, 16.9kKB/s]\n" + ] + } + ], + "source": [ + "df = movielens.load_pandas_df(\n", + " size=MOVIELENS_DATA_SIZE,\n", + " header=[\"userID\", \"itemID\", \"rating\", \"timestamp\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Split the data using the Spark chronological splitter provided in utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = python_chrono_split(df, 0.75)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Filter out any users or items in the test set that do not appear in the training set." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "test = test[test[\"userID\"].isin(train[\"userID\"].unique())]\n", + "test = test[test[\"itemID\"].isin(train[\"itemID\"].unique())]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Write datasets to csv files." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train_file = \"./train.csv\"\n", + "test_file = \"./test.csv\"\n", + "train.to_csv(train_file, index=False)\n", + "test.to_csv(test_file, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate an NCF dataset object from the data subsets." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:recommenders.models.ncf.dataset:Indexing ./train.csv ...\n", + "INFO:recommenders.models.ncf.dataset:Indexing ./test.csv ...\n", + "INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file ./test_full.csv ...\n", + "100%|██████████| 943/943 [00:28<00:00, 33.06it/s]\n", + "INFO:recommenders.models.ncf.dataset:Indexing ./test_full.csv ...\n" + ] + } + ], + "source": [ + "data = NCFDataset(train_file=train_file, test_file=test_file, seed=SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Train the NCF model on the training data, and get the top-k recommendations for our testing data\n", + "\n", + "NCF accepts implicit feedback and generates prospensity of items to be recommended to users in the scale of 0 to 1. A recommended item list can then be generated based on the scores. Note that this quickstart notebook is using a smaller number of epochs to reduce time for training. As a consequence, the model performance will be slighlty deteriorated. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = NCF (\n", + " n_users=data.n_users, \n", + " n_items=data.n_items,\n", + " model_type=\"NeuMF\",\n", + " n_factors=4,\n", + " layer_sizes=[16,8,4],\n", + " n_epochs=EPOCHS,\n", + " batch_size=BATCH_SIZE,\n", + " learning_rate=1e-3,\n", + " verbose=10,\n", + " seed=SEED\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [6.31s]: train_loss = 0.259318 \n", + "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [6.28s]: train_loss = 0.246134 \n", + "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [6.21s]: train_loss = 0.240125 \n", + "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [6.23s]: train_loss = 0.235913 \n", + "INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [6.31s]: train_loss = 0.232268 \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 317.7864 seconds for training.\n" + ] + } + ], + "source": [ + "with Timer() as train_time:\n", + " model.fit(data)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the movie recommendation use case scenario, seen movies are not recommended to the users." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 2.7835 seconds for prediction.\n" + ] + } + ], + "source": [ + "with Timer() as test_time:\n", + " users, items, preds = [], [], []\n", + " item = list(train.itemID.unique())\n", + " for user in train.userID.unique():\n", + " user = [user] * len(item) \n", + " users.extend(user)\n", + " items.extend(item)\n", + " preds.extend(list(model.predict(user, item, is_list=True)))\n", + "\n", + " all_predictions = pd.DataFrame(data={\"userID\": users, \"itemID\":items, \"prediction\":preds})\n", + "\n", + " merged = pd.merge(train, all_predictions, on=[\"userID\", \"itemID\"], how=\"outer\")\n", + " all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)\n", + "\n", + "print(\"Took {} seconds for prediction.\".format(test_time))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Evaluate how well NCF performs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ranking metrics are used for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAP:\t0.049650\n", + "NDCG:\t0.200524\n", + "Precision@K:\t0.183033\n", + "Recall@K:\t0.102721\n" + ] + } + ], + "source": [ + "eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", + "\n", + "print(\"MAP:\\t%f\" % eval_map,\n", + " \"NDCG:\\t%f\" % eval_ndcg,\n", + " \"Precision@K:\\t%f\" % eval_precision,\n", + " \"Recall@K:\\t%f\" % eval_recall, sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if is_jupyter():\n", + " # Record results with papermill for tests\n", + " import papermill as pm\n", + " import scrapbook as sb\n", + " sb.glue(\"map\", eval_map)\n", + " sb.glue(\"ndcg\", eval_ndcg)\n", + " sb.glue(\"precision\", eval_precision)\n", + " sb.glue(\"recall\", eval_recall)\n", + " sb.glue(\"train_time\", train_time.interval)\n", + " sb.glue(\"test_time\", test_time.interval)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "reco_gpu", + "language": "python", + "name": "conda-env-reco_gpu-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 317.7864 seconds for training.\n" - ] - } - ], - "source": [ - "with Timer() as train_time:\n", - " model.fit(data)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the movie recommendation use case scenario, seen movies are not recommended to the users." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 2.7835 seconds for prediction.\n" - ] - } - ], - "source": [ - "with Timer() as test_time:\n", - " users, items, preds = [], [], []\n", - " item = list(train.itemID.unique())\n", - " for user in train.userID.unique():\n", - " user = [user] * len(item) \n", - " users.extend(user)\n", - " items.extend(item)\n", - " preds.extend(list(model.predict(user, item, is_list=True)))\n", - "\n", - " all_predictions = pd.DataFrame(data={\"userID\": users, \"itemID\":items, \"prediction\":preds})\n", - "\n", - " merged = pd.merge(train, all_predictions, on=[\"userID\", \"itemID\"], how=\"outer\")\n", - " all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)\n", - "\n", - "print(\"Took {} seconds for prediction.\".format(test_time))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4. Evaluate how well NCF performs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The ranking metrics are used for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MAP:\t0.049650\n", - "NDCG:\t0.200524\n", - "Precision@K:\t0.183033\n", - "Recall@K:\t0.102721\n" - ] - } - ], - "source": [ - "eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", - "eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", - "eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", - "eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)\n", - "\n", - "print(\"MAP:\\t%f\" % eval_map,\n", - " \"NDCG:\\t%f\" % eval_ndcg,\n", - " \"Precision@K:\\t%f\" % eval_precision,\n", - " \"Recall@K:\\t%f\" % eval_recall, sep='\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if is_jupyter():\n", - " # Record results with papermill for tests\n", - " import papermill as pm\n", - " import scrapbook as sb\n", - " sb.glue(\"map\", eval_map)\n", - " sb.glue(\"ndcg\", eval_ndcg)\n", - " sb.glue(\"precision\", eval_precision)\n", - " sb.glue(\"recall\", eval_recall)\n", - " sb.glue(\"train_time\", train_time.interval)\n", - " sb.glue(\"test_time\", test_time.interval)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "reco_gpu", - "language": "python", - "name": "conda-env-reco_gpu-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/00_quick_start/npa_MIND.ipynb b/examples/00_quick_start/npa_MIND.ipynb index a7724ed53c..cd68bcdd07 100644 --- a/examples/00_quick_start/npa_MIND.ipynb +++ b/examples/00_quick_start/npa_MIND.ipynb @@ -1,537 +1,537 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# NPA: Neural News Recommendation with Personalized Attention\n", - "NPA \\[1\\] is a news recommendation model with personalized attention. The core of NPA is a news representation model and a user representation model. In the news representation model we use a CNN network to learn hidden representations of news articles based on their titles. In the user representation model we learn the representations of users based on the representations of their clicked news articles. In addition, a word-level and a news-level personalized attention are used to capture different informativeness for different users.\n", - "\n", - "## Properties of NPA:\n", - "- NPA is a content-based news recommendation method.\n", - "- It uses a CNN network to learn news representation. And it learns user representations from their clicked news articles.\n", - "- A word-level personalized attention is used to help NPA attend to important words for different users.\n", - "- A news-level personalized attention is used to help NPA attend to important historical clicked news for different users.\n", - "\n", - "## Data format:\n", - "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall\n", - " and MINDlarge, please change the dowload source.\n", - " Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", - " \n", - "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", - "\n", - "### news data\n", - "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", - "One simple example:
\n", - "\n", - "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", - "
\n", - "\n", - "In general, each line in data file represents information of one piece of news:
\n", - "\n", - "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", - "\n", - "
\n", - "\n", - "We generate a word_dict file to tranform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", - "\n", - "### behaviors data\n", - "One simple example:
\n", - "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", - "
\n", - "\n", - "In general, each line in data file represents one instance of an impression. The format is like:
\n", - "\n", - "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", - "\n", - "
\n", - "\n", - "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", - "\n", - "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", - "\n", - "
\n", - "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global settings and imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", - " from pyarrow import HadoopFileSystem\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NPA: Neural News Recommendation with Personalized Attention\n", + "NPA \\[1\\] is a news recommendation model with personalized attention. The core of NPA is a news representation model and a user representation model. In the news representation model we use a CNN network to learn hidden representations of news articles based on their titles. In the user representation model we learn the representations of users based on the representations of their clicked news articles. In addition, a word-level and a news-level personalized attention are used to capture different informativeness for different users.\n", + "\n", + "## Properties of NPA:\n", + "- NPA is a content-based news recommendation method.\n", + "- It uses a CNN network to learn news representation. And it learns user representations from their clicked news articles.\n", + "- A word-level personalized attention is used to help NPA attend to important words for different users.\n", + "- A news-level personalized attention is used to help NPA attend to important historical clicked news for different users.\n", + "\n", + "## Data format:\n", + "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall\n", + " and MINDlarge, please change the dowload source.\n", + " Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", + " \n", + "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", + "\n", + "### news data\n", + "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", + "One simple example:
\n", + "\n", + "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", + "
\n", + "\n", + "In general, each line in data file represents information of one piece of news:
\n", + "\n", + "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", + "\n", + "
\n", + "\n", + "We generate a word_dict file to tranform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", + "\n", + "### behaviors data\n", + "One simple example:
\n", + "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", + "
\n", + "\n", + "In general, each line in data file represents one instance of an impression. The format is like:
\n", + "\n", + "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", + "\n", + "
\n", + "\n", + "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", + "\n", + "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", + "\n", + "
\n", + "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global settings and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", + " from pyarrow import HadoopFileSystem\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", + "[GCC 7.5.0]\n", + "Tensorflow version: 2.6.1\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import numpy as np\n", + "import zipfile\n", + "from tqdm import tqdm\n", + "import scrapbook as sb\n", + "from tempfile import TemporaryDirectory\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel('ERROR') # only show error messages\n", + "\n", + "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", + "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", + "from recommenders.models.newsrec.models.npa import NPAModel\n", + "from recommenders.models.newsrec.io.mind_iterator import MINDIterator\n", + "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "epochs = 5\n", + "seed = 42\n", + "batch_size = 32\n", + "\n", + "# Options: demo, small, large\n", + "MIND_type = 'demo'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and load data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 17.0k/17.0k [00:01<00:00, 9.66kKB/s]\n", + "100%|██████████| 9.84k/9.84k [00:01<00:00, 9.01kKB/s]\n", + "100%|██████████| 95.0k/95.0k [00:09<00:00, 10.0kKB/s]\n" + ] + } + ], + "source": [ + "tmpdir = TemporaryDirectory()\n", + "data_path = tmpdir.name\n", + "\n", + "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", + "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", + "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", + "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", + "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding.npy\")\n", + "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", + "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict.pkl\")\n", + "yaml_file = os.path.join(data_path, \"utils\", r'npa.yaml')\n", + "\n", + "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", + "\n", + "if not os.path.exists(train_news_file):\n", + " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", + " \n", + "if not os.path.exists(valid_news_file):\n", + " download_deeprec_resources(mind_url, \\\n", + " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", + "if not os.path.exists(yaml_file):\n", + " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", + " os.path.join(data_path, 'utils'), mind_utils)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create hyper-parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_format=news,iterator_type=None,support_quick_scoring=False,wordEmb_file=/tmp/tmpump0ai7m/utils/embedding.npy,wordDict_file=/tmp/tmpump0ai7m/utils/word_dict.pkl,userDict_file=/tmp/tmpump0ai7m/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=10,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=None,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=100,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" + ] + } + ], + "source": [ + "hparams = prepare_hparams(yaml_file, \n", + " wordEmb_file=wordEmb_file,\n", + " wordDict_file=wordDict_file, \n", + " userDict_file=userDict_file,\n", + " batch_size=batch_size,\n", + " epochs=epochs)\n", + "print(hparams)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "iterator = MINDIterator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the NPA model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "model = NPAModel(hparams, iterator, seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "8874it [01:15, 117.03it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.5228, 'mean_mrr': 0.2328, 'ndcg@5': 0.2377, 'ndcg@10': 0.303}\n" + ] + } + ], + "source": [ + "print(model.run_eval(valid_news_file, valid_behaviors_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [00:34, 31.34it/s]\n", + "8874it [01:13, 119.95it/s]\n", + "4it [00:00, 35.00it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 1\n", + "train info: logloss loss:1.5035385669485202\n", + "eval info: group_auc:0.5867, mean_mrr:0.2555, ndcg@10:0.3432, ndcg@5:0.2778\n", + "at epoch 1 , train time: 34.7 eval time: 84.3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [00:30, 35.30it/s]\n", + "8874it [01:13, 120.34it/s]\n", + "4it [00:00, 35.45it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 2\n", + "train info: logloss loss:1.4050482461026579\n", + "eval info: group_auc:0.5996, mean_mrr:0.2706, ndcg@10:0.3589, ndcg@5:0.2967\n", + "at epoch 2 , train time: 30.8 eval time: 83.1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [00:31, 34.98it/s]\n", + "8874it [01:13, 119.93it/s]\n", + "4it [00:00, 35.09it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 3\n", + "train info: logloss loss:1.3529314152333838\n", + "eval info: group_auc:0.5992, mean_mrr:0.275, ndcg@10:0.3646, ndcg@5:0.3005\n", + "at epoch 3 , train time: 31.0 eval time: 83.4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [00:30, 35.45it/s]\n", + "8874it [01:14, 119.89it/s]\n", + "4it [00:00, 35.11it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 4\n", + "train info: logloss loss:1.3024913867314656\n", + "eval info: group_auc:0.5942, mean_mrr:0.2695, ndcg@10:0.3563, ndcg@5:0.2924\n", + "at epoch 4 , train time: 30.6 eval time: 83.4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1086it [00:30, 35.38it/s]\n", + "8874it [01:14, 119.70it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 5\n", + "train info: logloss loss:1.2650439398394104\n", + "eval info: group_auc:0.6005, mean_mrr:0.2711, ndcg@10:0.3586, ndcg@5:0.2936\n", + "at epoch 5 , train time: 30.7 eval time: 83.7\n", + "CPU times: user 13min 25s, sys: 59.5 s, total: 14min 25s\n", + "Wall time: 9min 35s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "8874it [01:14, 119.59it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.6005, 'mean_mrr': 0.2711, 'ndcg@5': 0.2936, 'ndcg@10': 0.3586}\n", + "CPU times: user 2min 5s, sys: 9.68 s, total: 2min 14s\n", + "Wall time: 1min 24s\n" + ] + } + ], + "source": [ + "%%time\n", + "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", + "print(res_syn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sb.glue(\"res_syn\", res_syn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = os.path.join(data_path, \"model\")\n", + "os.makedirs(model_path, exist_ok=True)\n", + "\n", + "model.model.save_weights(os.path.join(model_path, \"npa_ckpt\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Predcition File\n", + "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", + "\n", + "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "8874it [01:14, 119.45it/s]\n" + ] + } + ], + "source": [ + "group_impr_indexes, group_labels, group_preds = model.run_slow_eval(valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "7538it [00:00, 23050.36it/s]\n" + ] + } + ], + "source": [ + "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", + " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", + " impr_index += 1\n", + " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", + " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", + " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", + "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", + "f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reference\n", + "\\[1\\] Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie: NPA: Neural News Recommendation with Personalized Attention, KDD 2019, ADS track.
\n", + "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", + "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" + }, + "kernelspec": { + "display_name": "Python 3.7.11 64-bit ('tf2': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", - "[GCC 7.5.0]\n", - "Tensorflow version: 2.6.1\n" - ] - } - ], - "source": [ - "import sys\n", - "import os\n", - "import numpy as np\n", - "import zipfile\n", - "from tqdm import tqdm\n", - "import scrapbook as sb\n", - "from tempfile import TemporaryDirectory\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel('ERROR') # only show error messages\n", - "\n", - "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", - "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", - "from recommenders.models.newsrec.models.npa import NPAModel\n", - "from recommenders.models.newsrec.io.mind_iterator import MINDIterator\n", - "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Tensorflow version: {}\".format(tf.__version__))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "epochs = 5\n", - "seed = 42\n", - "batch_size = 32\n", - "\n", - "# Options: demo, small, large\n", - "MIND_type = 'demo'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download and load data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 17.0k/17.0k [00:01<00:00, 9.66kKB/s]\n", - "100%|██████████| 9.84k/9.84k [00:01<00:00, 9.01kKB/s]\n", - "100%|██████████| 95.0k/95.0k [00:09<00:00, 10.0kKB/s]\n" - ] - } - ], - "source": [ - "tmpdir = TemporaryDirectory()\n", - "data_path = tmpdir.name\n", - "\n", - "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", - "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", - "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", - "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", - "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding.npy\")\n", - "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", - "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict.pkl\")\n", - "yaml_file = os.path.join(data_path, \"utils\", r'npa.yaml')\n", - "\n", - "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", - "\n", - "if not os.path.exists(train_news_file):\n", - " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", - " \n", - "if not os.path.exists(valid_news_file):\n", - " download_deeprec_resources(mind_url, \\\n", - " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", - "if not os.path.exists(yaml_file):\n", - " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", - " os.path.join(data_path, 'utils'), mind_utils)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create hyper-parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data_format=news,iterator_type=None,support_quick_scoring=False,wordEmb_file=/tmp/tmpump0ai7m/utils/embedding.npy,wordDict_file=/tmp/tmpump0ai7m/utils/word_dict.pkl,userDict_file=/tmp/tmpump0ai7m/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=10,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=None,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=100,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" - ] - } - ], - "source": [ - "hparams = prepare_hparams(yaml_file, \n", - " wordEmb_file=wordEmb_file,\n", - " wordDict_file=wordDict_file, \n", - " userDict_file=userDict_file,\n", - " batch_size=batch_size,\n", - " epochs=epochs)\n", - "print(hparams)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "iterator = MINDIterator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train the NPA model" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "model = NPAModel(hparams, iterator, seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "8874it [01:15, 117.03it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.5228, 'mean_mrr': 0.2328, 'ndcg@5': 0.2377, 'ndcg@10': 0.303}\n" - ] - } - ], - "source": [ - "print(model.run_eval(valid_news_file, valid_behaviors_file))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [00:34, 31.34it/s]\n", - "8874it [01:13, 119.95it/s]\n", - "4it [00:00, 35.00it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 1\n", - "train info: logloss loss:1.5035385669485202\n", - "eval info: group_auc:0.5867, mean_mrr:0.2555, ndcg@10:0.3432, ndcg@5:0.2778\n", - "at epoch 1 , train time: 34.7 eval time: 84.3\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [00:30, 35.30it/s]\n", - "8874it [01:13, 120.34it/s]\n", - "4it [00:00, 35.45it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 2\n", - "train info: logloss loss:1.4050482461026579\n", - "eval info: group_auc:0.5996, mean_mrr:0.2706, ndcg@10:0.3589, ndcg@5:0.2967\n", - "at epoch 2 , train time: 30.8 eval time: 83.1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [00:31, 34.98it/s]\n", - "8874it [01:13, 119.93it/s]\n", - "4it [00:00, 35.09it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 3\n", - "train info: logloss loss:1.3529314152333838\n", - "eval info: group_auc:0.5992, mean_mrr:0.275, ndcg@10:0.3646, ndcg@5:0.3005\n", - "at epoch 3 , train time: 31.0 eval time: 83.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [00:30, 35.45it/s]\n", - "8874it [01:14, 119.89it/s]\n", - "4it [00:00, 35.11it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 4\n", - "train info: logloss loss:1.3024913867314656\n", - "eval info: group_auc:0.5942, mean_mrr:0.2695, ndcg@10:0.3563, ndcg@5:0.2924\n", - "at epoch 4 , train time: 30.6 eval time: 83.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1086it [00:30, 35.38it/s]\n", - "8874it [01:14, 119.70it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 5\n", - "train info: logloss loss:1.2650439398394104\n", - "eval info: group_auc:0.6005, mean_mrr:0.2711, ndcg@10:0.3586, ndcg@5:0.2936\n", - "at epoch 5 , train time: 30.7 eval time: 83.7\n", - "CPU times: user 13min 25s, sys: 59.5 s, total: 14min 25s\n", - "Wall time: 9min 35s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "8874it [01:14, 119.59it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.6005, 'mean_mrr': 0.2711, 'ndcg@5': 0.2936, 'ndcg@10': 0.3586}\n", - "CPU times: user 2min 5s, sys: 9.68 s, total: 2min 14s\n", - "Wall time: 1min 24s\n" - ] - } - ], - "source": [ - "%%time\n", - "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", - "print(res_syn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sb.glue(\"res_syn\", res_syn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save the model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "model_path = os.path.join(data_path, \"model\")\n", - "os.makedirs(model_path, exist_ok=True)\n", - "\n", - "model.model.save_weights(os.path.join(model_path, \"npa_ckpt\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Output Predcition File\n", - "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", - "\n", - "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "8874it [01:14, 119.45it/s]\n" - ] - } - ], - "source": [ - "group_impr_indexes, group_labels, group_preds = model.run_slow_eval(valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "7538it [00:00, 23050.36it/s]\n" - ] - } - ], - "source": [ - "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", - " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", - " impr_index += 1\n", - " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", - " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", - " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", - "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", - "f.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reference\n", - "\\[1\\] Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie: NPA: Neural News Recommendation with Personalized Attention, KDD 2019, ADS track.
\n", - "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", - "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "interpreter": { - "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" - }, - "kernelspec": { - "display_name": "Python 3.7.11 64-bit ('tf2': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/00_quick_start/nrms_MIND.ipynb b/examples/00_quick_start/nrms_MIND.ipynb index c8c4f5d822..9ca948a717 100644 --- a/examples/00_quick_start/nrms_MIND.ipynb +++ b/examples/00_quick_start/nrms_MIND.ipynb @@ -1,556 +1,556 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# NRMS: Neural News Recommendation with Multi-Head Self-Attention\n", - "NRMS \\[1\\] is a neural news recommendation approach with multi-head selfattention. The core of NRMS is a news encoder and a user encoder. In the newsencoder, a multi-head self-attentions is used to learn news representations from news titles by modeling the interactions between words. In the user encoder, we learn representations of users from their browsed news and use multihead self-attention to capture the relatedness between the news. Besides, we apply additive\n", - "attention to learn more informative news and user representations by selecting important words and news.\n", - "\n", - "## Properties of NRMS:\n", - "- NRMS is a content-based neural news recommendation approach.\n", - "- It uses multi-self attention to learn news representations by modeling the iteractions between words and learn user representations by capturing the relationship between user browsed news.\n", - "- NRMS uses additive attentions to learn informative news and user representations by selecting important words and news.\n", - "\n", - "## Data format:\n", - "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall and MINDlarge, please change the dowload source. Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", - " \n", - "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", - "\n", - "### news data\n", - "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", - "One simple example:
\n", - "\n", - "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", - "
\n", - "\n", - "In general, each line in data file represents information of one piece of news:
\n", - "\n", - "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", - "\n", - "
\n", - "\n", - "We generate a word_dict file to transform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", - "\n", - "### behaviors data\n", - "One simple example:
\n", - "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", - "
\n", - "\n", - "In general, each line in data file represents one instance of an impression. The format is like:
\n", - "\n", - "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", - "\n", - "
\n", - "\n", - "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", - "\n", - "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", - "\n", - "
\n", - "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Global settings and imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", - " from pyarrow import HadoopFileSystem\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NRMS: Neural News Recommendation with Multi-Head Self-Attention\n", + "NRMS \\[1\\] is a neural news recommendation approach with multi-head selfattention. The core of NRMS is a news encoder and a user encoder. In the newsencoder, a multi-head self-attentions is used to learn news representations from news titles by modeling the interactions between words. In the user encoder, we learn representations of users from their browsed news and use multihead self-attention to capture the relatedness between the news. Besides, we apply additive\n", + "attention to learn more informative news and user representations by selecting important words and news.\n", + "\n", + "## Properties of NRMS:\n", + "- NRMS is a content-based neural news recommendation approach.\n", + "- It uses multi-self attention to learn news representations by modeling the iteractions between words and learn user representations by capturing the relationship between user browsed news.\n", + "- NRMS uses additive attentions to learn informative news and user representations by selecting important words and news.\n", + "\n", + "## Data format:\n", + "For quicker training and evaluaiton, we sample MINDdemo dataset of 5k users from [MIND small dataset](https://msnews.github.io/). The MINDdemo dataset has the same file format as MINDsmall and MINDlarge. If you want to try experiments on MINDsmall and MINDlarge, please change the dowload source. Select the MIND_type parameter from ['large', 'small', 'demo'] to choose dataset.\n", + " \n", + "**MINDdemo_train** is used for training, and **MINDdemo_dev** is used for evaluation. Training data and evaluation data are composed of a news file and a behaviors file. You can find more detailed data description in [MIND repo](https://github.com/msnews/msnews.github.io/blob/master/assets/doc/introduction.md)\n", + "\n", + "### news data\n", + "This file contains news information including newsid, category, subcatgory, news title, news abstarct, news url and entities in news title, entities in news abstarct.\n", + "One simple example:
\n", + "\n", + "`N46466\tlifestyle\tlifestyleroyals\tThe Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By\tShop the notebooks, jackets, and more that the royals can't live without.\thttps://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata\t[{\"Label\": \"Prince Philip, Duke of Edinburgh\", \"Type\": \"P\", \"WikidataId\": \"Q80976\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [48], \"SurfaceForms\": [\"Prince Philip\"]}, {\"Label\": \"Charles, Prince of Wales\", \"Type\": \"P\", \"WikidataId\": \"Q43274\", \"Confidence\": 1.0, \"OccurrenceOffsets\": [28], \"SurfaceForms\": [\"Prince Charles\"]}, {\"Label\": \"Elizabeth II\", \"Type\": \"P\", \"WikidataId\": \"Q9682\", \"Confidence\": 0.97, \"OccurrenceOffsets\": [11], \"SurfaceForms\": [\"Queen Elizabeth\"]}]\t[]`\n", + "
\n", + "\n", + "In general, each line in data file represents information of one piece of news:
\n", + "\n", + "`[News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...`\n", + "\n", + "
\n", + "\n", + "We generate a word_dict file to transform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings.\n", + "\n", + "### behaviors data\n", + "One simple example:
\n", + "`1\tU82271\t11/11/2019 3:28:58 PM\tN3130 N11621 N12917 N4574 N12140 N9748\tN13390-0 N7180-0 N20785-0 N6937-0 N15776-0 N25810-0 N20820-0 N6885-0 N27294-0 N18835-0 N16945-0 N7410-0 N23967-0 N22679-0 N20532-0 N26651-0 N22078-0 N4098-0 N16473-0 N13841-0 N15660-0 N25787-0 N2315-0 N1615-0 N9087-0 N23880-0 N3600-0 N24479-0 N22882-0 N26308-0 N13594-0 N2220-0 N28356-0 N17083-0 N21415-0 N18671-0 N9440-0 N17759-0 N10861-0 N21830-0 N8064-0 N5675-0 N15037-0 N26154-0 N15368-1 N481-0 N3256-0 N20663-0 N23940-0 N7654-0 N10729-0 N7090-0 N23596-0 N15901-0 N16348-0 N13645-0 N8124-0 N20094-0 N27774-0 N23011-0 N14832-0 N15971-0 N27729-0 N2167-0 N11186-0 N18390-0 N21328-0 N10992-0 N20122-0 N1958-0 N2004-0 N26156-0 N17632-0 N26146-0 N17322-0 N18403-0 N17397-0 N18215-0 N14475-0 N9781-0 N17958-0 N3370-0 N1127-0 N15525-0 N12657-0 N10537-0 N18224-0`\n", + "
\n", + "\n", + "In general, each line in data file represents one instance of an impression. The format is like:
\n", + "\n", + "`[Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]`\n", + "\n", + "
\n", + "\n", + "User Click History is the user historical clicked news before Impression Time. Impression News is the displayed news in an impression, which format is:
\n", + "\n", + "`[News ID 1]-[label1] ... [News ID n]-[labeln]`\n", + "\n", + "
\n", + "Label represents whether the news is clicked by the user. All information of news in User Click History and Impression News can be found in news data file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global settings and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", + " from pyarrow import HadoopFileSystem\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", + "[GCC 7.5.0]\n", + "Tensorflow version: 2.6.1\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import numpy as np\n", + "import zipfile\n", + "from tqdm import tqdm\n", + "import scrapbook as sb\n", + "from tempfile import TemporaryDirectory\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel('ERROR') # only show error messages\n", + "\n", + "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", + "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", + "from recommenders.models.newsrec.models.nrms import NRMSModel\n", + "from recommenders.models.newsrec.io.mind_iterator import MINDIterator\n", + "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "epochs = 5\n", + "seed = 42\n", + "batch_size = 32\n", + "\n", + "# Options: demo, small, large\n", + "MIND_type = 'demo'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and load data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 17.0k/17.0k [00:01<00:00, 9.65kKB/s]\n", + "100%|██████████| 9.84k/9.84k [00:01<00:00, 8.93kKB/s]\n", + "100%|██████████| 95.0k/95.0k [00:08<00:00, 11.0kKB/s]\n" + ] + } + ], + "source": [ + "tmpdir = TemporaryDirectory()\n", + "data_path = tmpdir.name\n", + "\n", + "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", + "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", + "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", + "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", + "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding.npy\")\n", + "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", + "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict.pkl\")\n", + "yaml_file = os.path.join(data_path, \"utils\", r'nrms.yaml')\n", + "\n", + "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", + "\n", + "if not os.path.exists(train_news_file):\n", + " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", + " \n", + "if not os.path.exists(valid_news_file):\n", + " download_deeprec_resources(mind_url, \\\n", + " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", + "if not os.path.exists(yaml_file):\n", + " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", + " os.path.join(data_path, 'utils'), mind_utils)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create hyper-parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_format=news,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmp1z4450vf/utils/embedding.npy,wordDict_file=/tmp/tmp1z4450vf/utils/word_dict.pkl,userDict_file=/tmp/tmp1z4450vf/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=30,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=20,head_dim=20,cnn_activation=None,dense_activation=None,filter_num=200,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=10,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" + ] + } + ], + "source": [ + "hparams = prepare_hparams(yaml_file, \n", + " wordEmb_file=wordEmb_file,\n", + " wordDict_file=wordDict_file, \n", + " userDict_file=userDict_file,\n", + " batch_size=batch_size,\n", + " epochs=epochs,\n", + " show_step=10)\n", + "print(hparams)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the NRMS model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "iterator = MINDIterator" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "model = NRMSModel(hparams, iterator, seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "586it [00:02, 238.86it/s]\n", + "236it [00:04, 57.82it/s]\n", + "7538it [00:01, 6381.66it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.4792, 'mean_mrr': 0.2059, 'ndcg@5': 0.2045, 'ndcg@10': 0.2701}\n" + ] + } + ], + "source": [ + "print(model.run_eval(valid_news_file, valid_behaviors_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "step 1080 , total_loss: 1.5155, data_loss: 1.4078: : 1086it [01:07, 16.10it/s]\n", + "586it [00:01, 388.70it/s]\n", + "236it [00:03, 68.28it/s]\n", + "7538it [00:00, 7543.81it/s]\n", + "2it [00:00, 16.78it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 1\n", + "train info: logloss loss:1.5149870059327746\n", + "eval info: group_auc:0.5755, mean_mrr:0.2453, ndcg@10:0.3313, ndcg@5:0.2587\n", + "at epoch 1 , train time: 67.4 eval time: 13.3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "step 1080 , total_loss: 1.4203, data_loss: 1.3752: : 1086it [01:04, 16.93it/s]\n", + "586it [00:01, 412.04it/s]\n", + "236it [00:03, 67.25it/s]\n", + "7538it [00:00, 9040.56it/s]\n", + "2it [00:00, 16.90it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 2\n", + "train info: logloss loss:1.4203101933331779\n", + "eval info: group_auc:0.5995, mean_mrr:0.2572, ndcg@10:0.3482, ndcg@5:0.273\n", + "at epoch 2 , train time: 64.2 eval time: 13.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "step 1080 , total_loss: 1.3770, data_loss: 1.2186: : 1086it [01:05, 16.49it/s]\n", + "586it [00:01, 401.41it/s]\n", + "236it [00:03, 65.66it/s]\n", + "7538it [00:00, 7954.16it/s]\n", + "2it [00:00, 16.66it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 3\n", + "train info: logloss loss:1.3768525854658686\n", + "eval info: group_auc:0.6032, mean_mrr:0.2632, ndcg@10:0.3535, ndcg@5:0.2817\n", + "at epoch 3 , train time: 65.9 eval time: 13.3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "step 1080 , total_loss: 1.3516, data_loss: 1.2423: : 1086it [01:06, 16.39it/s]\n", + "586it [00:01, 390.04it/s]\n", + "236it [00:03, 64.53it/s]\n", + "7538it [00:01, 5913.76it/s]\n", + "2it [00:00, 16.68it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 4\n", + "train info: logloss loss:1.3515781479755598\n", + "eval info: group_auc:0.6107, mean_mrr:0.2662, ndcg@10:0.3577, ndcg@5:0.2857\n", + "at epoch 4 , train time: 66.2 eval time: 13.8\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "step 1080 , total_loss: 1.3297, data_loss: 1.2343: : 1086it [01:06, 16.37it/s]\n", + "586it [00:01, 391.49it/s]\n", + "236it [00:03, 64.32it/s]\n", + "7538it [00:00, 7717.24it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "at epoch 5\n", + "train info: logloss loss:1.330019418157047\n", + "eval info: group_auc:0.6127, mean_mrr:0.2697, ndcg@10:0.3625, ndcg@5:0.2912\n", + "at epoch 5 , train time: 66.3 eval time: 14.2\n", + "CPU times: user 8min 12s, sys: 15.5 s, total: 8min 27s\n", + "Wall time: 6min 37s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "586it [00:01, 396.35it/s]\n", + "236it [00:03, 67.56it/s]\n", + "7538it [00:01, 6017.89it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'group_auc': 0.6127, 'mean_mrr': 0.2697, 'ndcg@5': 0.2912, 'ndcg@10': 0.3625}\n", + "CPU times: user 29.8 s, sys: 1.27 s, total: 31.1 s\n", + "Wall time: 14.3 s\n" + ] + } + ], + "source": [ + "%%time\n", + "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", + "print(res_syn)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sb.glue(\"res_syn\", res_syn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = os.path.join(data_path, \"model\")\n", + "os.makedirs(model_path, exist_ok=True)\n", + "\n", + "model.model.save_weights(os.path.join(model_path, \"nrms_ckpt\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Predcition File\n", + "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", + "\n", + "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "586it [00:01, 399.64it/s]\n", + "236it [00:03, 67.94it/s]\n", + "7538it [00:00, 8052.34it/s]\n" + ] + } + ], + "source": [ + "group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "7538it [00:00, 35200.73it/s]\n" + ] + } + ], + "source": [ + "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", + " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", + " impr_index += 1\n", + " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", + " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", + " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", + "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", + "f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reference\n", + "\\[1\\] Wu et al. \"Neural News Recommendation with Multi-Head Self-Attention.\" in Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)
\n", + "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", + "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" + }, + "kernelspec": { + "display_name": "Python 3.7.11 64-bit ('tf2': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", - "[GCC 7.5.0]\n", - "Tensorflow version: 2.6.1\n" - ] - } - ], - "source": [ - "import sys\n", - "import os\n", - "import numpy as np\n", - "import zipfile\n", - "from tqdm import tqdm\n", - "import scrapbook as sb\n", - "from tempfile import TemporaryDirectory\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel('ERROR') # only show error messages\n", - "\n", - "from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources \n", - "from recommenders.models.newsrec.newsrec_utils import prepare_hparams\n", - "from recommenders.models.newsrec.models.nrms import NRMSModel\n", - "from recommenders.models.newsrec.io.mind_iterator import MINDIterator\n", - "from recommenders.models.newsrec.newsrec_utils import get_mind_data_set\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Tensorflow version: {}\".format(tf.__version__))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "epochs = 5\n", - "seed = 42\n", - "batch_size = 32\n", - "\n", - "# Options: demo, small, large\n", - "MIND_type = 'demo'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download and load data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 17.0k/17.0k [00:01<00:00, 9.65kKB/s]\n", - "100%|██████████| 9.84k/9.84k [00:01<00:00, 8.93kKB/s]\n", - "100%|██████████| 95.0k/95.0k [00:08<00:00, 11.0kKB/s]\n" - ] - } - ], - "source": [ - "tmpdir = TemporaryDirectory()\n", - "data_path = tmpdir.name\n", - "\n", - "train_news_file = os.path.join(data_path, 'train', r'news.tsv')\n", - "train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')\n", - "valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')\n", - "valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')\n", - "wordEmb_file = os.path.join(data_path, \"utils\", \"embedding.npy\")\n", - "userDict_file = os.path.join(data_path, \"utils\", \"uid2index.pkl\")\n", - "wordDict_file = os.path.join(data_path, \"utils\", \"word_dict.pkl\")\n", - "yaml_file = os.path.join(data_path, \"utils\", r'nrms.yaml')\n", - "\n", - "mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)\n", - "\n", - "if not os.path.exists(train_news_file):\n", - " download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)\n", - " \n", - "if not os.path.exists(valid_news_file):\n", - " download_deeprec_resources(mind_url, \\\n", - " os.path.join(data_path, 'valid'), mind_dev_dataset)\n", - "if not os.path.exists(yaml_file):\n", - " download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \\\n", - " os.path.join(data_path, 'utils'), mind_utils)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create hyper-parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data_format=news,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmp1z4450vf/utils/embedding.npy,wordDict_file=/tmp/tmp1z4450vf/utils/word_dict.pkl,userDict_file=/tmp/tmp1z4450vf/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=30,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=20,head_dim=20,cnn_activation=None,dense_activation=None,filter_num=200,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=10,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']\n" - ] - } - ], - "source": [ - "hparams = prepare_hparams(yaml_file, \n", - " wordEmb_file=wordEmb_file,\n", - " wordDict_file=wordDict_file, \n", - " userDict_file=userDict_file,\n", - " batch_size=batch_size,\n", - " epochs=epochs,\n", - " show_step=10)\n", - "print(hparams)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train the NRMS model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "iterator = MINDIterator" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "model = NRMSModel(hparams, iterator, seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "586it [00:02, 238.86it/s]\n", - "236it [00:04, 57.82it/s]\n", - "7538it [00:01, 6381.66it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.4792, 'mean_mrr': 0.2059, 'ndcg@5': 0.2045, 'ndcg@10': 0.2701}\n" - ] - } - ], - "source": [ - "print(model.run_eval(valid_news_file, valid_behaviors_file))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "step 1080 , total_loss: 1.5155, data_loss: 1.4078: : 1086it [01:07, 16.10it/s]\n", - "586it [00:01, 388.70it/s]\n", - "236it [00:03, 68.28it/s]\n", - "7538it [00:00, 7543.81it/s]\n", - "2it [00:00, 16.78it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 1\n", - "train info: logloss loss:1.5149870059327746\n", - "eval info: group_auc:0.5755, mean_mrr:0.2453, ndcg@10:0.3313, ndcg@5:0.2587\n", - "at epoch 1 , train time: 67.4 eval time: 13.3\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "step 1080 , total_loss: 1.4203, data_loss: 1.3752: : 1086it [01:04, 16.93it/s]\n", - "586it [00:01, 412.04it/s]\n", - "236it [00:03, 67.25it/s]\n", - "7538it [00:00, 9040.56it/s]\n", - "2it [00:00, 16.90it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 2\n", - "train info: logloss loss:1.4203101933331779\n", - "eval info: group_auc:0.5995, mean_mrr:0.2572, ndcg@10:0.3482, ndcg@5:0.273\n", - "at epoch 2 , train time: 64.2 eval time: 13.0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "step 1080 , total_loss: 1.3770, data_loss: 1.2186: : 1086it [01:05, 16.49it/s]\n", - "586it [00:01, 401.41it/s]\n", - "236it [00:03, 65.66it/s]\n", - "7538it [00:00, 7954.16it/s]\n", - "2it [00:00, 16.66it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 3\n", - "train info: logloss loss:1.3768525854658686\n", - "eval info: group_auc:0.6032, mean_mrr:0.2632, ndcg@10:0.3535, ndcg@5:0.2817\n", - "at epoch 3 , train time: 65.9 eval time: 13.3\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "step 1080 , total_loss: 1.3516, data_loss: 1.2423: : 1086it [01:06, 16.39it/s]\n", - "586it [00:01, 390.04it/s]\n", - "236it [00:03, 64.53it/s]\n", - "7538it [00:01, 5913.76it/s]\n", - "2it [00:00, 16.68it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 4\n", - "train info: logloss loss:1.3515781479755598\n", - "eval info: group_auc:0.6107, mean_mrr:0.2662, ndcg@10:0.3577, ndcg@5:0.2857\n", - "at epoch 4 , train time: 66.2 eval time: 13.8\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "step 1080 , total_loss: 1.3297, data_loss: 1.2343: : 1086it [01:06, 16.37it/s]\n", - "586it [00:01, 391.49it/s]\n", - "236it [00:03, 64.32it/s]\n", - "7538it [00:00, 7717.24it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "at epoch 5\n", - "train info: logloss loss:1.330019418157047\n", - "eval info: group_auc:0.6127, mean_mrr:0.2697, ndcg@10:0.3625, ndcg@5:0.2912\n", - "at epoch 5 , train time: 66.3 eval time: 14.2\n", - "CPU times: user 8min 12s, sys: 15.5 s, total: 8min 27s\n", - "Wall time: 6min 37s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "586it [00:01, 396.35it/s]\n", - "236it [00:03, 67.56it/s]\n", - "7538it [00:01, 6017.89it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group_auc': 0.6127, 'mean_mrr': 0.2697, 'ndcg@5': 0.2912, 'ndcg@10': 0.3625}\n", - "CPU times: user 29.8 s, sys: 1.27 s, total: 31.1 s\n", - "Wall time: 14.3 s\n" - ] - } - ], - "source": [ - "%%time\n", - "res_syn = model.run_eval(valid_news_file, valid_behaviors_file)\n", - "print(res_syn)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sb.glue(\"res_syn\", res_syn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save the model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "model_path = os.path.join(data_path, \"model\")\n", - "os.makedirs(model_path, exist_ok=True)\n", - "\n", - "model.model.save_weights(os.path.join(model_path, \"nrms_ckpt\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Output Predcition File\n", - "This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).\n", - "\n", - "Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "586it [00:01, 399.64it/s]\n", - "236it [00:03, 67.94it/s]\n", - "7538it [00:00, 8052.34it/s]\n" - ] - } - ], - "source": [ - "group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "7538it [00:00, 35200.73it/s]\n" - ] - } - ], - "source": [ - "with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:\n", - " for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):\n", - " impr_index += 1\n", - " pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()\n", - " pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'\n", - " f.write(' '.join([str(impr_index), pred_rank])+ '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)\n", - "f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')\n", - "f.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reference\n", - "\\[1\\] Wu et al. \"Neural News Recommendation with Multi-Head Self-Attention.\" in Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)
\n", - "\\[2\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html
\n", - "\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "interpreter": { - "hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5" - }, - "kernelspec": { - "display_name": "Python 3.7.11 64-bit ('tf2': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/00_quick_start/rbm_movielens.ipynb b/examples/00_quick_start/rbm_movielens.ipynb index 8223c05b63..d76fa764c0 100644 --- a/examples/00_quick_start/rbm_movielens.ipynb +++ b/examples/00_quick_start/rbm_movielens.ipynb @@ -1,872 +1,872 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "source": [ - "# Movie recommender with multinomial RBM (Tensorflow, GPU)\n", - "\n", - "A Restricted Boltzmann Machine (RBM) is a generative neural network model typically used to perform unsupervised learning. The main task of an RBM is to learn the joint probability distribution $P(v,h)$, where $v$ are the visible units and $h$ the hidden ones. The hidden units represent latent variables while the visible units are clamped on the input data. Once the joint distribution is learnt, new examples are generated by sampling from it. \n", - "\n", - "In this notebook, we provide an example of how to utilize the RBM to perform user/item recommendations. In particular, we use as a case study the [movielens dataset](https://movielens.org), comprising user's ranking of movies on a scale of 1 to 5. \n", - "\n", - "This notebook provides a quick start, showing the basic steps needed to use and evaluate the algorithm. A detailed discussion of the RBM model together with a deeper analysis of the recommendation task is provided in the [RBM Deep Dive section](../02_model/rbm_deep_dive.ipynb). The RBM implementation presented here is based on the article by Ruslan Salakhutdinov, Andriy Mnih and Geoffrey Hinton [Restricted Boltzmann Machines for Collaborative Filtering](https://www.cs.toronto.edu/~rsalakhu/papers/rbmcf.pdf) with the exception that here we use multinomial units instead of the one-hot encoded used in the paper. \n", - "\n", - "### Advantages of RBM: \n", - "\n", - "The model generates ratings for a user/movie pair using a collaborative filtering based approach. While matrix factorization methods learn how to reproduce an instance of the user/item affinity matrix, the RBM learns the underlying probability distribution. This has several advantages: \n", - "\n", - "- Generalizability : the model generalize well to new examples.\n", - "- Stability in time: if the recommendation task is time-stationary, the model does not need to be trained often to accomodate new ratings/users. \n", - "- The tensorflow implementation presented here allows fast training on GPU " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0 Global Settings and Import" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/pradjoshi/anaconda3/envs/reco-env/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", - " from pyarrow import HadoopFileSystem\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Recommenders contributors.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "source": [ + "# Movie recommender with multinomial RBM (Tensorflow, GPU)\n", + "\n", + "A Restricted Boltzmann Machine (RBM) is a generative neural network model typically used to perform unsupervised learning. The main task of an RBM is to learn the joint probability distribution $P(v,h)$, where $v$ are the visible units and $h$ the hidden ones. The hidden units represent latent variables while the visible units are clamped on the input data. Once the joint distribution is learnt, new examples are generated by sampling from it. \n", + "\n", + "In this notebook, we provide an example of how to utilize the RBM to perform user/item recommendations. In particular, we use as a case study the [movielens dataset](https://movielens.org), comprising user's ranking of movies on a scale of 1 to 5. \n", + "\n", + "This notebook provides a quick start, showing the basic steps needed to use and evaluate the algorithm. A detailed discussion of the RBM model together with a deeper analysis of the recommendation task is provided in the [RBM Deep Dive section](../02_model/rbm_deep_dive.ipynb). The RBM implementation presented here is based on the article by Ruslan Salakhutdinov, Andriy Mnih and Geoffrey Hinton [Restricted Boltzmann Machines for Collaborative Filtering](https://www.cs.toronto.edu/~rsalakhu/papers/rbmcf.pdf) with the exception that here we use multinomial units instead of the one-hot encoded used in the paper. \n", + "\n", + "### Advantages of RBM: \n", + "\n", + "The model generates ratings for a user/movie pair using a collaborative filtering based approach. While matrix factorization methods learn how to reproduce an instance of the user/item affinity matrix, the RBM learns the underlying probability distribution. This has several advantages: \n", + "\n", + "- Generalizability : the model generalize well to new examples.\n", + "- Stability in time: if the recommendation task is time-stationary, the model does not need to be trained often to accomodate new ratings/users. \n", + "- The tensorflow implementation presented here allows fast training on GPU " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0 Global Settings and Import" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pradjoshi/anaconda3/envs/reco-env/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", + " from pyarrow import HadoopFileSystem\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) \n", + "[GCC 9.4.0]\n", + "Pandas version: 1.3.5\n", + "Tensorflow version: 2.7.0\n" + ] + } + ], + "source": [ + "# set the environment path to find Recommenders\n", + "import sys\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import scrapbook as sb\n", + "import tensorflow as tf\n", + "tf.get_logger().setLevel('ERROR') # only show error messages\n", + "\n", + "from recommenders.models.rbm.rbm import RBM\n", + "from recommenders.datasets.python_splitters import numpy_stratified_split\n", + "from recommenders.datasets.sparse import AffinityMatrix\n", + "from recommenders.datasets import movielens\n", + "from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", + "from recommenders.utils.timer import Timer\n", + "from recommenders.utils.plot import line_graph\n", + "\n", + "#For interactive mode only\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"Tensorflow version: {}\".format(tf.__version__))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1 Load Data \n", + "\n", + "Here we select the size of the movielens dataset. In this example we consider the 100k ratings datasets, provided by 943 users on 1682 movies. The data are imported in a pandas dataframe including the user ID, the item ID, the ratings and a timestamp denoting when a particular user rated a particular item. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", + "MOVIELENS_DATA_SIZE = '100k'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4.81k/4.81k [00:00<00:00, 30.9kKB/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIDmovieIDratingtimestamp
01962423.0881250949
11863023.0891717742
2223771.0878887116
3244512.0880606923
41663461.0886397596
\n", + "
" + ], + "text/plain": [ + " userID movieID rating timestamp\n", + "0 196 242 3.0 881250949\n", + "1 186 302 3.0 891717742\n", + "2 22 377 1.0 878887116\n", + "3 244 51 2.0 880606923\n", + "4 166 346 1.0 886397596" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = movielens.load_pandas_df(\n", + " size=MOVIELENS_DATA_SIZE,\n", + " header=['userID','movieID','rating','timestamp']\n", + ")\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Split the data using the stratified splitter \n", + "\n", + "As a second step we generate the user/item affiity matrix and then split the data into train and test set. If you are familiar with training supervised learning model, here you will notice the first difference. In the former case, we cut off a certain proportion of training examples from dataset (e.g. images), here corresponding to users (or items), ending up with two matrices (train and test) having different row dimensions. Here we need to mantain the same matrix size for the train and test set, but the two will contain different amounts of ratings, see the [deep dive notebook](../02_model/rbm_deep_dive.ipynb) for more details. The affinity matrix reads " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "inputHidden": false, + "outputHidden": false, + "tags": [ + "sparse_matrix" + ] + }, + "outputs": [], + "source": [ + "#to use standard names across the analysis \n", + "header = {\n", + " \"col_user\": \"userID\",\n", + " \"col_item\": \"movieID\",\n", + " \"col_rating\": \"rating\",\n", + " }\n", + "\n", + "#instantiate the sparse matrix generation \n", + "am = AffinityMatrix(df = data, **header)\n", + "\n", + "#obtain the sparse matrix \n", + "X, _, _ = am.gen_affinity_matrix()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The method also returns informations on the sparsness of the dataset and the size of the user/affinity matrix. The former is given by the ratio between the unrated elements and the total number of matrix elements. This is what makes a recommendation task hard: we try to predict 93% of the missing data with only 7% of information!\n", + "\n", + "We split the matrix using the default ration of 0.75, i.e. 75% of the ratings will constitute the train set." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [ + "split" + ] + }, + "outputs": [], + "source": [ + "Xtr, Xtst = numpy_stratified_split(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The splitter returns:\n", + "\n", + "- Xtr: a matrix containing the train set ratings \n", + "- Xtst: a matrix containing the test elements \n", + "\n", + "Note that the train/test matrices have exactly the same dimension, but different entries as it can be explicitly verified:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train matrix size (943, 1682)\n", + "test matrix size (943, 1682)\n" + ] + } + ], + "source": [ + "print('train matrix size', Xtr.shape)\n", + "print('test matrix size', Xtst.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "model", + "train" + ] + }, + "source": [ + "## 2 Train the RBM model\n", + "\n", + "The model has been implemented as a Tensorflow (TF) class. TF does not support probabilistic models natively, so the implementation of the algorithm has a different structure than the one you may be used to see in popular supervised models. The class has been implemented in such a way that the TF session is hidden inside the `fit()` method and no explicit call is needed. The algorithm operates in three different steps: \n", + "\n", + "- Model initialization: This is where we tell TF how to build the computational graph. The main parameters to specify are the number of hidden units, the number of training epochs and the minibatch size. Other parameters can be optionally tweaked for experimentation and to achieve better performance, as explained in the [RBM Deep Dive section](../02_model/rbm_deep_dive.ipynb).\n", + "\n", + "- Model fit: This is where we train the model on the data. The method takes two arguments: the training and test set matrices. Note that the model is trained **only** on the training set, the test set is used to display the generalization accuracy of the trained model, useful to have an idea of how to fix the hyper parameters. \n", + "\n", + "- Model prediction: This is where we generate ratings for the unseen items. Once the model has been trained and we are satisfied with its overall accuracy, we sample new ratings from the learned distribution. In particular, we extract the top_k (e.g. 10) most relevant recommendations according to some predefined score. The prediction is then returned in a dataframe format ready to be analysed and deployed. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "inputHidden": false, + "outputHidden": false, + "tags": [ + "initialization" + ] + }, + "outputs": [], + "source": [ + "#First we initialize the model class\n", + "model = RBM(\n", + " possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),\n", + " visible_units=Xtr.shape[1],\n", + " hidden_units=600,\n", + " training_epoch=30,\n", + " minibatch_size=60,\n", + " keep_prob=0.9,\n", + " with_metrics=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the first time the fit method is called it may take longer to return the result. This is due to the fact that TF needs to initialized the GPU session. You will notice that this is not the case when training the algorithm the second or more times. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "inputHidden": false, + "outputHidden": false, + "tags": [ + "training" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 2.49 seconds for training.\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVEAAAE9CAYAAACyQFFjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAArsUlEQVR4nO3deXxU9b3/8dcnCwkQSAIkQNh3ZA0aRZEqbrggoFa94vKz2mptq7a29Vbba2lre9tb297ettZqW0RbxbaCFbV1LYIsimHf9y2sYd8JhM/vjxlshGyTzORMMu/n4zEPknPOzHwykDfne77f8/2auyMiIjWTFHQBIiL1mUJURKQWFKIiIrWgEBURqQWFqIhILShERURqISXoAqKpVatW3rlz56DLEJEGZs6cOTvdPae8fQ0qRDt37kxhYWHQZYhIA2NmGyrap+a8iEgtxDREzWycme0ws8UV7M80s9fMbIGZLTGzu8rsu9PMVoUfd8ayThGRmor1meh44KpK9n8FWOruA4FhwM/NrJGZtQDGAoOB84CxZpYd41pFRCIW0xB192nA7soOAZqZmQEZ4WNPAFcC77j7bnffA7xD5WEsIhKIoDuWfgNMBrYAzYD/cPeTZtYO2FTmuCKgXQD1iYhUKuiOpSuB+UAekA/8xsyaR/ICZnavmRWaWWFxcXH0KxQRqUTQIXoXMMlDVgPrgN7AZqBDmePah7edwd2fcfcCdy/IySl3GJeISMwEHaIbgcsAzKw10AtYC7wFDDez7HCH0vDwNhGRuBLTa6JmNoFQr3srMysi1OOeCuDuvwMeB8ab2SLAgG+5+87wcx8HPg6/1A/cvbIOKhGRQMQ0RN19TBX7txA6yyxv3zhgXCzqOuXYiVJ2HyqhbWbjWL6NiDRgQTfnAzVleTHfmrgo6DJEpB5L6BC9oFtL5qzfzdHjpUGXIiL1VEKHaGbjVHq0bsbcDXuCLkVE6qmEDlGAz/RoxQerdwZdhojUUwkfokO7t2KGQlREaijhQ3RQx2zWFR9iz6GSoEsRkXoo4UO0UUoSBZ2zmblmV9CliEg9lPAhCjC0Rw7TV+u+exGJnEKUcOfSqp24e9CliEg9oxAFeuRmcLz0JBt3Hw66FBGpZxSigJlxYffQ2aiISCQUomFDu7diukJURCKkEA0b2r0Vs9buovSkrouKSPUpRMNym6fTpnk6C4v2Bl2KiNQjCtEyhvZQk15EIqMQLWNoj1ZM1y2gIhIBhWgZg7u0YPHmfRw6diLoUkSknlCIltGkUQr92mUye51WIhGR6lGInubU3UsiItWhED2N7qMXkUgoRE/Tv10m2/cfY/v+o0GXIiL1gEL0NMlJxgVdW2qiZhGpFoVoOTReVESqSyFajs+Ex4tqajwRqYpCtBydWjYlLTWJldsPBl2KiMQ5hWgFhnbP4YNV6qUXkcopRCugVUBFpDoUohUY0q0lH6/fQ8mJk0GXIiJxTCFageymjeia05S5G/cEXYqIxDGFaCU0272IVEUhWomhPVrxga6LikglFKKVOKdTNqu3H2Dv4ZKgSxGROKUQrURaSjKf6ZHDW0u2BV2KiMQphWgVRuXnMXnBlqDLEJE4FdMQNbNxZrbDzBZXsP9hM5sffiw2s1IzaxHe95CZLQlvn2Bm6bGstSKX9s5lUdE+dhzQrE4icqZYn4mOB66qaKe7P+Hu+e6eDzwKTHX33WbWDngQKHD3fkAycEuMay1Xemoyl5/VmjcWbg3i7UUkzsU0RN19GlDdtTbGABPKfJ8CNDazFKAJEFibeqSa9CJSgbi4JmpmTQidsU4EcPfNwM+AjcBWYJ+7vx1UfUO7t2LDrsNs2n04qBJEJE7FRYgCI4EZ7r4bwMyygdFAFyAPaGpmt5f3RDO718wKzaywuDg2E4akJidxdb82OhsVkTPES4jewqeb8pcD69y92N2PA5OAIeU90d2fcfcCdy/IycmJWYGjBubxmkJURE4TeIiaWSZwMfBqmc0bgfPNrImZGXAZsCyI+k45t3ML9h05zsrtB4IsQ0TiTKyHOE0AZgG9zKzIzD5vZveZ2X1lDrseeNvdD53a4O4fAS8Dc4FF4TqfiWWtVUlKMq4d0JbJ83U2KiL/Zg1pCYyCggIvLCyM2esvKtrHV16cy9SHhxE6QRaRRGBmc9y9oLx9gTfn65N+7ZqTnGQsKNoXdCkiEicUohEwM0YOzFOTXkQ+oRCN0KiBeby+cAulJxvOZRARqTmFaIS652bQKiONj9btCroUEYkDCtEaGJWvMaMiEqIQrYGRA/N4c/E2LWInIgrRmmiX1ZhuORlal15EFKI1pcmaRQQUojV2Tf+2/Gv5Do6UlAZdiogESCFaQ60y0sjvkMV7y7cHXYqIBEghWgujNPBeJOEpRGvhyn5tmLVmF/uOHA+6FBEJiEK0FpqnpzKke0stqSySwBSitTS8TxumrtRQJ5FEpRCtpYEdslhYtDfoMkQkIArRWuraqil7Dx1n96GSoEsRkQAoRGspKcno1y6TBTobFUlICtEoGNghi4WbNFGzSCJSiEbBwPaZui4qkqAUolEwoEMWC4r20ZDWqxKR6lGIRkFeZjrgbN13NOhSRKSOKUSjwMwY2D6LBZv2Bl2KiNQxhWiUDGifpVVARRKQQjRKBnRQ55JIIlKIRsnA9lks2ryPk1oFVCShKESjpEXTRmQ2TmXtzkNBlyIidUghGkW6j14k8ShEoyg06F6dSyKJRCEaRaEe+r1BlyEidUghGkX92mWyfOsBjpdqPXqRRKEQjaKMtBQ6tGjMim0Hgi5FROqIQjTK1KQXSSwK0Sgb2D5T0+KJJBCFaJTpTFQkscQ0RM1snJntMLPFFex/2Mzmhx+LzazUzFqE92WZ2ctmttzMlpnZBbGsNVp6t23Ghl2HOVxyIuhSRKQOxPpMdDxwVUU73f0Jd89393zgUWCqu+8O7/4/4E137w0MBJbFuNaoSEtJpmfrDJZs2R90KSJSB2Iaou4+Ddhd5YEhY4AJAGaWCVwE/DH8OiXuvjcWNcbCAE2LJ5Iw4uKaqJk1IXTGOjG8qQtQDDxrZvPM7A9m1jSwAiM0QHcuiSSMuAhRYCQwo0xTPgU4G3jK3QcBh4BHynuimd1rZoVmVlhcXFw31VYhv4M6l0QSRbyE6C2Em/JhRUCRu38U/v5lQqF6Bnd/xt0L3L0gJycnxmVWT9ecDHYdLGHvYa1FL9LQBR6i4eufFwOvntrm7tuATWbWK7zpMmBpAOXVSHKS0TevuZr0IgkgJZYvbmYTgGFAKzMrAsYCqQDu/rvwYdcDb7v76RNxPgC8YGaNgLXAXbGsNdpOTYt3Uc/4ODsWkdiIaYi6+5hqHDOe0FCo07fPBwqiXlQdGdA+k1fnbwm6DBGJscCb8w2VVv8USQwK0Rhpn92YEyedbVqLXqRBU4jGiJkxoH2mhjqJNHAK0Rga0F5rLok0dArRGMrvkMkCTYsn0qApRGPo1Jmou9aiF2moFKIx1CojjWbpqazfdTjoUkQkRhSiMRaajGRv0GWISIwoRGNsYIcs5mu8qEiDpRCNMU2LJ9KwKURjrH+7TJZt3c/M1TuDLkVEYkAhGmPN0lP5+U0DeWTSIv7fuNks3qyzUpGGRCFaB67u35Z3v34xl/XO5a7xH/PghHlsVI+9SIOgEK0jjVKSuHNIZ97/5jC65WQw+snpjH11MTsPHgu6NBGpBavuQHAzSwM+C3SmzBR67v6DmFRWAwUFBV5YWBh0GdWy6+AxfjNlNa/M28ydF3Tmixd3pUmjmM5MKCI1ZGZz3L3cqTkjORN9FRgNnCC05tGph9RAy4w0xo7sy2v3D2Vh0V5++uaKoEsSkRqI5NSnvbtXuIa81EyHFk347si+3Pz0LB67tg/JSRZ0SSISgUjORGeaWf+YVZLAurRqSpvm6Xy4dlfQpYhIhCIJ0aHAHDNbYWYLzWyRmS2MVWGJZtTAPF5boOVEROqbSEL0aqAHMJzQOvHXhv+UKBgxoC1vLtlGyYmTQZciIhGoMkTNrHn4ywMVPCQK8rIa0zO3GR+sKg66FBGJQHU6ll4kdNY5B3CgbM+HA11jUFdCGjmwLZMXbOGys1oHXYqIVFOVIeru14b/7BL7chLb1f3b8tO3VnCkpJTGjZKDLkdEqiGiO5bMLNvMzjOzi049YlVYImqVkUZ+hyzeW7496FJEpJqqHaJm9gVgGvAW8P3wn9+LTVmJS730IvVLJGeiXwXOBTa4+yXAIGBvLIpKZMP7tmHm6l3sP3o86FJEpBoiCdGj7n4UQvfRu/tyoFdsykpcmY1TuaBbS95eoia9SH0QSYgWmVkW8HfgHTN7FdgQi6IS3ciBeUxWk16kXqh2iLr79e6+192/BzwG/BG4LkZ1JbTLzspl3sY97NI0eSJxr1ohambJZrb81PfuPtXdJ7t7SexKS1xNGqVwSa9c/rF4W9CliEgVqhWi7l4KrDCzjjGuR8LUSy9SP0RyTTQbWGJm75nZ5FOPWBWW6D7TsxUrtx9g674jQZciIpWIZD7Rx2JWhZwhLSWZK/u04Y2FW/nCZ3RnrUi8iuRM9JrwtdBPHsA1sSpM1EsvUh9EEqJXlLPt6sqeYGbjzGyHmS2uYP/DZjY//FhsZqVm1qLM/mQzm2dmr0dQZ4NxQbeWbNl7lPU7tQqLSLyqzlR4XzKzRUCv8GTMpx7rgKomZR4PVLikiLs/4e757p4PPApMdffdZQ75KrCsqhobquQkY0T/NupgEolj1TkTfZHQ5MuTw3+eepzj7refOsjMsk9/ortPA3afvr0CY4AJZV6vPTAC+EM1n98gjcrP47WFClGReFVliLr7Pndf7+5j3H1Dmcfp4fheTYswsyaEzlgnltn8S+A/gYSe6n1Qh2wOHStl+bb9QZciIuWIaCq8KtRmmcqRwIxTwWxm1wI73H1OlW9qdq+ZFZpZYXFxw5sVPinJuHZgWzXpReJUNEPUa/HcWyjTlAcuBEaZ2XrgJeBSM/tzuW/q/oy7F7h7QU5OTi1KiF8jB+Qxcc5mNu0+HHQpInKaaIZojZhZJnAx8Oqpbe7+qLu3d/fOhAL2X2Wvvyaafu0yueeirlz35Axenb856HJEpIxIBttX5YzmvJlNAIYBrcysCBgLpAK4++/Ch10PvO3uGsdTic8P7cLgLi148KV5TFu5k++P7ktGWjT/+kSkJsy9+q1wMxsK9HD3Z80sB8hw93XhfS3K6WyqUwUFBV5YWBhkCTF3uOQE35+8lI/W7eJXYwYxoH1W0CWJNHhmNsfdC8rbF8nyIGOBbxEazwmhM8pPrlMGHaCJokmjFP7nxgF888pe3PXsxzw9dQ0nT9bmcrSI1EYk10SvB0YBhwDcfQvQLBZFSdWuHZDHq/dfyDtLt3Pns7PZsf9o0CWJJKRIQrTEQ21/BzCzprEpSaqrfXYTXrr3fM7umM2IX0/n2RnrKD6giZxF6lIkPRN/NbOngSwzuwe4G/h9bMqS6kpJTuKhK3oyrFcOf5q1gV+8s5L8DlmMzm/HlX1b0yw9NegSRRq0SDuWrgCGE+qJf8vd34lVYTWRCB1LVTlSUsq7y7bz6vzNfLR2Nxf1zGF0fh4X98ohLSU56PJE6qXKOpaqHaLh5vtRdy81s16EVvr8p7vHzdq+CtFP23OohH8u3sbf529m5fYD3HlBZx66omfQZYnUO1HpnQemAWlm1g54E7iD0CxNEqeymzbi1sEd+esXL+D1B4byx+nr2Hckbv7PE2kQIglRc/fDwA3AU+5+E9A3NmVJtLXPbsK5nbOZvmpn0KWINCgRhaiZXQDcBrwR3qaLbPXIJb1zmbJiR9BliDQokYTo1wgNtH/F3ZeYWVdgSkyqkpgY1jOXqSuLNThfJIqqPcQpvKbS1DLfrwUejEVREhsdWzaheXoKS7fup1+7zKDLEWkQIrnts8DMJpnZ3LLLhMSyOIm+Yb1ymbJcTXqRaImkOf8Cod74z/LpZUKkHrmkl66LikRTJHcsFbv75JhVInXi3C7ZrNp+kN2HSmjRtFHQ5YjUe5GE6Fgz+wOhtZQ+uUHb3SdFvSqJmbSUZM7v1pIPVhUzOr9d0OWI1HuRhOhdQG9CU+CdWjzOAYVoPXNJ+LqoQlSk9iIJ0XPdvVfMKpE6M6xXDj97ewWlJ53kpNqsLygikXQszTSzPjGrROpMXlZjcpulsbBob9CliNR71QpRMzNCi8nNN7MV4eFNizTEqf66uFcOU1Y0vCWmRepatUI0PBlzLtCD0FR4I4Fr0RCneuuSXrm8r6FOIrUWyTXRiUCuu38cq2Kk7pzTKZv1Ow9RfOAYOc3Sgi5HpN6K5JroYGCWma1Rc77+S01OYmiPVkxdqSa9SG1EciZ6ZcyqkEAMC9+9dOM57YMuRaTeimQCkg2xLETq3rCeOfzojWWcKD1JSnIkjRIROUW/OQkst3k6HVo0Zu7GvUGXIlJvKUQTnHrpRWpHIZrghmm8qEitKEQTXH6HbLbuO8K2fUeDLkWkXlKIJrjkJOOiHjlq0ovUkEJUuKR3jiZqFqkhhahwUY8cZq7ZRcmJk1UfLCKfohAVWmak0S0ng8L1u4MuRaTeUYgKoLWXRGpKISpA6Lro+xrqJBKxmIaomY0zsx1mtriC/Q+b2fzwY7GZlZpZCzPrYGZTzGypmS0xs6/Gsk6BfnmZ7DlcoomaRSIUyQQkNTEe+A3wfHk73f0J4AkAMxsJPOTuu80sDfiGu881s2bAHDN7x92XxrjehJWUZDxwaQ8+/1whzdNTuPys1lzaO5dzOmXrvnqRSsQ0RN19mpl1rubhY4AJ4edtBbaGvz5gZsuAdoBCNIbuHNKZO87vxKLN+3hv2XZ+8PpSNu89wrCeOVx6Vmsu7plDZuPUoMsUiSsWmrQ+hm8QCtHX3b1fJcc0AYqA7u6++7R9nYFpQD9331/ZexUUFHhhYWGta5Z/27rvCO8t28F7y7bz8fo9XNi9JT++YYDWrJeEYmZz3L2gvH3x0k4bCcwoJ0AzCM2o/7WKAtTM7jWzQjMrLC5Wx0i0tc1szO3nd+LZu85j9ncuo0urDK791QfM3bgn6NJE4kK8hOgthJvyp5hZKqEAfcHdK1zb3t2fcfcCdy/IycmJcZmJrUmjFB65ujffH92Pe54rZPyMdcS6JSMS7wIPUTPLJLSS6KtlthnwR2CZu/8iqNqkfFf0ac2kLw/hr4VFPDBhHgePnQi6JJHAxHqI0wRgFtDLzIrM7PNmdp+Z3VfmsOuBt939UJltFwJ3AJeWGQJ1TSxrlch0atmUSV8eQkZaCqN/M52V2w8EXZJIIGLesVSX1LEUjL8VbuLH/1zO2JF9GJ3fLuhyRKKuso6lWI8TlQRwU0EH+uZl8uUX5lC4fg+PXduHRimBXykSqRP6ly5R0SevOZMfGMrWfUe5c9xs9h05HnRJInVCISpR0zw9lafvOIdebZpx41MzKdpzOOiSRGJOISpRlZxkfG9UX8ac15HPPjWTRUX7gi5JJKYUohITdw/twvdH9ePOZ2fzr+Xbgy5HJGbUsSQxc1W/NuQ2T+OLf5rDg5cd5Y7zOwVdkkjU6UxUYursjtm8fN8FPDt9HT/+xzJOnmw4Q+pEQCEqdaBTy6ZM/NIQ5mzYwwMvzePo8dKgSxKJGoWo1Inspo348xcGA3D/i/N0z700GApRqTPpqcn87835bNp9mNcWbg26HJGoUIhKnWqUksRPPtufx19fyp5DJUGXI1JrClGpc4M6ZjOif1t+9I9lQZciUmsKUQnEN6/sxaw1u5ixemfQpYjUikJUApGRlsLj1/Xl268siri3/khJaVzcm3+45ASlGrKV8BSiEphLe7emf7tMfvnuqmo/Z/v+o1z/2xnc9LuZHApwMmh3585xs3n8da2dmOgUohKosSP78rfCTSzZUvU99qt3HOCG385kVH4eA9tn8Z8TFwY2VOr9lcXsOljCawu2sHxbpesnSgOnEJVA5TRL41tX9eaRiYs4UXqywuMK1+/mlmc+5OtX9OTLw7rz+HX92LjrMH/4YF0dVhvi7vz87RV888pefO3yHnxv8hKNe01gClEJ3E0F7WmWnsL4mevL3f/Wkm188U9z+PnN+Xz2nPZAaMzpU7efzdPT1jJzTd12Tr21ZBvucFXfNow5ryN7Dx/nH4u21WkNEj8UohI4M+O/r+/Pk1NWs2n3p+cg/dOHG3js74sZf9d5XNzz06u5ts9uwi//I5+vvjSfLXuP1EmtpSedX7yzkm8M70lSkpGSnMT3RvXlv/+xjCMlup01ESlEJS50btWUey7qynf+vhh3x9154q3ljJu+jpfvG0L/9pnlPm9oj1bcfWEXvvTCXI6diH2IvbZgCxlpKVzSK/eTbed3bcmgjlk8NXVNzN9f4o9CVOLGPZ/pSvGBY0ycu5mHX17I9NW7ePm+C+jYskmlz7vv4q7kZabzvcmx7Sk/XnqSX767km8O70VoVe9/+/Y1Z/H8rPVnnElLw6cQlbiRmpzET27oz7cmLmT3oRIm3DOYlhlpVT7PzHjipoHMXreLl2ZvjFl9E+cUkZfVmCHdW52xLy+rMZ+/sAs/fENDnhKNQlTiysAOWfz9yxfyzB3n0KRR9ecMz0hL4ek7CvjpWytYsGlv1Os6dqKUX723im8M71XhMfdc1JVlWw/wwariqL+/xC+FqMSd/u0zSUmO/J9m99wM/vv6/nz5hbnsOngsqjW9NHsTvds255xO2RUek56azH+NOIvvv7aU45UM15KGRSEqDcpV/dowOj+P+1+cF7Xe8iMlpTw5ZTVfv6Jnlcde0ac1bTPTeX7Whqi8t8Q/hag0ON8Y3ou2Welc/9sZrNt5qNav9/ys9RR0zqZfu/JHCJRlZowd2Ycnp6xmZ5TPhiU+KUSlwUlOMn5+00BuP78TNz41kzcX13wC6ANHj/PMtLU8dHnVZ6GndM9txg2D2vHEmytq/L5SfyhEpUEyM24/vxPjPncuj7++jB+9UbPrlOOmr+finjn0aN0souc9eHkPpqzYEZNOLokvClFp0AZ2yOL1B4ayasdBbv39h2zff7Taz917uITxM9fx1ct7RPy+zdNTefjKXnzlxbmMfXUxL3y0gcL1u+NiCj+JLmtIEycUFBR4YWFh0GVIHDp50vnNlNX8+cMN/N8tg7igW8sqn/M/by5n7+Hj/PiG/jV6T3dn1tpdLN2yn5XbD7By+0FWbT9As/RUerZpRq/WGfRs3YwRA9pGNJxL6p6ZzXH3gnL3KUQlkXywqpiH/rKAu4d25u4Lu1B84Bjb9x9l+/5jbNt/lB37j7Jt/1G27z/K0i37efNrF5GX1Thq73/ypLN57xFW7TjAim0HmbpyBy0z0vjNmEFn3AUVLUV7DmNmtIviz5FoFKIiZWzZe4T7X5zLwqJ95DZLo3VmOq2bpdMmM53c5mm0aZ5O6+bpdM1pStvM2AbP0eOlXP/bmdw6uCN3nN8pKq9ZfOAYs9buYtaancxYvYu9h0tol92Efzw4NGZB3dBVFqJqQ0jCyctqzMQvDcEdkpKCDZX01GR+e9vZ3PjUTAZ1yKrWMKrTHTh6nFlrdjFzzS5mrdnF1n1HOK9LSy7s3pLPDelC99wMLv35+8zftJdBHSu+WUBqRiEqCcnMiJeTsi6tmjJ2VF/uf3Eurz0wlGbpqdV+7uLN+7hr/Mf0btOMC7q15Kc3DqBvXvMz7vi69byOvPDRRoVoDMS0d97MxpnZDjNbXMH+h81sfvix2MxKzaxFeN9VZrbCzFab2SOxrFMkaKMG5jGkeysenbSo2rPkz9mwh889O5vHR/flT58fzJeHdWdgh6xyb5m98Zz2vLVkG/sOa3RAtMV6iNN44KqKdrr7E+6e7+75wKPAVHffbWbJwJPA1UAfYIyZ9YlxrSKB+u61fVi94yAvVmMmqplrdnLP84U8cdNArurXtsrjW2akcUmvXCbNK4pGqVJGTEPU3acBu6t5+BhgQvjr84DV7r7W3UuAl4DRMShRJG6kpybz5G1n8/O3V7J0S8WL301ZvoMHXpzHk7ee/anJoaty2+BQk74hdSbHg7gYbG9mTQidsU4Mb2oHbCpzSFF4m0iD1i0ng7Ej+/CVF+dysJwlof+5aCsPv7yA399ZUK2xrmWd16UFALPXVfe8RqojLkIUGAnMcPeI/3bN7F4zKzSzwuJizeMo9d/o/Hac37UF3z7t+ujEOUV8d/ISnrv7PM6uQQeRmX3SwSTREy8hegv/bsoDbAY6lPm+fXjbGdz9GXcvcPeCnJyc8g4RqXfGjuzLyu0HeOnjUIPszx9u4Gdvr2DCPYPpmxf5MKhTPnt2e95fsSPq860mssBD1MwygYuBV8ts/hjoYWZdzKwRoZCdHER9IkE4dX30ibdW8P3XlvD0tDX85d4L6J4b2UQop8tsksrwvm14eY46mKIl1kOcJgCzgF5mVmRmnzez+8zsvjKHXQ+87e6fTPzo7ieA+4G3gGXAX919SSxrFYk3p66Pzl63m79+seoF+6rr1sEdeXH2Rk6eVAdTNOi2T5EE4+5c86vpfPua3nymhy6BVUdlt30G3pwXkbplZtw2uCMvqoMpKhSiIglodH4eM1bvjGh+1do6eryUkhMNbwE/hahIAmqWnsqIAXn89eNNVR8cJQ9MmMed42Y3uJVQFaIiCeq2wR2ZMHsjpVV0MB07Ucqv3lvF+Bnravxe01YWs2LbAdJSk/jh60tr/DrxSCEqkqD6tcskp1kaU1fuqPCY2et2c83/fcDCor38+l+rWb6t4ttRK3K89CSPv76U74w4i1+NGcT01TtrdD32eOlJvvPKIt5fUXG9QVCIiiSw2wZ34oUPzwy0fUeO8+ikRTw4YR4PX9mLP9x5Lt8Y3otHJi6q8sz1dC98uIHc5mkM79Oa5ump/OHOc/nFOysiuv306PFS7vvTHN5fUczfCuNrjKtCVCSBXTuwLXM27qFoz2EgNPzpjYVbueIXU0lOgre/ftEns0Tdcm4HGqUk8fys9dV+/T2HSvj1v1bz3Wv7fjKrfpdWTfnFzfnc/+LcT963MoeOneDu8R/TuFEyE780hGmrijl2ojTyHzZGFKIiCaxJoxSuy2/HXz7exJa9R/jCc4X877sr+e1tZ/PD6/rTvMwE0UlJxo9v6M+v/7WazXuPVOv1//fdlYwY0JZebT59p9VFPXP44sXduOf5ORwuOXOilVP2HT7O7X/8iI4tmvB/twyiTWY6PVs346O18TOJikJUJMHdOrgjz81cz4hffcCA9lm88eBQCjq3KPfYbjkZ3DWkM4/9fXGVU+ot37afNxZu5aHLe5a7/+4LO9Mvrznf/NuCcl9r58Fj3PL7Dzm7YzY/vqE/yeGlXC4/qzXvLtse4U/5b9v3H434kkRlFKIiCa5n62Z87fKe/O2+IXz18h6kpSRXevwXL+7G5j1HeH3h1gqPcXcef30pD1zaneymjco9xsz44fX92LbvKL/+1+pP7duy9wg3Pz2L4X1a818jzvrUAntX9Mnl3aXbazwv6n1/nsP01Ttr9NzyKERFhLuHhha0q45GKUn85LP9+cHrS9l7uKTcY95Zup0d+49xWxUrmKalJPO7O87hpdkbeXPxNgDW7zzEzU/PYsy5HXnoip5nrFDaLSeDRilJLN0a+UiBtcUHKdpzhAsjnIu1MgpREYnYoI7ZjOjflh+9seyMfcdOlPLDN5bx3ZF9SC1nvafT5TZL5+k7CvjOK4uYvGALtzzzIV8e1p17Lupa7vFmFmrSL418qNMr8zYzemBeuetQ1ZRCVERq5JtX9mLmml3MOK1pPG76enq2zohocpP+7TMZO6ovX//LfB69pje3Du5Y6fGX94n8uujJk86kuZu54ez2ET2vKgpREamRjLQUHr+uL99+ZRFHj4eGHO3Yf5Rnpq3hOyMiX1dy1MA85jx2BaPzq14JqKBTNpv2HGbrvuqNEgCYvX43zdJT6JPXPOLaKqMQFZEau7R3a/q3y+SX764C4Im3VnBzQQe6tGpao9fLbJxa9UFASnISw3rm8N6y6jfpJ80t4oazo79Um0JURGpl7Mi+vDxnEy/N3sj7K4u5/9LudfK+kTTpjx4v5a0l26t1lhsphaiI1EpOszT+86rePDJpEd8c3pNm6dU7m6yti3rmULh+D4fKWRX1dG8v3c7ADlm0bp4e9TpSov6KIpJwbjqnPempyYzo37bO3rN5eiqDOmbxwaqdXNWvTaXHTppbxGdj0JQHnYmKSBSYGaMG5n1yV1Fdqc7dSzsOHGXuhj0M71N50NaUQlRE6q3LzsplyvIdld7GOXn+Fob3bUPjRpXfiVVTClERqbfaZzcht3k68zbuqfCYiXM3x6RX/hSFqIjUa1eclcs7FTTpl23dz77DJZzfJXq3eZ5OISoi9drlfVrz7tLyQ/SVeZu5/ux2JMXwWq1CVETqtX55mRw4eoK1xQc/tf1E6Un+Pm8z1w+K7m2ep1OIiki9lpRkXHZW6zPuXpqxZhdtsxpXe3aqGr9/TF9dRKQOXNHnzOuisRwbWpZCVETqvSHdWrF0y372HArNb3rw2An+tXwH1w7Ii/l7K0RFpN5LT01mSLeWTAkvp/zPRVs5v2tLWlQwq340KURFpEEoOyHJpLmb66QpDwpREWkgLu2dywerdrJu5yGWb9vPJb1z6+R9FaIi0iC0ykijZ+tmPDJxISMGtK1ywb1oUYiKSINx+Vmt+Wjd7qgvAVIZTYUnIg3G1f3a8OHaXQzqkFVn76kQFZEGo3Orpjx393l1+p4xbc6b2Tgz22Fmiys5ZpiZzTezJWY2tcz2h8LbFpvZBDOL/pTUIiK1FOtrouOBqyraaWZZwG+BUe7eF7gpvL0d8CBQ4O79gGTglhjXKiISsZiGqLtPA3ZXcsitwCR33xg+vuzNrylAYzNLAZoAW2JWqIhIDQXdO98TyDaz981sjpn9PwB33wz8DNgIbAX2ufvbAdYpIlKuoEM0BTgHGAFcCTxmZj3NLBsYDXQB8oCmZnZ7eS9gZveaWaGZFRYXF9dV3SIiQPAhWgS85e6H3H0nMA0YCFwOrHP3Ync/DkwChpT3Au7+jLsXuHtBTk5OnRUuIgLBh+irwFAzSzGzJsBgYBmhZvz5ZtbEzAy4LLxdRCSuxHScqJlNAIYBrcysCBgLpAK4++/cfZmZvQksBE4Cf3D3xeHnvgzMBU4A84BnYlmriEhNmHvFS43WNwUFBV5YWBh0GSLSwJjZHHcvKG9f0M15EZF6TSEqIlILDao5b2bFwIYIn9YK2BmDcqKtvtQJ9adW1Rl99aXWSOvs5O7lDv9pUCFaE2ZWWNG1jnhSX+qE+lOr6oy++lJrNOtUc15EpBYUoiIitaAQrT/jT+tLnVB/alWd0Vdfao1anQl/TVREpDZ0JioiUgsJHaJmdpWZrTCz1Wb2SND1VMTM1pvZovAKAHF1S1Z5qxeYWQsze8fMVoX/zA6yxnBN5dX5PTPbHP5c55vZNUHWGK6pg5lNMbOl4ZUdvhreHlefaSV1xtVnambpZjbbzBaE6/x+eHsXM/so/Lv/FzNrVOP3SNTmvJklAyuBKwjNJvUxMMbdlwZaWDnMbD2hWf7jbvydmV0EHASeD69CgJn9FNjt7j8J/+eU7e7fisM6vwccdPefBVlbWWbWFmjr7nPNrBkwB7gO+Bxx9JlWUufNxNFnGp7AqKm7HzSzVGA68FXg64QmhH/JzH4HLHD3p2ryHol8JnoesNrd17p7CfASoTlMJQIVrF4wGngu/PVzhH65AlWNVRbigrtvdfe54a8PEJq9rB1x9plWUmdc8ZCD4W9Tww8HLgVeDm+v1eeZyCHaDthU5vsi4vAfQZgDb4dn/7836GKqobW7bw1/vQ1oHWQxVbjfzBaGm/uBX3Yoy8w6A4OAj4jjz/S0OiHOPlMzSzaz+cAO4B1gDbDX3U+ED6nV734ih2h9MtTdzwauBr4SbprWCx66XhSv14yeAroB+YSWofl5oNWUYWYZwETga+6+v+y+ePpMy6kz7j5Tdy9193ygPaEWaO9ovn4ih+hmoEOZ79uHt8Wd8JpTpxbye4XQP4R4tj18zezUtbMdVRwfCHffHv4FOwn8njj5XMPX7iYCL7j7pPDmuPtMy6szXj9TAHffC0wBLgCywotgQi1/9xM5RD8GeoR76RoRWpJ5csA1ncHMmoYv3GNmTYHhwOLKnxW4ycCd4a/vJLSCQdw5FUph1xMHn2u4I+SPwDJ3/0WZXXH1mVZUZ7x9pmaWY6Gl2TGzxoQ6kpcRCtMbw4fV6vNM2N55gPDwi18SWtd+nLv/KNiKzmRmXQmdfUJoJYIX46nOsqsXANsJrV7wd+CvQEdCs2rd7O6BdupUUOcwQs1OB9YDXyxz3TEQZjYU+ABYRGi1B4BvE7reGDefaSV1jiGOPlMzG0Co4yiZ0EnjX939B+Hfq5eAFoRWzrjd3Y/V6D0SOURFRGorkZvzIiK1phAVEakFhaiISC0oREVEakEhKiJSCwpRkQqY2TAzez3oOiS+KURFRGpBISr1npndHp4zcr6ZPR2ecOKgmf1veA7J98wsJ3xsvpl9GJ4g45VTE2SYWXczezc87+RcM+sWfvkMM3vZzJab2QvhO3VEPqEQlXrNzM4C/gO4MDzJRClwG9AUKHT3vsBUQncoATwPfMvdBxC62+bU9heAJ919IDCE0OQZEJqd6GtAH6ArcGGMfySpZ1KqPkQkrl0GnAN8HD5JbExoco6TwF/Cx/wZmGRmmUCWu08Nb38O+Ft4boJ27v4KgLsfBQi/3mx3Lwp/Px/oTGhiXxFAISr1nwHPufujn9po9thpx9X0/uay91OXot8ZOY2a81LfvQfcaGa58MlaRJ0I/ds+NUvPrcB0d98H7DGzz4S33wFMDc/MXmRm14VfI83MmtTlDyH1l/5XlXrN3Zea2X8Rmvk/CTgOfAU4BJwX3reD0HVTCE179rtwSK4F7gpvvwN42sx+EH6Nm+rwx5B6TLM4SYNkZgfdPSPoOqThU3NeRKQWdCYqIlILOhMVEakFhaiISC0oREVEakEhKiJSCwpREZFaUIiKiNTC/wdn2YE5l/9kaQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Model Fit\n", + "with Timer() as train_time:\n", + " model.fit(Xtr)\n", + "\n", + "print(\"Took {:.2f} seconds for training.\".format(train_time.interval))\n", + "\n", + "# Plot the train RMSE as a function of the epochs\n", + "line_graph(values=model.rmse_train, labels='train', x_name='epoch', y_name='rmse_train')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "During training, we can optionlly evauate the root mean squared error to have an idea of how the learning is proceeding. We would generally like to see this quantity decreasing as a function of the learning epochs. To visualise this choose `with_metrics = True` in the `RBM()` model function. \n", + "\n", + "Once the model has been trained, we can predict new ratings on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [ + "top_k" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 0.23 seconds for prediction.\n" + ] + } + ], + "source": [ + "# number of top score elements to be recommended \n", + "K = 10\n", + "\n", + "# Model prediction on the test set Xtst.\n", + "with Timer() as prediction_time:\n", + " top_k = model.recommend_k_items(Xtst)\n", + "\n", + "print(\"Took {:.2f} seconds for prediction.\".format(prediction_time.interval))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`top_k` returns the first K elements having the highest recommendation score. Here the recommendation score is evaluated by multiplying the predicted rating by its probability, i.e. the confidence the algorithm has about its output. So if we have two items both with predicted ratings 5, but one with probability 0.5 and the other 0.9, the latter will be considered more relevant. In order to inspect the prediction and use the evaluation metrics in this repository, we convert both top_k and Xtst to pandas dataframe format:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "top_k_df = am.map_back_sparse(top_k, kind = 'prediction')\n", + "test_df = am.map_back_sparse(Xtst, kind = 'ratings')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIDmovieIDprediction
011004.881824
11654.822650
211294.672100
3111044.898961
4111234.664860
5114184.611925
6114274.722356
7115214.738353
8115834.569103
9115464.890738
\n", + "
" + ], + "text/plain": [ + " userID movieID prediction\n", + "0 1 100 4.881824\n", + "1 1 65 4.822650\n", + "2 1 129 4.672100\n", + "3 1 1104 4.898961\n", + "4 1 1123 4.664860\n", + "5 1 1418 4.611925\n", + "6 1 1427 4.722356\n", + "7 1 1521 4.738353\n", + "8 1 1583 4.569103\n", + "9 1 1546 4.890738" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_k_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 Evaluation metrics \n", + "\n", + "Here we evaluate the performance of the algorithm using the metrics provided in the `PythonRankingEvaluation` class. Note that the following metrics take into account only the first K elements, therefore their value may be different from the one displayed from the `model.fit()` method. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [ + "ranking" + ] + }, + "outputs": [], + "source": [ + "def ranking_metrics(\n", + " data_size,\n", + " data_true,\n", + " data_pred,\n", + " K\n", + "):\n", + "\n", + " eval_map = map_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", + " col_rating=\"rating\", col_prediction=\"prediction\", \n", + " relevancy_method=\"top_k\", k= K)\n", + "\n", + " eval_ndcg = ndcg_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", + " col_rating=\"rating\", col_prediction=\"prediction\", \n", + " relevancy_method=\"top_k\", k= K)\n", + "\n", + " eval_precision = precision_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", + " col_rating=\"rating\", col_prediction=\"prediction\", \n", + " relevancy_method=\"top_k\", k= K)\n", + "\n", + " eval_recall = recall_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", + " col_rating=\"rating\", col_prediction=\"prediction\", \n", + " relevancy_method=\"top_k\", k= K)\n", + "\n", + " \n", + " df_result = pd.DataFrame(\n", + " { \"Dataset\": data_size,\n", + " \"K\": K,\n", + " \"MAP\": eval_map,\n", + " \"nDCG@k\": eval_ndcg,\n", + " \"Precision@k\": eval_precision,\n", + " \"Recall@k\": eval_recall,\n", + " }, \n", + " index=[0]\n", + " )\n", + " \n", + " return df_result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatasetKMAPnDCG@kPrecision@kRecall@k
0mv 100k100.1408280.4111240.3362670.212256
\n", + "
" + ], + "text/plain": [ + " Dataset K MAP nDCG@k Precision@k Recall@k\n", + "0 mv 100k 10 0.140828 0.411124 0.336267 0.212256" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_100k = ranking_metrics(\n", + " data_size=\"mv 100k\",\n", + " data_true=test_df,\n", + " data_pred=top_k_df,\n", + " K=10\n", + ")\n", + "\n", + "eval_100k" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.14082811192026132, + "encoder": "json", + "name": "map", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "map" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.41112362614927883, + "encoder": "json", + "name": "ndcg", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "ndcg" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.3362672322375398, + "encoder": "json", + "name": "precision", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "precision" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.2122560190189148, + "encoder": "json", + "name": "recall", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "recall" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Record results with papermill for tests\n", + "sb.glue(\"map\", eval_100k['MAP'][0])\n", + "sb.glue(\"ndcg\", eval_100k['nDCG@k'][0])\n", + "sb.glue(\"precision\", eval_100k['Precision@k'][0])\n", + "sb.glue(\"recall\", eval_100k['Recall@k'][0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 Saving the model and Loading a pre-trained model\n", + "Trained model checkpoint can be saved to a specified directory using the `save` function." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "model.save(file_path='./models/rbm_model.ckpt')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pre-trained RBM model can be loaded using the `load` function, which can be used to resume the training." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the model class\n", + "model = RBM(\n", + " possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),\n", + " visible_units=Xtr.shape[1],\n", + " hidden_units=600,\n", + " training_epoch=30,\n", + " minibatch_size=60,\n", + " keep_prob=0.9,\n", + " with_metrics=True\n", + ")\n", + "\n", + "# Load the model checkpoint\n", + "model.load(file_path='./models/rbm_model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "67434505f7f08e5031eee7757e853265d2f43dd6b5963eb755a27835ec0e1503" + }, + "kernel_info": { + "name": "python3" + }, + "kernelspec": { + "display_name": "tf37", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "nteract": { + "version": "0.12.3" + } }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) \n", - "[GCC 9.4.0]\n", - "Pandas version: 1.3.5\n", - "Tensorflow version: 2.7.0\n" - ] - } - ], - "source": [ - "# set the environment path to find Recommenders\n", - "import sys\n", - "\n", - "import pandas as pd\n", - "import numpy as np\n", - "import scrapbook as sb\n", - "import tensorflow as tf\n", - "tf.get_logger().setLevel('ERROR') # only show error messages\n", - "\n", - "from recommenders.models.rbm.rbm import RBM\n", - "from recommenders.datasets.python_splitters import numpy_stratified_split\n", - "from recommenders.datasets.sparse import AffinityMatrix\n", - "from recommenders.datasets import movielens\n", - "from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", - "from recommenders.utils.timer import Timer\n", - "from recommenders.utils.plot import line_graph\n", - "\n", - "#For interactive mode only\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib inline\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"Pandas version: {}\".format(pd.__version__))\n", - "print(\"Tensorflow version: {}\".format(tf.__version__))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1 Load Data \n", - "\n", - "Here we select the size of the movielens dataset. In this example we consider the 100k ratings datasets, provided by 943 users on 1682 movies. The data are imported in a pandas dataframe including the user ID, the item ID, the ratings and a timestamp denoting when a particular user rated a particular item. " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 30.9kKB/s]\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
userIDmovieIDratingtimestamp
01962423.0881250949
11863023.0891717742
2223771.0878887116
3244512.0880606923
41663461.0886397596
\n", - "
" - ], - "text/plain": [ - " userID movieID rating timestamp\n", - "0 196 242 3.0 881250949\n", - "1 186 302 3.0 891717742\n", - "2 22 377 1.0 878887116\n", - "3 244 51 2.0 880606923\n", - "4 166 346 1.0 886397596" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = movielens.load_pandas_df(\n", - " size=MOVIELENS_DATA_SIZE,\n", - " header=['userID','movieID','rating','timestamp']\n", - ")\n", - "\n", - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.2 Split the data using the stratified splitter \n", - "\n", - "As a second step we generate the user/item affiity matrix and then split the data into train and test set. If you are familiar with training supervised learning model, here you will notice the first difference. In the former case, we cut off a certain proportion of training examples from dataset (e.g. images), here corresponding to users (or items), ending up with two matrices (train and test) having different row dimensions. Here we need to mantain the same matrix size for the train and test set, but the two will contain different amounts of ratings, see the [deep dive notebook](../02_model/rbm_deep_dive.ipynb) for more details. The affinity matrix reads " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "inputHidden": false, - "outputHidden": false, - "tags": [ - "sparse_matrix" - ] - }, - "outputs": [], - "source": [ - "#to use standard names across the analysis \n", - "header = {\n", - " \"col_user\": \"userID\",\n", - " \"col_item\": \"movieID\",\n", - " \"col_rating\": \"rating\",\n", - " }\n", - "\n", - "#instantiate the sparse matrix generation \n", - "am = AffinityMatrix(df = data, **header)\n", - "\n", - "#obtain the sparse matrix \n", - "X, _, _ = am.gen_affinity_matrix()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The method also returns informations on the sparsness of the dataset and the size of the user/affinity matrix. The former is given by the ratio between the unrated elements and the total number of matrix elements. This is what makes a recommendation task hard: we try to predict 93% of the missing data with only 7% of information!\n", - "\n", - "We split the matrix using the default ration of 0.75, i.e. 75% of the ratings will constitute the train set." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "tags": [ - "split" - ] - }, - "outputs": [], - "source": [ - "Xtr, Xtst = numpy_stratified_split(X)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The splitter returns:\n", - "\n", - "- Xtr: a matrix containing the train set ratings \n", - "- Xtst: a matrix containing the test elements \n", - "\n", - "Note that the train/test matrices have exactly the same dimension, but different entries as it can be explicitly verified:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "train matrix size (943, 1682)\n", - "test matrix size (943, 1682)\n" - ] - } - ], - "source": [ - "print('train matrix size', Xtr.shape)\n", - "print('test matrix size', Xtst.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "model", - "train" - ] - }, - "source": [ - "## 2 Train the RBM model\n", - "\n", - "The model has been implemented as a Tensorflow (TF) class. TF does not support probabilistic models natively, so the implementation of the algorithm has a different structure than the one you may be used to see in popular supervised models. The class has been implemented in such a way that the TF session is hidden inside the `fit()` method and no explicit call is needed. The algorithm operates in three different steps: \n", - "\n", - "- Model initialization: This is where we tell TF how to build the computational graph. The main parameters to specify are the number of hidden units, the number of training epochs and the minibatch size. Other parameters can be optionally tweaked for experimentation and to achieve better performance, as explained in the [RBM Deep Dive section](../02_model/rbm_deep_dive.ipynb).\n", - "\n", - "- Model fit: This is where we train the model on the data. The method takes two arguments: the training and test set matrices. Note that the model is trained **only** on the training set, the test set is used to display the generalization accuracy of the trained model, useful to have an idea of how to fix the hyper parameters. \n", - "\n", - "- Model prediction: This is where we generate ratings for the unseen items. Once the model has been trained and we are satisfied with its overall accuracy, we sample new ratings from the learned distribution. In particular, we extract the top_k (e.g. 10) most relevant recommendations according to some predefined score. The prediction is then returned in a dataframe format ready to be analysed and deployed. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "inputHidden": false, - "outputHidden": false, - "tags": [ - "initialization" - ] - }, - "outputs": [], - "source": [ - "#First we initialize the model class\n", - "model = RBM(\n", - " possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),\n", - " visible_units=Xtr.shape[1],\n", - " hidden_units=600,\n", - " training_epoch=30,\n", - " minibatch_size=60,\n", - " keep_prob=0.9,\n", - " with_metrics=True\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the first time the fit method is called it may take longer to return the result. This is due to the fact that TF needs to initialized the GPU session. You will notice that this is not the case when training the algorithm the second or more times. " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "inputHidden": false, - "outputHidden": false, - "tags": [ - "training" - ] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 2.49 seconds for training.\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVEAAAE9CAYAAACyQFFjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAArsUlEQVR4nO3deXxU9b3/8dcnCwkQSAIkQNh3ZA0aRZEqbrggoFa94vKz2mptq7a29Vbba2lre9tb297ettZqW0RbxbaCFbV1LYIsimHf9y2sYd8JhM/vjxlshGyTzORMMu/n4zEPknPOzHwykDfne77f8/2auyMiIjWTFHQBIiL1mUJURKQWFKIiIrWgEBURqQWFqIhILShERURqISXoAqKpVatW3rlz56DLEJEGZs6cOTvdPae8fQ0qRDt37kxhYWHQZYhIA2NmGyrap+a8iEgtxDREzWycme0ws8UV7M80s9fMbIGZLTGzu8rsu9PMVoUfd8ayThGRmor1meh44KpK9n8FWOruA4FhwM/NrJGZtQDGAoOB84CxZpYd41pFRCIW0xB192nA7soOAZqZmQEZ4WNPAFcC77j7bnffA7xD5WEsIhKIoDuWfgNMBrYAzYD/cPeTZtYO2FTmuCKgXQD1iYhUKuiOpSuB+UAekA/8xsyaR/ICZnavmRWaWWFxcXH0KxQRqUTQIXoXMMlDVgPrgN7AZqBDmePah7edwd2fcfcCdy/IySl3GJeISMwEHaIbgcsAzKw10AtYC7wFDDez7HCH0vDwNhGRuBLTa6JmNoFQr3srMysi1OOeCuDuvwMeB8ab2SLAgG+5+87wcx8HPg6/1A/cvbIOKhGRQMQ0RN19TBX7txA6yyxv3zhgXCzqOuXYiVJ2HyqhbWbjWL6NiDRgQTfnAzVleTHfmrgo6DJEpB5L6BC9oFtL5qzfzdHjpUGXIiL1VEKHaGbjVHq0bsbcDXuCLkVE6qmEDlGAz/RoxQerdwZdhojUUwkfokO7t2KGQlREaijhQ3RQx2zWFR9iz6GSoEsRkXoo4UO0UUoSBZ2zmblmV9CliEg9lPAhCjC0Rw7TV+u+exGJnEKUcOfSqp24e9CliEg9oxAFeuRmcLz0JBt3Hw66FBGpZxSigJlxYffQ2aiISCQUomFDu7diukJURCKkEA0b2r0Vs9buovSkrouKSPUpRMNym6fTpnk6C4v2Bl2KiNQjCtEyhvZQk15EIqMQLWNoj1ZM1y2gIhIBhWgZg7u0YPHmfRw6diLoUkSknlCIltGkUQr92mUye51WIhGR6lGInubU3UsiItWhED2N7qMXkUgoRE/Tv10m2/cfY/v+o0GXIiL1gEL0NMlJxgVdW2qiZhGpFoVoOTReVESqSyFajs+Ex4tqajwRqYpCtBydWjYlLTWJldsPBl2KiMQ5hWgFhnbP4YNV6qUXkcopRCugVUBFpDoUohUY0q0lH6/fQ8mJk0GXIiJxTCFageymjeia05S5G/cEXYqIxDGFaCU0272IVEUhWomhPVrxga6LikglFKKVOKdTNqu3H2Dv4ZKgSxGROKUQrURaSjKf6ZHDW0u2BV2KiMQphWgVRuXnMXnBlqDLEJE4FdMQNbNxZrbDzBZXsP9hM5sffiw2s1IzaxHe95CZLQlvn2Bm6bGstSKX9s5lUdE+dhzQrE4icqZYn4mOB66qaKe7P+Hu+e6eDzwKTHX33WbWDngQKHD3fkAycEuMay1Xemoyl5/VmjcWbg3i7UUkzsU0RN19GlDdtTbGABPKfJ8CNDazFKAJEFibeqSa9CJSgbi4JmpmTQidsU4EcPfNwM+AjcBWYJ+7vx1UfUO7t2LDrsNs2n04qBJEJE7FRYgCI4EZ7r4bwMyygdFAFyAPaGpmt5f3RDO718wKzaywuDg2E4akJidxdb82OhsVkTPES4jewqeb8pcD69y92N2PA5OAIeU90d2fcfcCdy/IycmJWYGjBubxmkJURE4TeIiaWSZwMfBqmc0bgfPNrImZGXAZsCyI+k45t3ML9h05zsrtB4IsQ0TiTKyHOE0AZgG9zKzIzD5vZveZ2X1lDrseeNvdD53a4O4fAS8Dc4FF4TqfiWWtVUlKMq4d0JbJ83U2KiL/Zg1pCYyCggIvLCyM2esvKtrHV16cy9SHhxE6QRaRRGBmc9y9oLx9gTfn65N+7ZqTnGQsKNoXdCkiEicUohEwM0YOzFOTXkQ+oRCN0KiBeby+cAulJxvOZRARqTmFaIS652bQKiONj9btCroUEYkDCtEaGJWvMaMiEqIQrYGRA/N4c/E2LWInIgrRmmiX1ZhuORlal15EFKI1pcmaRQQUojV2Tf+2/Gv5Do6UlAZdiogESCFaQ60y0sjvkMV7y7cHXYqIBEghWgujNPBeJOEpRGvhyn5tmLVmF/uOHA+6FBEJiEK0FpqnpzKke0stqSySwBSitTS8TxumrtRQJ5FEpRCtpYEdslhYtDfoMkQkIArRWuraqil7Dx1n96GSoEsRkQAoRGspKcno1y6TBTobFUlICtEoGNghi4WbNFGzSCJSiEbBwPaZui4qkqAUolEwoEMWC4r20ZDWqxKR6lGIRkFeZjrgbN13NOhSRKSOKUSjwMwY2D6LBZv2Bl2KiNQxhWiUDGifpVVARRKQQjRKBnRQ55JIIlKIRsnA9lks2ryPk1oFVCShKESjpEXTRmQ2TmXtzkNBlyIidUghGkW6j14k8ShEoyg06F6dSyKJRCEaRaEe+r1BlyEidUghGkX92mWyfOsBjpdqPXqRRKEQjaKMtBQ6tGjMim0Hgi5FROqIQjTK1KQXSSwK0Sgb2D5T0+KJJBCFaJTpTFQkscQ0RM1snJntMLPFFex/2Mzmhx+LzazUzFqE92WZ2ctmttzMlpnZBbGsNVp6t23Ghl2HOVxyIuhSRKQOxPpMdDxwVUU73f0Jd89393zgUWCqu+8O7/4/4E137w0MBJbFuNaoSEtJpmfrDJZs2R90KSJSB2Iaou4+Ddhd5YEhY4AJAGaWCVwE/DH8OiXuvjcWNcbCAE2LJ5Iw4uKaqJk1IXTGOjG8qQtQDDxrZvPM7A9m1jSwAiM0QHcuiSSMuAhRYCQwo0xTPgU4G3jK3QcBh4BHynuimd1rZoVmVlhcXFw31VYhv4M6l0QSRbyE6C2Em/JhRUCRu38U/v5lQqF6Bnd/xt0L3L0gJycnxmVWT9ecDHYdLGHvYa1FL9LQBR6i4eufFwOvntrm7tuATWbWK7zpMmBpAOXVSHKS0TevuZr0IgkgJZYvbmYTgGFAKzMrAsYCqQDu/rvwYdcDb7v76RNxPgC8YGaNgLXAXbGsNdpOTYt3Uc/4ODsWkdiIaYi6+5hqHDOe0FCo07fPBwqiXlQdGdA+k1fnbwm6DBGJscCb8w2VVv8USQwK0Rhpn92YEyedbVqLXqRBU4jGiJkxoH2mhjqJNHAK0Rga0F5rLok0dArRGMrvkMkCTYsn0qApRGPo1Jmou9aiF2moFKIx1CojjWbpqazfdTjoUkQkRhSiMRaajGRv0GWISIwoRGNsYIcs5mu8qEiDpRCNMU2LJ9KwKURjrH+7TJZt3c/M1TuDLkVEYkAhGmPN0lP5+U0DeWTSIv7fuNks3qyzUpGGRCFaB67u35Z3v34xl/XO5a7xH/PghHlsVI+9SIOgEK0jjVKSuHNIZ97/5jC65WQw+snpjH11MTsPHgu6NBGpBavuQHAzSwM+C3SmzBR67v6DmFRWAwUFBV5YWBh0GdWy6+AxfjNlNa/M28ydF3Tmixd3pUmjmM5MKCI1ZGZz3L3cqTkjORN9FRgNnCC05tGph9RAy4w0xo7sy2v3D2Vh0V5++uaKoEsSkRqI5NSnvbtXuIa81EyHFk347si+3Pz0LB67tg/JSRZ0SSISgUjORGeaWf+YVZLAurRqSpvm6Xy4dlfQpYhIhCIJ0aHAHDNbYWYLzWyRmS2MVWGJZtTAPF5boOVEROqbSEL0aqAHMJzQOvHXhv+UKBgxoC1vLtlGyYmTQZciIhGoMkTNrHn4ywMVPCQK8rIa0zO3GR+sKg66FBGJQHU6ll4kdNY5B3CgbM+HA11jUFdCGjmwLZMXbOGys1oHXYqIVFOVIeru14b/7BL7chLb1f3b8tO3VnCkpJTGjZKDLkdEqiGiO5bMLNvMzjOzi049YlVYImqVkUZ+hyzeW7496FJEpJqqHaJm9gVgGvAW8P3wn9+LTVmJS730IvVLJGeiXwXOBTa4+yXAIGBvLIpKZMP7tmHm6l3sP3o86FJEpBoiCdGj7n4UQvfRu/tyoFdsykpcmY1TuaBbS95eoia9SH0QSYgWmVkW8HfgHTN7FdgQi6IS3ciBeUxWk16kXqh2iLr79e6+192/BzwG/BG4LkZ1JbTLzspl3sY97NI0eSJxr1ohambJZrb81PfuPtXdJ7t7SexKS1xNGqVwSa9c/rF4W9CliEgVqhWi7l4KrDCzjjGuR8LUSy9SP0RyTTQbWGJm75nZ5FOPWBWW6D7TsxUrtx9g674jQZciIpWIZD7Rx2JWhZwhLSWZK/u04Y2FW/nCZ3RnrUi8iuRM9JrwtdBPHsA1sSpM1EsvUh9EEqJXlLPt6sqeYGbjzGyHmS2uYP/DZjY//FhsZqVm1qLM/mQzm2dmr0dQZ4NxQbeWbNl7lPU7tQqLSLyqzlR4XzKzRUCv8GTMpx7rgKomZR4PVLikiLs/4e757p4PPApMdffdZQ75KrCsqhobquQkY0T/NupgEolj1TkTfZHQ5MuTw3+eepzj7refOsjMsk9/ortPA3afvr0CY4AJZV6vPTAC+EM1n98gjcrP47WFClGReFVliLr7Pndf7+5j3H1Dmcfp4fheTYswsyaEzlgnltn8S+A/gYSe6n1Qh2wOHStl+bb9QZciIuWIaCq8KtRmmcqRwIxTwWxm1wI73H1OlW9qdq+ZFZpZYXFxw5sVPinJuHZgWzXpReJUNEPUa/HcWyjTlAcuBEaZ2XrgJeBSM/tzuW/q/oy7F7h7QU5OTi1KiF8jB+Qxcc5mNu0+HHQpInKaaIZojZhZJnAx8Oqpbe7+qLu3d/fOhAL2X2Wvvyaafu0yueeirlz35Axenb856HJEpIxIBttX5YzmvJlNAIYBrcysCBgLpAK4++/Ch10PvO3uGsdTic8P7cLgLi148KV5TFu5k++P7ktGWjT/+kSkJsy9+q1wMxsK9HD3Z80sB8hw93XhfS3K6WyqUwUFBV5YWBhkCTF3uOQE35+8lI/W7eJXYwYxoH1W0CWJNHhmNsfdC8rbF8nyIGOBbxEazwmhM8pPrlMGHaCJokmjFP7nxgF888pe3PXsxzw9dQ0nT9bmcrSI1EYk10SvB0YBhwDcfQvQLBZFSdWuHZDHq/dfyDtLt3Pns7PZsf9o0CWJJKRIQrTEQ21/BzCzprEpSaqrfXYTXrr3fM7umM2IX0/n2RnrKD6giZxF6lIkPRN/NbOngSwzuwe4G/h9bMqS6kpJTuKhK3oyrFcOf5q1gV+8s5L8DlmMzm/HlX1b0yw9NegSRRq0SDuWrgCGE+qJf8vd34lVYTWRCB1LVTlSUsq7y7bz6vzNfLR2Nxf1zGF0fh4X98ohLSU56PJE6qXKOpaqHaLh5vtRdy81s16EVvr8p7vHzdq+CtFP23OohH8u3sbf529m5fYD3HlBZx66omfQZYnUO1HpnQemAWlm1g54E7iD0CxNEqeymzbi1sEd+esXL+D1B4byx+nr2Hckbv7PE2kQIglRc/fDwA3AU+5+E9A3NmVJtLXPbsK5nbOZvmpn0KWINCgRhaiZXQDcBrwR3qaLbPXIJb1zmbJiR9BliDQokYTo1wgNtH/F3ZeYWVdgSkyqkpgY1jOXqSuLNThfJIqqPcQpvKbS1DLfrwUejEVREhsdWzaheXoKS7fup1+7zKDLEWkQIrnts8DMJpnZ3LLLhMSyOIm+Yb1ymbJcTXqRaImkOf8Cod74z/LpZUKkHrmkl66LikRTJHcsFbv75JhVInXi3C7ZrNp+kN2HSmjRtFHQ5YjUe5GE6Fgz+wOhtZQ+uUHb3SdFvSqJmbSUZM7v1pIPVhUzOr9d0OWI1HuRhOhdQG9CU+CdWjzOAYVoPXNJ+LqoQlSk9iIJ0XPdvVfMKpE6M6xXDj97ewWlJ53kpNqsLygikXQszTSzPjGrROpMXlZjcpulsbBob9CliNR71QpRMzNCi8nNN7MV4eFNizTEqf66uFcOU1Y0vCWmRepatUI0PBlzLtCD0FR4I4Fr0RCneuuSXrm8r6FOIrUWyTXRiUCuu38cq2Kk7pzTKZv1Ow9RfOAYOc3Sgi5HpN6K5JroYGCWma1Rc77+S01OYmiPVkxdqSa9SG1EciZ6ZcyqkEAMC9+9dOM57YMuRaTeimQCkg2xLETq3rCeOfzojWWcKD1JSnIkjRIROUW/OQkst3k6HVo0Zu7GvUGXIlJvKUQTnHrpRWpHIZrghmm8qEitKEQTXH6HbLbuO8K2fUeDLkWkXlKIJrjkJOOiHjlq0ovUkEJUuKR3jiZqFqkhhahwUY8cZq7ZRcmJk1UfLCKfohAVWmak0S0ng8L1u4MuRaTeUYgKoLWXRGpKISpA6Lro+xrqJBKxmIaomY0zsx1mtriC/Q+b2fzwY7GZlZpZCzPrYGZTzGypmS0xs6/Gsk6BfnmZ7DlcoomaRSIUyQQkNTEe+A3wfHk73f0J4AkAMxsJPOTuu80sDfiGu881s2bAHDN7x92XxrjehJWUZDxwaQ8+/1whzdNTuPys1lzaO5dzOmXrvnqRSsQ0RN19mpl1rubhY4AJ4edtBbaGvz5gZsuAdoBCNIbuHNKZO87vxKLN+3hv2XZ+8PpSNu89wrCeOVx6Vmsu7plDZuPUoMsUiSsWmrQ+hm8QCtHX3b1fJcc0AYqA7u6++7R9nYFpQD9331/ZexUUFHhhYWGta5Z/27rvCO8t28F7y7bz8fo9XNi9JT++YYDWrJeEYmZz3L2gvH3x0k4bCcwoJ0AzCM2o/7WKAtTM7jWzQjMrLC5Wx0i0tc1szO3nd+LZu85j9ncuo0urDK791QfM3bgn6NJE4kK8hOgthJvyp5hZKqEAfcHdK1zb3t2fcfcCdy/IycmJcZmJrUmjFB65ujffH92Pe54rZPyMdcS6JSMS7wIPUTPLJLSS6KtlthnwR2CZu/8iqNqkfFf0ac2kLw/hr4VFPDBhHgePnQi6JJHAxHqI0wRgFtDLzIrM7PNmdp+Z3VfmsOuBt939UJltFwJ3AJeWGQJ1TSxrlch0atmUSV8eQkZaCqN/M52V2w8EXZJIIGLesVSX1LEUjL8VbuLH/1zO2JF9GJ3fLuhyRKKuso6lWI8TlQRwU0EH+uZl8uUX5lC4fg+PXduHRimBXykSqRP6ly5R0SevOZMfGMrWfUe5c9xs9h05HnRJInVCISpR0zw9lafvOIdebZpx41MzKdpzOOiSRGJOISpRlZxkfG9UX8ac15HPPjWTRUX7gi5JJKYUohITdw/twvdH9ePOZ2fzr+Xbgy5HJGbUsSQxc1W/NuQ2T+OLf5rDg5cd5Y7zOwVdkkjU6UxUYursjtm8fN8FPDt9HT/+xzJOnmw4Q+pEQCEqdaBTy6ZM/NIQ5mzYwwMvzePo8dKgSxKJGoWo1Inspo348xcGA3D/i/N0z700GApRqTPpqcn87835bNp9mNcWbg26HJGoUIhKnWqUksRPPtufx19fyp5DJUGXI1JrClGpc4M6ZjOif1t+9I9lQZciUmsKUQnEN6/sxaw1u5ixemfQpYjUikJUApGRlsLj1/Xl268siri3/khJaVzcm3+45ASlGrKV8BSiEphLe7emf7tMfvnuqmo/Z/v+o1z/2xnc9LuZHApwMmh3585xs3n8da2dmOgUohKosSP78rfCTSzZUvU99qt3HOCG385kVH4eA9tn8Z8TFwY2VOr9lcXsOljCawu2sHxbpesnSgOnEJVA5TRL41tX9eaRiYs4UXqywuMK1+/mlmc+5OtX9OTLw7rz+HX92LjrMH/4YF0dVhvi7vz87RV888pefO3yHnxv8hKNe01gClEJ3E0F7WmWnsL4mevL3f/Wkm188U9z+PnN+Xz2nPZAaMzpU7efzdPT1jJzTd12Tr21ZBvucFXfNow5ryN7Dx/nH4u21WkNEj8UohI4M+O/r+/Pk1NWs2n3p+cg/dOHG3js74sZf9d5XNzz06u5ts9uwi//I5+vvjSfLXuP1EmtpSedX7yzkm8M70lSkpGSnMT3RvXlv/+xjCMlup01ESlEJS50btWUey7qynf+vhh3x9154q3ljJu+jpfvG0L/9pnlPm9oj1bcfWEXvvTCXI6diH2IvbZgCxlpKVzSK/eTbed3bcmgjlk8NXVNzN9f4o9CVOLGPZ/pSvGBY0ycu5mHX17I9NW7ePm+C+jYskmlz7vv4q7kZabzvcmx7Sk/XnqSX767km8O70VoVe9/+/Y1Z/H8rPVnnElLw6cQlbiRmpzET27oz7cmLmT3oRIm3DOYlhlpVT7PzHjipoHMXreLl2ZvjFl9E+cUkZfVmCHdW52xLy+rMZ+/sAs/fENDnhKNQlTiysAOWfz9yxfyzB3n0KRR9ecMz0hL4ek7CvjpWytYsGlv1Os6dqKUX723im8M71XhMfdc1JVlWw/wwariqL+/xC+FqMSd/u0zSUmO/J9m99wM/vv6/nz5hbnsOngsqjW9NHsTvds255xO2RUek56azH+NOIvvv7aU45UM15KGRSEqDcpV/dowOj+P+1+cF7Xe8iMlpTw5ZTVfv6Jnlcde0ac1bTPTeX7Whqi8t8Q/hag0ON8Y3ou2Welc/9sZrNt5qNav9/ys9RR0zqZfu/JHCJRlZowd2Ycnp6xmZ5TPhiU+KUSlwUlOMn5+00BuP78TNz41kzcX13wC6ANHj/PMtLU8dHnVZ6GndM9txg2D2vHEmytq/L5SfyhEpUEyM24/vxPjPncuj7++jB+9UbPrlOOmr+finjn0aN0souc9eHkPpqzYEZNOLokvClFp0AZ2yOL1B4ayasdBbv39h2zff7Taz917uITxM9fx1ct7RPy+zdNTefjKXnzlxbmMfXUxL3y0gcL1u+NiCj+JLmtIEycUFBR4YWFh0GVIHDp50vnNlNX8+cMN/N8tg7igW8sqn/M/by5n7+Hj/PiG/jV6T3dn1tpdLN2yn5XbD7By+0FWbT9As/RUerZpRq/WGfRs3YwRA9pGNJxL6p6ZzXH3gnL3KUQlkXywqpiH/rKAu4d25u4Lu1B84Bjb9x9l+/5jbNt/lB37j7Jt/1G27z/K0i37efNrF5GX1Thq73/ypLN57xFW7TjAim0HmbpyBy0z0vjNmEFn3AUVLUV7DmNmtIviz5FoFKIiZWzZe4T7X5zLwqJ95DZLo3VmOq2bpdMmM53c5mm0aZ5O6+bpdM1pStvM2AbP0eOlXP/bmdw6uCN3nN8pKq9ZfOAYs9buYtaancxYvYu9h0tol92Efzw4NGZB3dBVFqJqQ0jCyctqzMQvDcEdkpKCDZX01GR+e9vZ3PjUTAZ1yKrWMKrTHTh6nFlrdjFzzS5mrdnF1n1HOK9LSy7s3pLPDelC99wMLv35+8zftJdBHSu+WUBqRiEqCcnMiJeTsi6tmjJ2VF/uf3Eurz0wlGbpqdV+7uLN+7hr/Mf0btOMC7q15Kc3DqBvXvMz7vi69byOvPDRRoVoDMS0d97MxpnZDjNbXMH+h81sfvix2MxKzaxFeN9VZrbCzFab2SOxrFMkaKMG5jGkeysenbSo2rPkz9mwh889O5vHR/flT58fzJeHdWdgh6xyb5m98Zz2vLVkG/sOa3RAtMV6iNN44KqKdrr7E+6e7+75wKPAVHffbWbJwJPA1UAfYIyZ9YlxrSKB+u61fVi94yAvVmMmqplrdnLP84U8cdNArurXtsrjW2akcUmvXCbNK4pGqVJGTEPU3acBu6t5+BhgQvjr84DV7r7W3UuAl4DRMShRJG6kpybz5G1n8/O3V7J0S8WL301ZvoMHXpzHk7ee/anJoaty2+BQk74hdSbHg7gYbG9mTQidsU4Mb2oHbCpzSFF4m0iD1i0ng7Ej+/CVF+dysJwlof+5aCsPv7yA399ZUK2xrmWd16UFALPXVfe8RqojLkIUGAnMcPeI/3bN7F4zKzSzwuJizeMo9d/o/Hac37UF3z7t+ujEOUV8d/ISnrv7PM6uQQeRmX3SwSTREy8hegv/bsoDbAY6lPm+fXjbGdz9GXcvcPeCnJyc8g4RqXfGjuzLyu0HeOnjUIPszx9u4Gdvr2DCPYPpmxf5MKhTPnt2e95fsSPq860mssBD1MwygYuBV8ts/hjoYWZdzKwRoZCdHER9IkE4dX30ibdW8P3XlvD0tDX85d4L6J4b2UQop8tsksrwvm14eY46mKIl1kOcJgCzgF5mVmRmnzez+8zsvjKHXQ+87e6fTPzo7ieA+4G3gGXAX919SSxrFYk3p66Pzl63m79+seoF+6rr1sEdeXH2Rk6eVAdTNOi2T5EE4+5c86vpfPua3nymhy6BVUdlt30G3pwXkbplZtw2uCMvqoMpKhSiIglodH4eM1bvjGh+1do6eryUkhMNbwE/hahIAmqWnsqIAXn89eNNVR8cJQ9MmMed42Y3uJVQFaIiCeq2wR2ZMHsjpVV0MB07Ucqv3lvF+Bnravxe01YWs2LbAdJSk/jh60tr/DrxSCEqkqD6tcskp1kaU1fuqPCY2et2c83/fcDCor38+l+rWb6t4ttRK3K89CSPv76U74w4i1+NGcT01TtrdD32eOlJvvPKIt5fUXG9QVCIiiSw2wZ34oUPzwy0fUeO8+ikRTw4YR4PX9mLP9x5Lt8Y3otHJi6q8sz1dC98uIHc5mkM79Oa5ump/OHOc/nFOysiuv306PFS7vvTHN5fUczfCuNrjKtCVCSBXTuwLXM27qFoz2EgNPzpjYVbueIXU0lOgre/ftEns0Tdcm4HGqUk8fys9dV+/T2HSvj1v1bz3Wv7fjKrfpdWTfnFzfnc/+LcT963MoeOneDu8R/TuFEyE780hGmrijl2ojTyHzZGFKIiCaxJoxSuy2/HXz7exJa9R/jCc4X877sr+e1tZ/PD6/rTvMwE0UlJxo9v6M+v/7WazXuPVOv1//fdlYwY0JZebT59p9VFPXP44sXduOf5ORwuOXOilVP2HT7O7X/8iI4tmvB/twyiTWY6PVs346O18TOJikJUJMHdOrgjz81cz4hffcCA9lm88eBQCjq3KPfYbjkZ3DWkM4/9fXGVU+ot37afNxZu5aHLe5a7/+4LO9Mvrznf/NuCcl9r58Fj3PL7Dzm7YzY/vqE/yeGlXC4/qzXvLtse4U/5b9v3H434kkRlFKIiCa5n62Z87fKe/O2+IXz18h6kpSRXevwXL+7G5j1HeH3h1gqPcXcef30pD1zaneymjco9xsz44fX92LbvKL/+1+pP7duy9wg3Pz2L4X1a818jzvrUAntX9Mnl3aXbazwv6n1/nsP01Ttr9NzyKERFhLuHhha0q45GKUn85LP9+cHrS9l7uKTcY95Zup0d+49xWxUrmKalJPO7O87hpdkbeXPxNgDW7zzEzU/PYsy5HXnoip5nrFDaLSeDRilJLN0a+UiBtcUHKdpzhAsjnIu1MgpREYnYoI7ZjOjflh+9seyMfcdOlPLDN5bx3ZF9SC1nvafT5TZL5+k7CvjOK4uYvGALtzzzIV8e1p17Lupa7vFmFmrSL418qNMr8zYzemBeuetQ1ZRCVERq5JtX9mLmml3MOK1pPG76enq2zohocpP+7TMZO6ovX//LfB69pje3Du5Y6fGX94n8uujJk86kuZu54ez2ET2vKgpREamRjLQUHr+uL99+ZRFHj4eGHO3Yf5Rnpq3hOyMiX1dy1MA85jx2BaPzq14JqKBTNpv2HGbrvuqNEgCYvX43zdJT6JPXPOLaKqMQFZEau7R3a/q3y+SX764C4Im3VnBzQQe6tGpao9fLbJxa9UFASnISw3rm8N6y6jfpJ80t4oazo79Um0JURGpl7Mi+vDxnEy/N3sj7K4u5/9LudfK+kTTpjx4v5a0l26t1lhsphaiI1EpOszT+86rePDJpEd8c3pNm6dU7m6yti3rmULh+D4fKWRX1dG8v3c7ADlm0bp4e9TpSov6KIpJwbjqnPempyYzo37bO3rN5eiqDOmbxwaqdXNWvTaXHTppbxGdj0JQHnYmKSBSYGaMG5n1yV1Fdqc7dSzsOHGXuhj0M71N50NaUQlRE6q3LzsplyvIdld7GOXn+Fob3bUPjRpXfiVVTClERqbfaZzcht3k68zbuqfCYiXM3x6RX/hSFqIjUa1eclcs7FTTpl23dz77DJZzfJXq3eZ5OISoi9drlfVrz7tLyQ/SVeZu5/ux2JMXwWq1CVETqtX55mRw4eoK1xQc/tf1E6Un+Pm8z1w+K7m2ep1OIiki9lpRkXHZW6zPuXpqxZhdtsxpXe3aqGr9/TF9dRKQOXNHnzOuisRwbWpZCVETqvSHdWrF0y372HArNb3rw2An+tXwH1w7Ii/l7K0RFpN5LT01mSLeWTAkvp/zPRVs5v2tLWlQwq340KURFpEEoOyHJpLmb66QpDwpREWkgLu2dywerdrJu5yGWb9vPJb1z6+R9FaIi0iC0ykijZ+tmPDJxISMGtK1ywb1oUYiKSINx+Vmt+Wjd7qgvAVIZTYUnIg3G1f3a8OHaXQzqkFVn76kQFZEGo3Orpjx393l1+p4xbc6b2Tgz22Fmiys5ZpiZzTezJWY2tcz2h8LbFpvZBDOL/pTUIiK1FOtrouOBqyraaWZZwG+BUe7eF7gpvL0d8CBQ4O79gGTglhjXKiISsZiGqLtPA3ZXcsitwCR33xg+vuzNrylAYzNLAZoAW2JWqIhIDQXdO98TyDaz981sjpn9PwB33wz8DNgIbAX2ufvbAdYpIlKuoEM0BTgHGAFcCTxmZj3NLBsYDXQB8oCmZnZ7eS9gZveaWaGZFRYXF9dV3SIiQPAhWgS85e6H3H0nMA0YCFwOrHP3Ync/DkwChpT3Au7+jLsXuHtBTk5OnRUuIgLBh+irwFAzSzGzJsBgYBmhZvz5ZtbEzAy4LLxdRCSuxHScqJlNAIYBrcysCBgLpAK4++/cfZmZvQksBE4Cf3D3xeHnvgzMBU4A84BnYlmriEhNmHvFS43WNwUFBV5YWBh0GSLSwJjZHHcvKG9f0M15EZF6TSEqIlILDao5b2bFwIYIn9YK2BmDcqKtvtQJ9adW1Rl99aXWSOvs5O7lDv9pUCFaE2ZWWNG1jnhSX+qE+lOr6oy++lJrNOtUc15EpBYUoiIitaAQrT/jT+tLnVB/alWd0Vdfao1anQl/TVREpDZ0JioiUgsJHaJmdpWZrTCz1Wb2SND1VMTM1pvZovAKAHF1S1Z5qxeYWQsze8fMVoX/zA6yxnBN5dX5PTPbHP5c55vZNUHWGK6pg5lNMbOl4ZUdvhreHlefaSV1xtVnambpZjbbzBaE6/x+eHsXM/so/Lv/FzNrVOP3SNTmvJklAyuBKwjNJvUxMMbdlwZaWDnMbD2hWf7jbvydmV0EHASeD69CgJn9FNjt7j8J/+eU7e7fisM6vwccdPefBVlbWWbWFmjr7nPNrBkwB7gO+Bxx9JlWUufNxNFnGp7AqKm7HzSzVGA68FXg64QmhH/JzH4HLHD3p2ryHol8JnoesNrd17p7CfASoTlMJQIVrF4wGngu/PVzhH65AlWNVRbigrtvdfe54a8PEJq9rB1x9plWUmdc8ZCD4W9Tww8HLgVeDm+v1eeZyCHaDthU5vsi4vAfQZgDb4dn/7836GKqobW7bw1/vQ1oHWQxVbjfzBaGm/uBX3Yoy8w6A4OAj4jjz/S0OiHOPlMzSzaz+cAO4B1gDbDX3U+ED6nV734ih2h9MtTdzwauBr4SbprWCx66XhSv14yeAroB+YSWofl5oNWUYWYZwETga+6+v+y+ePpMy6kz7j5Tdy9193ygPaEWaO9ovn4ih+hmoEOZ79uHt8Wd8JpTpxbye4XQP4R4tj18zezUtbMdVRwfCHffHv4FOwn8njj5XMPX7iYCL7j7pPDmuPtMy6szXj9TAHffC0wBLgCywotgQi1/9xM5RD8GeoR76RoRWpJ5csA1ncHMmoYv3GNmTYHhwOLKnxW4ycCd4a/vJLSCQdw5FUph1xMHn2u4I+SPwDJ3/0WZXXH1mVZUZ7x9pmaWY6Gl2TGzxoQ6kpcRCtMbw4fV6vNM2N55gPDwi18SWtd+nLv/KNiKzmRmXQmdfUJoJYIX46nOsqsXANsJrV7wd+CvQEdCs2rd7O6BdupUUOcwQs1OB9YDXyxz3TEQZjYU+ABYRGi1B4BvE7reGDefaSV1jiGOPlMzG0Co4yiZ0EnjX939B+Hfq5eAFoRWzrjd3Y/V6D0SOURFRGorkZvzIiK1phAVEakFhaiISC0oREVEakEhKiJSCwpRkQqY2TAzez3oOiS+KURFRGpBISr1npndHp4zcr6ZPR2ecOKgmf1veA7J98wsJ3xsvpl9GJ4g45VTE2SYWXczezc87+RcM+sWfvkMM3vZzJab2QvhO3VEPqEQlXrNzM4C/gO4MDzJRClwG9AUKHT3vsBUQncoATwPfMvdBxC62+bU9heAJ919IDCE0OQZEJqd6GtAH6ArcGGMfySpZ1KqPkQkrl0GnAN8HD5JbExoco6TwF/Cx/wZmGRmmUCWu08Nb38O+Ft4boJ27v4KgLsfBQi/3mx3Lwp/Px/oTGhiXxFAISr1nwHPufujn9po9thpx9X0/uay91OXot8ZOY2a81LfvQfcaGa58MlaRJ0I/ds+NUvPrcB0d98H7DGzz4S33wFMDc/MXmRm14VfI83MmtTlDyH1l/5XlXrN3Zea2X8Rmvk/CTgOfAU4BJwX3reD0HVTCE179rtwSK4F7gpvvwN42sx+EH6Nm+rwx5B6TLM4SYNkZgfdPSPoOqThU3NeRKQWdCYqIlILOhMVEakFhaiISC0oREVEakEhKiJSCwpREZFaUIiKiNTC/wdn2YE5l/9kaQAAAABJRU5ErkJggg==", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Model Fit\n", - "with Timer() as train_time:\n", - " model.fit(Xtr)\n", - "\n", - "print(\"Took {:.2f} seconds for training.\".format(train_time.interval))\n", - "\n", - "# Plot the train RMSE as a function of the epochs\n", - "line_graph(values=model.rmse_train, labels='train', x_name='epoch', y_name='rmse_train')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "During training, we can optionlly evauate the root mean squared error to have an idea of how the learning is proceeding. We would generally like to see this quantity decreasing as a function of the learning epochs. To visualise this choose `with_metrics = True` in the `RBM()` model function. \n", - "\n", - "Once the model has been trained, we can predict new ratings on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "tags": [ - "top_k" - ] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 0.23 seconds for prediction.\n" - ] - } - ], - "source": [ - "# number of top score elements to be recommended \n", - "K = 10\n", - "\n", - "# Model prediction on the test set Xtst.\n", - "with Timer() as prediction_time:\n", - " top_k = model.recommend_k_items(Xtst)\n", - "\n", - "print(\"Took {:.2f} seconds for prediction.\".format(prediction_time.interval))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`top_k` returns the first K elements having the highest recommendation score. Here the recommendation score is evaluated by multiplying the predicted rating by its probability, i.e. the confidence the algorithm has about its output. So if we have two items both with predicted ratings 5, but one with probability 0.5 and the other 0.9, the latter will be considered more relevant. In order to inspect the prediction and use the evaluation metrics in this repository, we convert both top_k and Xtst to pandas dataframe format:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "top_k_df = am.map_back_sparse(top_k, kind = 'prediction')\n", - "test_df = am.map_back_sparse(Xtst, kind = 'ratings')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
userIDmovieIDprediction
011004.881824
11654.822650
211294.672100
3111044.898961
4111234.664860
5114184.611925
6114274.722356
7115214.738353
8115834.569103
9115464.890738
\n", - "
" - ], - "text/plain": [ - " userID movieID prediction\n", - "0 1 100 4.881824\n", - "1 1 65 4.822650\n", - "2 1 129 4.672100\n", - "3 1 1104 4.898961\n", - "4 1 1123 4.664860\n", - "5 1 1418 4.611925\n", - "6 1 1427 4.722356\n", - "7 1 1521 4.738353\n", - "8 1 1583 4.569103\n", - "9 1 1546 4.890738" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "top_k_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4 Evaluation metrics \n", - "\n", - "Here we evaluate the performance of the algorithm using the metrics provided in the `PythonRankingEvaluation` class. Note that the following metrics take into account only the first K elements, therefore their value may be different from the one displayed from the `model.fit()` method. " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "tags": [ - "ranking" - ] - }, - "outputs": [], - "source": [ - "def ranking_metrics(\n", - " data_size,\n", - " data_true,\n", - " data_pred,\n", - " K\n", - "):\n", - "\n", - " eval_map = map_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", - " col_rating=\"rating\", col_prediction=\"prediction\", \n", - " relevancy_method=\"top_k\", k= K)\n", - "\n", - " eval_ndcg = ndcg_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", - " col_rating=\"rating\", col_prediction=\"prediction\", \n", - " relevancy_method=\"top_k\", k= K)\n", - "\n", - " eval_precision = precision_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", - " col_rating=\"rating\", col_prediction=\"prediction\", \n", - " relevancy_method=\"top_k\", k= K)\n", - "\n", - " eval_recall = recall_at_k(data_true, data_pred, col_user=\"userID\", col_item=\"movieID\", \n", - " col_rating=\"rating\", col_prediction=\"prediction\", \n", - " relevancy_method=\"top_k\", k= K)\n", - "\n", - " \n", - " df_result = pd.DataFrame(\n", - " { \"Dataset\": data_size,\n", - " \"K\": K,\n", - " \"MAP\": eval_map,\n", - " \"nDCG@k\": eval_ndcg,\n", - " \"Precision@k\": eval_precision,\n", - " \"Recall@k\": eval_recall,\n", - " }, \n", - " index=[0]\n", - " )\n", - " \n", - " return df_result" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DatasetKMAPnDCG@kPrecision@kRecall@k
0mv 100k100.1408280.4111240.3362670.212256
\n", - "
" - ], - "text/plain": [ - " Dataset K MAP nDCG@k Precision@k Recall@k\n", - "0 mv 100k 10 0.140828 0.411124 0.336267 0.212256" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval_100k = ranking_metrics(\n", - " data_size=\"mv 100k\",\n", - " data_true=test_df,\n", - " data_pred=top_k_df,\n", - " K=10\n", - ")\n", - "\n", - "eval_100k" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.14082811192026132, - "encoder": "json", - "name": "map", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "map" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.41112362614927883, - "encoder": "json", - "name": "ndcg", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "ndcg" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.3362672322375398, - "encoder": "json", - "name": "precision", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "precision" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.2122560190189148, - "encoder": "json", - "name": "recall", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "recall" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Record results with papermill for tests\n", - "sb.glue(\"map\", eval_100k['MAP'][0])\n", - "sb.glue(\"ndcg\", eval_100k['nDCG@k'][0])\n", - "sb.glue(\"precision\", eval_100k['Precision@k'][0])\n", - "sb.glue(\"recall\", eval_100k['Recall@k'][0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5 Saving the model and Loading a pre-trained model\n", - "Trained model checkpoint can be saved to a specified directory using the `save` function." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "model.save(file_path='./models/rbm_model.ckpt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pre-trained RBM model can be loaded using the `load` function, which can be used to resume the training." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the model class\n", - "model = RBM(\n", - " possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),\n", - " visible_units=Xtr.shape[1],\n", - " hidden_units=600,\n", - " training_epoch=30,\n", - " minibatch_size=60,\n", - " keep_prob=0.9,\n", - " with_metrics=True\n", - ")\n", - "\n", - "# Load the model checkpoint\n", - "model.load(file_path='./models/rbm_model.ckpt')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "interpreter": { - "hash": "67434505f7f08e5031eee7757e853265d2f43dd6b5963eb755a27835ec0e1503" - }, - "kernel_info": { - "name": "python3" - }, - "kernelspec": { - "display_name": "tf37", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - }, - "nteract": { - "version": "0.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/00_quick_start/rlrmc_movielens.ipynb b/examples/00_quick_start/rlrmc_movielens.ipynb index 6d14eebd82..ed78974d8f 100644 --- a/examples/00_quick_start/rlrmc_movielens.ipynb +++ b/examples/00_quick_start/rlrmc_movielens.ipynb @@ -1,334 +1,334 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Copyright (c) Microsoft Corporation. All rights reserved.*\n", - "\n", - "*Licensed under the MIT License.*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Riemannian Low-rank Matrix Completion algorithm on Movielens dataset\n", - "\n", - "Riemannian Low-rank Matrix Completion (RLRMC) is a matrix factorization based (vanilla) matrix completion algorithm that solves the optimization problem using Riemannian conjugate gradients algorithm (Absil et al., 2008). RLRMC is based on the works by Jawanpuria and Mishra (2018) and Mishra et al. (2013). \n", - "\n", - "The ratings matrix of movies (items) and users is modeled as a low-rank matrix. Let the number of movies be $d$ and the number of users be $T$. RLRMC algorithm assumes that the ratings matrix $M$ (of size $d\\times T$) is partially known. The entry at $M(i,j)$ represents the rating given by the $j$-th user to the $i$-th movie. RLRMC learns matrix $M$ as $M=LR^\\top$, where $L$ is a $d\\times r$ matrix and $R$ is a $T\\times r$ matrix. Here, $r$ is the rank hyper-parameter which needs to be provided to the RLRMC algorithm. Typically, it is assumed that $r\\ll d,T$. The optimization problem is solved iteratively using the the Riemannian conjugate gradients algorithm. The Riemannian optimization framework generalizes a range of Euclidean first- and second-order algorithms such as conjugate gradients, trust-regions, among others, to Riemannian manifolds. A detailed exposition of the Riemannian optimization framework can be found in Absil et al. (2008). \n", - "\n", - "This notebook provides an example of how to utilize and evaluate RLRMC implementation in **recommenders**." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import sys\n", - "import time\n", - "import pandas as pd\n", - "\n", - "from recommenders.datasets.python_splitters import python_random_split\n", - "from recommenders.datasets.python_splitters import python_stratified_split\n", - "from recommenders.datasets import movielens\n", - "from recommenders.models.rlrmc.RLRMCdataset import RLRMCdataset \n", - "from recommenders.models.rlrmc.RLRMCalgorithm import RLRMCalgorithm \n", - "# Pymanopt installation is required via\n", - "# pip install pymanopt \n", - "from recommenders.evaluation.python_evaluation import (\n", - " rmse, mae\n", - ")\n", - "\n", - "# import logging\n", - "\n", - "# %load_ext autoreload\n", - "# %autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pandas version: 0.23.4\n", - "System version: 3.7.1 (default, Dec 14 2018, 13:28:58) \n", - "[Clang 4.0.1 (tags/RELEASE_401/final)]\n" - ] - } - ], - "source": [ - "print(\"Pandas version: {}\".format(pd.__version__))\n", - "print(\"System version: {}\".format(sys.version))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set the default parameters.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Select Movielens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '10m'\n", - "\n", - "# Model parameters\n", - "\n", - "# rank of the model, a positive integer (usually small), required parameter\n", - "rank_parameter = 10\n", - "# regularization parameter multiplied to loss function, a positive number (usually small), required parameter\n", - "regularization_parameter = 0.001\n", - "# initialization option for the model, 'svd' employs singular value decomposition, optional parameter\n", - "initialization_flag = 'svd' #default is 'random'\n", - "# maximum number of iterations for the solver, a positive integer, optional parameter\n", - "maximum_iteration = 100 #optional, default is 100\n", - "# maximum time in seconds for the solver, a positive integer, optional parameter\n", - "maximum_time = 300#optional, default is 1000\n", - "\n", - "# Verbosity of the intermediate results\n", - "verbosity=0 #optional parameter, valid values are 0,1,2, default is 0\n", - "# Whether to compute per iteration train RMSE (and test RMSE, if test data is given)\n", - "compute_iter_rmse=True #optional parameter, boolean value, default is False" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "## Logging utilities. Please import 'logging' in order to use the following command. \n", - "# logging.basicConfig(level=logging.INFO)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Download the MovieLens dataset\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "65.6MB [00:25, 2.57MB/s] \n" - ] - } - ], - "source": [ - "\n", - "df = movielens.load_pandas_df(\n", - " size=MOVIELENS_DATA_SIZE,\n", - " header=[\"userID\", \"itemID\", \"rating\", \"timestamp\"]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Split the data using the Spark chronological splitter provided in utilities" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "## If both validation and test sets are required\n", - "# train, validation, test = python_random_split(df,[0.6, 0.2, 0.2])\n", - "\n", - "## If validation set is not required\n", - "train, test = python_random_split(df,[0.8, 0.2])\n", - "\n", - "## If test set is not required\n", - "# train, validation = python_random_split(df,[0.8, 0.2])\n", - "\n", - "## If both validation and test sets are not required (i.e., the complete dataset is for training the model)\n", - "# train = df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate an RLRMCdataset object from the data subsets." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# data = RLRMCdataset(train=train, validation=validation, test=test)\n", - "data = RLRMCdataset(train=train, test=test) # No validation set\n", - "# data = RLRMCdataset(train=train, validation=validation) # No test set\n", - "# data = RLRMCdataset(train=train) # No validation or test set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Train the RLRMC model on the training data" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "model = RLRMCalgorithm(rank = rank_parameter,\n", - " C = regularization_parameter,\n", - " model_param = data.model_param,\n", - " initialize_flag = initialization_flag,\n", - " maxiter=maximum_iteration,\n", - " max_time=maximum_time)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 44.991251945495605 seconds for training.\n" - ] - } - ], - "source": [ - "start_time = time.time()\n", - "\n", - "model.fit(data,verbosity=verbosity)\n", - "\n", - "# fit_and_evaluate will compute RMSE on the validation set (if given) at every iteration\n", - "# model.fit_and_evaluate(data,verbosity=verbosity)\n", - "\n", - "train_time = time.time() - start_time # train_time includes both model initialization and model training time. \n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4. Obtain predictions from the RLRMC model on the test data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "## Obtain predictions on (userID,itemID) pairs (60586,54775) and (52681,36519) in Movielens 10m dataset\n", - "# output = model.predict([60586,52681],[54775,36519]) # Movielens 10m dataset\n", - "\n", - "# Obtain prediction on the full test set\n", - "predictions_ndarr = model.predict(test['userID'].values,test['itemID'].values)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5. Evaluate how well RLRMC performs" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RMSE:\t0.809386\n", - "MAE:\t0.620971\n" - ] - } - ], - "source": [ - "predictions_df = pd.DataFrame(data={\"userID\": test['userID'].values, \"itemID\":test['itemID'].values, \"prediction\":predictions_ndarr})\n", - "\n", - "## Compute test RMSE \n", - "eval_rmse = rmse(test, predictions_df)\n", - "## Compute test MAE \n", - "eval_mae = mae(test, predictions_df)\n", - "\n", - "print(\"RMSE:\\t%f\" % eval_rmse,\n", - " \"MAE:\\t%f\" % eval_mae, sep='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reference\n", - "[1] Pratik Jawanpuria and Bamdev Mishra. *A unified framework for structured low-rank matrix learning*. In International Conference on Machine Learning, 2018.\n", - "\n", - "[2] Bamdev Mishra, Gilles Meyer, Francis Bach, and Rodolphe Sepulchre. *Low-rank optimization with trace norm penalty*. In SIAM Journal on Optimization 23(4):2124-2149, 2013.\n", - "\n", - "[3] James Townsend, Niklas Koep, and Sebastian Weichwald. *Pymanopt: A Python Toolbox for Optimization on Manifolds using Automatic Differentiation*. In Journal of Machine Learning Research 17(137):1-5, 2016.\n", - "\n", - "[4] P.-A. Absil, R. Mahony, and R. Sepulchre. *Optimization Algorithms on Matrix Manifolds*. Princeton University Press, Princeton, NJ, 2008.\n", - "\n", - "[5] A. Edelman, T. Arias, and S. Smith. *The geometry of algo- rithms with orthogonality constraints*. SIAM Journal on Matrix Analysis and Applications, 20(2):303–353, 1998." - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Copyright (c) Recommenders contributors.*\n", + "\n", + "*Licensed under the MIT License.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Riemannian Low-rank Matrix Completion algorithm on Movielens dataset\n", + "\n", + "Riemannian Low-rank Matrix Completion (RLRMC) is a matrix factorization based (vanilla) matrix completion algorithm that solves the optimization problem using Riemannian conjugate gradients algorithm (Absil et al., 2008). RLRMC is based on the works by Jawanpuria and Mishra (2018) and Mishra et al. (2013). \n", + "\n", + "The ratings matrix of movies (items) and users is modeled as a low-rank matrix. Let the number of movies be $d$ and the number of users be $T$. RLRMC algorithm assumes that the ratings matrix $M$ (of size $d\\times T$) is partially known. The entry at $M(i,j)$ represents the rating given by the $j$-th user to the $i$-th movie. RLRMC learns matrix $M$ as $M=LR^\\top$, where $L$ is a $d\\times r$ matrix and $R$ is a $T\\times r$ matrix. Here, $r$ is the rank hyper-parameter which needs to be provided to the RLRMC algorithm. Typically, it is assumed that $r\\ll d,T$. The optimization problem is solved iteratively using the the Riemannian conjugate gradients algorithm. The Riemannian optimization framework generalizes a range of Euclidean first- and second-order algorithms such as conjugate gradients, trust-regions, among others, to Riemannian manifolds. A detailed exposition of the Riemannian optimization framework can be found in Absil et al. (2008). \n", + "\n", + "This notebook provides an example of how to utilize and evaluate RLRMC implementation in **recommenders**." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sys\n", + "import time\n", + "import pandas as pd\n", + "\n", + "from recommenders.datasets.python_splitters import python_random_split\n", + "from recommenders.datasets.python_splitters import python_stratified_split\n", + "from recommenders.datasets import movielens\n", + "from recommenders.models.rlrmc.RLRMCdataset import RLRMCdataset \n", + "from recommenders.models.rlrmc.RLRMCalgorithm import RLRMCalgorithm \n", + "# Pymanopt installation is required via\n", + "# pip install pymanopt \n", + "from recommenders.evaluation.python_evaluation import (\n", + " rmse, mae\n", + ")\n", + "\n", + "# import logging\n", + "\n", + "# %load_ext autoreload\n", + "# %autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pandas version: 0.23.4\n", + "System version: 3.7.1 (default, Dec 14 2018, 13:28:58) \n", + "[Clang 4.0.1 (tags/RELEASE_401/final)]\n" + ] + } + ], + "source": [ + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"System version: {}\".format(sys.version))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the default parameters.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# Select Movielens data size: 100k, 1m, 10m, or 20m\n", + "MOVIELENS_DATA_SIZE = '10m'\n", + "\n", + "# Model parameters\n", + "\n", + "# rank of the model, a positive integer (usually small), required parameter\n", + "rank_parameter = 10\n", + "# regularization parameter multiplied to loss function, a positive number (usually small), required parameter\n", + "regularization_parameter = 0.001\n", + "# initialization option for the model, 'svd' employs singular value decomposition, optional parameter\n", + "initialization_flag = 'svd' #default is 'random'\n", + "# maximum number of iterations for the solver, a positive integer, optional parameter\n", + "maximum_iteration = 100 #optional, default is 100\n", + "# maximum time in seconds for the solver, a positive integer, optional parameter\n", + "maximum_time = 300#optional, default is 1000\n", + "\n", + "# Verbosity of the intermediate results\n", + "verbosity=0 #optional parameter, valid values are 0,1,2, default is 0\n", + "# Whether to compute per iteration train RMSE (and test RMSE, if test data is given)\n", + "compute_iter_rmse=True #optional parameter, boolean value, default is False" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "## Logging utilities. Please import 'logging' in order to use the following command. \n", + "# logging.basicConfig(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Download the MovieLens dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "65.6MB [00:25, 2.57MB/s] \n" + ] + } + ], + "source": [ + "\n", + "df = movielens.load_pandas_df(\n", + " size=MOVIELENS_DATA_SIZE,\n", + " header=[\"userID\", \"itemID\", \"rating\", \"timestamp\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Split the data using the Spark chronological splitter provided in utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "## If both validation and test sets are required\n", + "# train, validation, test = python_random_split(df,[0.6, 0.2, 0.2])\n", + "\n", + "## If validation set is not required\n", + "train, test = python_random_split(df,[0.8, 0.2])\n", + "\n", + "## If test set is not required\n", + "# train, validation = python_random_split(df,[0.8, 0.2])\n", + "\n", + "## If both validation and test sets are not required (i.e., the complete dataset is for training the model)\n", + "# train = df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate an RLRMCdataset object from the data subsets." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# data = RLRMCdataset(train=train, validation=validation, test=test)\n", + "data = RLRMCdataset(train=train, test=test) # No validation set\n", + "# data = RLRMCdataset(train=train, validation=validation) # No test set\n", + "# data = RLRMCdataset(train=train) # No validation or test set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Train the RLRMC model on the training data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "model = RLRMCalgorithm(rank = rank_parameter,\n", + " C = regularization_parameter,\n", + " model_param = data.model_param,\n", + " initialize_flag = initialization_flag,\n", + " maxiter=maximum_iteration,\n", + " max_time=maximum_time)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 44.991251945495605 seconds for training.\n" + ] + } + ], + "source": [ + "start_time = time.time()\n", + "\n", + "model.fit(data,verbosity=verbosity)\n", + "\n", + "# fit_and_evaluate will compute RMSE on the validation set (if given) at every iteration\n", + "# model.fit_and_evaluate(data,verbosity=verbosity)\n", + "\n", + "train_time = time.time() - start_time # train_time includes both model initialization and model training time. \n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Obtain predictions from the RLRMC model on the test data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "## Obtain predictions on (userID,itemID) pairs (60586,54775) and (52681,36519) in Movielens 10m dataset\n", + "# output = model.predict([60586,52681],[54775,36519]) # Movielens 10m dataset\n", + "\n", + "# Obtain prediction on the full test set\n", + "predictions_ndarr = model.predict(test['userID'].values,test['itemID'].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Evaluate how well RLRMC performs" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE:\t0.809386\n", + "MAE:\t0.620971\n" + ] + } + ], + "source": [ + "predictions_df = pd.DataFrame(data={\"userID\": test['userID'].values, \"itemID\":test['itemID'].values, \"prediction\":predictions_ndarr})\n", + "\n", + "## Compute test RMSE \n", + "eval_rmse = rmse(test, predictions_df)\n", + "## Compute test MAE \n", + "eval_mae = mae(test, predictions_df)\n", + "\n", + "print(\"RMSE:\\t%f\" % eval_rmse,\n", + " \"MAE:\\t%f\" % eval_mae, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reference\n", + "[1] Pratik Jawanpuria and Bamdev Mishra. *A unified framework for structured low-rank matrix learning*. In International Conference on Machine Learning, 2018.\n", + "\n", + "[2] Bamdev Mishra, Gilles Meyer, Francis Bach, and Rodolphe Sepulchre. *Low-rank optimization with trace norm penalty*. In SIAM Journal on Optimization 23(4):2124-2149, 2013.\n", + "\n", + "[3] James Townsend, Niklas Koep, and Sebastian Weichwald. *Pymanopt: A Python Toolbox for Optimization on Manifolds using Automatic Differentiation*. In Journal of Machine Learning Research 17(137):1-5, 2016.\n", + "\n", + "[4] P.-A. Absil, R. Mahony, and R. Sepulchre. *Optimization Algorithms on Matrix Manifolds*. Princeton University Press, Princeton, NJ, 2008.\n", + "\n", + "[5] A. Edelman, T. Arias, and S. Smith. *The geometry of algo- rithms with orthogonality constraints*. SIAM Journal on Matrix Analysis and Applications, 20(2):303–353, 1998." + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } \ No newline at end of file diff --git a/examples/00_quick_start/sar_movielens.ipynb b/examples/00_quick_start/sar_movielens.ipynb index feefcf9352..f5922a8611 100644 --- a/examples/00_quick_start/sar_movielens.ipynb +++ b/examples/00_quick_start/sar_movielens.ipynb @@ -1,807 +1,807 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SAR Single Node on MovieLens (Python, CPU)\n", - "\n", - "Simple Algorithm for Recommendation (SAR) is a fast and scalable algorithm for personalized recommendations based on user transaction history. It produces easily explainable and interpretable recommendations and handles \"cold item\" and \"semi-cold user\" scenarios. SAR is a kind of neighborhood based algorithm (as discussed in [Recommender Systems by Aggarwal](https://dl.acm.org/citation.cfm?id=2931100)) which is intended for ranking top items for each user. More details about SAR can be found in the [deep dive notebook](../02_model_collaborative_filtering/sar_deep_dive.ipynb). \n", - "\n", - "SAR recommends items that are most ***similar*** to the ones that the user already has an existing ***affinity*** for. Two items are ***similar*** if the users that interacted with one item are also likely to have interacted with the other. A user has an ***affinity*** to an item if they have interacted with it in the past.\n", - "\n", - "### Advantages of SAR:\n", - "- High accuracy for an easy to train and deploy algorithm\n", - "- Fast training, only requiring simple counting to construct matrices used at prediction time. \n", - "- Fast scoring, only involving multiplication of the similarity matrix with an affinity vector\n", - "\n", - "### Notes to use SAR properly:\n", - "- Since it does not use item or user features, it can be at a disadvantage against algorithms that do.\n", - "- It's memory-hungry, requiring the creation of an $mxm$ sparse square matrix (where $m$ is the number of items). This can also be a problem for many matrix factorization algorithms.\n", - "- SAR favors an implicit rating scenario and it does not predict ratings.\n", - "\n", - "This notebook provides an example of how to utilize and evaluate SAR in Python on a CPU." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0 Global Settings and Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", - "[GCC 11.2.0]\n", - "NumPy version: 1.24.3\n", - "Pandas version: 1.5.3\n" - ] - } - ], - "source": [ - "import sys\n", - "import logging\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scrapbook as sb\n", - "from sklearn.preprocessing import minmax_scale\n", - "\n", - "from recommenders.utils.timer import Timer\n", - "from recommenders.datasets import movielens\n", - "from recommenders.utils.python_utils import binarize\n", - "from recommenders.datasets.python_splitters import python_stratified_split\n", - "from recommenders.models.sar import SAR\n", - "from recommenders.evaluation.python_evaluation import (\n", - " map_at_k,\n", - " ndcg_at_k,\n", - " precision_at_k,\n", - " recall_at_k,\n", - " rmse,\n", - " mae,\n", - " logloss,\n", - " rsquared,\n", - " exp_var\n", - ")\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "print(f\"System version: {sys.version}\")\n", - "print(f\"NumPy version: {np.__version__}\")\n", - "print(f\"Pandas version: {pd.__version__}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1 Load Data\n", - "\n", - "SAR is intended to be used on interactions with the following schema:\n", - "`, ,