Skip to content

Commit

Permalink
movielens WIP
Browse files Browse the repository at this point in the history
Signed-off-by: miguelgfierro <[email protected]>
  • Loading branch information
miguelgfierro committed Sep 16, 2023
1 parent abaa152 commit da9dffc
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 87 deletions.
16 changes: 9 additions & 7 deletions tests/ci/azureml_tests/test_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@

nightly_test_groups = {
"group_cpu_001": [ # Total group time: 1883s
"tests/smoke/recommenders/dataset/test_movielens.py::test_download_and_extract_movielens", # 0.45s
"tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens", # 0.45s + 0.61s + 3.47s + 8.28s
#
"tests/smoke/recommenders/dataset/test_movielens.py::test_load_item_df", # 0.47s
"tests/smoke/recommenders/dataset/test_movielens.py::test_load_pandas_df", # 2.45s
#
"tests/integration/recommenders/datasets/test_movielens.py::test_load_pandas_df", # 16.87s
"tests/integration/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens", # 0.61s + 3.47s + 8.28s
"tests/integration/recommenders/datasets/test_movielens.py::test_load_item_df", # 0.59s + 3.59s + 8.44s
"tests/integration/recommenders/datasets/test_movielens.py::test_load_pandas_df", # 37.33s + 352.99s + 673.61s
#
Expand Down Expand Up @@ -156,11 +156,13 @@

pr_gate_test_groups = {
"group_cpu_001": [ # Total group time: 525.96s
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df_mock_100__with_default_param__succeed",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df_mock_100__with_custom_param__succeed",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__has_default_col_names",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df_remove_default_col__return_success",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df_invalid_param__return_failure",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_data__no_name_collision",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df_mock_100__with_default_param__succeed",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df_mock_100__with_custom_param__succeed",
"tests/data_validation/recommenders/datasets/test_wikidata.py::test_find_wikidata_id",
"tests/unit/recommenders/datasets/test_download_utils.py::test_maybe_download_wrong_bytes",
"tests/unit/recommenders/datasets/test_download_utils.py::test_maybe_download_maybe",
Expand Down Expand Up @@ -205,11 +207,11 @@
"tests/unit/examples/test_notebooks_python.py::test_sar_single_node_runs",
],
"group_spark_001": [ # Total group time: 270.41s
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df_mock_100__with_custom_param__succeed",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__data_serialization_default_param",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__store_tmp_file",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__data_serialization_default_param",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df_mock_100__with_default_param__succeed",
"tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df_mock_100__with_custom_param__succeed",
"tests/unit/recommenders/datasets/test_spark_splitter.py::test_stratified_splitter",
"tests/unit/recommenders/datasets/test_spark_splitter.py::test_chrono_splitter",
"tests/unit/recommenders/evaluation/test_spark_evaluation.py::test_user_diversity_item_feature_vector",
Expand Down
95 changes: 65 additions & 30 deletions tests/data_validation/recommenders/datasets/test_movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,27 @@
import os
import pandas
from pandas.core.series import Series
from pytest_mock import MockerFixture
import pytest
from pytest_mock import MockerFixture

try:
from pyspark.sql.types import (
StructType,
StructField,
IntegerType,
)
from pyspark.sql.functions import col
except ImportError:
pass # skip this import if we are in pure python environment

from recommenders.datasets.movielens import MockMovielensSchema
from recommenders.datasets.movielens import load_pandas_df, load_spark_df
from recommenders.datasets.movielens import (
load_pandas_df,
load_spark_df,
load_item_df,
download_movielens,
extract_movielens,
)
from recommenders.datasets.movielens import (
DATA_FORMAT,
MOCK_DATA_FORMAT,
Expand Down Expand Up @@ -66,6 +82,53 @@ def test_mock_movielens_schema__get_df__return_success(
assert len(df[DEFAULT_GENRE_COL]) == size


def test_mock_movielens_data__no_name_collision():
"""
Making sure that no common names are shared between the mock and real dataset sizes
"""
dataset_name = set(DATA_FORMAT.keys())
dataset_name_mock = set(MOCK_DATA_FORMAT.keys())
collision = dataset_name.intersection(dataset_name_mock)
assert not collision


def test_load_pandas_df_mock_100__with_default_param__succeed():
df = load_pandas_df("mock100")
assert type(df) == pandas.DataFrame
assert len(df) == 100
assert not df[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]].duplicated().any()


def test_load_pandas_df_mock_100__with_custom_param__succeed():
df = load_pandas_df(
"mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL
)
assert type(df[DEFAULT_TITLE_COL]) == Series
assert type(df[DEFAULT_GENRE_COL]) == Series
assert len(df) == 100
assert "|" in df.loc[0, DEFAULT_GENRE_COL]
assert df.loc[0, DEFAULT_TITLE_COL] == "foo"


@pytest.mark.parametrize("size", ["100k", "1m", "10m", "20m"])
def test_download_and_extract_movielens(size, tmp):
"""Test movielens data download and extract"""
zip_path = os.path.join(tmp, "ml.zip")
download_movielens(size, dest_path=zip_path)
assert len(os.listdir(tmp)) == 1
assert os.path.exists(zip_path)

rating_path = os.path.join(tmp, "rating.dat")
item_path = os.path.join(tmp, "item.dat")
extract_movielens(
size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
)
# Test if raw-zip file, rating file, and item file are cached
assert len(os.listdir(tmp)) == 3
assert os.path.exists(rating_path)
assert os.path.exists(item_path)


@pytest.mark.spark
@pytest.mark.parametrize("keep_genre_col", [True, False])
@pytest.mark.parametrize("keep_title_col", [True, False])
Expand Down Expand Up @@ -109,29 +172,12 @@ def test_mock_movielens_schema__get_spark_df__data_serialization_default_param(
assert df.count() == data_size


def test_mock_movielens_data__no_name_collision():
"""
Making sure that no common names are shared between the mock and real dataset sizes
"""
dataset_name = set(DATA_FORMAT.keys())
dataset_name_mock = set(MOCK_DATA_FORMAT.keys())
collision = dataset_name.intersection(dataset_name_mock)
assert not collision


@pytest.mark.spark
def test_load_spark_df_mock_100__with_default_param__succeed(spark):
df = load_spark_df(spark, "mock100")
assert df.count() == 100


def test_load_pandas_df_mock_100__with_default_param__succeed():
df = load_pandas_df("mock100")
assert type(df) == pandas.DataFrame
assert len(df) == 100
assert not df[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]].duplicated().any()


@pytest.mark.spark
def test_load_spark_df_mock_100__with_custom_param__succeed(spark):
df = load_spark_df(
Expand All @@ -142,14 +188,3 @@ def test_load_spark_df_mock_100__with_custom_param__succeed(spark):
assert df.count() == 100
assert "|" in df.take(1)[0][DEFAULT_GENRE_COL]
assert df.take(1)[0][DEFAULT_TITLE_COL] == "foo"


def test_load_pandas_df_mock_100__with_custom_param__succeed():
df = load_pandas_df(
"mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL
)
assert type(df[DEFAULT_TITLE_COL]) == Series
assert type(df[DEFAULT_GENRE_COL]) == Series
assert len(df) == 100
assert "|" in df.loc[0, DEFAULT_GENRE_COL]
assert df.loc[0, DEFAULT_TITLE_COL] == "foo"
20 changes: 0 additions & 20 deletions tests/integration/recommenders/datasets/test_movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,23 +271,3 @@ def test_load_spark_df(
assert len(df.columns) == 4
del df
gc.collect()


@pytest.mark.integration
@pytest.mark.parametrize("size", ["1m", "10m", "20m"])
def test_download_and_extract_movielens(size, tmp):
"""Test movielens data download and extract"""
zip_path = os.path.join(tmp, "ml.zip")
download_movielens(size, dest_path=zip_path)
assert len(os.listdir(tmp)) == 1
assert os.path.exists(zip_path)

rating_path = os.path.join(tmp, "rating.dat")
item_path = os.path.join(tmp, "item.dat")
extract_movielens(
size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
)
# Test if raw-zip file, rating file, and item file are cached
assert len(os.listdir(tmp)) == 3
assert os.path.exists(rating_path)
assert os.path.exists(item_path)
30 changes: 0 additions & 30 deletions tests/smoke/recommenders/dataset/test_movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,6 @@
extract_movielens,
)

try:
from pyspark.sql.types import (
StructType,
StructField,
IntegerType,
)
from pyspark.sql.functions import col
except ImportError:
pass # skip this import if we are in pure python environment


@pytest.mark.smoke
@pytest.mark.parametrize(
Expand Down Expand Up @@ -210,23 +200,3 @@ def test_load_spark_df(
assert df.count() == num_samples
# user, item, rating and timestamp
assert len(df.columns) == 4


@pytest.mark.smoke
@pytest.mark.parametrize("size", ["100k"])
def test_download_and_extract_movielens(size, tmp):
"""Test movielens data download and extract"""
zip_path = os.path.join(tmp, "ml.zip")
download_movielens(size, dest_path=zip_path)
assert len(os.listdir(tmp)) == 1
assert os.path.exists(zip_path)

rating_path = os.path.join(tmp, "rating.dat")
item_path = os.path.join(tmp, "item.dat")
extract_movielens(
size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
)
# Test if raw-zip file, rating file, and item file are cached
assert len(os.listdir(tmp)) == 3
assert os.path.exists(rating_path)
assert os.path.exists(item_path)

0 comments on commit da9dffc

Please sign in to comment.