From 0d5fa3df2034c71f372b5fe45ddc0b65bbef36db Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 22 Nov 2021 14:32:31 +0800 Subject: [PATCH 01/33] Upgrade sarplus to support Spark 3.x --- contrib/sarplus/python/pysarplus/SARModel.py | 5 +- contrib/sarplus/python/pysarplus/SARPlus.py | 99 ++++-------- contrib/sarplus/python/setup.py | 15 +- .../sarplus/python/tests/test_pyspark_sar.py | 144 +++++------------- contrib/sarplus/scala/build.sbt | 102 ++++++++++--- .../sarplus/scala/project/build.properties | 2 +- contrib/sarplus/scala/project/plugins.sbt | 6 +- contrib/sarplus/scala/python/setup.py | 4 +- .../sarplus/SARCacheOutputWriter.scala | 19 ++- 9 files changed, 181 insertions(+), 215 deletions(-) diff --git a/contrib/sarplus/python/pysarplus/SARModel.py b/contrib/sarplus/python/pysarplus/SARModel.py index bd18c2c88f..427bdccd0a 100644 --- a/contrib/sarplus/python/pysarplus/SARModel.py +++ b/contrib/sarplus/python/pysarplus/SARModel.py @@ -17,10 +17,7 @@ def __init__(self, path): def find_or_raise(extension): files = [f for f in all_files if f.endswith(extension)] if len(files) != 1: - raise ValueError( - "Directory '%s' must contain exactly 1 file ending in '%s'" - % (path, extension) - ) + raise ValueError("Directory '%s' must contain exactly 1 file ending in '%s'" % (path, extension)) return path + "/" + files[0] # instantiate C++ backend diff --git a/contrib/sarplus/python/pysarplus/SARPlus.py b/contrib/sarplus/python/pysarplus/SARPlus.py index c372bf82bd..e560002760 100644 --- a/contrib/sarplus/python/pysarplus/SARPlus.py +++ b/contrib/sarplus/python/pysarplus/SARPlus.py @@ -1,13 +1,8 @@ -""" -This is the one and only (to rule them all) implementation of SAR. -""" +"""This is the implementation of SAR.""" import logging -import pyspark.sql.functions as F import pandas as pd from pyspark.sql.types import ( - StringType, - DoubleType, StructType, StructField, IntegerType, @@ -16,6 +11,7 @@ from pyspark.sql.functions import pandas_udf, PandasUDFType from pysarplus import SARModel + SIM_COOCCUR = "cooccurrence" SIM_JACCARD = "jaccard" SIM_LIFT = "lift" @@ -25,7 +21,7 @@ class SARPlus: - """SAR implementation for PySpark""" + """SAR implementation for PySpark.""" def __init__( self, @@ -66,13 +62,15 @@ def f(self, str, **kwargs): # current time for time decay calculation # cooccurrence matrix threshold def fit(self, df): - """Main fit method for SAR. Expects the dataframes to have row_id, col_id columns which are indexes, + """Main fit method for SAR. + + Expects the dataframes to have row_id, col_id columns which are indexes, i.e. contain the sequential integer index of the original alphanumeric user and item IDs. Dataframe also contains rating and timestamp as floats; timestamp is in seconds since Epoch by default. Arguments: - df (pySpark.DataFrame): input dataframe which contains the index of users and items. """ - + df (pySpark.DataFrame): input dataframe which contains the index of users and items. + """ # threshold - items below this number get set to zero in coocurrence counts df.createOrReplaceTempView(self.f("{prefix}df_train_input")) @@ -93,12 +91,12 @@ def fit(self, df): query = self.f( """ SELECT - {col_user}, {col_item}, + {col_user}, {col_item}, SUM({col_rating} * EXP(-log(2) * (latest_timestamp - CAST({col_timestamp} AS long)) / ({time_decay_coefficient} * 3600 * 24))) as {col_rating} FROM {prefix}df_train_input, (SELECT CAST(MAX({col_timestamp}) AS long) latest_timestamp FROM {prefix}df_train_input) - GROUP BY {col_user}, {col_item} - CLUSTER BY {col_user} + GROUP BY {col_user}, {col_item} + CLUSTER BY {col_user} """ ) @@ -106,9 +104,7 @@ def fit(self, df): df = self.spark.sql(query) else: # since SQL is case insensitive, this check needs to be performed similar - if self.header["col_timestamp"].lower() in [ - s.name.lower() for s in df.schema - ]: + if self.header["col_timestamp"].lower() in [s.name.lower() for s in df.schema]: # we need to de-duplicate items by using the latest item query = self.f( """ @@ -143,16 +139,12 @@ def fit(self, df): ) item_cooccurrence = self.spark.sql(query) - item_cooccurrence.write.mode("overwrite").saveAsTable( - self.f("{prefix}item_cooccurrence") - ) + item_cooccurrence.write.mode("overwrite").saveAsTable(self.f("{prefix}item_cooccurrence")) # compute the diagonal used later for Jaccard and Lift if self.similarity_type == SIM_LIFT or self.similarity_type == SIM_JACCARD: item_marginal = self.spark.sql( - self.f( - "SELECT i1 i, value AS margin FROM {prefix}item_cooccurrence WHERE i1 = i2" - ) + self.f("SELECT i1 i, value AS margin FROM {prefix}item_cooccurrence WHERE i1 = i2") ) item_marginal.createOrReplaceTempView(self.f("{prefix}item_marginal")) @@ -181,17 +173,11 @@ def fit(self, df): ) self.item_similarity = self.spark.sql(query) else: - raise ValueError( - "Unknown similarity type: {0}".format(self.similarity_type) - ) + raise ValueError("Unknown similarity type: {0}".format(self.similarity_type)) # store upper triangular - log.info( - "sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type - ) - self.item_similarity.write.mode("overwrite").saveAsTable( - self.f("{prefix}item_similarity_upper") - ) + log.info("sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type) + self.item_similarity.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_upper")) # expand upper triangular to full matrix @@ -209,9 +195,7 @@ def fit(self, df): ) self.item_similarity = self.spark.sql(query) - self.item_similarity.write.mode("overwrite").saveAsTable( - self.f("{prefix}item_similarity") - ) + self.item_similarity.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity")) # free space self.spark.sql(self.f("DROP TABLE {prefix}item_cooccurrence")) @@ -228,14 +212,10 @@ def get_user_affinity(self, test): """ test.createOrReplaceTempView(self.f("{prefix}df_test")) - query = self.f( - "SELECT DISTINCT {col_user} FROM {prefix}df_test CLUSTER BY {col_user}" - ) + query = self.f("SELECT DISTINCT {col_user} FROM {prefix}df_test CLUSTER BY {col_user}") df_test_users = self.spark.sql(query) - df_test_users.write.mode("overwrite").saveAsTable( - self.f("{prefix}df_test_users") - ) + df_test_users.write.mode("overwrite").saveAsTable(self.f("{prefix}df_test_users")) query = self.f( """ @@ -249,12 +229,7 @@ def get_user_affinity(self, test): return self.spark.sql(query) def recommend_k_items( - self, - test, - cache_path, - top_k=10, - remove_seen=True, - n_user_prediction_partitions=200, + self, test, cache_path, top_k=10, remove_seen=True, n_user_prediction_partitions=200, ): # create item id to continuous index mapping @@ -286,16 +261,10 @@ def recommend_k_items( log.info("sarplus.recommend_k_items 2/3: prepare similarity matrix") self.spark.sql( - self.f( - "SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2" - ) - ).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save( - cache_path_output - ) + self.f("SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2") + ).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(cache_path_output) - self.get_user_affinity(test).createOrReplaceTempView( - self.f("{prefix}user_affinity") - ) + self.get_user_affinity(test).createOrReplaceTempView(self.f("{prefix}user_affinity")) # map item ids to index space pred_input = self.spark.sql( @@ -314,9 +283,7 @@ def recommend_k_items( schema = StructType( [ - StructField( - "userID", pred_input.schema[self.header["col_user"]].dataType, True - ), + StructField("userID", pred_input.schema[self.header["col_user"]].dataType, True), StructField("itemID", IntegerType(), True), StructField("score", FloatType(), True), ] @@ -334,24 +301,18 @@ def sar_predict_udf(df): # memory mapped, the memory consumption only happens ones per worker # for all python processes model = SARModel(cache_path_input) - preds = model.predict( - df["idx"].values, df["rating"].values, top_k, remove_seen - ) + preds = model.predict(df["idx"].values, df["rating"].values, top_k, remove_seen) user = df[local_header["col_user"]].iloc[0] - preds_ret = pd.DataFrame( - [(user, x.id, x.score) for x in preds], columns=range(3) - ) + preds_ret = pd.DataFrame([(user, x.id, x.score) for x in preds], columns=range(3)) return preds_ret log.info("sarplus.recommend_k_items 3/3: compute recommendations") df_preds = ( - pred_input.repartition( - n_user_prediction_partitions, self.header["col_user"] - ) + pred_input.repartition(n_user_prediction_partitions, self.header["col_user"]) .groupby(self.header["col_user"]) .apply(sar_predict_udf) ) @@ -381,9 +342,7 @@ def recommend_k_items_slow(self, test, top_k=10, remove_seen=True): if remove_seen: raise ValueError("Not implemented") - self.get_user_affinity(test).write.mode("overwrite").saveAsTable( - self.f("{prefix}user_affinity") - ) + self.get_user_affinity(test).write.mode("overwrite").saveAsTable(self.f("{prefix}user_affinity")) # user_affinity * item_similarity # filter top-k diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index bb072fe324..4b8fff292f 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -1,3 +1,4 @@ +import os import sysconfig from setuptools import setup @@ -13,10 +14,17 @@ def __str__(self): return pybind11.get_include(self.user) +DEPENDENCIES = [ + "numpy", + "pandas", + # "pyarrow==0.13.0", + "pybind11>=2.2", + "pyspark>=3.0.0" +] setup( name="pysarplus", - version="0.2.6", + version=os.environ["VERSION"], description="SAR prediction for use with PySpark", url="https://github.com/Microsoft/Recommenders/contrib/sarplus", author="Markus Cozowicz", @@ -33,7 +41,7 @@ def __str__(self): "Topic :: Scientific/Engineering :: Mathematics", ], setup_requires=["pytest-runner"], - install_requires=["pybind11>=2.2"], + install_requires=DEPENDENCIES, tests_require=["pytest"], packages=["pysarplus"], ext_modules=[ @@ -41,8 +49,7 @@ def __str__(self): "pysarplus_cpp", ["src/pysarplus.cpp"], include_dirs=[get_pybind_include(), get_pybind_include(user=True)], - extra_compile_args=sysconfig.get_config_var("CFLAGS").split() - + ["-std=c++11", "-Wall", "-Wextra"], + extra_compile_args=sysconfig.get_config_var("CFLAGS").split() + ["-std=c++11", "-Wall", "-Wextra"], libraries=["stdc++"], language="c++11", ) diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index f2b85b5e2c..35df0ee0a0 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -1,26 +1,26 @@ import calendar import datetime import math +import os +from pathlib import Path + import numpy as np import pandas as pd +from pyspark.sql import SparkSession import pytest -import os from sklearn.model_selection import train_test_split -from pyspark.sql import SparkSession from pysarplus import SARPlus, SARModel def assert_compare(expected_id, expected_score, actual_prediction): assert expected_id == actual_prediction.id - assert math.isclose( - expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3 - ) + assert math.isclose(expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3) @pytest.fixture(scope="module") -def spark(app_name="Sample", url="local[*]", memory="1G"): +def spark(tmp_path_factory, app_name="Sample", url="local[*]", memory="1G"): """Start Spark if not started Args: app_name (str): sets name of the application @@ -28,19 +28,25 @@ def spark(app_name="Sample", url="local[*]", memory="1G"): memory (str): size of memory for spark driver """ + try: + sarplus_jar_path = next( + Path(__file__) + .parents[2] + .joinpath("scala", "target") + .glob(f"**/*{os.environ.get('VERSION', '')}.jar")).absolute() + except StopIteration: + raise Exception("Could not find Sarplus JAR file") + spark = ( SparkSession.builder.appName(app_name) .master(url) - .config( - "spark.jars", - os.path.dirname(__file__) - + "/../../scala/target/scala-2.11/sarplus_2.11-0.2.6.jar", - ) + .config("spark.jars", sarplus_jar_path) .config("spark.driver.memory", memory) .config("spark.sql.shuffle.partitions", "1") .config("spark.default.parallelism", "1") .config("spark.sql.crossJoin.enabled", True) .config("spark.ui.enabled", False) + .config("spark.sql.warehouse.dir", str(tmp_path_factory.mktemp("spark"))) # .config("spark.eventLog.enabled", True) # only for local debugging, breaks on build server .getOrCreate() ) @@ -130,12 +136,7 @@ def test_pandas(spark, sample_cache): item_scores = pd.DataFrame([(0, 2.3), (1, 3.1)], columns=["itemID", "score"]) model = SARModel(sample_cache) - y = model.predict( - item_scores["itemID"].values, - item_scores["score"].values, - top_k=10, - remove_seen=False, - ) + y = model.predict(item_scores["itemID"].values, item_scores["score"].values, top_k=10, remove_seen=False,) assert_compare(0, 0.85, y[0]) assert_compare(1, 6.9699, y[1]) @@ -149,9 +150,7 @@ def test_e2e(spark, pandas_dummy_dataset, header): df = spark.createDataFrame(pandas_dummy_dataset) sar.fit(df) - test_df = spark.createDataFrame( - pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]}) - ) + test_df = spark.createDataFrame(pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]})) r1 = ( sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False) @@ -162,11 +161,7 @@ def test_e2e(spark, pandas_dummy_dataset, header): r2 = ( sar.recommend_k_items( - test_df, - "tests/test_e2e_cache", - top_k=3, - n_user_prediction_partitions=2, - remove_seen=False, + test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False, ) .toPandas() .sort_values([header["col_user"], header["col_item"]]) @@ -218,11 +213,7 @@ def demo_usage_data(header, sar_settings): # convert timestamp data[header["col_timestamp"]] = data[header["col_timestamp"]].apply( - lambda s: float( - calendar.timegm( - datetime.datetime.strptime(s, "%Y/%m/%dT%H:%M:%S").timetuple() - ) - ) + lambda s: float(calendar.timegm(datetime.datetime.strptime(s, "%Y/%m/%dT%H:%M:%S").timetuple())) ) return data @@ -249,18 +240,9 @@ def sar_settings(): } -@pytest.mark.parametrize( - "similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)] -) -def test_fit( - spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header -): - model = SARPlus( - spark, - **header, - timedecay_formula=timedecay_formula, - similarity_type=similarity_type - ) +@pytest.mark.parametrize("similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)]) +def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header): + model = SARPlus(spark, **header, timedecay_formula=timedecay_formula, similarity_type=similarity_type) trainset, testset = train_test_dummy_timestamp @@ -276,6 +258,7 @@ def test_fit( Main SAR tests are below - load test files which are used for both Scala SAR and Python reference implementations """ + # Tests 1-6 @pytest.mark.parametrize( "threshold,similarity_type,file", @@ -288,9 +271,7 @@ def test_fit( (3, "lift", "lift"), ], ) -def test_sar_item_similarity( - spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header -): +def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header): model = SARPlus( spark, @@ -306,41 +287,25 @@ def test_sar_item_similarity( model.fit(df) # reference - item_similarity_ref = pd.read_csv( - sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv" - ) + item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv") item_similarity_ref = pd.melt( - item_similarity_ref, - item_similarity_ref.columns[0], - item_similarity_ref.columns[1:], - "i2", - "value", + item_similarity_ref, item_similarity_ref.columns[0], item_similarity_ref.columns[1:], "i2", "value", ) item_similarity_ref.columns = ["i1", "i2", "value"] item_similarity_ref = ( - item_similarity_ref[item_similarity_ref.value > 0] - .sort_values(["i1", "i2"]) - .reset_index(drop=True) + item_similarity_ref[item_similarity_ref.value > 0].sort_values(["i1", "i2"]).reset_index(drop=True) ) # actual - item_similarity = ( - model.item_similarity.toPandas() - .sort_values(["i1", "i2"]) - .reset_index(drop=True) - ) + item_similarity = model.item_similarity.toPandas().sort_values(["i1", "i2"]).reset_index(drop=True) if similarity_type == "cooccurrence": assert (item_similarity_ref == item_similarity).all().all() else: - assert ( - (item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]).all().all() - ) + assert (item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]).all().all() - assert np.allclose( - item_similarity.value.values, item_similarity_ref.value.values - ) + assert np.allclose(item_similarity.value.values, item_similarity_ref.value.values) # Test 7 @@ -361,40 +326,27 @@ def test_user_affinity(spark, demo_usage_data, sar_settings, header): user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv") user_affinity_ref = pd.melt( - user_affinity_ref, - user_affinity_ref.columns[0], - user_affinity_ref.columns[1:], - "ItemId", - "Rating", - ) - user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0].reset_index( - drop=True + user_affinity_ref, user_affinity_ref.columns[0], user_affinity_ref.columns[1:], "ItemId", "Rating", ) + user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0].reset_index(drop=True) # construct dataframe with test user id we'd like to get the affinity for - df_test = spark.createDataFrame( - pd.DataFrame({header["col_user"]: [sar_settings["TEST_USER_ID"]]}) - ) + df_test = spark.createDataFrame(pd.DataFrame({header["col_user"]: [sar_settings["TEST_USER_ID"]]})) user_affinity = model.get_user_affinity(df_test).toPandas().reset_index(drop=True) # verify the that item ids are the same assert (user_affinity[header["col_item"]] == user_affinity_ref.ItemId).all() assert np.allclose( - user_affinity_ref[header["col_rating"]].values, - user_affinity["Rating"].values, - atol=sar_settings["ATOL"], + user_affinity_ref[header["col_rating"]].values, user_affinity["Rating"].values, atol=sar_settings["ATOL"], ) # Tests 8-10 @pytest.mark.parametrize( - "threshold,similarity_type,file", - [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")], + "threshold,similarity_type,file", [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")], ) -def test_userpred( - spark, threshold, similarity_type, file, header, sar_settings, demo_usage_data -): +def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar_settings, demo_usage_data): time_now = demo_usage_data[header["col_timestamp"]].max() test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file) @@ -413,13 +365,7 @@ def test_userpred( df = spark.createDataFrame(demo_usage_data) model.fit(df) - url = ( - sar_settings["FILE_DIR"] - + "userpred_" - + file - + str(threshold) - + "_userid_only.csv" - ) + url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" pred_ref = pd.read_csv(url) pred_ref = ( @@ -430,12 +376,8 @@ def test_userpred( # Note: it's important to have a separate cache_path for each run as they're interferring with each other pred = model.recommend_k_items( - spark.createDataFrame( - demo_usage_data[ - demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"] - ] - ), - cache_path="test_userpred-" + test_id, + spark.createDataFrame(demo_usage_data[demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]]), + cache_path=str(tmp_path.joinpath("test_userpred-" + test_id)), top_k=10, n_user_prediction_partitions=1, ) @@ -443,6 +385,4 @@ def test_userpred( pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True) assert (pred.MovieId.values == pred_ref.rec.values).all() - assert np.allclose( - pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"] - ) + assert np.allclose(pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"]) diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index f79cb49a43..83364a1645 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -1,30 +1,88 @@ -scalaVersion := "2.11.8" - -sparkVersion := sys.env.get("sparkversion").getOrElse("2.3.0") - -spName := "microsoft/sarplus" - -organization := "microsoft" name := "sarplus" +licenses := Seq("MIT" -> url("http://opensource.org/licenses/MIT")) +// credentials += Credentials(Path.userHome / ".m2" / ".sbtcredentials") +// publishTo := { +// val org = sys.env.getOrElse("ORG", "") +// val project = sys.env.getOrElse("PROJECT", "") +// val feed = sys.env.getOrElse("FEED", "") +// Some("releases" at "https://pkgs.dev.azure.com/%s/%s/_packaging/%s/Maven/v1".format(org, project, feed)) +// } -version := "0.2.6" +lazy val sparkVer = settingKey[String]("spark version") +lazy val hadoopVer = settingKey[String]("hadoop version") +lazy val commonsIoVer = settingKey[String]("commons-io version") +lazy val jacksonDatabindVer = settingKey[String]("jackson-databind version") -sparkComponents ++= Seq("core", "sql", "mllib") +lazy val commonSettings = Seq( + organization := "sarplus.microsoft", + version := sys.env.getOrElse("VERSION", "0.5.0"), + crossScalaVersions := Seq("2.11.12", "2.12.10", "2.12.14"), + resolvers ++= Seq( + Resolver.sonatypeRepo("snapshots"), + Resolver.sonatypeRepo("releases"), + ), + addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.1" cross CrossVersion.full), + sparkVer := { + scalaVersion.value match { + case "2.11.12" => "2.4.5" + case "2.12.10" => "3.1.2" + case "2.12.14" => "3.2.0" + case _ => "3.2.0" + } + }, -libraryDependencies ++= Seq( - "commons-io" % "commons-io" % "2.6", - "com.google.guava" % "guava" % "25.0-jre", - "org.scalatest" %% "scalatest" % "3.0.5" % "test", - "org.scalamock" %% "scalamock" % "4.1.0" % "test" -) + hadoopVer := { + scalaVersion.value match { + case "2.11.12" => "2.7.3" + case "2.12.10" => "2.7.4" + case "2.12.14" => "3.3.1" + case _ => "3.3.1" + } + }, -// All Spark Packages need a license -licenses := Seq("MIT" -> url("http://opensource.org/licenses/MIT")) + commonsIoVer := { + scalaVersion.value match { + case "2.11.12" => "2.4" + case "2.12.10" => "2.4" + case "2.12.14" => "2.8.0" + case _ => "2.8.0" + } + }, -// doesn't work anyway... -credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") // A file containing credentials + jacksonDatabindVer := { + scalaVersion.value match { + case "2.11.12" => "2.6.7.1" + case "2.12.10" => "2.10.0" + case "2.12.14" => "2.12.2" + case _ => "2.12.2" + } + }, + libraryDependencies ++= Seq( + "com.fasterxml.jackson.core" % "jackson-databind" % jacksonDatabindVer.value, + "commons-io" % "commons-io" % commonsIoVer.value, + "org.apache.hadoop" % "hadoop-common" % hadoopVer.value, + "org.apache.hadoop" % "hadoop-hdfs" % hadoopVer.value, + "org.apache.spark" %% "spark-core" % sparkVer.value, + "org.apache.spark" %% "spark-mllib" % sparkVer.value, + "org.apache.spark" %% "spark-sql" % sparkVer.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.google.guava" % "guava" % "15.0", + "org.scalamock" %% "scalamock" % "4.1.0" % "test", + "org.scalatest" %% "scalatest" % "3.0.8" % "test", + "xerces" % "xercesImpl" % "2.12.1", + ), + artifactName := { + (sv: ScalaVersion, module: ModuleID, artifact: Artifact) => + artifact.name + "_" + sv.full + "_" + sparkVer.value + "-" + module.revision + "." + artifact.extension + }, +) -spHomepage := "http://github.com/Microsoft/Recommenders/contrib/sarplus" +lazy val compat = project.settings(commonSettings) +lazy val root = (project in file(".")) + .dependsOn(compat) + .settings( + name := "sarplus", + commonSettings, + ) -// If you published your package to Maven Central for this release (must be done prior to spPublish) -spIncludeMaven := true +// aetherPublishBothSettings diff --git a/contrib/sarplus/scala/project/build.properties b/contrib/sarplus/scala/project/build.properties index 133a8f197e..10fd9eee04 100644 --- a/contrib/sarplus/scala/project/build.properties +++ b/contrib/sarplus/scala/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.17 +sbt.version=1.5.5 diff --git a/contrib/sarplus/scala/project/plugins.sbt b/contrib/sarplus/scala/project/plugins.sbt index f1495154b8..2bd037f6a1 100644 --- a/contrib/sarplus/scala/project/plugins.sbt +++ b/contrib/sarplus/scala/project/plugins.sbt @@ -1,4 +1,2 @@ -// You may use this file to add plugin dependencies for sbt. -resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" - -addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.2.6") +addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.27.0") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.1") diff --git a/contrib/sarplus/scala/python/setup.py b/contrib/sarplus/scala/python/setup.py index 49a0dc8b59..fedf198952 100644 --- a/contrib/sarplus/scala/python/setup.py +++ b/contrib/sarplus/scala/python/setup.py @@ -1,8 +1,10 @@ from distutils.core import setup +import os + setup( name="pysarplus_dummy", - version="0.2", + version=os.environ["VERSION"], description="pysarplus dummy package to trigger spark packaging", author="Markus Cozowicz", author_email="marcozo@microsoft.com", diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala index 7f8f8d446f..c4d83d13fe 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala @@ -11,8 +11,10 @@ import org.apache.spark.sql.types._ import org.apache.commons.io.IOUtils import com.google.common.io.LittleEndianDataOutputStream +import com.microsoft.sarplus.spark.since3p2defvisible + class SARCacheOutputWriter( - path: String, + filePath: String, outputStream: OutputStream, schema: StructType) extends OutputWriter { @@ -20,8 +22,8 @@ class SARCacheOutputWriter( if (schema.length < 3) throw new IllegalArgumentException("Schema must have at least 3 fields") - val pathOffset = path + ".offsets" - val pathRelated = path + ".related" + val pathOffset = filePath + ".offsets" + val pathRelated = filePath + ".related" // temporary output files val tempOutputOffset = new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(pathOffset), 8*1024)) @@ -44,7 +46,7 @@ class SARCacheOutputWriter( if(lastId != i1) { - tempOutputOffset.writeLong(rowNumber) + tempOutputOffset.writeLong(rowNumber) offsetCount += 1 lastId = i1 } @@ -64,7 +66,7 @@ class SARCacheOutputWriter( if(lastId != i1) { - tempOutputOffset.writeLong(rowNumber) + tempOutputOffset.writeLong(rowNumber) offsetCount += 1 lastId = i1 } @@ -75,7 +77,7 @@ class SARCacheOutputWriter( rowNumber += 1 } - override def close(): Unit = + override def close(): Unit = { tempOutputOffset.writeLong(rowNumber) offsetCount += 1 @@ -94,5 +96,8 @@ class SARCacheOutputWriter( input.close outputFinal.close - } + } + + @since3p2defvisible + override def path(): String = filePath } From 66b76d9bc0ce1c85503f864a8546b47e04163480 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 22 Nov 2021 15:54:18 +0800 Subject: [PATCH 02/33] Add corresponding docs and simplify version specification --- contrib/sarplus/DEVELOPMENT.md | 30 +++++++++++++++++ contrib/sarplus/scala/build.sbt | 46 +++------------------------ contrib/sarplus/scala/python/setup.py | 3 ++ 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md index 8d1557cbba..40cac04a0b 100644 --- a/contrib/sarplus/DEVELOPMENT.md +++ b/contrib/sarplus/DEVELOPMENT.md @@ -39,3 +39,33 @@ sbt test ``` (use ~test and it will automatically check for changes in source files, but not build.sbt) + + +## Notes for Spark 3.x ## + +The code now has been modified to support Spark 3.x, and has been +tested under different versions of Databricks Runtime (including 6.4 +Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks +Service. But now manual packaging is needed: + + +```bash +export VERSION=0.5.0 +cd python +python setup.py bdist_wheel # => dist/pysarplus-0.5.0-cp38-cp38-linux_x86_64.whl + +export SPARK_VERSION=3.2.0 +export HADOOP_VERSION=3.3.1 +export SCALA_VERSION=2.12.14 +cd scala +sbt ++${SCALA_VERSION} package # => target/scala-2.12/sarplus_2.12.14_s3.2.0_h3.3.1-0.5.0.jar +``` + +where `VERSION`, `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` +should be customized as needed. When running on Spark 3.x, extra +configurations are also required: + +``` +spark.sql.sources.default parquet +spark.sql.legacy.createHiveTableByDefault true +``` diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index 83364a1645..ebf61615d8 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -10,56 +10,20 @@ licenses := Seq("MIT" -> url("http://opensource.org/licenses/MIT")) lazy val sparkVer = settingKey[String]("spark version") lazy val hadoopVer = settingKey[String]("hadoop version") -lazy val commonsIoVer = settingKey[String]("commons-io version") -lazy val jacksonDatabindVer = settingKey[String]("jackson-databind version") lazy val commonSettings = Seq( organization := "sarplus.microsoft", version := sys.env.getOrElse("VERSION", "0.5.0"), - crossScalaVersions := Seq("2.11.12", "2.12.10", "2.12.14"), resolvers ++= Seq( Resolver.sonatypeRepo("snapshots"), Resolver.sonatypeRepo("releases"), ), addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.1" cross CrossVersion.full), - sparkVer := { - scalaVersion.value match { - case "2.11.12" => "2.4.5" - case "2.12.10" => "3.1.2" - case "2.12.14" => "3.2.0" - case _ => "3.2.0" - } - }, - - hadoopVer := { - scalaVersion.value match { - case "2.11.12" => "2.7.3" - case "2.12.10" => "2.7.4" - case "2.12.14" => "3.3.1" - case _ => "3.3.1" - } - }, - - commonsIoVer := { - scalaVersion.value match { - case "2.11.12" => "2.4" - case "2.12.10" => "2.4" - case "2.12.14" => "2.8.0" - case _ => "2.8.0" - } - }, - - jacksonDatabindVer := { - scalaVersion.value match { - case "2.11.12" => "2.6.7.1" - case "2.12.10" => "2.10.0" - case "2.12.14" => "2.12.2" - case _ => "2.12.2" - } - }, + sparkVer := sys.env.getOrElse("SPARK_VERSION", "3.2.0"), + hadoopVer := sys.env.getOrElse("HADOOP_VERSION", "3.3.1"), libraryDependencies ++= Seq( - "com.fasterxml.jackson.core" % "jackson-databind" % jacksonDatabindVer.value, - "commons-io" % "commons-io" % commonsIoVer.value, + "com.fasterxml.jackson.core" % "jackson-databind" % "2.12.2", + "commons-io" % "commons-io" % "2.8.0", "org.apache.hadoop" % "hadoop-common" % hadoopVer.value, "org.apache.hadoop" % "hadoop-hdfs" % hadoopVer.value, "org.apache.spark" %% "spark-core" % sparkVer.value, @@ -73,7 +37,7 @@ lazy val commonSettings = Seq( ), artifactName := { (sv: ScalaVersion, module: ModuleID, artifact: Artifact) => - artifact.name + "_" + sv.full + "_" + sparkVer.value + "-" + module.revision + "." + artifact.extension + artifact.name + "_" + sv.full + "_s" + sparkVer.value + "_h" + hadoopVer.value + "-" + module.revision + "." + artifact.extension }, ) diff --git a/contrib/sarplus/scala/python/setup.py b/contrib/sarplus/scala/python/setup.py index fedf198952..39a96b237e 100644 --- a/contrib/sarplus/scala/python/setup.py +++ b/contrib/sarplus/scala/python/setup.py @@ -1,6 +1,9 @@ from distutils.core import setup import os +version = os.getenv("VERSION") +if version is None: + version = "0.5.0" setup( name="pysarplus_dummy", From b5d3b55355a88b8e7b847d92eb0f15d31a76b665 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Tue, 23 Nov 2021 15:29:45 +0800 Subject: [PATCH 03/33] Update python package url --- contrib/sarplus/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 4b8fff292f..ad17589af9 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -26,7 +26,7 @@ def __str__(self): name="pysarplus", version=os.environ["VERSION"], description="SAR prediction for use with PySpark", - url="https://github.com/Microsoft/Recommenders/contrib/sarplus", + url="https://github.com/microsoft/recommenders/tree/main/contrib/sarplus", author="Markus Cozowicz", author_email="marcozo@microsoft.com", license="MIT", From dd1ceebfea0b08e1dfe50e3e1ca155afcb1ccc79 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Fri, 26 Nov 2021 14:52:04 +0800 Subject: [PATCH 04/33] Add macros for Spark 3.2.x --- .../compat/spark/since3p2defvisible.scala | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala diff --git a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala new file mode 100644 index 0000000000..153c2bc344 --- /dev/null +++ b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala @@ -0,0 +1,31 @@ +package com.microsoft.sarplus.spark + +import scala.annotation.{StaticAnnotation, compileTimeOnly} +import scala.language.experimental.macros +import scala.reflect.macros.Context + +import util.Properties.versionNumberString + +@compileTimeOnly("enable macro paradise to expand macro annotations") +class since3p2defvisible extends StaticAnnotation { + def macroTransform(annottees: Any*): Any = macro since3p2defvisibleMacro.impl +} + +object since3p2defvisibleMacro { + def impl(c: Context)(annottees: c.Tree*) = { + import c.universe._ + annottees match { + case q"$mods def $name[..$tparams](...$paramss): $tpt = $body" :: tail => + // NOTE: There seems no way to find out the Spark version. + if (versionNumberString.startsWith("2.12.14")) { + q""" + $mods def $name[..$tparams](...$paramss): $tpt = + $body + """ + } else { + q"" + } + case _ => throw new IllegalArgumentException("Please annotate a method") + } + } +} From 24b4d4832dbfa4592f1acffed053b3a52f6c9076 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Thu, 2 Dec 2021 18:20:25 +0800 Subject: [PATCH 05/33] Add sarplus testing and packaging workflow --- .github/workflows/sarplus.yml | 128 ++++++++++++++++++ contrib/sarplus/python/.flake8 | 4 + contrib/sarplus/python/setup.py | 3 +- contrib/sarplus/python/tests/conftest.py | 4 + .../sarplus/python/tests/test_pyspark_sar.py | 26 ++-- .../compat/spark/since3p2defvisible.scala | 5 +- 6 files changed, 159 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/sarplus.yml create mode 100644 contrib/sarplus/python/.flake8 create mode 100644 contrib/sarplus/python/tests/conftest.py diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml new file mode 100644 index 0000000000..07167f016c --- /dev/null +++ b/.github/workflows/sarplus.yml @@ -0,0 +1,128 @@ +# This workflow will run tests and do packaging for contrib/sarplus. +# +# Refenreces: +# * [GitHub Actions doc](https://docs.github.com/en/actions) +# * GitHub Actions workflow templates +# + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml) +# + [python publish](https://github.com/actions/starter-workflows/blob/main/ci/python-publish.yml) +# + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml) +# * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md) +# * [Azure Databirkcs runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases) + +name: sarplus package + +on: + push: + paths: + - contrib/sarplus/python/** + - contrib/sarplus/scala/** + - .github/workflows/sarplus.yml + +env: + PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python + SCALA_ROOT: ${{ github.workspace }}/contrib/sarplus/scala + SARPLUS_VERSION: 0.5.0 + +jobs: + python: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pybind11 pytest pytest-cov scikit-learn wheel + + - name: Lint with flake8 + run: | + cd "${PYTHON_ROOT}" + # See https://flake8.pycqa.org/en/latest/user/index.html + flake8 . + + - name: Package + run: | + cd "${PYTHON_ROOT}" + VERSION="${SARPLUS_VERSION}" python setup.py bdist_wheel + + - name: Testing + env: + ACCESS_TOKEN: ${{ secrets.SARPLUS_TESTDATA_ACCESS_TOKEN }} + run: | + cd "${PYTHON_ROOT}" + python -m pip install --use-feature=2020-resolver dist/*.whl + + cd "${SCALA_ROOT}" + export SPARK_VERSION=$(python -m pip show pyspark | grep -i version | cut -d ' ' -f 2) + SPARK_JAR_DIR=$(python -m pip show pyspark | grep -i location | cut -d ' ' -f2)/pyspark/jars + SCALA_JAR=$(ls ${SPARK_JAR_DIR}/scala-library*) + HADOOP_JAR=$(ls ${SPARK_JAR_DIR}/hadoop-client-api*) + SCALA_VERSION=${SCALA_JAR##*-} + export SCALA_VERSION=${SCALA_VERSION%.*} + HADOOP_VERSION=${HADOOP_JAR##*-} + export HADOOP_VERSION=${HADOOP_VERSION%.*} + export VERSION="${SARPLUS_VERSION}" + sbt ++"${SCALA_VERSION}"! package + + cd "${PYTHON_ROOT}" + pytest --token "${ACCESS_TOKEN}" ./tests + + - name: Upload Python package + uses: actions/upload-artifact@v2 + with: + name: pysarplus-${{ env.SARPLUS_VERSION }}-cp${{ matrix.python-version }} + path: ${{ env.PYTHON_ROOT }}/dist/pysarplus-*.whl +# - name: Publish Python package +# if: github.ref == 'refs/heads/main' +# run: + + scala: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scala-version: "2.12.10" + spark-version: "3.0.1" + hadoop-version: "2.7.4" + databricks-runtime: "ADB 7.3 LTS" + + - scala-version: "2.12.10" + spark-version: "3.1.2" + hadoop-version: "2.7.4" + databricks-runtime: "ADB 9.1 LTS" + + - scala-version: "2.12.14" + spark-version: "3.2.0" + hadoop-version: "3.3.1" + databricks-runtime: "ADB 10.0" + + steps: + - uses: actions/checkout@v2 + + # TODO: Add testing + + - name: Package + run: | + cd "${SCALA_ROOT}" + export VERSION="${SARPLUS_VERSION}" + export SPARK_VERSION="${{ matrix.spark-version }}" + export HADOOP_VERSION="${{ matrix.hadoop-version }}" + sbt ++${{ matrix.scala-version }}! package + SCALA_VERSION=${{ matrix.scala-version }} + echo "scala_binary_version=${SCALA_VERSION%.*}" >> $GITHUB_ENV + - name: Upload Scala package + uses: actions/upload-artifact@v2 + with: + name: scala_${{ matrix.scala-version }}_s${{ matrix.spark-version }}_h${{ matrix.hadoop-version }}-${{ env.SARPLUS_VERSION }} (${{ matrix.databricks-runtime }}) + path: ${{ env.SCALA_ROOT }}/target/scala-${{ env.scala_binary_version }}/*.jar +# - name: Publish Scala package +# if: github.ref == 'refs/heads/main' +# run: diff --git a/contrib/sarplus/python/.flake8 b/contrib/sarplus/python/.flake8 new file mode 100644 index 0000000000..4c042c65b0 --- /dev/null +++ b/contrib/sarplus/python/.flake8 @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 120 +ignore = W291 +per-file-ignores = pysarplus/SARPlus.py: E501, pysarplus/__init__.py: F401 \ No newline at end of file diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index ad17589af9..48e39468b6 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -14,10 +14,11 @@ def __str__(self): return pybind11.get_include(self.user) + DEPENDENCIES = [ "numpy", "pandas", - # "pyarrow==0.13.0", + "pyarrow>=1.0.0", "pybind11>=2.2", "pyspark>=3.0.0" ] diff --git a/contrib/sarplus/python/tests/conftest.py b/contrib/sarplus/python/tests/conftest.py new file mode 100644 index 0000000000..f66b33b92f --- /dev/null +++ b/contrib/sarplus/python/tests/conftest.py @@ -0,0 +1,4 @@ +def pytest_addoption(parser): + parser.addoption( + "--token", action="store", default="", help="Access token of the test data" + ) diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index 35df0ee0a0..a09d4c2a39 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -19,6 +19,14 @@ def assert_compare(expected_id, expected_score, actual_prediction): assert math.isclose(expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3) +@pytest.fixture(scope="module") +def token(request): + if request.config.getoption("--token") == "": + return "" + else: + return "?" + request.config.getoption("--token") + + @pytest.fixture(scope="module") def spark(tmp_path_factory, app_name="Sample", url="local[*]", memory="1G"): """Start Spark if not started @@ -198,9 +206,9 @@ def train_test_dummy_timestamp(pandas_dummy_timestamp): @pytest.fixture(scope="module") -def demo_usage_data(header, sar_settings): +def demo_usage_data(header, sar_settings, token): # load the data - data = pd.read_csv(sar_settings["FILE_DIR"] + "demoUsage.csv") + data = pd.read_csv(sar_settings["FILE_DIR"] + "demoUsage.csv" + token) data["rating"] = pd.Series([1] * data.shape[0]) data = data.rename( columns={ @@ -234,7 +242,7 @@ def sar_settings(): # absolute tolerance parameter for matrix equivalence in SAR tests "ATOL": 1e-8, # directory of the current file - used to link unit test data - "FILE_DIR": "http://recodatasets.blob.core.windows.net/sarunittest/", + "FILE_DIR": "https://recodatasets.blob.core.windows.net/sarunittest/", # user ID used in the test files (they are designed for this user ID, this is part of the test) "TEST_USER_ID": "0003000098E85347", } @@ -271,7 +279,7 @@ def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timesta (3, "lift", "lift"), ], ) -def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header): +def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header, token): model = SARPlus( spark, @@ -287,7 +295,7 @@ def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage model.fit(df) # reference - item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv") + item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv" + token) item_similarity_ref = pd.melt( item_similarity_ref, item_similarity_ref.columns[0], item_similarity_ref.columns[1:], "i2", "value", @@ -309,7 +317,7 @@ def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage # Test 7 -def test_user_affinity(spark, demo_usage_data, sar_settings, header): +def test_user_affinity(spark, demo_usage_data, sar_settings, header, token): time_now = demo_usage_data[header["col_timestamp"]].max() model = SARPlus( @@ -324,7 +332,7 @@ def test_user_affinity(spark, demo_usage_data, sar_settings, header): df = spark.createDataFrame(demo_usage_data) model.fit(df) - user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv") + user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv" + token) user_affinity_ref = pd.melt( user_affinity_ref, user_affinity_ref.columns[0], user_affinity_ref.columns[1:], "ItemId", "Rating", ) @@ -346,7 +354,7 @@ def test_user_affinity(spark, demo_usage_data, sar_settings, header): @pytest.mark.parametrize( "threshold,similarity_type,file", [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")], ) -def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar_settings, demo_usage_data): +def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar_settings, demo_usage_data, token): time_now = demo_usage_data[header["col_timestamp"]].max() test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file) @@ -365,7 +373,7 @@ def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar df = spark.createDataFrame(demo_usage_data) model.fit(df) - url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" + url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" + token pred_ref = pd.read_csv(url) pred_ref = ( diff --git a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala index 153c2bc344..6867a6ea6a 100644 --- a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala +++ b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala @@ -17,7 +17,10 @@ object since3p2defvisibleMacro { annottees match { case q"$mods def $name[..$tparams](...$paramss): $tpt = $body" :: tail => // NOTE: There seems no way to find out the Spark version. - if (versionNumberString.startsWith("2.12.14")) { + val major = versionNumberString.split('.')(0).toInt + val minor = versionNumberString.split('.')(1).toInt + val patch = versionNumberString.split('.')(2).toInt + if (major >= 2 && minor >= 12 && patch >= 14) { q""" $mods def $name[..$tparams](...$paramss): $tpt = $body From 5af0c803c24959d8a8e67d3c7f6a193bc3bc7054 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Wed, 8 Dec 2021 13:51:45 +0800 Subject: [PATCH 06/33] Add steps to publish python package --- .github/workflows/sarplus.yml | 42 +++++++++++++++------------ contrib/sarplus/python/pyproject.toml | 7 +++++ contrib/sarplus/python/setup.py | 4 +++ 3 files changed, 35 insertions(+), 18 deletions(-) create mode 100644 contrib/sarplus/python/pyproject.toml diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 07167f016c..998d6bdd45 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -3,11 +3,15 @@ # Refenreces: # * [GitHub Actions doc](https://docs.github.com/en/actions) # * GitHub Actions workflow templates -# + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml) -# + [python publish](https://github.com/actions/starter-workflows/blob/main/ci/python-publish.yml) -# + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml) +# + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml) +# + [python publish](https://github.com/actions/starter-workflows/blob/main/ci/python-publish.yml) +# + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml) # * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md) # * [Azure Databirkcs runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases) +# * Package publish +# + [Publishing package distribution releases using GitHub Actions CI/CD workflows](https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/) +# + [pypa/gh-action-pypy-publish](https://github.com/pypa/gh-action-pypi-publish) + name: sarplus package @@ -39,8 +43,8 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install flake8 pybind11 pytest pytest-cov scikit-learn wheel + python -m pip install -U build pip twine + python -m pip install -U flake8 pytest pytest-cov scikit-learn - name: Lint with flake8 run: | @@ -48,17 +52,19 @@ jobs: # See https://flake8.pycqa.org/en/latest/user/index.html flake8 . - - name: Package + - name: Package and check run: | cd "${PYTHON_ROOT}" - VERSION="${SARPLUS_VERSION}" python setup.py bdist_wheel + sed -i -E "s/version[[:space:]]*=[[:space:]]*.*,/version=\"${SARPLUS_VERSION}\",/" setup.py + python -m build --sdist + python -m twine check dist/* - name: Testing env: ACCESS_TOKEN: ${{ secrets.SARPLUS_TESTDATA_ACCESS_TOKEN }} run: | cd "${PYTHON_ROOT}" - python -m pip install --use-feature=2020-resolver dist/*.whl + python -m pip install dist/*.gz cd "${SCALA_ROOT}" export SPARK_VERSION=$(python -m pip show pyspark | grep -i version | cut -d ' ' -f 2) @@ -75,14 +81,15 @@ jobs: cd "${PYTHON_ROOT}" pytest --token "${ACCESS_TOKEN}" ./tests - - name: Upload Python package - uses: actions/upload-artifact@v2 + - name: Publish package to PyPI + if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' + uses: pypa/gh-action-pypi-publish@release/v1 with: - name: pysarplus-${{ env.SARPLUS_VERSION }}-cp${{ matrix.python-version }} - path: ${{ env.PYTHON_ROOT }}/dist/pysarplus-*.whl -# - name: Publish Python package -# if: github.ref == 'refs/heads/main' -# run: + user: __token__ + password: ${{ secrets.SARPLUS_TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + packages_dir: ${{ env.PYTHON_ROOT }}/dist/ + skip_existing: true scala: runs-on: ubuntu-latest @@ -107,14 +114,13 @@ jobs: steps: - uses: actions/checkout@v2 - # TODO: Add testing - - - name: Package + - name: Testing and Packaging run: | cd "${SCALA_ROOT}" export VERSION="${SARPLUS_VERSION}" export SPARK_VERSION="${{ matrix.spark-version }}" export HADOOP_VERSION="${{ matrix.hadoop-version }}" + sbt ++${{ matrix.scala-version }}! test sbt ++${{ matrix.scala-version }}! package SCALA_VERSION=${{ matrix.scala-version }} echo "scala_binary_version=${SCALA_VERSION%.*}" >> $GITHUB_ENV diff --git a/contrib/sarplus/python/pyproject.toml b/contrib/sarplus/python/pyproject.toml new file mode 100644 index 0000000000..415ac9499d --- /dev/null +++ b/contrib/sarplus/python/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = [ + "pybind11", + "setuptools>=42", + "wheel", +] +build-backend = "setuptools.build_meta" diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 48e39468b6..771782a9b6 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -37,6 +37,10 @@ def __str__(self): "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Mathematics", From 52999e30edfc32272bda2a00ba30656e060628cc Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Wed, 8 Dec 2021 13:52:22 +0800 Subject: [PATCH 07/33] Add configs for scala package publish --- contrib/sarplus/scala/build.sbt | 53 +++++++++++++++++++---- contrib/sarplus/scala/project/plugins.sbt | 1 + 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index ebf61615d8..528c02d49c 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -1,12 +1,7 @@ name := "sarplus" -licenses := Seq("MIT" -> url("http://opensource.org/licenses/MIT")) -// credentials += Credentials(Path.userHome / ".m2" / ".sbtcredentials") -// publishTo := { -// val org = sys.env.getOrElse("ORG", "") -// val project = sys.env.getOrElse("PROJECT", "") -// val feed = sys.env.getOrElse("FEED", "") -// Some("releases" at "https://pkgs.dev.azure.com/%s/%s/_packaging/%s/Maven/v1".format(org, project, feed)) -// } + + +// Denpendency configuration lazy val sparkVer = settingKey[String]("spark version") lazy val hadoopVer = settingKey[String]("hadoop version") @@ -49,4 +44,44 @@ lazy val root = (project in file(".")) commonSettings, ) -// aetherPublishBothSettings + +// POM metadata configuration. See https://www.scala-sbt.org/release/docs/Using-Sonatype.html + +organization := "com.microsoft.sarplus" +organizationName := "microsoft" +organizationHomepage := Some(url("https://microsoft.com")) + +scmInfo := Some( + ScmInfo( + url("https://github.com/microsoft/recommenders/tree/main/contrib/sarplus"), + "scm:git@github.com:microsoft/recommenders.git" + ) +) + +developers := List( + Developer( + name = "Markus Cozowicz", + email = "marcozo@microsoft.com" + ) +) + +description := "sarplus" +licenses := Seq("MIT" -> url("http://opensource.org/licenses/MIT")) +homepage := Some(url("https://github.com/microsoft/recommenders/tree/main/contrib/sarplus")) +pomIncludeRepository := { _ => false } +publishTo := { + val nexus = "https://oss.sonatype.org/" + if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") + else Some("releases" at nexus + "service/local/staging/deploy/maven2") +} +publishMavenStyle := true + + +// PGP configuration + +credentials += Credentials( + "GnuPG Key ID", + "gpg", + "C72E596B384EC14CFA65D80A36CB250AF1C18ECE", + "ignored" +) diff --git a/contrib/sarplus/scala/project/plugins.sbt b/contrib/sarplus/scala/project/plugins.sbt index 2bd037f6a1..0a8aeeaba2 100644 --- a/contrib/sarplus/scala/project/plugins.sbt +++ b/contrib/sarplus/scala/project/plugins.sbt @@ -1,2 +1,3 @@ addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.27.0") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.1") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") From b50b94adcb54e8a303621222f720e4b07a56e45e Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Wed, 8 Dec 2021 14:25:14 +0800 Subject: [PATCH 08/33] Add python 3.6 and 3.7 --- .github/workflows/sarplus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 998d6bdd45..9dc877e767 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -32,7 +32,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 From 74298ce1a2ef440d7afc90ec44c9fa5b6f1de3e6 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Thu, 9 Dec 2021 16:10:46 +0800 Subject: [PATCH 09/33] Add steps for Scala packaging --- .github/workflows/sarplus.yml | 56 +++++++++++++++---- .../sarplus/python/tests/test_pyspark_sar.py | 2 +- contrib/sarplus/scala/build.sbt | 24 ++++++-- contrib/sarplus/scala/project/plugins.sbt | 1 + 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 9dc877e767..f2be6ad281 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -81,6 +81,13 @@ jobs: cd "${PYTHON_ROOT}" pytest --token "${ACCESS_TOKEN}" ./tests + - name: Upload Python package + if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' + uses: actions/upload-artifact@v2 + with: + name: pysarplus-${{ env.SARPLUS_VERSION }} + path: ${{ env.PYTHON_ROOT }}/dist/*.gz + - name: Publish package to PyPI if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' uses: pypa/gh-action-pypi-publish@release/v1 @@ -91,7 +98,7 @@ jobs: packages_dir: ${{ env.PYTHON_ROOT }}/dist/ skip_existing: true - scala: + scala-test: runs-on: ubuntu-latest strategy: matrix: @@ -114,21 +121,48 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Testing and Packaging + - name: Test run: | cd "${SCALA_ROOT}" export VERSION="${SARPLUS_VERSION}" export SPARK_VERSION="${{ matrix.spark-version }}" export HADOOP_VERSION="${{ matrix.hadoop-version }}" sbt ++${{ matrix.scala-version }}! test - sbt ++${{ matrix.scala-version }}! package - SCALA_VERSION=${{ matrix.scala-version }} - echo "scala_binary_version=${SCALA_VERSION%.*}" >> $GITHUB_ENV - - name: Upload Scala package + + scala-package: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Package + env: + GPG_KEY: ${{ secrets.SARPLUS_GPG_PRI_KEY_ASC }} + run: | + # generate artifacts + cd "${SCALA_ROOT}" + export VERSION="${SARPLUS_VERSION}" + export SPARK_VERSION="3.1.2" + export HADOOP_VERSION="2.7.4" + export SCALA_VERSION=2.12.10 + sbt ++${SCALA_VERSION}! package + sbt ++${SCALA_VERSION}! packageDoc + sbt ++${SCALA_VERSION}! packageSrc + sbt ++${SCALA_VERSION}! makePom + export SPARK_VERSION="3.2.0" + export HADOOP_VERSION="3.3.1" + export SCALA_VERSION=2.12.14 + sbt ++${SCALA_VERSION}! package + + # sign with GPG + cd target/scala-2.12 + gpg --import <(cat <<< "${GPG_KEY}") + for file in {*.jar,*.pom}; do gpg -ab "${file}"; done + + # bundle + jar cvf sarplus-bundle_2.12-${SARPLUS_VERSION}.jar *.jar *.pom *.asc + + - name: Upload Scala bundle uses: actions/upload-artifact@v2 with: - name: scala_${{ matrix.scala-version }}_s${{ matrix.spark-version }}_h${{ matrix.hadoop-version }}-${{ env.SARPLUS_VERSION }} (${{ matrix.databricks-runtime }}) - path: ${{ env.SCALA_ROOT }}/target/scala-${{ env.scala_binary_version }}/*.jar -# - name: Publish Scala package -# if: github.ref == 'refs/heads/main' -# run: + name: scala-bundle_2.12-${{ env.SARPLUS_VERSION }} + path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.SARPLUS_VERSION }}.jar diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index a09d4c2a39..3390d8d34a 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -41,7 +41,7 @@ def spark(tmp_path_factory, app_name="Sample", url="local[*]", memory="1G"): Path(__file__) .parents[2] .joinpath("scala", "target") - .glob(f"**/*{os.environ.get('VERSION', '')}.jar")).absolute() + .glob(f"**/sarplus*{os.environ.get('VERSION', '')}*.jar")).absolute() except StopIteration: raise Exception("Could not find Sarplus JAR file") diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index 528c02d49c..7952f31496 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -30,9 +30,20 @@ lazy val commonSettings = Seq( "org.scalatest" %% "scalatest" % "3.0.8" % "test", "xerces" % "xercesImpl" % "2.12.1", ), - artifactName := { - (sv: ScalaVersion, module: ModuleID, artifact: Artifact) => - artifact.name + "_" + sv.full + "_s" + sparkVer.value + "_h" + hadoopVer.value + "-" + module.revision + "." + artifact.extension + Compile / packageBin / artifact := { + val prev: Artifact = (Compile / packageBin / artifact).value + prev.withClassifier( + prev.classifier match { + case None => { + val splitVer = sparkVer.value.split('.') + val major = splitVer(0).toInt + val minor = splitVer(1).toInt + if (major >=3 && minor >= 2) Some("spark32") else None + } + case Some(s: String) => Some(s) + + } + ) }, ) @@ -60,8 +71,10 @@ scmInfo := Some( developers := List( Developer( + id = "eisber", name = "Markus Cozowicz", - email = "marcozo@microsoft.com" + email = "marcozo@microsoft.com", + url = url("https://github.com/eisber") ) ) @@ -77,7 +90,7 @@ publishTo := { publishMavenStyle := true -// PGP configuration +// PGP key configuration credentials += Credentials( "GnuPG Key ID", @@ -85,3 +98,4 @@ credentials += Credentials( "C72E596B384EC14CFA65D80A36CB250AF1C18ECE", "ignored" ) +// credentials += Credentials(Path.userHome / ".sbt" / "sonatype_credentials") diff --git a/contrib/sarplus/scala/project/plugins.sbt b/contrib/sarplus/scala/project/plugins.sbt index 0a8aeeaba2..e29553cb77 100644 --- a/contrib/sarplus/scala/project/plugins.sbt +++ b/contrib/sarplus/scala/project/plugins.sbt @@ -1,3 +1,4 @@ addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.27.0") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.1") addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") +// addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.10") \ No newline at end of file From 92923ad90867e46a3dbfb53a59ff97f0bcd93889 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Thu, 9 Dec 2021 16:23:59 +0800 Subject: [PATCH 10/33] Rename scala bundle --- .github/workflows/sarplus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index f2be6ad281..39b701cee1 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -164,5 +164,5 @@ jobs: - name: Upload Scala bundle uses: actions/upload-artifact@v2 with: - name: scala-bundle_2.12-${{ env.SARPLUS_VERSION }} + name: sarplus-bundle_2.12-${{ env.SARPLUS_VERSION }} path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.SARPLUS_VERSION }}.jar From d34c3e2cf42b3f5bbaa6c2a4d04d1f55fc9caded Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Fri, 10 Dec 2021 14:47:30 +0800 Subject: [PATCH 11/33] Add license hader --- contrib/sarplus/python/pysarplus/SARModel.py | 3 +++ contrib/sarplus/python/pysarplus/SARPlus.py | 2 ++ contrib/sarplus/python/pysarplus/__init__.py | 3 +++ contrib/sarplus/python/setup.py | 3 +++ contrib/sarplus/python/src/pysarplus.cpp | 5 +++++ contrib/sarplus/python/tests/conftest.py | 3 +++ contrib/sarplus/python/tests/test_pyspark_sar.py | 3 +++ contrib/sarplus/scala/build.sbt | 6 +++++- .../microsoft/sarplus/compat/spark/since3p2defvisible.scala | 5 +++++ contrib/sarplus/scala/python/pysarplus_dummy/__init__.py | 3 +++ contrib/sarplus/scala/python/setup.py | 3 +++ .../main/scala/com/microsoft/sarplus/DefaultSource.scala | 5 +++++ .../scala/com/microsoft/sarplus/SARCacheOutputWriter.scala | 5 +++++ .../com/microsoft/sarplus/SARCacheOutputWriterFactory.scala | 5 +++++ .../com/microsoft/sarplus/SARCacheOutputWriterSpec.scala | 5 +++++ 15 files changed, 58 insertions(+), 1 deletion(-) diff --git a/contrib/sarplus/python/pysarplus/SARModel.py b/contrib/sarplus/python/pysarplus/SARModel.py index 427bdccd0a..09bb289502 100644 --- a/contrib/sarplus/python/pysarplus/SARModel.py +++ b/contrib/sarplus/python/pysarplus/SARModel.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + import pysarplus_cpp import os diff --git a/contrib/sarplus/python/pysarplus/SARPlus.py b/contrib/sarplus/python/pysarplus/SARPlus.py index e560002760..b96ad91605 100644 --- a/contrib/sarplus/python/pysarplus/SARPlus.py +++ b/contrib/sarplus/python/pysarplus/SARPlus.py @@ -1,3 +1,5 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. """This is the implementation of SAR.""" import logging diff --git a/contrib/sarplus/python/pysarplus/__init__.py b/contrib/sarplus/python/pysarplus/__init__.py index 4e44ba7fe9..8ae388e483 100644 --- a/contrib/sarplus/python/pysarplus/__init__.py +++ b/contrib/sarplus/python/pysarplus/__init__.py @@ -1,2 +1,5 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + from .SARModel import SARModel from .SARPlus import SARPlus diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 771782a9b6..53745d453c 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + import os import sysconfig diff --git a/contrib/sarplus/python/src/pysarplus.cpp b/contrib/sarplus/python/src/pysarplus.cpp index 7a5a2739f9..0b06912740 100644 --- a/contrib/sarplus/python/src/pysarplus.cpp +++ b/contrib/sarplus/python/src/pysarplus.cpp @@ -1,3 +1,8 @@ +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ + #include #include diff --git a/contrib/sarplus/python/tests/conftest.py b/contrib/sarplus/python/tests/conftest.py index f66b33b92f..02b14a9c78 100644 --- a/contrib/sarplus/python/tests/conftest.py +++ b/contrib/sarplus/python/tests/conftest.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + def pytest_addoption(parser): parser.addoption( "--token", action="store", default="", help="Access token of the test data" diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index 3390d8d34a..9b03862ba6 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + import calendar import datetime import math diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index 7952f31496..d79ebab9cf 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -1,5 +1,9 @@ -name := "sarplus" +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ +name := "sarplus" // Denpendency configuration diff --git a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala index 6867a6ea6a..780581d4e5 100644 --- a/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala +++ b/contrib/sarplus/scala/compat/src/main/scala/com/microsoft/sarplus/compat/spark/since3p2defvisible.scala @@ -1,3 +1,8 @@ +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ + package com.microsoft.sarplus.spark import scala.annotation.{StaticAnnotation, compileTimeOnly} diff --git a/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py b/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py index aa0be35f48..0720a92163 100644 --- a/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py +++ b/contrib/sarplus/scala/python/pysarplus_dummy/__init__.py @@ -1 +1,4 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + installed = 1 diff --git a/contrib/sarplus/scala/python/setup.py b/contrib/sarplus/scala/python/setup.py index 39a96b237e..de096d333f 100644 --- a/contrib/sarplus/scala/python/setup.py +++ b/contrib/sarplus/scala/python/setup.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + from distutils.core import setup import os diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala index c693e870ad..f7a1da5376 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/DefaultSource.scala @@ -1,3 +1,8 @@ +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ + package com.microsoft.sarplus import org.apache.spark.sql.sources.DataSourceRegister diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala index c4d83d13fe..49c924c6c7 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriter.scala @@ -1,3 +1,8 @@ +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ + package com.microsoft.sarplus import java.io.{DataOutputStream, FileInputStream, FileOutputStream, BufferedOutputStream, OutputStream} diff --git a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala index 71d1e44f37..2e41effa3a 100644 --- a/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala +++ b/contrib/sarplus/scala/src/main/scala/com/microsoft/sarplus/SARCacheOutputWriterFactory.scala @@ -1,3 +1,8 @@ +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ + package com.microsoft.sarplus import org.apache.hadoop.mapreduce.TaskAttemptContext diff --git a/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala b/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala index 5eadfd8012..7565965e80 100644 --- a/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala +++ b/contrib/sarplus/scala/src/test/scala/com/microsoft/sarplus/SARCacheOutputWriterSpec.scala @@ -1,3 +1,8 @@ +/* + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. + */ + package com.microsoft.sarplus import org.scalatest._ From 466ebc83db626110c533cf3e54b9cec319e8e508 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Sun, 12 Dec 2021 16:24:49 +0800 Subject: [PATCH 12/33] Format Python code with black --- contrib/sarplus/python/pysarplus/SARModel.py | 5 +- contrib/sarplus/python/pysarplus/SARPlus.py | 77 +++++++--- contrib/sarplus/python/tests/conftest.py | 1 + .../sarplus/python/tests/test_pyspark_sar.py | 142 ++++++++++++++---- 4 files changed, 177 insertions(+), 48 deletions(-) diff --git a/contrib/sarplus/python/pysarplus/SARModel.py b/contrib/sarplus/python/pysarplus/SARModel.py index 09bb289502..afd90cd0f9 100644 --- a/contrib/sarplus/python/pysarplus/SARModel.py +++ b/contrib/sarplus/python/pysarplus/SARModel.py @@ -20,7 +20,10 @@ def __init__(self, path): def find_or_raise(extension): files = [f for f in all_files if f.endswith(extension)] if len(files) != 1: - raise ValueError("Directory '%s' must contain exactly 1 file ending in '%s'" % (path, extension)) + raise ValueError( + "Directory '%s' must contain exactly 1 file ending in '%s'" + % (path, extension) + ) return path + "/" + files[0] # instantiate C++ backend diff --git a/contrib/sarplus/python/pysarplus/SARPlus.py b/contrib/sarplus/python/pysarplus/SARPlus.py index b96ad91605..91620ede1e 100644 --- a/contrib/sarplus/python/pysarplus/SARPlus.py +++ b/contrib/sarplus/python/pysarplus/SARPlus.py @@ -64,7 +64,7 @@ def f(self, str, **kwargs): # current time for time decay calculation # cooccurrence matrix threshold def fit(self, df): - """Main fit method for SAR. + """Main fit method for SAR. Expects the dataframes to have row_id, col_id columns which are indexes, i.e. contain the sequential integer index of the original alphanumeric user and item IDs. @@ -106,7 +106,9 @@ def fit(self, df): df = self.spark.sql(query) else: # since SQL is case insensitive, this check needs to be performed similar - if self.header["col_timestamp"].lower() in [s.name.lower() for s in df.schema]: + if self.header["col_timestamp"].lower() in [ + s.name.lower() for s in df.schema + ]: # we need to de-duplicate items by using the latest item query = self.f( """ @@ -141,12 +143,16 @@ def fit(self, df): ) item_cooccurrence = self.spark.sql(query) - item_cooccurrence.write.mode("overwrite").saveAsTable(self.f("{prefix}item_cooccurrence")) + item_cooccurrence.write.mode("overwrite").saveAsTable( + self.f("{prefix}item_cooccurrence") + ) # compute the diagonal used later for Jaccard and Lift if self.similarity_type == SIM_LIFT or self.similarity_type == SIM_JACCARD: item_marginal = self.spark.sql( - self.f("SELECT i1 i, value AS margin FROM {prefix}item_cooccurrence WHERE i1 = i2") + self.f( + "SELECT i1 i, value AS margin FROM {prefix}item_cooccurrence WHERE i1 = i2" + ) ) item_marginal.createOrReplaceTempView(self.f("{prefix}item_marginal")) @@ -175,11 +181,17 @@ def fit(self, df): ) self.item_similarity = self.spark.sql(query) else: - raise ValueError("Unknown similarity type: {0}".format(self.similarity_type)) + raise ValueError( + "Unknown similarity type: {0}".format(self.similarity_type) + ) # store upper triangular - log.info("sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type) - self.item_similarity.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_upper")) + log.info( + "sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type + ) + self.item_similarity.write.mode("overwrite").saveAsTable( + self.f("{prefix}item_similarity_upper") + ) # expand upper triangular to full matrix @@ -197,7 +209,9 @@ def fit(self, df): ) self.item_similarity = self.spark.sql(query) - self.item_similarity.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity")) + self.item_similarity.write.mode("overwrite").saveAsTable( + self.f("{prefix}item_similarity") + ) # free space self.spark.sql(self.f("DROP TABLE {prefix}item_cooccurrence")) @@ -214,10 +228,14 @@ def get_user_affinity(self, test): """ test.createOrReplaceTempView(self.f("{prefix}df_test")) - query = self.f("SELECT DISTINCT {col_user} FROM {prefix}df_test CLUSTER BY {col_user}") + query = self.f( + "SELECT DISTINCT {col_user} FROM {prefix}df_test CLUSTER BY {col_user}" + ) df_test_users = self.spark.sql(query) - df_test_users.write.mode("overwrite").saveAsTable(self.f("{prefix}df_test_users")) + df_test_users.write.mode("overwrite").saveAsTable( + self.f("{prefix}df_test_users") + ) query = self.f( """ @@ -231,7 +249,12 @@ def get_user_affinity(self, test): return self.spark.sql(query) def recommend_k_items( - self, test, cache_path, top_k=10, remove_seen=True, n_user_prediction_partitions=200, + self, + test, + cache_path, + top_k=10, + remove_seen=True, + n_user_prediction_partitions=200, ): # create item id to continuous index mapping @@ -263,10 +286,16 @@ def recommend_k_items( log.info("sarplus.recommend_k_items 2/3: prepare similarity matrix") self.spark.sql( - self.f("SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2") - ).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(cache_path_output) + self.f( + "SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2" + ) + ).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save( + cache_path_output + ) - self.get_user_affinity(test).createOrReplaceTempView(self.f("{prefix}user_affinity")) + self.get_user_affinity(test).createOrReplaceTempView( + self.f("{prefix}user_affinity") + ) # map item ids to index space pred_input = self.spark.sql( @@ -285,7 +314,9 @@ def recommend_k_items( schema = StructType( [ - StructField("userID", pred_input.schema[self.header["col_user"]].dataType, True), + StructField( + "userID", pred_input.schema[self.header["col_user"]].dataType, True + ), StructField("itemID", IntegerType(), True), StructField("score", FloatType(), True), ] @@ -303,18 +334,24 @@ def sar_predict_udf(df): # memory mapped, the memory consumption only happens ones per worker # for all python processes model = SARModel(cache_path_input) - preds = model.predict(df["idx"].values, df["rating"].values, top_k, remove_seen) + preds = model.predict( + df["idx"].values, df["rating"].values, top_k, remove_seen + ) user = df[local_header["col_user"]].iloc[0] - preds_ret = pd.DataFrame([(user, x.id, x.score) for x in preds], columns=range(3)) + preds_ret = pd.DataFrame( + [(user, x.id, x.score) for x in preds], columns=range(3) + ) return preds_ret log.info("sarplus.recommend_k_items 3/3: compute recommendations") df_preds = ( - pred_input.repartition(n_user_prediction_partitions, self.header["col_user"]) + pred_input.repartition( + n_user_prediction_partitions, self.header["col_user"] + ) .groupby(self.header["col_user"]) .apply(sar_predict_udf) ) @@ -344,7 +381,9 @@ def recommend_k_items_slow(self, test, top_k=10, remove_seen=True): if remove_seen: raise ValueError("Not implemented") - self.get_user_affinity(test).write.mode("overwrite").saveAsTable(self.f("{prefix}user_affinity")) + self.get_user_affinity(test).write.mode("overwrite").saveAsTable( + self.f("{prefix}user_affinity") + ) # user_affinity * item_similarity # filter top-k diff --git a/contrib/sarplus/python/tests/conftest.py b/contrib/sarplus/python/tests/conftest.py index 02b14a9c78..9a36620d7b 100644 --- a/contrib/sarplus/python/tests/conftest.py +++ b/contrib/sarplus/python/tests/conftest.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. + def pytest_addoption(parser): parser.addoption( "--token", action="store", default="", help="Access token of the test data" diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index 9b03862ba6..995a03ebf2 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -19,7 +19,9 @@ def assert_compare(expected_id, expected_score, actual_prediction): assert expected_id == actual_prediction.id - assert math.isclose(expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3) + assert math.isclose( + expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3 + ) @pytest.fixture(scope="module") @@ -44,7 +46,8 @@ def spark(tmp_path_factory, app_name="Sample", url="local[*]", memory="1G"): Path(__file__) .parents[2] .joinpath("scala", "target") - .glob(f"**/sarplus*{os.environ.get('VERSION', '')}*.jar")).absolute() + .glob(f"**/sarplus*{os.environ.get('VERSION', '')}*.jar") + ).absolute() except StopIteration: raise Exception("Could not find Sarplus JAR file") @@ -147,7 +150,12 @@ def test_pandas(spark, sample_cache): item_scores = pd.DataFrame([(0, 2.3), (1, 3.1)], columns=["itemID", "score"]) model = SARModel(sample_cache) - y = model.predict(item_scores["itemID"].values, item_scores["score"].values, top_k=10, remove_seen=False,) + y = model.predict( + item_scores["itemID"].values, + item_scores["score"].values, + top_k=10, + remove_seen=False, + ) assert_compare(0, 0.85, y[0]) assert_compare(1, 6.9699, y[1]) @@ -161,7 +169,9 @@ def test_e2e(spark, pandas_dummy_dataset, header): df = spark.createDataFrame(pandas_dummy_dataset) sar.fit(df) - test_df = spark.createDataFrame(pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]})) + test_df = spark.createDataFrame( + pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]}) + ) r1 = ( sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False) @@ -172,7 +182,11 @@ def test_e2e(spark, pandas_dummy_dataset, header): r2 = ( sar.recommend_k_items( - test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False, + test_df, + "tests/test_e2e_cache", + top_k=3, + n_user_prediction_partitions=2, + remove_seen=False, ) .toPandas() .sort_values([header["col_user"], header["col_item"]]) @@ -224,7 +238,11 @@ def demo_usage_data(header, sar_settings, token): # convert timestamp data[header["col_timestamp"]] = data[header["col_timestamp"]].apply( - lambda s: float(calendar.timegm(datetime.datetime.strptime(s, "%Y/%m/%dT%H:%M:%S").timetuple())) + lambda s: float( + calendar.timegm( + datetime.datetime.strptime(s, "%Y/%m/%dT%H:%M:%S").timetuple() + ) + ) ) return data @@ -251,9 +269,18 @@ def sar_settings(): } -@pytest.mark.parametrize("similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)]) -def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header): - model = SARPlus(spark, **header, timedecay_formula=timedecay_formula, similarity_type=similarity_type) +@pytest.mark.parametrize( + "similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)] +) +def test_fit( + spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header +): + model = SARPlus( + spark, + **header, + timedecay_formula=timedecay_formula, + similarity_type=similarity_type, + ) trainset, testset = train_test_dummy_timestamp @@ -282,7 +309,16 @@ def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timesta (3, "lift", "lift"), ], ) -def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header, token): +def test_sar_item_similarity( + spark, + threshold, + similarity_type, + file, + demo_usage_data, + sar_settings, + header, + token, +): model = SARPlus( spark, @@ -291,32 +327,48 @@ def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage time_decay_coefficient=30, time_now=None, threshold=threshold, - similarity_type=similarity_type + similarity_type=similarity_type, ) df = spark.createDataFrame(demo_usage_data) model.fit(df) # reference - item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv" + token) + item_similarity_ref = pd.read_csv( + sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv" + token + ) item_similarity_ref = pd.melt( - item_similarity_ref, item_similarity_ref.columns[0], item_similarity_ref.columns[1:], "i2", "value", + item_similarity_ref, + item_similarity_ref.columns[0], + item_similarity_ref.columns[1:], + "i2", + "value", ) item_similarity_ref.columns = ["i1", "i2", "value"] item_similarity_ref = ( - item_similarity_ref[item_similarity_ref.value > 0].sort_values(["i1", "i2"]).reset_index(drop=True) + item_similarity_ref[item_similarity_ref.value > 0] + .sort_values(["i1", "i2"]) + .reset_index(drop=True) ) # actual - item_similarity = model.item_similarity.toPandas().sort_values(["i1", "i2"]).reset_index(drop=True) + item_similarity = ( + model.item_similarity.toPandas() + .sort_values(["i1", "i2"]) + .reset_index(drop=True) + ) if similarity_type == "cooccurrence": assert (item_similarity_ref == item_similarity).all().all() else: - assert (item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]).all().all() + assert ( + (item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]).all().all() + ) - assert np.allclose(item_similarity.value.values, item_similarity_ref.value.values) + assert np.allclose( + item_similarity.value.values, item_similarity_ref.value.values + ) # Test 7 @@ -329,7 +381,7 @@ def test_user_affinity(spark, demo_usage_data, sar_settings, header, token): timedecay_formula=True, time_decay_coefficient=30, time_now=time_now, - similarity_type="cooccurrence" + similarity_type="cooccurrence", ) df = spark.createDataFrame(demo_usage_data) @@ -337,27 +389,48 @@ def test_user_affinity(spark, demo_usage_data, sar_settings, header, token): user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv" + token) user_affinity_ref = pd.melt( - user_affinity_ref, user_affinity_ref.columns[0], user_affinity_ref.columns[1:], "ItemId", "Rating", + user_affinity_ref, + user_affinity_ref.columns[0], + user_affinity_ref.columns[1:], + "ItemId", + "Rating", + ) + user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0].reset_index( + drop=True ) - user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0].reset_index(drop=True) # construct dataframe with test user id we'd like to get the affinity for - df_test = spark.createDataFrame(pd.DataFrame({header["col_user"]: [sar_settings["TEST_USER_ID"]]})) + df_test = spark.createDataFrame( + pd.DataFrame({header["col_user"]: [sar_settings["TEST_USER_ID"]]}) + ) user_affinity = model.get_user_affinity(df_test).toPandas().reset_index(drop=True) # verify the that item ids are the same assert (user_affinity[header["col_item"]] == user_affinity_ref.ItemId).all() assert np.allclose( - user_affinity_ref[header["col_rating"]].values, user_affinity["Rating"].values, atol=sar_settings["ATOL"], + user_affinity_ref[header["col_rating"]].values, + user_affinity["Rating"].values, + atol=sar_settings["ATOL"], ) # Tests 8-10 @pytest.mark.parametrize( - "threshold,similarity_type,file", [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")], + "threshold,similarity_type,file", + [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")], ) -def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar_settings, demo_usage_data, token): +def test_userpred( + spark, + tmp_path, + threshold, + similarity_type, + file, + header, + sar_settings, + demo_usage_data, + token, +): time_now = demo_usage_data[header["col_timestamp"]].max() test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file) @@ -370,13 +443,20 @@ def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar time_decay_coefficient=30, time_now=time_now, threshold=threshold, - similarity_type=similarity_type + similarity_type=similarity_type, ) df = spark.createDataFrame(demo_usage_data) model.fit(df) - url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" + token + url = ( + sar_settings["FILE_DIR"] + + "userpred_" + + file + + str(threshold) + + "_userid_only.csv" + + token + ) pred_ref = pd.read_csv(url) pred_ref = ( @@ -387,7 +467,11 @@ def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar # Note: it's important to have a separate cache_path for each run as they're interferring with each other pred = model.recommend_k_items( - spark.createDataFrame(demo_usage_data[demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]]), + spark.createDataFrame( + demo_usage_data[ + demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"] + ] + ), cache_path=str(tmp_path.joinpath("test_userpred-" + test_id)), top_k=10, n_user_prediction_partitions=1, @@ -396,4 +480,6 @@ def test_userpred(spark, tmp_path, threshold, similarity_type, file, header, sar pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True) assert (pred.MovieId.values == pred_ref.rec.values).all() - assert np.allclose(pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"]) + assert np.allclose( + pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"] + ) From bdb1892796d389fe718b1e8e3fc237013e104964 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 08:44:55 +0800 Subject: [PATCH 13/33] Remove trailing whitespaces --- contrib/sarplus/scala/build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index d79ebab9cf..8299864f21 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -45,7 +45,7 @@ lazy val commonSettings = Seq( if (major >=3 && minor >= 2) Some("spark32") else None } case Some(s: String) => Some(s) - + } ) }, From 2006b584dad84484c1f5e336810324f3ed8ddad8 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 08:45:18 +0800 Subject: [PATCH 14/33] Add Python README --- contrib/sarplus/python/README.md | 13 +++++++++++++ contrib/sarplus/python/setup.py | 3 +++ 2 files changed, 16 insertions(+) create mode 100644 contrib/sarplus/python/README.md diff --git a/contrib/sarplus/python/README.md b/contrib/sarplus/python/README.md new file mode 100644 index 0000000000..a280bc7614 --- /dev/null +++ b/contrib/sarplus/python/README.md @@ -0,0 +1,13 @@ +# SARplus + +Simple Algorithm for Recommendation (SAR) is a neighborhood based +algorithm for personalized recommendations based on user transaction +history. SAR recommends items that are most **similar** to the ones +that the user already has an existing **affinity** for. Two items are +**similar** if the users that interacted with one item are also likely +to have interacted with the other. A user has an **affinity** to an +item if they have interacted with it in the past. + +SARplus is an efficient implementation of this algorithm for Spark. +More details can be found at +[sarplus@microsoft/recommenders](https://github.com/microsoft/recommenders/tree/main/contrib/sarplus). diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 53745d453c..cd70c53fe3 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -4,6 +4,7 @@ import os import sysconfig +from pathlib import Path from setuptools import setup from setuptools.extension import Extension @@ -30,6 +31,8 @@ def __str__(self): name="pysarplus", version=os.environ["VERSION"], description="SAR prediction for use with PySpark", + long_description=(Path(__file__).parent / "README.md").read_text(), + long_description_content_type='text/markdown', url="https://github.com/microsoft/recommenders/tree/main/contrib/sarplus", author="Markus Cozowicz", author_email="marcozo@microsoft.com", From 46d3ae3859ecfa16ea33215c83538cb6984a575d Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 14:47:03 +0800 Subject: [PATCH 15/33] Use VERSION as the only place for version update --- .github/workflows/sarplus.yml | 16 +++++++--------- contrib/sarplus/VERSION | 1 + contrib/sarplus/python/MANIFEST.in | 1 + contrib/sarplus/python/pysarplus/__init__.py | 15 +++++++++++++++ contrib/sarplus/python/setup.py | 4 ++-- contrib/sarplus/scala/build.sbt | 2 +- 6 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 contrib/sarplus/VERSION create mode 100644 contrib/sarplus/python/MANIFEST.in diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 39b701cee1..88c33afd2f 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -25,7 +25,6 @@ on: env: PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python SCALA_ROOT: ${{ github.workspace }}/contrib/sarplus/scala - SARPLUS_VERSION: 0.5.0 jobs: python: @@ -55,7 +54,7 @@ jobs: - name: Package and check run: | cd "${PYTHON_ROOT}" - sed -i -E "s/version[[:space:]]*=[[:space:]]*.*,/version=\"${SARPLUS_VERSION}\",/" setup.py + cp ../VERSION ./ python -m build --sdist python -m twine check dist/* @@ -75,17 +74,17 @@ jobs: export SCALA_VERSION=${SCALA_VERSION%.*} HADOOP_VERSION=${HADOOP_JAR##*-} export HADOOP_VERSION=${HADOOP_VERSION%.*} - export VERSION="${SARPLUS_VERSION}" sbt ++"${SCALA_VERSION}"! package cd "${PYTHON_ROOT}" pytest --token "${ACCESS_TOKEN}" ./tests + echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV - name: Upload Python package if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' uses: actions/upload-artifact@v2 with: - name: pysarplus-${{ env.SARPLUS_VERSION }} + name: pysarplus-${{ env.sarplus_version }} path: ${{ env.PYTHON_ROOT }}/dist/*.gz - name: Publish package to PyPI @@ -124,7 +123,6 @@ jobs: - name: Test run: | cd "${SCALA_ROOT}" - export VERSION="${SARPLUS_VERSION}" export SPARK_VERSION="${{ matrix.spark-version }}" export HADOOP_VERSION="${{ matrix.hadoop-version }}" sbt ++${{ matrix.scala-version }}! test @@ -140,7 +138,6 @@ jobs: run: | # generate artifacts cd "${SCALA_ROOT}" - export VERSION="${SARPLUS_VERSION}" export SPARK_VERSION="3.1.2" export HADOOP_VERSION="2.7.4" export SCALA_VERSION=2.12.10 @@ -159,10 +156,11 @@ jobs: for file in {*.jar,*.pom}; do gpg -ab "${file}"; done # bundle - jar cvf sarplus-bundle_2.12-${SARPLUS_VERSION}.jar *.jar *.pom *.asc + jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc + echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV - name: Upload Scala bundle uses: actions/upload-artifact@v2 with: - name: sarplus-bundle_2.12-${{ env.SARPLUS_VERSION }} - path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.SARPLUS_VERSION }}.jar + name: sarplus-bundle_2.12-${{ env.sarplus_version }} + path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.sarplus_version }}.jar diff --git a/contrib/sarplus/VERSION b/contrib/sarplus/VERSION new file mode 100644 index 0000000000..79a2734bbf --- /dev/null +++ b/contrib/sarplus/VERSION @@ -0,0 +1 @@ +0.5.0 \ No newline at end of file diff --git a/contrib/sarplus/python/MANIFEST.in b/contrib/sarplus/python/MANIFEST.in new file mode 100644 index 0000000000..b1fc69e0ab --- /dev/null +++ b/contrib/sarplus/python/MANIFEST.in @@ -0,0 +1 @@ +include VERSION \ No newline at end of file diff --git a/contrib/sarplus/python/pysarplus/__init__.py b/contrib/sarplus/python/pysarplus/__init__.py index 8ae388e483..f49b02aa91 100644 --- a/contrib/sarplus/python/pysarplus/__init__.py +++ b/contrib/sarplus/python/pysarplus/__init__.py @@ -1,5 +1,20 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from pathlib import Path + from .SARModel import SARModel from .SARPlus import SARPlus + +__title__ = "pysarplus" +__version__ = (Path(__file__).resolve().parent.parent / "VERSION").read_text().strip() +__author__ = "Markus Cozowicz" +__license__ = "MIT" +__copyright__ = "Copyright 2018-present Microsoft Corporation" + +# Synonyms +TITLE = __title__ +VERSION = __version__ +AUTHOR = __author__ +LICENSE = __license__ +COPYRIGHT = __copyright__ diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index cd70c53fe3..5b85e94dae 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -29,9 +29,9 @@ def __str__(self): setup( name="pysarplus", - version=os.environ["VERSION"], + version=(Path(__file__).resolve().parent / "VERSION").read_text().strip(), description="SAR prediction for use with PySpark", - long_description=(Path(__file__).parent / "README.md").read_text(), + long_description=(Path(__file__).resolve().parent / "README.md").read_text(), long_description_content_type='text/markdown', url="https://github.com/microsoft/recommenders/tree/main/contrib/sarplus", author="Markus Cozowicz", diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index 8299864f21..c06165a1bf 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -12,7 +12,7 @@ lazy val hadoopVer = settingKey[String]("hadoop version") lazy val commonSettings = Seq( organization := "sarplus.microsoft", - version := sys.env.getOrElse("VERSION", "0.5.0"), + version := IO.read(new File("../VERSION")), resolvers ++= Seq( Resolver.sonatypeRepo("snapshots"), Resolver.sonatypeRepo("releases"), From d2dbe11f93b5a3b9db63ff25b822d969a0fd9d29 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 15:34:11 +0800 Subject: [PATCH 16/33] Update workflow --- .github/workflows/sarplus.yml | 38 +++++++++++++++-------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 88c33afd2f..cf8abeab43 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -1,26 +1,21 @@ # This workflow will run tests and do packaging for contrib/sarplus. # -# Refenreces: -# * [GitHub Actions doc](https://docs.github.com/en/actions) +# References: # * GitHub Actions workflow templates # + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml) -# + [python publish](https://github.com/actions/starter-workflows/blob/main/ci/python-publish.yml) # + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml) # * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md) -# * [Azure Databirkcs runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases) -# * Package publish -# + [Publishing package distribution releases using GitHub Actions CI/CD workflows](https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/) -# + [pypa/gh-action-pypy-publish](https://github.com/pypa/gh-action-pypi-publish) +# * [Azure Databricks runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases) -name: sarplus package +name: sarplus test and package on: push: paths: - contrib/sarplus/python/** - contrib/sarplus/scala/** - - .github/workflows/sarplus.yml + - contrib/sarplus/VERSION env: PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python @@ -28,6 +23,9 @@ env: jobs: python: + # Test pysarplus with different versions of Python. + # Package pysarplus and upload as GitHub workflow artifact when merged into + # the main branch. runs-on: ubuntu-latest strategy: matrix: @@ -58,7 +56,7 @@ jobs: python -m build --sdist python -m twine check dist/* - - name: Testing + - name: Test env: ACCESS_TOKEN: ${{ secrets.SARPLUS_TESTDATA_ACCESS_TOKEN }} run: | @@ -80,24 +78,16 @@ jobs: pytest --token "${ACCESS_TOKEN}" ./tests echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV - - name: Upload Python package + - name: Upload Python package as GitHub artifact if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' uses: actions/upload-artifact@v2 with: name: pysarplus-${{ env.sarplus_version }} path: ${{ env.PYTHON_ROOT }}/dist/*.gz - - name: Publish package to PyPI - if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.SARPLUS_TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ - packages_dir: ${{ env.PYTHON_ROOT }}/dist/ - skip_existing: true - scala-test: + # Test sarplus with different versions of Databricks runtime, 2 LTSs and 1 + # latest. runs-on: ubuntu-latest strategy: matrix: @@ -128,6 +118,10 @@ jobs: sbt ++${{ matrix.scala-version }}! test scala-package: + # Package sarplus and upload as GitHub workflow artifact when merged into + # the main branch. + needs: scala-test + if: github.ref == 'refs/heads/main' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -159,7 +153,7 @@ jobs: jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV - - name: Upload Scala bundle + - name: Upload Scala bundle as GitHub artifact uses: actions/upload-artifact@v2 with: name: sarplus-bundle_2.12-${{ env.sarplus_version }} From 87d78d82082718636eafec82707d062d183292ca Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 15:57:12 +0800 Subject: [PATCH 17/33] Remove unused code --- contrib/sarplus/scala/build.sbt | 2 -- contrib/sarplus/scala/project/plugins.sbt | 1 - 2 files changed, 3 deletions(-) diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index c06165a1bf..0ce41936ef 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -45,7 +45,6 @@ lazy val commonSettings = Seq( if (major >=3 && minor >= 2) Some("spark32") else None } case Some(s: String) => Some(s) - } ) }, @@ -102,4 +101,3 @@ credentials += Credentials( "C72E596B384EC14CFA65D80A36CB250AF1C18ECE", "ignored" ) -// credentials += Credentials(Path.userHome / ".sbt" / "sonatype_credentials") diff --git a/contrib/sarplus/scala/project/plugins.sbt b/contrib/sarplus/scala/project/plugins.sbt index e29553cb77..0a8aeeaba2 100644 --- a/contrib/sarplus/scala/project/plugins.sbt +++ b/contrib/sarplus/scala/project/plugins.sbt @@ -1,4 +1,3 @@ addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.27.0") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.1") addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") -// addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.10") \ No newline at end of file From 52376d1ede3caa6b536f79d2db3483ec16e2f539 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 16:09:53 +0800 Subject: [PATCH 18/33] Remove azure-pipelines.yml --- contrib/sarplus/azure-pipelines.yml | 93 ----------------------------- 1 file changed, 93 deletions(-) delete mode 100644 contrib/sarplus/azure-pipelines.yml diff --git a/contrib/sarplus/azure-pipelines.yml b/contrib/sarplus/azure-pipelines.yml deleted file mode 100644 index ae56707d70..0000000000 --- a/contrib/sarplus/azure-pipelines.yml +++ /dev/null @@ -1,93 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -pr: - branches: - include: - - staging - - master - paths: - include: - - contrib/sarplus/* - -# no CI trigger -trigger: none - -jobs: -- job: 'Test' - pool: - vmImage: 'Ubuntu 16.04' - strategy: - matrix: - Python35-Spark2.3: - python.version: '3.5' - spark.version: '2.3.0' - Python36-Spark2.3: - python.version: '3.6' - spark.version: '2.3.0' - Python35-Spark2.4.1: - python.version: '3.5' - spark.version: '2.4.1' - Python36-Spark2.4.1: - python.version: '3.6' - spark.version: '2.4.1' - Python36-Spark2.4.3: - python.version: '3.6' - spark.version: '2.4.3' - Python37-Spark2.4.3: - python.version: '3.7' - spark.version: '2.4.3' - maxParallel: 4 - - steps: - - task: ComponentGovernanceComponentDetection@0 - inputs: - scanType: 'Register' - verbosity: 'Verbose' - alertWarningLevel: 'High' - sourceScanPath: contrib/sarplus - - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - architecture: 'x64' - - # pyarrow version: https://issues.apache.org/jira/projects/SPARK/issues/SPARK-29367 - - script: python -m pip install --upgrade pip && pip install pyspark==$(spark.version) pytest pandas pybind11 pyarrow==0.14.1 sklearn - displayName: 'Install dependencies' - - - script: | - cd contrib/sarplus/scala - sparkversion=$(spark.version) sbt package - cd ../python - python setup.py install - pytest tests --doctest-modules --junitxml=junit/test-results.xml - displayName: 'pytest' - - - script: | - cd contrib/sarplus/scala - sparkversion=$(spark.version) sbt test - displayName: 'scala test' - - - - task: PublishTestResults@2 - inputs: - testResultsFiles: '**/test-results.xml' - testRunTitle: 'Python $(python.version)' - condition: succeededOrFailed() - -- job: 'Publish' - dependsOn: 'Test' - pool: - vmImage: 'Ubuntu 16.04' - - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.x' - architecture: 'x64' - - - script: cd contrib/sarplus/python && python setup.py sdist - displayName: 'Build sdist' From 5c786fb74bc13d62c3dc91819bdcb0edc001c938 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 16:52:12 +0800 Subject: [PATCH 19/33] Update DEVELOPMENT.md --- contrib/sarplus/DEVELOPMENT.md | 106 +++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md index 40cac04a0b..600d149427 100644 --- a/contrib/sarplus/DEVELOPMENT.md +++ b/contrib/sarplus/DEVELOPMENT.md @@ -1,69 +1,109 @@ # Packaging -For [databricks](https://databricks.com/) to properly install a [C++ extension](https://docs.python.org/3/extending/building.html), one must take a detour through [pypi](https://pypi.org/). -Use [twine](https://github.com/pypa/twine) to upload the package to [pypi](https://pypi.org/). +For [databricks](https://databricks.com/) to properly install a [C++ +extension](https://docs.python.org/3/extending/building.html), one +must take a detour through [pypi](https://pypi.org/). Use +[twine](https://github.com/pypa/twine) to upload the package to +[pypi](https://pypi.org/). ```bash -cd python - -python setup.py sdist +# build dependencies +python -m pip install -U build pip twine -twine upload dist/pysarplus-*.tar.gz +cd python +cp ../VERSION ./ # version file +python -m build --sdist +python -m twine upload dist/* ``` -On [Spark](https://spark.apache.org/) one can install all 3 components (C++, Python, Scala) in one pass by creating a [Spark Package](https://spark-packages.org/). Documentation is rather sparse. Steps to install +On [Spark](https://spark.apache.org/) one can install all 3 components +(C++, Python, Scala) in one pass by creating a [Spark +Package](https://spark-packages.org/). Steps to install 1. Package and publish the [pip package](python/setup.py) (see above) -2. Package the [Spark package](scala/build.sbt), which includes the [Scala formatter](scala/src/main/scala/microsoft/sarplus) and references the [pip package](scala/python/requirements.txt) (see below) -3. Upload the zipped Scala package to [Spark Package](https://spark-packages.org/) through a browser. [sbt spPublish](https://github.com/databricks/sbt-spark-package) has a few [issues](https://github.com/databricks/sbt-spark-package/issues/31) so it always fails for me. Don't use spPublishLocal as the packages are not created properly (names don't match up, [issue](https://github.com/databricks/sbt-spark-package/issues/17)) and furthermore fail to install if published to [Spark-Packages.org](https://spark-packages.org/). +2. Package the [Spark package](scala/build.sbt), which includes the + [Scala formatter](scala/src/main/scala/microsoft/sarplus) and + references the pip package (see below) +3. Upload the zipped Scala package bundle to [Nexus Repository + Manager](https://oss.sonatype.org/) through a browser. ```bash +export SPARK_VERSION="3.1.2" +export HADOOP_VERSION="2.7.4" +export SCALA_VERSION="2.12.10" +GPG_KEY="" + +# generate artifacts cd scala -sbt spPublish +sbt ++${SCALA_VERSION}! package +sbt ++${SCALA_VERSION}! packageDoc +sbt ++${SCALA_VERSION}! packageSrc +sbt ++${SCALA_VERSION}! makePom + +# generate the artifact (sarplus-*-spark32.jar) for Spark 3.2+ +export SPARK_VERSION="3.2.0" +export HADOOP_VERSION="3.3.1" +export SCALA_VERSION="2.12.14" +sbt ++${SCALA_VERSION}! package + +# sign with GPG +cd target/scala-${SCALA_VERSION%.*} +gpg --import <(cat <<< "${GPG_KEY}") +for file in {*.jar,*.pom}; do gpg -ab "${file}"; done + +# bundle +jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc ``` +where `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` should be +customized as needed. + + ## Testing To test the python UDF + C++ backend ```bash -cd python -python setup.py install && pytest -s tests/ +# access token for https://recodatasets.blob.core.windows.net/sarunittest/ +ACCESS_TOKEN="" + +# build dependencies +python -m pip install -U build pip twine + +# build +cd python +cp ../VERSION ./ # version file +python -m build --sdist + +# test +pytest --token "${ACCESS_TOKEN}" ./tests ``` To test the Scala formatter ```bash +export SPARK_VERSION=3.2.0 +export HADOOP_VERSION=3.3.1 +export SCALA_VERSION=2.12.14 + cd scala -sbt test +sbt ++${SCALA_VERSION}! test ``` -(use ~test and it will automatically check for changes in source files, but not build.sbt) - ## Notes for Spark 3.x ## The code now has been modified to support Spark 3.x, and has been tested under different versions of Databricks Runtime (including 6.4 Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks -Service. But now manual packaging is needed: - - -```bash -export VERSION=0.5.0 -cd python -python setup.py bdist_wheel # => dist/pysarplus-0.5.0-cp38-cp38-linux_x86_64.whl - -export SPARK_VERSION=3.2.0 -export HADOOP_VERSION=3.3.1 -export SCALA_VERSION=2.12.14 -cd scala -sbt ++${SCALA_VERSION} package # => target/scala-2.12/sarplus_2.12.14_s3.2.0_h3.3.1-0.5.0.jar -``` - -where `VERSION`, `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` -should be customized as needed. When running on Spark 3.x, extra -configurations are also required: +Service. However, there is a breaking change of +[org/apache.spark.sql.execution.datasources.OutputWriter](https://github.com/apache/spark/blob/dc0fa1eef74238d745dabfdc86705b59d95b07e1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala#L74) +on **Spark 3.2**, which adds an extra function `path()`, so an +additional JAR file with the classifier `spark32` will be needed if +running on Spark 3.2 (See above for packaging). + +Also, extra configurations are also required when running on Spark +3.x: ``` spark.sql.sources.default parquet From f1f8cf0c27e661e028534b831e003e2f5cec7971 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 16:56:59 +0800 Subject: [PATCH 20/33] Update README.md --- contrib/sarplus/README.md | 53 +++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/contrib/sarplus/README.md b/contrib/sarplus/README.md index d898e2648b..bf27610b19 100644 --- a/contrib/sarplus/README.md +++ b/contrib/sarplus/README.md @@ -5,7 +5,13 @@ Pronounced surplus as it's simply better if not best! [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/contrib%20sarplus?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=107&branchName=master) [![PyPI version](https://badge.fury.io/py/pysarplus.svg)](https://badge.fury.io/py/pysarplus) -Simple Algorithm for Recommendation (SAR) is a neighborhood based algorithm for personalized recommendations based on user transaction history. SAR recommends items that are most **similar** to the ones that the user already has an existing **affinity** for. Two items are **similar** if the users that interacted with one item are also likely to have interacted with the other. A user has an **affinity** to an item if they have interacted with it in the past. +Simple Algorithm for Recommendation (SAR) is a neighborhood based +algorithm for personalized recommendations based on user transaction +history. SAR recommends items that are most **similar** to the ones +that the user already has an existing **affinity** for. Two items are +**similar** if the users that interacted with one item are also likely +to have interacted with the other. A user has an **affinity** to an +item if they have interacted with it in the past. SARplus is an efficient implementation of this algorithm for Spark. @@ -13,7 +19,8 @@ Features: * Scalable PySpark based [implementation](python/pysarplus/SARPlus.py) * Fast C++ based [predictions](python/src/pysarplus.cpp) -* Reduced memory consumption: similarity matrix cached in-memory once per worker, shared accross python executors +* Reduced memory consumption: similarity matrix cached in-memory once + per worker, shared accross python executors ## Benchmarks @@ -25,15 +32,23 @@ Features: There are a couple of key optimizations: -* map item ids (e.g. strings) to a continuous set of indexes to optmize storage and simplify access -* convert similarity matrix to exactly the representation the C++ component needs, thus enabling simple shared, memory mapping of the cache file and avoid parsing. This requires a customer formatter, written in Scala -* shared read-only memory mapping allows us to re-use the same memory from multiple python executors on the same worker node -* partition the input test users and past seen items by users, allowing for scale out +* map item ids (e.g. strings) to a continuous set of indexes to + optmize storage and simplify access +* convert similarity matrix to exactly the representation the C++ + component needs, thus enabling simple shared, memory mapping of the + cache file and avoid parsing. This requires a customer formatter, + written in Scala +* shared read-only memory mapping allows us to re-use the same memory + from multiple python executors on the same worker node +* partition the input test users and past seen items by users, + allowing for scale out * perform as much of the work as possible in PySpark (way simpler) * top-k computation -** reverse the join by summing reverse joining the users past seen items with any related items -** make sure to always just keep top-k items in-memory -** use standard join using binary search between users past seen items and the related items + + reverse the join by summing reverse joining the users past seen + items with any related items + + make sure to always just keep top-k items in-memory + + use standard join using binary search between users past seen + items and the related items ![Image of sarplus top-k recommendation optimization](https://recodatasets.z20.web.core.windows.net/images/sarplus_udf.svg) @@ -76,7 +91,7 @@ Insert this cell prior to the code above. ```python import os -SUBMIT_ARGS = "--packages eisber:sarplus:0.2.6 pyspark-shell" +SUBMIT_ARGS = "--packages com.microsoft.sarplus:sarplus:0.5.0 pyspark-shell" os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS from pyspark.sql import SparkSession @@ -96,21 +111,26 @@ spark = ( ```bash pip install pysarplus -pyspark --packages eisber:sarplus:0.2.6 --conf spark.sql.crossJoin.enabled=true +pyspark --packages com.microsoft.sarplus:sarplus:0.5.0 --conf spark.sql.crossJoin.enabled=true ``` ### Databricks -One must set the crossJoin property to enable calculation of the similarity matrix (Clusters / < Cluster > / Configuration / Spark Config) +One must set the crossJoin property to enable calculation of the +similarity matrix (Clusters / < Cluster > / Configuration / +Spark Config) ``` spark.sql.crossJoin.enabled true +spark.sql.sources.default parquet +spark.sql.legacy.createHiveTableByDefault true ``` 1. Navigate to your workspace 2. Create library 3. Under 'Source' select 'Maven Coordinate' -4. Enter 'eisber:sarplus:0.2.5' or 'eisber:sarplus:0.2.6' if you're on Spark 2.4.1 +4. Enter com.microsoft:sarplus:0.5.0' or + microsoft:sarplus:0.5.0:spark32' if you're on Spark 3.2+ 5. Hit 'Create Library' 6. Attach to your cluster 7. Create 2nd library @@ -130,10 +150,10 @@ You'll also have to mount shared storage 2. Generate new token: enter 'sarplus' 3. Use databricks shell (installation here) 4. databricks configure --token -4.1. Host: e.g. https://westus.azuredatabricks.net + 1. Host: e.g. https://westus.azuredatabricks.net 5. databricks secrets create-scope --scope all --initial-manage-principal users 6. databricks secrets put --scope all --key sarpluscache -6.1. enter Azure Storage Blob key of Azure Storage created before + 1. enter Azure Storage Blob key of Azure Storage created before 7. Run mount code @@ -153,4 +173,5 @@ logging.getLogger("py4j").setLevel(logging.ERROR) ## Development -See [DEVELOPMENT.md](DEVELOPMENT.md) for implementation details and development information. +See [DEVELOPMENT.md](DEVELOPMENT.md) for implementation details and +development information. From da7473548801f46e3002b8e5e3f31334583371ec Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 16:57:20 +0800 Subject: [PATCH 21/33] Update sarplus.yml --- .github/workflows/sarplus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index cf8abeab43..bf05baeed4 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -134,14 +134,14 @@ jobs: cd "${SCALA_ROOT}" export SPARK_VERSION="3.1.2" export HADOOP_VERSION="2.7.4" - export SCALA_VERSION=2.12.10 + export SCALA_VERSION="2.12.10" sbt ++${SCALA_VERSION}! package sbt ++${SCALA_VERSION}! packageDoc sbt ++${SCALA_VERSION}! packageSrc sbt ++${SCALA_VERSION}! makePom export SPARK_VERSION="3.2.0" export HADOOP_VERSION="3.3.1" - export SCALA_VERSION=2.12.14 + export SCALA_VERSION="2.12.14" sbt ++${SCALA_VERSION}! package # sign with GPG From 8c696e4b2f41eade9a990531b2bc8de13b6d195b Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 17:09:16 +0800 Subject: [PATCH 22/33] Add link to publish scala package manually to central repository --- contrib/sarplus/DEVELOPMENT.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md index 600d149427..3b77e65735 100644 --- a/contrib/sarplus/DEVELOPMENT.md +++ b/contrib/sarplus/DEVELOPMENT.md @@ -25,7 +25,8 @@ Package](https://spark-packages.org/). Steps to install [Scala formatter](scala/src/main/scala/microsoft/sarplus) and references the pip package (see below) 3. Upload the zipped Scala package bundle to [Nexus Repository - Manager](https://oss.sonatype.org/) through a browser. + Manager](https://oss.sonatype.org/) through a browser (See [publish + manul](https://central.sonatype.org/publish/publish-manual/)). ```bash export SPARK_VERSION="3.1.2" From 1094e2a467ccb84bb838a7f7a4d503bbd14e33e6 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 18:40:18 +0800 Subject: [PATCH 23/33] Add docstring for SARPlus init function --- contrib/sarplus/python/pysarplus/SARPlus.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/contrib/sarplus/python/pysarplus/SARPlus.py b/contrib/sarplus/python/pysarplus/SARPlus.py index 91620ede1e..7bb619bf67 100644 --- a/contrib/sarplus/python/pysarplus/SARPlus.py +++ b/contrib/sarplus/python/pysarplus/SARPlus.py @@ -39,6 +39,21 @@ def __init__( timedecay_formula=False, threshold=1, ): + + """Initialize model parameters + Args: + spark (pyspark.sql.SparkSession): Spark session + col_user (str): user column name + col_item (str): item column name + col_rating (str): rating column name + col_timestamp (str): timestamp column name + table_prefix (str): name prefix of the generated tables + similarity_type (str): ['cooccurrence', 'jaccard', 'lift'] option for computing item-item similarity + time_decay_coefficient (float): number of days till ratings are decayed by 1/2 + time_now (int | None): current time for time decay calculation + timedecay_formula (bool): flag to apply time decay + threshold (int): item-item co-occurrences below this threshold will be removed + """ assert threshold > 0 self.spark = spark From fc7cab2135b003f6e1d51642575796369cc87127 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 20:53:34 +0800 Subject: [PATCH 24/33] Use VERSION --- contrib/sarplus/python/pysarplus/__init__.py | 2 +- contrib/sarplus/python/setup.py | 4 ++-- contrib/sarplus/scala/build.sbt | 8 ++++---- contrib/sarplus/scala/python/setup.py | 10 +++------- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/contrib/sarplus/python/pysarplus/__init__.py b/contrib/sarplus/python/pysarplus/__init__.py index f49b02aa91..e889425413 100644 --- a/contrib/sarplus/python/pysarplus/__init__.py +++ b/contrib/sarplus/python/pysarplus/__init__.py @@ -8,7 +8,7 @@ __title__ = "pysarplus" __version__ = (Path(__file__).resolve().parent.parent / "VERSION").read_text().strip() -__author__ = "Markus Cozowicz" +__author__ = "RecoDev Team at Microsoft" __license__ = "MIT" __copyright__ = "Copyright 2018-present Microsoft Corporation" diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 5b85e94dae..81a3b9ebdd 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -34,8 +34,8 @@ def __str__(self): long_description=(Path(__file__).resolve().parent / "README.md").read_text(), long_description_content_type='text/markdown', url="https://github.com/microsoft/recommenders/tree/main/contrib/sarplus", - author="Markus Cozowicz", - author_email="marcozo@microsoft.com", + author="RecoDev Team at Microsoft", + author_email="recodevteam@service.microsoft.com", license="MIT", classifiers=[ "Development Status :: 4 - Beta", diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index 0ce41936ef..b4891188fc 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -74,10 +74,10 @@ scmInfo := Some( developers := List( Developer( - id = "eisber", - name = "Markus Cozowicz", - email = "marcozo@microsoft.com", - url = url("https://github.com/eisber") + id = "recodev", + name = "RecoDev Team at Microsoft", + email = "recodevteam@service.microsoft.com", + url = url("https://github.com/microsoft/recommenders/") ) ) diff --git a/contrib/sarplus/scala/python/setup.py b/contrib/sarplus/scala/python/setup.py index de096d333f..9ceedd4a52 100644 --- a/contrib/sarplus/scala/python/setup.py +++ b/contrib/sarplus/scala/python/setup.py @@ -4,16 +4,12 @@ from distutils.core import setup import os -version = os.getenv("VERSION") -if version is None: - version = "0.5.0" - setup( name="pysarplus_dummy", - version=os.environ["VERSION"], + version=(Path(__file__).resolve().parent / "VERSION").read_text().strip(), description="pysarplus dummy package to trigger spark packaging", - author="Markus Cozowicz", - author_email="marcozo@microsoft.com", + author="RecoDev Team at Microsoft", + author_email="recodevteam@service.microsoft.com", url="https://github.com/Microsoft/Recommenders/contrib/sarplus", packages=["pysarplus_dummy"], ) From 34c72f73934474c7d4f8d4cb7be0e054350e1f55 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 20:59:08 +0800 Subject: [PATCH 25/33] Add simon in AUTHORS.md --- AUTHORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS.md b/AUTHORS.md index 3ded6466f9..7cbac2591c 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -103,6 +103,8 @@ To contributors: please add your name to the list when you submit a patch to the * Windows test pipelines * **[Satyadev Ntv](https://github.com/satyadevntv)** * GeoIMC algorithm +* **[Simon Zhao](https://github.com/simonzhaoms)** + * SARplus algorithm upgrade * **[Yan Zhang](https://github.com/YanZhangADS)** * Diversity metrics including coverage, novelty, diversity, and serendipity * Diversity metrics evaluation sample notebook From fa9ece8a572931d2c46a5927d6f7a1fab9c93ea6 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 23:39:49 +0800 Subject: [PATCH 26/33] Remove GPG key --- contrib/sarplus/scala/build.sbt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/contrib/sarplus/scala/build.sbt b/contrib/sarplus/scala/build.sbt index b4891188fc..7c573270d6 100644 --- a/contrib/sarplus/scala/build.sbt +++ b/contrib/sarplus/scala/build.sbt @@ -91,13 +91,3 @@ publishTo := { else Some("releases" at nexus + "service/local/staging/deploy/maven2") } publishMavenStyle := true - - -// PGP key configuration - -credentials += Credentials( - "GnuPG Key ID", - "gpg", - "C72E596B384EC14CFA65D80A36CB250AF1C18ECE", - "ignored" -) From d01637024f132c92a31e8c424c5410b0c3d3d2e5 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 23:40:27 +0800 Subject: [PATCH 27/33] Update sarplus.yml --- .github/workflows/sarplus.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index bf05baeed4..00549b6781 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -16,6 +16,7 @@ on: - contrib/sarplus/python/** - contrib/sarplus/scala/** - contrib/sarplus/VERSION + - .github/workflows/sarplus.yml env: PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python From b8f31f22bfbf9f1ba2d7d42c29d8bd60be114600 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 23:41:03 +0800 Subject: [PATCH 28/33] Resolve flake8 errors --- contrib/sarplus/python/tests/test_pyspark_sar.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index 995a03ebf2..b3ace34f0a 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -449,14 +449,7 @@ def test_userpred( df = spark.createDataFrame(demo_usage_data) model.fit(df) - url = ( - sar_settings["FILE_DIR"] - + "userpred_" - + file - + str(threshold) - + "_userid_only.csv" - + token - ) + url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" + token pred_ref = pd.read_csv(url) pred_ref = ( From 7ef200d6820cd560cc4122f3df48f88564bef4ea Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Mon, 13 Dec 2021 23:52:45 +0800 Subject: [PATCH 29/33] Update setup.py --- contrib/sarplus/python/pysarplus/__init__.py | 2 +- contrib/sarplus/python/setup.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/contrib/sarplus/python/pysarplus/__init__.py b/contrib/sarplus/python/pysarplus/__init__.py index e889425413..0d922d7df1 100644 --- a/contrib/sarplus/python/pysarplus/__init__.py +++ b/contrib/sarplus/python/pysarplus/__init__.py @@ -7,7 +7,7 @@ from .SARPlus import SARPlus __title__ = "pysarplus" -__version__ = (Path(__file__).resolve().parent.parent / "VERSION").read_text().strip() +__version__ = (Path(__file__).resolve().parent / "VERSION").read_text().strip() __author__ = "RecoDev Team at Microsoft" __license__ = "MIT" __copyright__ = "Copyright 2018-present Microsoft Corporation" diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 81a3b9ebdd..f0055aad94 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import os import sysconfig from pathlib import Path @@ -38,10 +37,8 @@ def __str__(self): author_email="recodevteam@service.microsoft.com", license="MIT", classifiers=[ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", @@ -55,6 +52,7 @@ def __str__(self): install_requires=DEPENDENCIES, tests_require=["pytest"], packages=["pysarplus"], + include_package_data=True, ext_modules=[ Extension( "pysarplus_cpp", From 75028d5295c000314d97b3cdf9a42c0b348ccb95 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Tue, 14 Dec 2021 10:40:15 +0800 Subject: [PATCH 30/33] Move VERSION as package data file of pysarplus --- .github/workflows/sarplus.yml | 2 +- contrib/sarplus/DEVELOPMENT.md | 4 ++-- contrib/sarplus/python/MANIFEST.in | 1 - contrib/sarplus/python/setup.py | 4 ++-- contrib/sarplus/python/tests/test_pyspark_sar.py | 3 +-- 5 files changed, 6 insertions(+), 8 deletions(-) delete mode 100644 contrib/sarplus/python/MANIFEST.in diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 00549b6781..6f4e37a330 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -53,7 +53,7 @@ jobs: - name: Package and check run: | cd "${PYTHON_ROOT}" - cp ../VERSION ./ + cp ../VERSION ./pysarplus/ python -m build --sdist python -m twine check dist/* diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md index 3b77e65735..3600e2d6d3 100644 --- a/contrib/sarplus/DEVELOPMENT.md +++ b/contrib/sarplus/DEVELOPMENT.md @@ -11,7 +11,7 @@ must take a detour through [pypi](https://pypi.org/). Use python -m pip install -U build pip twine cd python -cp ../VERSION ./ # version file +cp ../VERSION ./pysarplus/ # version file python -m build --sdist python -m twine upload dist/* ``` @@ -73,7 +73,7 @@ python -m pip install -U build pip twine # build cd python -cp ../VERSION ./ # version file +cp ../VERSION ./pysarplus/ # version file python -m build --sdist # test diff --git a/contrib/sarplus/python/MANIFEST.in b/contrib/sarplus/python/MANIFEST.in deleted file mode 100644 index b1fc69e0ab..0000000000 --- a/contrib/sarplus/python/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include VERSION \ No newline at end of file diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index f0055aad94..fc1c866189 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -28,7 +28,7 @@ def __str__(self): setup( name="pysarplus", - version=(Path(__file__).resolve().parent / "VERSION").read_text().strip(), + version=(Path(__file__).resolve().parent / "pysarplus" / "VERSION").read_text().strip(), description="SAR prediction for use with PySpark", long_description=(Path(__file__).resolve().parent / "README.md").read_text(), long_description_content_type='text/markdown', @@ -52,7 +52,7 @@ def __str__(self): install_requires=DEPENDENCIES, tests_require=["pytest"], packages=["pysarplus"], - include_package_data=True, + package_data={"": ["VERSION"]}, ext_modules=[ Extension( "pysarplus_cpp", diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index b3ace34f0a..9d642d688f 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -4,7 +4,6 @@ import calendar import datetime import math -import os from pathlib import Path import numpy as np @@ -46,7 +45,7 @@ def spark(tmp_path_factory, app_name="Sample", url="local[*]", memory="1G"): Path(__file__) .parents[2] .joinpath("scala", "target") - .glob(f"**/sarplus*{os.environ.get('VERSION', '')}*.jar") + .glob("**/sarplus*.jar") ).absolute() except StopIteration: raise Exception("Could not find Sarplus JAR file") From 8c44d14e4402e0f06f4fa1838f94b9592b159848 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Tue, 14 Dec 2021 15:50:01 +0800 Subject: [PATCH 31/33] Remove test data access token and move fixtures into conftest.py --- .github/workflows/sarplus.yml | 4 +- contrib/sarplus/DEVELOPMENT.md | 5 +- contrib/sarplus/python/tests/conftest.py | 73 +++++++++++- .../sarplus/python/tests/test_pyspark_sar.py | 107 ++---------------- 4 files changed, 80 insertions(+), 109 deletions(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 6f4e37a330..e2b3fc6103 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -58,8 +58,6 @@ jobs: python -m twine check dist/* - name: Test - env: - ACCESS_TOKEN: ${{ secrets.SARPLUS_TESTDATA_ACCESS_TOKEN }} run: | cd "${PYTHON_ROOT}" python -m pip install dist/*.gz @@ -76,7 +74,7 @@ jobs: sbt ++"${SCALA_VERSION}"! package cd "${PYTHON_ROOT}" - pytest --token "${ACCESS_TOKEN}" ./tests + pytest ./tests echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV - name: Upload Python package as GitHub artifact diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md index 3600e2d6d3..d601be582f 100644 --- a/contrib/sarplus/DEVELOPMENT.md +++ b/contrib/sarplus/DEVELOPMENT.md @@ -65,9 +65,6 @@ customized as needed. To test the python UDF + C++ backend ```bash -# access token for https://recodatasets.blob.core.windows.net/sarunittest/ -ACCESS_TOKEN="" - # build dependencies python -m pip install -U build pip twine @@ -77,7 +74,7 @@ cp ../VERSION ./pysarplus/ # version file python -m build --sdist # test -pytest --token "${ACCESS_TOKEN}" ./tests +pytest ./tests ``` To test the Scala formatter diff --git a/contrib/sarplus/python/tests/conftest.py b/contrib/sarplus/python/tests/conftest.py index 9a36620d7b..9d209fe223 100644 --- a/contrib/sarplus/python/tests/conftest.py +++ b/contrib/sarplus/python/tests/conftest.py @@ -1,8 +1,75 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +@pytest.fixture(scope="module") +def demo_usage_data(header, sar_settings): + # load the data + data = pd.read_csv(sar_settings["FILE_DIR"] + "demoUsage.csv") + data["rating"] = pd.Series([1] * data.shape[0]) + data = data.rename( + columns={ + "userId": header["col_user"], + "productId": header["col_item"], + "rating": header["col_rating"], + "timestamp": header["col_timestamp"], + } + ) -def pytest_addoption(parser): - parser.addoption( - "--token", action="store", default="", help="Access token of the test data" + # convert timestamp + data[header["col_timestamp"]] = data[header["col_timestamp"]].apply( + lambda s: float( + calendar.timegm( + datetime.datetime.strptime(s, "%Y/%m/%dT%H:%M:%S").timetuple() + ) + ) ) + + return data + + +@pytest.fixture(scope="module") +def header(): + header = { + "col_user": "UserId", + "col_item": "MovieId", + "col_rating": "Rating", + "col_timestamp": "Timestamp", + } + return header + + +@pytest.fixture(scope="module") +def pandas_dummy(header): + ratings_dict = { + header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + header["col_item"]: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + header["col_rating"]: [1, 2, 3, 4, 5, 1, 2, 3, 4, 5], + } + df = pd.DataFrame(ratings_dict) + return df + + +@pytest.fixture(scope="module") +def pandas_dummy_timestamp(pandas_dummy, header): + time = 1535133442 + time_series = [time + 20 * i for i in range(10)] + df = pandas_dummy + df[header["col_timestamp"]] = time_series + return df + + +@pytest.fixture(scope="module") +def sar_settings(): + return { + # absolute tolerance parameter for matrix equivalence in SAR tests + "ATOL": 1e-8, + # directory of the current file - used to link unit test data + "FILE_DIR": "https://recodatasets.z20.web.core.windows.net/sarunittest/", + # user ID used in the test files (they are designed for this user ID, this is part of the test) + "TEST_USER_ID": "0003000098E85347", + } + + +@pytest.fixture(scope="module") +def train_test_dummy_timestamp(pandas_dummy_timestamp): + return train_test_split(pandas_dummy_timestamp, test_size=0.2, random_state=0) diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index 9d642d688f..648578a432 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -23,14 +23,6 @@ def assert_compare(expected_id, expected_score, actual_prediction): ) -@pytest.fixture(scope="module") -def token(request): - if request.config.getoption("--token") == "": - return "" - else: - return "?" + request.config.getoption("--token") - - @pytest.fixture(scope="module") def spark(tmp_path_factory, app_name="Sample", url="local[*]", memory="1G"): """Start Spark if not started @@ -78,17 +70,6 @@ def sample_cache(spark): return path -@pytest.fixture(scope="module") -def header(): - header = { - "col_user": "UserId", - "col_item": "MovieId", - "col_rating": "Rating", - "col_timestamp": "Timestamp", - } - return header - - @pytest.fixture(scope="module") def pandas_dummy_dataset(header): """Load sample dataset in pandas for testing; can be used to create a Spark dataframe @@ -196,78 +177,6 @@ def test_e2e(spark, pandas_dummy_dataset, header): assert np.allclose(r1.score.values, r2.score.values, 1e-3) -@pytest.fixture(scope="module") -def pandas_dummy(header): - ratings_dict = { - header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2, 2, 2], - header["col_item"]: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - header["col_rating"]: [1, 2, 3, 4, 5, 1, 2, 3, 4, 5], - } - df = pd.DataFrame(ratings_dict) - return df - - -@pytest.fixture(scope="module") -def pandas_dummy_timestamp(pandas_dummy, header): - time = 1535133442 - time_series = [time + 20 * i for i in range(10)] - df = pandas_dummy - df[header["col_timestamp"]] = time_series - return df - - -@pytest.fixture(scope="module") -def train_test_dummy_timestamp(pandas_dummy_timestamp): - return train_test_split(pandas_dummy_timestamp, test_size=0.2, random_state=0) - - -@pytest.fixture(scope="module") -def demo_usage_data(header, sar_settings, token): - # load the data - data = pd.read_csv(sar_settings["FILE_DIR"] + "demoUsage.csv" + token) - data["rating"] = pd.Series([1] * data.shape[0]) - data = data.rename( - columns={ - "userId": header["col_user"], - "productId": header["col_item"], - "rating": header["col_rating"], - "timestamp": header["col_timestamp"], - } - ) - - # convert timestamp - data[header["col_timestamp"]] = data[header["col_timestamp"]].apply( - lambda s: float( - calendar.timegm( - datetime.datetime.strptime(s, "%Y/%m/%dT%H:%M:%S").timetuple() - ) - ) - ) - - return data - - -@pytest.fixture(scope="module") -def demo_usage_data_spark(spark, demo_usage_data, header): - data_local = demo_usage_data[[x[1] for x in header.items()]] - # TODO: install pyArrow in DS VM - # spark.conf.set("spark.sql.execution.arrow.enabled", "true") - data = spark.createDataFrame(data_local) - return data - - -@pytest.fixture(scope="module") -def sar_settings(): - return { - # absolute tolerance parameter for matrix equivalence in SAR tests - "ATOL": 1e-8, - # directory of the current file - used to link unit test data - "FILE_DIR": "https://recodatasets.blob.core.windows.net/sarunittest/", - # user ID used in the test files (they are designed for this user ID, this is part of the test) - "TEST_USER_ID": "0003000098E85347", - } - - @pytest.mark.parametrize( "similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)] ) @@ -316,7 +225,6 @@ def test_sar_item_similarity( demo_usage_data, sar_settings, header, - token, ): model = SARPlus( @@ -334,7 +242,7 @@ def test_sar_item_similarity( # reference item_similarity_ref = pd.read_csv( - sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv" + token + sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv" ) item_similarity_ref = pd.melt( @@ -366,12 +274,14 @@ def test_sar_item_similarity( ) assert np.allclose( - item_similarity.value.values, item_similarity_ref.value.values + item_similarity.value.values, + item_similarity_ref.value.values, + atol=sar_settings["ATOL"], ) # Test 7 -def test_user_affinity(spark, demo_usage_data, sar_settings, header, token): +def test_user_affinity(spark, demo_usage_data, sar_settings, header): time_now = demo_usage_data[header["col_timestamp"]].max() model = SARPlus( @@ -386,7 +296,7 @@ def test_user_affinity(spark, demo_usage_data, sar_settings, header, token): df = spark.createDataFrame(demo_usage_data) model.fit(df) - user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv" + token) + user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv") user_affinity_ref = pd.melt( user_affinity_ref, user_affinity_ref.columns[0], @@ -428,7 +338,6 @@ def test_userpred( header, sar_settings, demo_usage_data, - token, ): time_now = demo_usage_data[header["col_timestamp"]].max() @@ -448,7 +357,7 @@ def test_userpred( df = spark.createDataFrame(demo_usage_data) model.fit(df) - url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" + token + url = sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" pred_ref = pd.read_csv(url) pred_ref = ( @@ -457,7 +366,7 @@ def test_userpred( .reset_index(drop=True) ) - # Note: it's important to have a separate cache_path for each run as they're interferring with each other + # Note: it's important to have a separate cache_path for each run as they're interfering with each other pred = model.recommend_k_items( spark.createDataFrame( demo_usage_data[ From 4ec139a3bde4149974c239221e9647124ac3a506 Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Tue, 14 Dec 2021 16:18:07 +0800 Subject: [PATCH 32/33] Corrent VERSION path --- contrib/sarplus/scala/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/sarplus/scala/python/setup.py b/contrib/sarplus/scala/python/setup.py index 9ceedd4a52..9821b294c9 100644 --- a/contrib/sarplus/scala/python/setup.py +++ b/contrib/sarplus/scala/python/setup.py @@ -6,7 +6,7 @@ setup( name="pysarplus_dummy", - version=(Path(__file__).resolve().parent / "VERSION").read_text().strip(), + version=(Path(__file__).resolve().parent.parent.parent / "VERSION").read_text().strip(), description="pysarplus dummy package to trigger spark packaging", author="RecoDev Team at Microsoft", author_email="recodevteam@service.microsoft.com", From 594674c9c2eed2732430c54979f1852b9fe3bdfb Mon Sep 17 00:00:00 2001 From: Simon Zhao <43029286+simonzhaoms@users.noreply.github.com> Date: Tue, 14 Dec 2021 16:40:26 +0800 Subject: [PATCH 33/33] Fix flake8 issues --- contrib/sarplus/python/tests/conftest.py | 8 ++++++++ contrib/sarplus/python/tests/test_pyspark_sar.py | 4 ---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/contrib/sarplus/python/tests/conftest.py b/contrib/sarplus/python/tests/conftest.py index 9d209fe223..44efbde4a7 100644 --- a/contrib/sarplus/python/tests/conftest.py +++ b/contrib/sarplus/python/tests/conftest.py @@ -1,6 +1,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import calendar +import datetime +import pandas as pd +import pytest + +from sklearn.model_selection import train_test_split + + @pytest.fixture(scope="module") def demo_usage_data(header, sar_settings): # load the data diff --git a/contrib/sarplus/python/tests/test_pyspark_sar.py b/contrib/sarplus/python/tests/test_pyspark_sar.py index 648578a432..110e469f36 100644 --- a/contrib/sarplus/python/tests/test_pyspark_sar.py +++ b/contrib/sarplus/python/tests/test_pyspark_sar.py @@ -1,8 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import calendar -import datetime import math from pathlib import Path @@ -10,8 +8,6 @@ import pandas as pd from pyspark.sql import SparkSession import pytest -from sklearn.model_selection import train_test_split - from pysarplus import SARPlus, SARModel