From 66b7c82b1d05f725418d71bfc74ebd0291b6e851 Mon Sep 17 00:00:00 2001 From: raf-be Date: Mon, 26 Aug 2024 18:13:19 +0200 Subject: [PATCH] Adds a benchmark using California's housing dataset from Scikit-learn. Signed-off-by: raf-be --- .gitignore | 2 + README.md | 10 +++ benchmarks/california_housing.py | 99 +++++++++++++++++++++++ benchmarks/california_housing_results.svg | 52 ++++++++++++ parfun/about.py | 2 +- 5 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 benchmarks/california_housing.py create mode 100644 benchmarks/california_housing_results.svg diff --git a/.gitignore b/.gitignore index e68be99..15ffe1f 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ venv/ dask-worker-space/* .pre-commit-config.yaml + +.DS_Store diff --git a/README.md b/README.md index 359962d..1812c37 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,16 @@ def list_pow(values: List[float], factor: float) -> List[float]: [Scaler](https://github.com/citi/scaler) or Dask. +## Benchmarks + +**Parfun efficiently parallelizes short-duration functions**. + +When running a short 0.28-second ML function on an AMD Epyc 7313 16-Cores Processor, Parfun provides an impressive +**7.4x speedup**. Source code for this experiment [here](benchmarks/california_housing.py). + +![Benchmark Results](benchmarks/california_housing_results.svg) + + ## Quick Start The built-in Sphinx documentation contains detailed usage instructions, implementation details, and an exhaustive API reference. diff --git a/benchmarks/california_housing.py b/benchmarks/california_housing.py new file mode 100644 index 0000000..6d80f65 --- /dev/null +++ b/benchmarks/california_housing.py @@ -0,0 +1,99 @@ +""" +Trains a decision tree regressor on the California housing dataset from scikit-learn. + +Measure the training time when splitting the learning dataset process using Parfun. +""" + +import argparse +import json +import timeit + +from typing import List + +import numpy as np +import pandas as pd + +from sklearn.datasets import fetch_california_housing +from sklearn.base import RegressorMixin +from sklearn.tree import DecisionTreeRegressor + +from parfun.decorators import parfun +from parfun.entry_point import BACKEND_REGISTRY, set_parallel_backend_context +from parfun.partition.api import per_argument +from parfun.partition.dataframe import df_by_row + + +class MeanRegressor(RegressorMixin): + def __init__(self, regressors: List[RegressorMixin]) -> None: + super().__init__() + self._regressors = regressors + + def predict(self, X): + return np.mean([ + regressor.predict(X) + for regressor in self._regressors + ]) + + +@parfun( + split=per_argument(dataframe=df_by_row), + combine_with=lambda regressors: MeanRegressor(list(regressors)), +) +def train_regressor( + dataframe: pd.DataFrame, feature_names: List[str], target_name: str +) -> RegressorMixin: + + regressor = DecisionTreeRegressor() + regressor.fit(dataframe[feature_names], dataframe[[target_name]]) + + return regressor + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("n_workers", action="store", type=int) + parser.add_argument( + "--backend", + type=str, + choices=BACKEND_REGISTRY.keys(), + default="local_multiprocessing", + ) + parser.add_argument( + "--backend_args", + type=str, + default="{}", + ) + + args = parser.parse_args() + + dataset = fetch_california_housing(download_if_missing=True) + + feature_names = dataset["feature_names"] + target_name = dataset["target_names"][0] + + dataframe = pd.DataFrame(dataset["data"], columns=feature_names) + dataframe[target_name] = dataset["target"] + + N_MEASURES = 5 + + with set_parallel_backend_context("local_single_process"): + regressor = train_regressor(dataframe, feature_names, target_name) + + duration = timeit.timeit( + lambda: train_regressor(dataframe, feature_names, target_name), + number=N_MEASURES + ) / N_MEASURES + + print("Duration sequential:", duration) + + backend_args = {"max_workers": args.n_workers, **json.loads(args.backend_args)} + + with set_parallel_backend_context(args.backend, **backend_args): + regressor = train_regressor(dataframe, feature_names, target_name) + + duration = timeit.timeit( + lambda: train_regressor(dataframe, feature_names, target_name), + number=N_MEASURES + ) / N_MEASURES + + print("Duration parallel:", duration) diff --git a/benchmarks/california_housing_results.svg b/benchmarks/california_housing_results.svg new file mode 100644 index 0000000..864217b --- /dev/null +++ b/benchmarks/california_housing_results.svg @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + 0 + 0.05 + 0.1 + 0.15 + 0.2 + 0.25 + 0.3 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + + Training + duration + (seconds) + + Number of worker processes + + Parfun + benchmark + + + + + diff --git a/parfun/about.py b/parfun/about.py index 52b23b0..a3e63b6 100644 --- a/parfun/about.py +++ b/parfun/about.py @@ -1 +1 @@ -__version__ = "6.0.6" +__version__ = "6.0.7"