From 66b7c82b1d05f725418d71bfc74ebd0291b6e851 Mon Sep 17 00:00:00 2001
From: raf-be <raphael@noisycamp.com>
Date: Mon, 26 Aug 2024 18:13:19 +0200
Subject: [PATCH] Adds a benchmark using California's housing dataset from
 Scikit-learn.

Signed-off-by: raf-be <raphael@noisycamp.com>
---
 .gitignore                                |  2 +
 README.md                                 | 10 +++
 benchmarks/california_housing.py          | 99 +++++++++++++++++++++++
 benchmarks/california_housing_results.svg | 52 ++++++++++++
 parfun/about.py                           |  2 +-
 5 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/california_housing.py
 create mode 100644 benchmarks/california_housing_results.svg

diff --git a/.gitignore b/.gitignore
index e68be99..15ffe1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ venv/
 
 dask-worker-space/*
 .pre-commit-config.yaml
+
+.DS_Store
diff --git a/README.md b/README.md
index 359962d..1812c37 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,16 @@ def list_pow(values: List[float], factor: float) -> List[float]:
   [Scaler](https://github.com/citi/scaler) or Dask.
 
 
+## Benchmarks
+
+**Parfun efficiently parallelizes short-duration functions**.
+
+When running a short 0.28-second ML function on an AMD Epyc 7313 16-Cores Processor, Parfun provides an impressive
+**7.4x speedup**. Source code for this experiment [here](benchmarks/california_housing.py).
+
+![Benchmark Results](benchmarks/california_housing_results.svg)
+
+
 ## Quick Start
 The built-in Sphinx documentation contains detailed usage instructions, implementation details, and an exhaustive
 API reference.
diff --git a/benchmarks/california_housing.py b/benchmarks/california_housing.py
new file mode 100644
index 0000000..6d80f65
--- /dev/null
+++ b/benchmarks/california_housing.py
@@ -0,0 +1,99 @@
+"""
+Trains a decision tree regressor on the California housing dataset from scikit-learn.
+
+Measure the training time when splitting the learning dataset process using Parfun.
+"""
+
+import argparse
+import json
+import timeit
+
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from sklearn.datasets import fetch_california_housing
+from sklearn.base import RegressorMixin
+from sklearn.tree import DecisionTreeRegressor
+
+from parfun.decorators import parfun
+from parfun.entry_point import BACKEND_REGISTRY, set_parallel_backend_context
+from parfun.partition.api import per_argument
+from parfun.partition.dataframe import df_by_row
+
+
+class MeanRegressor(RegressorMixin):
+    def __init__(self, regressors: List[RegressorMixin]) -> None:
+        super().__init__()
+        self._regressors = regressors
+
+    def predict(self, X):
+        return np.mean([
+            regressor.predict(X)
+            for regressor in self._regressors
+        ])
+
+
+@parfun(
+    split=per_argument(dataframe=df_by_row),
+    combine_with=lambda regressors: MeanRegressor(list(regressors)),
+)
+def train_regressor(
+    dataframe: pd.DataFrame, feature_names: List[str], target_name: str
+) -> RegressorMixin:
+
+    regressor = DecisionTreeRegressor()
+    regressor.fit(dataframe[feature_names], dataframe[[target_name]])
+
+    return regressor
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("n_workers", action="store", type=int)
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=BACKEND_REGISTRY.keys(),
+        default="local_multiprocessing",
+    )
+    parser.add_argument(
+        "--backend_args",
+        type=str,
+        default="{}",
+    )
+
+    args = parser.parse_args()
+
+    dataset = fetch_california_housing(download_if_missing=True)
+
+    feature_names = dataset["feature_names"]
+    target_name = dataset["target_names"][0]
+
+    dataframe = pd.DataFrame(dataset["data"], columns=feature_names)
+    dataframe[target_name] = dataset["target"]
+
+    N_MEASURES = 5
+
+    with set_parallel_backend_context("local_single_process"):
+        regressor = train_regressor(dataframe, feature_names, target_name)
+
+        duration = timeit.timeit(
+            lambda: train_regressor(dataframe, feature_names, target_name),
+            number=N_MEASURES
+        ) / N_MEASURES
+
+        print("Duration sequential:", duration)
+
+    backend_args = {"max_workers": args.n_workers, **json.loads(args.backend_args)}
+
+    with set_parallel_backend_context(args.backend, **backend_args):
+        regressor = train_regressor(dataframe, feature_names, target_name)
+
+        duration = timeit.timeit(
+            lambda: train_regressor(dataframe, feature_names, target_name),
+            number=N_MEASURES
+        ) / N_MEASURES
+
+        print("Duration parallel:", duration)
diff --git a/benchmarks/california_housing_results.svg b/benchmarks/california_housing_results.svg
new file mode 100644
index 0000000..864217b
--- /dev/null
+++ b/benchmarks/california_housing_results.svg
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg width="100%" height="100%" viewBox="0 0 672 346" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linejoin:round;stroke-miterlimit:10;">
+    <g transform="matrix(1,0,0,1,1.6e-08,-0.999995)">
+        <rect x="0" y="1" width="670" height="344" style="fill:white;"/>
+    </g>
+    <g transform="matrix(7.87402e-05,0,0,-7.87402e-05,52.5,255.5)">
+        <path d="M0,0L7.7089e+06,0M0,558800L7.7089e+06,558800M0,1.1049e+06L7.7089e+06,1.1049e+06M0,1.6637e+06L7.7089e+06,1.6637e+06M0,2.2098e+06L7.7089e+06,2.2098e+06M0,2.7686e+06L7.7089e+06,2.7686e+06" style="fill:none;fill-rule:nonzero;stroke:rgb(217,217,217);stroke-width:12700px;"/>
+    </g>
+    <g transform="matrix(7.87402e-05,0,0,-7.87402e-05,52.5,299.5)">
+        <path d="M0,1L7.7089e+06,0" style="fill:none;fill-rule:nonzero;stroke:rgb(217,217,217);stroke-width:12700px;"/>
+    </g>
+    <g transform="matrix(7.87402e-05,0,0,-7.87402e-05,71,266)">
+        <path d="M0,2.7178e+06L482600,1.2319e+06L965200,685800L1.4478e+06,508000L1.9304e+06,355600L2.413e+06,355600L2.8956e+06,330200L3.3782e+06,127000L3.8608e+06,63500L4.3434e+06,76200L4.8133e+06,101600L5.2959e+06,63500L6.2611e+06,63500L6.7437e+06,88900L7.2263e+06,0" style="fill:none;fill-rule:nonzero;stroke:rgb(24,108,36);stroke-width:25400px;stroke-linecap:round;"/>
+    </g>
+    <text x="39.345px" y="302px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0</text>
+    <text x="27.955px" y="258px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0<tspan x="32.571px 34.344px 38.96px " y="258px 258px 258px ">.05</tspan></text>
+    <text x="32.515px" y="215px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0<tspan x="37.438px 39.519px " y="215px 215px ">.1</tspan></text>
+    <text x="27.955px" y="171px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0<tspan x="32.571px 34.344px 38.96px " y="171px 171px 171px ">.15</tspan></text>
+    <text x="32.515px" y="127px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0<tspan x="37.438px 39.519px " y="127px 127px ">.2</tspan></text>
+    <text x="27.955px" y="84px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0<tspan x="32.571px 34.344px 38.96px " y="84px 84px 84px ">.25</tspan></text>
+    <text x="32.515px" y="40px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">0<tspan x="37.438px 39.519px " y="40px 40px ">.3</tspan></text>
+    <text x="68.935px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1</text>
+    <text x="106.857px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">2</text>
+    <text x="144.778px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">3</text>
+    <text x="182.7px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">4</text>
+    <text x="220.621px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">5</text>
+    <text x="258.543px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">6</text>
+    <text x="296.465px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">7</text>
+    <text x="334.386px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">8</text>
+    <text x="372.308px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">9</text>
+    <text x="407.95px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="412.95px " y="314px ">0</tspan></text>
+    <text x="445.871px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="450.871px " y="314px ">1</tspan></text>
+    <text x="483.793px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="488.793px " y="314px ">2</tspan></text>
+    <text x="521.714px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="526.715px " y="314px ">3</tspan></text>
+    <text x="559.636px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="564.636px " y="314px ">4</tspan></text>
+    <text x="597.558px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="602.558px " y="314px ">5</tspan></text>
+    <text x="635.479px" y="314px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:9px;fill:rgb(89,89,89);">1<tspan x="640.48px " y="314px ">6</tspan></text>
+    <g transform="matrix(0,-1,1,0,1.6e-08,1.6e-08)">
+        <text x="-224px" y="22.75px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:10px;fill:rgb(89,89,89);">T<tspan x="-218.53px -215.292px -210.624px -208.587px -203.234px -201.197px -195.845px " y="22.75px 22.75px 22.75px 22.75px 22.75px 22.75px 22.75px ">raining</tspan></text>
+        <text x="-188.785px" y="22.75px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:10px;fill:rgb(89,89,89);">d<tspan x="-183.345px -177.992px -174.754px -170.085px -167.199px -165.162px -159.873px " y="22.75px 22.75px 22.75px 22.75px 22.75px 22.75px 22.75px ">uration</tspan></text>
+        <text x="-151.565px" y="22.75px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:10px;fill:rgb(89,89,89);">(<tspan x="-149.167px -144.923px -140.205px -135.937px -130.648px -125.295px -119.855px -115.611px " y="22.75px 22.75px 22.75px 22.75px 22.75px 22.75px 22.75px 22.75px ">seconds)</tspan></text>
+    </g>
+    <text x="296.947px" y="329px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:10px;fill:rgb(89,89,89);">N<tspan x="303.495px 308.862px 317.358px 322.812px 327.544px 330.796px 333.12px 338.423px 341.261px 343.585px 350.45px 355.753px 359.006px 364.011px 368.742px 371.995px 374.319px 379.773px 383.026px 388.328px 392.611px 397.343px 401.601px 405.859px 410.59px " y="329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px 329px ">umber of worker processes</tspan></text>
+    <g>
+        <text x="282.752px" y="21px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:14px;fill:rgb(89,89,89);">P<tspan x="289.334px 295.909px 300.481px 304.471px 312.003px " y="21px 21px 21px 21px 21px ">arfun</tspan></text>
+        <text x="323.414px" y="21px" style="font-family:'LucidaGrande', 'Lucida Grande', sans-serif;font-weight:500;font-size:14px;fill:rgb(89,89,89);">b<tspan x="331.069px 337.712px 345.244px 351.258px 358.79px 370.703px 377.278px 381.85px " y="21px 21px 21px 21px 21px 21px 21px 21px ">enchmark</tspan></text>
+    </g>
+    <g transform="matrix(7.87402e-05,0,0,-7.87402e-05,0.5,344.5)">
+        <rect x="0" y="0" width="8.509e+06" height="4.3688e+06" style="fill:none;stroke:rgb(217,217,217);stroke-width:12700px;"/>
+    </g>
+</svg>
diff --git a/parfun/about.py b/parfun/about.py
index 52b23b0..a3e63b6 100644
--- a/parfun/about.py
+++ b/parfun/about.py
@@ -1 +1 @@
-__version__ = "6.0.6"
+__version__ = "6.0.7"