diff --git a/.github/workflows/run_singularity_versions.yml b/.github/workflows/run_singularity_versions.yml
index fe576a30..c7862636 100644
--- a/.github/workflows/run_singularity_versions.yml
+++ b/.github/workflows/run_singularity_versions.yml
@@ -1,6 +1,16 @@
 name: Test Support for different Singularity Versions
 
-on: [push]
+on:
+  pull_request:
+    types: [ready_for_review]
+
+  pull_request_review:
+    types: [submitted]
+
+  push:
+    branches:
+      - 'main'
+      - 'development'
 
 jobs:
   Tests:
@@ -10,25 +20,25 @@ jobs:
       matrix:
         include:
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.5"
+            DISPLAY_NAME: "Singularity Container Examples with S3.7"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.5"
+            SINGULARITY_VERSION: "3.7"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.6"
+            DISPLAY_NAME: "Singularity Container Examples with S3.8"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.6"
+            SINGULARITY_VERSION: "3.8"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.7"
+            DISPLAY_NAME: "Singularity Container Examples with S3.9"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.7"
+            SINGULARITY_VERSION: "3.9"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.8"
+            DISPLAY_NAME: "Singularity Container Examples with S3.10"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.8"
+            SINGULARITY_VERSION: "3.10"
 
       fail-fast: false
 
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 322812b2..3d52b250 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -2,7 +2,7 @@
 
 name: Test Pull Requests
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
   Tests:
@@ -11,34 +11,46 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.6
-            DISPLAY_NAME: "Singularity Tests"
-            RUN_TESTS: true
-            USE_SINGULARITY: true
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
             RUN_CODECOV: true
-          - python-version: 3.7
+
+          - python-version: "3.7"
             DISPLAY_NAME: "Codestyle"
             RUN_CODESTYLE: true
-          - python-version: 3.7
+            USE_SINGULARITY: false
+
+          - python-version: "3.7"
             DISPLAY_NAME: "Singularity Container Examples"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: true
-          - python-version: 3.7
+            SINGULARITY_VERSION: "3.8"
+
+          - python-version: "3.7"
             DISPLAY_NAME: "Local Examples"
             RUN_LOCAL_EXAMPLES: true
             USE_SINGULARITY: false
-          - python-version: 3.8
+
+          - python-version: "3.8"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
-          - python-version: 3.9
+            SINGULARITY_VERSION: "3.8"
+
+          - python-version: "3.9"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
+          - python-version: "3.10"
+            DISPLAY_NAME: "Singularity Tests"
+            RUN_TESTS: true
+            USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
       fail-fast: false
 
     name: Tests ${{ matrix.python-version }} ${{ matrix.DISPLAY_NAME }}
@@ -46,6 +58,7 @@ jobs:
     env:
       RUN_TESTS: ${{ matrix.RUN_TESTS }}
       USE_SINGULARITY: ${{ matrix.USE_SINGULARITY }}
+      SINGULARITY_VERSION: ${{ matrix.SINGULARITY_VERSION }}
       RUN_CODECOV: ${{ matrix.RUN_CODECOV }}
       RUN_CODESTYLE: ${{ matrix.RUN_CODESTYLE }}
       RUN_CONTAINER_EXAMPLES: ${{ matrix.RUN_CONTAINER_EXAMPLES }}
@@ -56,15 +69,19 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: "${{ matrix.python-version }}"
     - name: Set up Go for Singularity
       if: matrix.USE_SINGULARITY == true
       uses: actions/setup-go@v2
       with:
         go-version: '1.14.15' # The Go version to download (if necessary) and use.
+    - name: Set up Singularity
+      if: matrix.USE_SINGULARITY == true
+      run: |
+        chmod +x ci_scripts/install_singularity.sh && source ./ci_scripts/install_singularity.sh
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         chmod +x ci_scripts/install.sh && source ./ci_scripts/install.sh
     - name: Run Tests
-      run: chmod +x ci_scripts/script.sh && source ./ci_scripts/script.sh
\ No newline at end of file
+      run: chmod +x ci_scripts/script.sh && source ./ci_scripts/script.sh
diff --git a/README.md b/README.md
index 5ef43638..96dd406d 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,7 @@
 
 HPOBench is a library for providing benchmarks for (multi-fidelity) hyperparameter optimization and with a focus on reproducibility.
 
-Further info:
-  * list of [benchmarks](https://github.com/automl/HPOBench/wiki/Available-Containerized-Benchmarks)
-  * [howto](https://github.com/automl/HPOBench/wiki/How-to-add-a-new-benchmark-step-by-step) contribute benchmarks
+For further info on [existing benchmarks](https://github.com/automl/HPOBench/wiki/Available-Containerized-Benchmarks) and [howto](https://github.com/automl/HPOBench/wiki/How-to-add-a-new-benchmark-step-by-step) contribute new benchmarks, see the [wiki](https://github.com/automl/HPOBench/wiki).
 
 ## Status
 
@@ -56,14 +54,14 @@ cd HPOBench
 pip install .
 ```
 
-**Note:** This does not install *singularity (version 3.6)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.6/user-guide/quick_start.html#quick-installation-steps).   
+**Note:** This does not install *singularity (version 3.8)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.8/user-guide/quick_start.html#quick-installation-steps).   
 If you run into problems, using the most recent singularity version might help: [here](https://singularity.hpcng.org/admin-docs/master/installation.html)
 
 ## Containerized Benchmarks
 
-We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.6)](https://sylabs.io/guides/3.6/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
+We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.8)](https://sylabs.io/guides/3.8/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
 
-The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *scipy* and *numpy* 
+The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *numpy*, *oslo* and *Pyro4* 
 
 ### Run a Benchmark Locally
 
@@ -141,10 +139,9 @@ If you use a benchmark in your experiments, please specify the version number of
 the used container to ensure reproducibility. When starting an experiment, HPOBench writes automatically these two version numbers to the log. 
 
 ### Troubleshooting and Further Notes
-
   - **Singularity throws an 'Invalid Image format' exception**
-  Use a singularity version > 3. For users of the Meta-Cluster in Freiburg, you have to set the following path:
-  ```export PATH=/usr/local/kislurm/singularity-3.5/bin/:$PATH```
+  Use a singularity version >= 3.8. If you have multiple singularity installations, you have to add the correct singularity version to your $PATH, e.g.
+  ```export PATH=/usr/local/kislurm/singularity-3.8/bin/:$PATH```
 
   - **A Benchmark fails with `SystemError: Could not start an instance of the benchmark. Retried 5 times` but the container 
 can be started locally with `singularity instance start <pathtocontainer> test`**
@@ -152,3 +149,19 @@ See whether in `~/.singularity/instances/sing/$HOSTNAME/*/` there is a file that
 
 **Note:** If you are looking for a different or older version of our benchmarking library, you might be looking for
  [HPOlib1.5](https://github.com/automl/HPOlib1.5) 
+ 
+## Reference
+
+If you use HPOBench, please cite the following paper:
+
+```bibtex
+@inproceedings{
+  eggensperger2021hpobench,
+  title={{HPOB}ench: A Collection of Reproducible Multi-Fidelity Benchmark Problems for {HPO}},
+  author={Katharina Eggensperger and Philipp M{\"u}ller and Neeratyoy Mallik and Matthias Feurer and Rene Sass and Aaron Klein and Noor Awad and Marius Lindauer and Frank Hutter},
+  booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
+  year={2021},
+  url={https://openreview.net/forum?id=1k4rJYEwda-}
+}
+```
+
diff --git a/changelog.md b/changelog.md
index 18b3b9fd..818e978b 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,10 @@
+# 0.0.11
+  * Drop Support for 3.6:
+    Although most of the functionality should still work, we drop the official support for 3.6.
+  * Add an interface for Multi-Objective Benchmarks.
+  * Add a check for the return values of the objective_functions
+    The returned dictionary of the objective functions have to fulfill now some criteria. 
+
 # 0.0.10
   * Cartpole Benchmark Version 0.0.4:
     Fix: Pass the hp `entropy_regularization` to the PPO Agent. 
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index b68a1b88..d361600d 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -4,14 +4,24 @@ install_packages=""
 
 if [[ "$RUN_TESTS" == "true" ]]; then
     echo "Install tools for testing"
-    install_packages="${install_packages}xgboost,pytest,test_paramnet,test_tabular_datamanager,"
+    install_packages="${install_packages}pytest,test_tabular_datamanager,"
     pip install codecov
 
-    # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
-    # To make sure that no newer version is installed, we install it before the other requirements.
-    # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
-    echo "Install the right scikit-learn function for the param net tests."
-    pip install --upgrade scikit-learn==0.23.2
+    PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
+    if [[ "${PYVERSION}" != "310" ]]; then
+      # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
+      # To make sure that no newer version is installed, we install it before the other requirements.
+      # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
+      echo "Install the right scikit-learn function for the param net tests."
+      pip install --upgrade scikit-learn==0.23.2
+      install_packages="${install_packages}xgboost,test_paramnet,"
+    else
+      echo "Skip installing the extra paramnet tests."
+      # For 3.10, we need a different pandas version - this comes as a requirement for the old xgboost benchmark.
+      # building pandas<=1.5.0 does not work with 3.10 anymore. -> install a different version.
+      install_packages="${install_packages}xgboost_310,"
+    fi
+
 else
     echo "Skip installing tools for testing"
 fi
@@ -35,41 +45,24 @@ if [[ "$RUN_LOCAL_EXAMPLES" == "true" ]]; then
     echo "Install packages for local examples"
     echo "Install swig"
     sudo apt-get update && sudo apt-get install -y build-essential swig
-    install_packages="${install_packages}xgboost,"
-else
-    echo "Skip installing packages for local examples"
-fi
-
-if [[ "$USE_SINGULARITY" == "true" ]]; then
-    echo "Install Singularity"
 
-    sudo apt-get update && sudo apt-get install -y \
-      build-essential \
-      libssl-dev \
-      uuid-dev \
-      libgpgme11-dev \
-      squashfs-tools \
-      libseccomp-dev \
-      wget \
-      pkg-config \
-      git \
-      cryptsetup
+    PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
+    if [[ "${PYVERSION}" != "310" ]]; then
+      # For 3.10, we need a different pandas version - this comes as a requirement for the old xgboost benchmark.
+      # building pandas<=1.5.0 does not work with 3.10 anymore. -> install a different version.
+      install_packages="${install_packages}xgboost,"
+    else
+      install_packages="${install_packages}xgboost_310,"
+    fi
 
-    export VERSION=3.5.3 && # adjust this as necessary \
-      wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-      tar -xzf v${VERSION}.tar.gz && \
-      cd singularity-${VERSION}
-
-    ./mconfig && \
-      make -C builddir && \
-      sudo make -C builddir install
-
-    cd ..
-    install_packages="${install_packages}placeholder,"
 else
-    echo "Skip installing Singularity"
+    echo "Skip installing packages for local examples"
 fi
 
+# We add a placeholder / No-OP operator. When running the container examples, we don't install any
+# additional packages. That causes an error, since `pip install .[]` does not work.
+install_packages="${install_packages}NOP,"
+
 # remove the trailing comma
 install_packages="$(echo ${install_packages} | sed 's/,*\r*$//')"
 echo "Install HPOBench with options: ${install_packages}"
diff --git a/ci_scripts/install_singularity.sh b/ci_scripts/install_singularity.sh
index 292df85b..9a89e4a3 100644
--- a/ci_scripts/install_singularity.sh
+++ b/ci_scripts/install_singularity.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env sh
 
-echo "Install Singularity"
+echo "Inside Singularity Installation Script"
 
 sudo apt-get update && sudo apt-get install -y \
   build-essential \
@@ -14,21 +14,33 @@ sudo apt-get update && sudo apt-get install -y \
   git \
   cryptsetup
 
-if [[ "$SINGULARITY_VERSION" == "3.5" ]]; then
-    export VERSION=3.5.3
-elif [[ "$SINGULARITY_VERSION" == "3.6" ]]; then
-    export VERSION=3.6.4
-elif [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
+if [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
     export VERSION=3.7.3
+    export FILENAME=singularity-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity
+
 elif [[ "$SINGULARITY_VERSION" == "3.8" ]]; then
-    export VERSION=3.8.0
+    export VERSION=3.8.4
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.9" ]]; then
+    export VERSION=3.9.3
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.10" ]]; then
+    export VERSION=3.10.0
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
 else
     echo "Skip installing Singularity"
 fi
 
-wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-tar -xzf v${VERSION}.tar.gz && \
-cd singularity-${VERSION} && \
+wget https://github.com/sylabs/singularity/releases/download/v"${VERSION}"/"${FILENAME}".tar.gz && \
+tar -xzf "${FILENAME}".tar.gz && \
+cd "${EXTRACTED_FILENAME}" && \
 ./mconfig && \
 make -C builddir && \
 sudo make -C builddir install
diff --git a/extra_requirements/mo_cnn.json b/extra_requirements/mo_cnn.json
new file mode 100644
index 00000000..35914e3e
--- /dev/null
+++ b/extra_requirements/mo_cnn.json
@@ -0,0 +1,7 @@
+{
+  "mo_cnn": [
+    "tqdm>=3.0.0",
+    "torch==1.9.0",
+    "pandas==1.2.4"
+  ]
+}
diff --git a/extra_requirements/multi_objective.json b/extra_requirements/multi_objective.json
new file mode 100644
index 00000000..146c06a7
--- /dev/null
+++ b/extra_requirements/multi_objective.json
@@ -0,0 +1,3 @@
+{
+  "mo_adult": ["pandas==1.2.4","scikit-learn==0.24.2","tqdm>=3.1.4"]
+}
\ No newline at end of file
diff --git a/extra_requirements/nasbench_1shot1.json b/extra_requirements/nasbench_1shot1.json
index 7523d0f2..b008c789 100644
--- a/extra_requirements/nasbench_1shot1.json
+++ b/extra_requirements/nasbench_1shot1.json
@@ -1,3 +1,3 @@
 {
-  "nasbench_1shot1": ["tensorflow==1.15.0","matplotlib","seaborn", "networkx", "tqdm"]
+  "nasbench_1shot1": ["protobuf==3.20.1", "tensorflow==1.15.0", "matplotlib", "seaborn", "networkx", "tqdm"]
 }
\ No newline at end of file
diff --git a/extra_requirements/tests.json b/extra_requirements/tests.json
index 6c27be97..b25d6755 100644
--- a/extra_requirements/tests.json
+++ b/extra_requirements/tests.json
@@ -2,5 +2,5 @@
   "codestyle": ["pycodestyle","flake8","pylint"],
   "pytest": ["pytest>=4.6","pytest-cov"],
   "test_paramnet": ["tqdm", "scikit-learn==0.23.2"],
-  "test_tabular_datamanager": ["pyarrow", "fastparquet"]
+  "test_tabular_datamanager": ["tqdm","pyarrow", "fastparquet"]
 }
\ No newline at end of file
diff --git a/extra_requirements/xgboost.json b/extra_requirements/xgboost.json
index 2789d2ef..eefc920c 100644
--- a/extra_requirements/xgboost.json
+++ b/extra_requirements/xgboost.json
@@ -1,3 +1,4 @@
 {
-  "xgboost": ["xgboost==0.90","pandas>=1.0.0,<1.1.5","openml==0.10.2","scikit-learn>=0.18.1"]
+  "xgboost": ["xgboost==0.90","pandas>=1.0.0,<1.1.5","openml==0.10.2","scikit-learn>=0.18.1"],
+  "xgboost_310": ["xgboost","pandas","openml==0.10.2","scikit-learn>=0.18.1"]
 }
\ No newline at end of file
diff --git a/extra_requirements/yahpo_gym.json b/extra_requirements/yahpo_gym.json
new file mode 100644
index 00000000..10f4e390
--- /dev/null
+++ b/extra_requirements/yahpo_gym.json
@@ -0,0 +1,4 @@
+{
+  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"],
+  "yahpo_gym_raw": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym", "rpy2>=3.5.0", "openml==0.10.2", "gitpython>=3.1"]
+}
diff --git a/hpobench/__version__.py b/hpobench/__version__.py
index 6820f36a..7f116511 100644
--- a/hpobench/__version__.py
+++ b/hpobench/__version__.py
@@ -1 +1 @@
-__version__ = '0.0.10'
+__version__ = '0.0.11dev'
diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index c9db4216..6a2942af 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -1,20 +1,20 @@
 """ Base-class of all benchmarks """
 
 import abc
-from typing import Union, Dict
 import functools
-
 import logging
+from typing import Union, Dict, List, Tuple
+
 import ConfigSpace
 import numpy as np
-
 from ConfigSpace.util import deactivate_inactive_hyperparameters
+
 from hpobench.util import rng_helper
 
 logger = logging.getLogger('AbstractBenchmark')
 
 
-class AbstractBenchmark(abc.ABC, metaclass=abc.ABCMeta):
+class _BaseAbstractBenchmark(abc.ABC, metaclass=abc.ABCMeta):
 
     def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs):
         """
@@ -34,7 +34,7 @@ def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs
             np.random.RandomState with seed `rng` is created. If type is None,
             create a new random state.
         """
-
+        super(_BaseAbstractBenchmark, self).__init__(**kwargs)
         self.rng = rng_helper.get_rng(rng=rng)
         self.configuration_space = self.get_configuration_space(self.rng.randint(0, 10000))
         self.fidelity_space = self.get_fidelity_space(self.rng.randint(0, 10000))
@@ -124,7 +124,13 @@ def wrapper(self, configuration: Union[ConfigSpace.Configuration, Dict],
             fidelity = AbstractBenchmark._check_and_cast_fidelity(fidelity, self.fidelity_space, **kwargs)
 
             # All benchmarks should work on dictionaries. Cast the both objects to dictionaries.
-            return wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
+            return_values = wrapped_function(self, configuration.get_dictionary(), fidelity.get_dictionary(), **kwargs)
+
+            # Make sure that every benchmark returns a well-shaped return object.
+            # Every benchmark have to have the fields 'function_value' and 'cost'.
+            # Multi-Objective benchmarks have to return collections of values for the 'function_value' field.
+            return_values = type(self)._check_return_values(return_values)
+            return return_values
         return wrapper
 
     @staticmethod
@@ -208,6 +214,10 @@ def __call__(self, configuration: Dict, **kwargs) -> float:
         """ Provides interface to use, e.g., SciPy optimizers """
         return self.objective_function(configuration, **kwargs)['function_value']
 
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        raise NotImplementedError()
+
     @staticmethod
     @abc.abstractmethod
     def get_configuration_space(seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
@@ -251,3 +261,57 @@ def get_meta_information() -> Dict:
 
         """
         raise NotImplementedError()
+
+
+class AbstractSingleObjectiveBenchmark(_BaseAbstractBenchmark):
+    """
+    Abstract Benchmark class for single-objective benchmarks.
+    This corresponds to the old AbstractBenchmark class.
+
+    The only purpose of this class is to point out to users that this benchmark returns only a single
+    objective function value.
+
+    When writing a benchmark, please make sure to inherit from the correct abstract class.
+    """
+
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        """
+        assert 'function_value' in return_values.keys()
+        assert 'cost' in return_values.keys()
+        return return_values
+
+
+# Ensure compatibility with older versions of the HPOBench
+AbstractBenchmark = AbstractSingleObjectiveBenchmark
+
+
+class AbstractMultiObjectiveBenchmark(_BaseAbstractBenchmark):
+    """
+    Abstract Benchmark class for multi-objective benchmarks.
+    The only purpose of this class is to point out to users that this benchmark returns multiple
+    objective function values.
+
+    When writing a benchmark, please make sure to inherit from the correct abstract class.
+    """
+
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        The field `function_value` has to be a collection of multiple objective targets.
+        """
+        return_values = AbstractBenchmark._check_return_values(return_values)
+        assert isinstance(return_values['function_value'], (List, Dict, Tuple)), \
+            'Every MO benchmark has to return multiple objectives.'
+        return return_values
+
+    @staticmethod
+    @abc.abstractmethod
+    def get_objective_names():
+        """
+        Return the names of supported targets
+        """
+        raise NotImplementedError()
diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 64e399cd..e69de29b 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -1,22 +0,0 @@
-from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
-    RandomForestBenchmarkMF
-from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
-
-try:
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-except ImportError:
-    pass
-
-
-__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
-           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
-           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
-           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
-           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
-           'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
-           ]
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 8c317111..aa7aa162 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -4,30 +4,38 @@
 
 0.0.1:
 * First implementation of the LR Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
-
+import time
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.linear_model import SGDClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class LRBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-
-        super(LRBenchmark, self).__init__(task_id, rng, valid_size, data_path)
-        self.cache_size = 500
+    """ Multi-multi-fidelity Logisitic Regression Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(LRBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -44,7 +52,8 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         ])
         return cs
 
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - iterations + data subsample
@@ -53,17 +62,11 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            iter_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
         """Fidelity space available --- specifies the fidelity dimensions
-
-        For SVM, only a single fidelity exists, i.e., subsample fraction.
-        if fidelity_choice == 0
-            uses the entire data (subsample=1), reflecting the black-box setup
-        else
-            parameterizes the fraction of data to subsample
-
         """
-
         assert iter_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
 
@@ -79,14 +82,16 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
                 'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
-
         iter = fidelity1[iter_choice]
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         # initializing model
         rng = self.rng if rng is None else rng
 
@@ -103,13 +108,185 @@ def init_model(self, config: Union[CS.Configuration, Dict],
             learning_rate="adaptive",
             tol=None,
             random_state=rng,
-
         )
         return model
 
+    def get_model_size(self, model: SGDClassifier = None) -> float:
+        """ Returns the dimensionality as a proxy for the number of model parameters
+
+        Logistic Regression models have a fixed number of parameters given a dataset. Model size is
+        being approximated as the number of beta parameters required as the model support plus the
+        intercept. This depends on the dataset and not on the trained model.
+
+        Parameters
+        ----------
+        model : SGDClassifier
+            Trained LR model. This parameter is required to maintain function signature.
+
+        Returns
+        -------
+        float
+        """
+        ndims = self.train_X.shape[1]
+        # accounting for the intercept
+        ndims += 1
+        return ndims
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            # IMPORTANT to allow partial_fit
+            model.warm_start = True
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            lc_spacings = self._get_lc_spacing(model.max_iter, lc_every_k)
+            iter_start = 0
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
+                start = time.time()
+                # trains model for k steps
+                for j in range(iter_end - iter_start):
+                    model.partial_fit(
+                        train_X[train_idx],
+                        train_y.iloc[train_idx],
+                        np.unique(train_y.iloc[train_idx])
+                    )
+                # adding all partial fit times
+                model_fit_time += time.time() - start
+                iter_start = iter_end
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                # sums the time taken to evaluate and collect data for the learning curves
+                lc_time += time.time() - lc_start
+        else:
+            # default training as per the base benchmark template
+            learning_curves = None
+            lc_time = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
+
 
 class LRBenchmarkBB(LRBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the LRBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -119,7 +296,10 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class LRBenchmarkMF(LRBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the LRBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 06634661..4263278f 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -4,28 +4,39 @@
 
 0.0.1:
 * First implementation of the NN Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
+import time
 from copy import deepcopy
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.neural_network import MLPClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class NNBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(NNBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity Multi-Layer Perceptron Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(NNBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -63,8 +74,11 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
-
+    def _get_fidelity_choices(
+            iter_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
+        """Fidelity space available --- specifies the fidelity dimensions
+        """
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=243),
             variable=CS.UniformIntegerHyperparameter(
@@ -81,11 +95,13 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
         rng = self.rng if rng is None else rng
 
         if isinstance(config, CS.Configuration):
@@ -99,6 +115,7 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         config.pop("depth")
         config.pop("width")
         hidden_layers = [width] * depth
+        # TODO: check for iteration length and edit n_iter_no_change maybe
         model = MLPClassifier(
             **config,
             hidden_layer_sizes=hidden_layers,
@@ -109,9 +126,175 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         )
         return model
 
+    def get_model_size(self, model: MLPClassifier) -> float:
+        """ Returns the total number of trained parameters in the MLP model
+
+        Parameters
+        ----------
+        model : MLPClassifier
+            Trained MLP model.
+
+        Returns
+        -------
+        float
+        """
+        nparams = 0
+        for layer in model.coefs_:
+            nparams += layer.shape[0] * layer.shape[1]
+        for layer in model.intercepts_:
+            nparams += layer.shape[0]
+        return nparams
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            # IMPORTANT to allow partial_fit
+            model.warm_start = True
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            lc_spacings = self._get_lc_spacing(model.max_iter, lc_every_k)
+            iter_start = 0
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
+                start = time.time()
+                # trains model for k steps
+                for j in range(iter_end - iter_start):
+                    model.partial_fit(
+                        train_X[train_idx],
+                        train_y.iloc[train_idx],
+                        np.unique(train_y.iloc[train_idx])
+                    )
+                # adding all partial fit times
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            lc_time = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
+
 
 class NNBenchmarkBB(NNBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the NNBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -121,7 +304,10 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class NNBenchmarkMF(NNBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the NNBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
diff --git a/hpobench/benchmarks/ml/rbv2_benchmark.py b/hpobench/benchmarks/ml/rbv2_benchmark.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 596f03b6..b6874788 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -4,28 +4,39 @@
 
 0.0.1:
 * First implementation of the RF Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
+import time
 from copy import deepcopy
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.ensemble import RandomForestClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class RandomForestBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(RandomForestBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity Random Forest Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(RandomForestBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -54,12 +65,16 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='variable'
+            )
         )
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            n_estimators_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
 
         assert n_estimators_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
@@ -70,7 +85,6 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
                 'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
-
         fidelity2 = dict(
             fixed=CS.Constant('subsample', value=1),
             variable=CS.UniformFloatHyperparameter(
@@ -81,11 +95,13 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
         rng = self.rng if rng is None else rng
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
@@ -103,23 +119,194 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         )
         return model
 
+    def get_model_size(self, model: RandomForestClassifier) -> float:
+        """ Returns the total number of decision nodes in the entire Random Forest model
+
+        Parameters
+        ----------
+        model : RandomForestClassifier
+            Trained RF model.
+
+        Returns
+        -------
+        float
+        """
+        nodes = 0
+        for tree in model.estimators_:
+            # total number of nodes in the tree (internal + leaf)
+            nodes += tree.tree_.node_count
+        return nodes
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            lc_spacings = self._get_lc_spacing(model.n_estimators, lc_every_k)
+            # IMPORTANT to allow refitting with more estimators
+            model.warm_start = True
+            model.n_estimators = 0
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            iter_start = 0
+            # for i in range(fidelity['n_estimators']):
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
+                start = time.time()
+                # adds k new estimators to the model for training
+                model.n_estimators += iter_end - iter_start
+                model.fit(train_X[train_idx], train_y.iloc[train_idx])
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            lc_time = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
+
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the RandomForestBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='fixed', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
 
 class RandomForestBenchmarkMF(RandomForestBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the RandomForestBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 9462442f..c7b6a816 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -4,6 +4,10 @@
 
 0.0.1:
 * First implementation of the new SVM Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 from typing import Union, Dict
@@ -15,18 +19,21 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class SVMBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(SVMBenchmark, self).__init__(task_id, rng, valid_size, data_path)
-
-        self.cache_size = 200
+    """ Multi-multi-fidelity SVM Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(SVMBenchmark, self).__init__(task_id, valid_size, rng, data_path)
+        self.cache_size = 1024  # in MB
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -54,7 +61,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
     @staticmethod
     def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
-
+        """Fidelity space available --- specifies the fidelity dimensions
+        """
         assert subsample_choice in ['fixed', 'variable']
 
         fidelity = dict(
@@ -64,12 +72,14 @@ def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
             )
         )
         subsample = fidelity[subsample_choice]
-
         return subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         # initializing model
         rng = self.rng if rng is None else rng
         if isinstance(config, CS.Configuration):
@@ -81,9 +91,27 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         )
         return model
 
+    def get_model_size(self, model: SVC) -> float:
+        """ Returns the number of support vectors in the SVM model
+
+        Parameters
+        ----------
+        model : SVC
+            Trained SVM model.
+
+        Returns
+        -------
+        float
+        """
+        nsupport = model.support_.shape[0]
+        return nsupport
+
 
 class SVMBenchmarkBB(SVMBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the SVMBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameter(
             # uses the entire data (subsample=1), reflecting the black-box setup
diff --git a/hpobench/benchmarks/ml/svm_benchmark_old.py b/hpobench/benchmarks/ml/svm_benchmark_old.py
deleted file mode 100644
index 9aad5e44..00000000
--- a/hpobench/benchmarks/ml/svm_benchmark_old.py
+++ /dev/null
@@ -1,354 +0,0 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Standardize the structure of the meta information
-
-0.0.1:
-* First implementation
-
-"""
-
-import logging
-import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
-import numpy as np
-from scipy import sparse
-from sklearn import pipeline
-from sklearn import svm
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('SVMBenchmark')
-
-
-class SupportVectorMachine(AbstractBenchmark):
-    """
-    Hyperparameter optimization task to optimize the regularization
-    parameter C and the kernel parameter gamma of a support vector machine.
-    Both hyperparameters are optimized on a log scale in [-10, 10].
-    The X_test data set is only used for a final offline evaluation of
-    a configuration. For that the validation and training data is
-    concatenated to form the whole training data set.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-        Parameters
-        ----------
-        task_id : int, None
-        rng : np.random.RandomState, int, None
-        """
-        super(SupportVectorMachine, self).__init__(rng=rng)
-
-        self.task_id = task_id
-        self.cache_size = 200  # Cache for the SVC in MB
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM model
-        fidelity: Dict, None
-            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : training loss
-                fidelity : used fidelities in this evaluation
-        """
-        start_time = time.time()
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        # Split of dataset subset
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_size = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_size = fidelity['dataset_fraction']
-
-        train_size = int(train_size * len(self.train_idx))
-        train_idx = self.train_idx[:train_size]
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        # Train support vector machine
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(self.x_train[train_idx], self.y_train[train_idx])
-
-        # Compute validation error
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(val_loss),
-                "cost": cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}}
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model with a given configuration on both the X_train
-        and validation data set and evaluates the model on the X_test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : X_test loss
-            cost : time to X_train and evaluate the model
-            info : Dict
-                train_valid_loss: Loss on the train+valid data set
-                fidelity : used fidelities in this evaluation
-        """
-        assert np.isclose(fidelity['dataset_fraction'], 1), \
-            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start_time = time.time()
-
-        # Concatenate training and validation dataset
-        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
-            data = sparse.vstack((self.x_train, self.x_valid))
-        else:
-            data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(data, targets)
-
-        # Compute validation error
-        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
-
-        # Compute test error
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(test_loss),
-                "cost": cost,
-                'info': {'train_valid_loss': float(train_valid_loss),
-                         'fidelity': fidelity}}
-
-    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-
-        model = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
-            ('svm',
-             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
-        ])
-        return model
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the SVM Model
-
-        For a detailed explanation of the hyperparameters:
-        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
-            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
-        ])
-        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the SupportVector Benchmark
-
-        Fidelities
-        ----------
-        dataset_fraction: float - [0.1, 1]
-            fraction of training data set to use
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-        ])
-        return fidel_space
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Support Vector Machine',
-                'references': ["@InProceedings{pmlr-v54-klein17a",
-                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
-                               "Frank Hutter}, "
-                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
-                               "Large Datasets}}"
-                               "pages = {528--536}, year = {2017},"
-                               "editor = {Aarti Singh and Jerry Zhu},"
-                               "volume = {54},"
-                               "series = {Proceedings of Machine Learning Research},"
-                               "address = {Fort Lauderdale, FL, USA},"
-                               "month = {20--22 Apr},"
-                               "publisher = {PMLR},"
-                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
-                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
-                               ],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 72e5fb31..342766b4 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -4,6 +4,10 @@
 
 0.0.1:
 * First implementation of the Tabular Benchmark.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 from pathlib import Path
@@ -17,7 +21,7 @@
 from hpobench.dependencies.ml.ml_benchmark_template import metrics
 from hpobench.util.data_manager import TabularDataManager
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class TabularBenchmark(AbstractBenchmark):
@@ -145,8 +149,8 @@ def _search_dataframe(self, row_dict, df):
         for i, param in enumerate(df.drop("result", axis=1).columns):
             mask *= df[param].values == row_dict[param]
         idx = np.where(mask)
-        assert len(idx) == 1, 'The query has resulted into mulitple matches. This should not happen. ' \
-                              f'The Query was {row_dict}'
+        assert len(idx) == 1, 'The query has resulted into mulitple matches. ' \
+                              'This should not happen. The Query was {row_dict}'
         idx = idx[0][0]
         result = df.iloc[idx]["result"]
         return result
@@ -163,7 +167,7 @@ def _objective(
         metric_str = ', '.join(list(metrics.keys()))
         assert metric in list(metrics.keys()), f"metric not found among: {metric_str}"
         score_key = f"{evaluation}_scores"
-        cost_key = f"{evaluation}_scores"
+        cost_key = f"{evaluation}_costs"
 
         key_path = dict()
         for name in self.configuration_space.get_hyperparameter_names():
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index ae554628..234c2cee 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -4,7 +4,12 @@
 
 0.0.1:
 * First implementation of the new XGB Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
+
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
@@ -12,18 +17,23 @@
 import xgboost as xgb
 from ConfigSpace.hyperparameters import Hyperparameter
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class XGBoostBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(XGBoostBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity XGBoost Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(XGBoostBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -52,12 +62,16 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='variable'
+            )
         )
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            n_estimators_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
 
         assert n_estimators_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
@@ -74,28 +88,31 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-
         n_estimators = fidelity1[n_estimators_choice]
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self,
-                   config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
+        rng = self.rng if rng is None else get_rng(rng)
+        # xgb.XGBClassifier when trainied using the scikit-learn API of `fit`, requires
+        # random_state to be an integer and doesn't accept a RandomState
+        seed = rng.randint(1, 10**6)
+
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
         if isinstance(fidelity, CS.Configuration):
             fidelity = fidelity.get_dictionary()
-
-        rng = rng if (rng is None or isinstance(rng, int)) else self.seed
         extra_args = dict(
             booster="gbtree",
             n_estimators=fidelity['n_estimators'],
             objective="binary:logistic",
-            random_state=rng,
+            random_state=seed,
             subsample=1
         )
         if self.n_classes > 2:
@@ -108,23 +125,48 @@ def init_model(self,
         )
         return model
 
+    def get_model_size(self, model: xgb.XGBClassifier) -> float:
+        """ Returns the total number of decision nodes in the sequence of Gradient Boosted trees
+
+        Parameters
+        ----------
+        model : xgb.XGBClassifier
+            Trained XGB model.
+
+        Returns
+        -------
+        float
+        """
+        nodes = model.get_booster().trees_to_dataframe().shape[0]
+        return nodes
+
 
 class XGBoostBenchmarkBB(XGBoostBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the XGBoostBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='fixed', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
 
 class XGBoostBenchmarkMF(XGBoostBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the XGBoostBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
deleted file mode 100644
index f8730f52..00000000
--- a/hpobench/benchmarks/ml/xgboost_benchmark_old.py
+++ /dev/null
@@ -1,430 +0,0 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
-    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
-    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
-    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
-    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
-    max_depth:          -                       ->  [1, 15] (def: 6)
-    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
-    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
-
-    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
-
-* Increase the fidelity `n_estimators`
-    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
-
-* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
-    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
-    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
-
-
-0.0.1:
-* First implementation of a XGBoost Benchmark.
-
-
-"""
-
-import logging
-import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
-import numpy as np
-import xgboost as xgb
-from sklearn import pipeline
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('XGBBenchmark')
-
-
-class XGBoostBenchmark(AbstractBenchmark):
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-
-        Parameters
-        ----------
-        task_id : int, None
-        n_threads  : int, None
-        rng : np.random.RandomState, int, None
-        """
-
-        super(XGBoostBenchmark, self).__init__(rng=rng)
-        self.n_threads = n_threads
-        self.task_id = task_id
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        # Determine the number of categories in the labels.
-        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
-        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
-        self.num_class = 1 if self.num_class == 2 else self.num_class
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost model
-        fidelity: Dict, None
-            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : trainings loss
-                fidelity : used fidelities in this evaluation
-        """
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_data_fraction = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_data_fraction = fidelity['dataset_fraction']
-
-        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
-
-        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
-        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
-
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-        cost = time.time() - start
-
-        return {'function_value': float(val_loss),
-                'cost': cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}
-                }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model with a given configuration on both the train
-        and validation data set and evaluates the model on the test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : test loss
-            cost : time to train and evaluate the model
-            info : Dict
-                fidelity : used fidelities in this evaluation
-        """
-        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
-        if fidelity['dataset_fraction'] != default_dataset_fraction:
-            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
-                                      f'{default_dataset_fraction}')
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        # Impute potential nan values with the feature-
-        data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
-        model.fit(X=data, y=targets)
-
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-        cost = time.time() - start
-
-        return {'function_value': float(test_loss),
-                'cost': cost,
-                'info': {'fidelity': fidelity}}
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the XGBoost Model
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
-            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
-            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
-            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
-        ])
-
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the XGBoost Benchmark
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
-        ])
-
-        return fidel_space
-
-    def get_meta_information(self) -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'XGBoost',
-                'references': ['@article{probst2019tunability,'
-                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
-                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
-                               'journal={J. Mach. Learn. Res.},'
-                               'volume={20},'
-                               'number={53},'
-                               'pages={1--32},'
-                               'year={2019}'
-                               '}'],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
-                        'xgboost_benchmark_old.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
-
-    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
-                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
-                      n_estimators: int, subsample_per_it: float) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(
-                 max_depth=max_depth,
-                 learning_rate=eta,
-                 min_child_weight=min_child_weight,
-                 colsample_bytree=colsample_bytree,
-                 colsample_bylevel=colsample_bylevel,
-                 reg_alpha=reg_alpha,
-                 reg_lambda=reg_lambda,
-                 n_estimators=n_estimators,
-                 objective=objective,
-                 n_jobs=self.n_threads,
-                 random_state=self.rng.randint(1, 100000),
-                 num_class=self.num_class,
-                 subsample=subsample_per_it))
-            ])
-        return clf
-
-
-class XGBoostExtendedBenchmark(XGBoostBenchmark):
-    """
-    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = XGBoostBenchmark.get_configuration_space(seed)
-        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
-                                                  default_value='gbtree')
-        cs.add_hyperparameter(hp_booster)
-
-        # XGBoost with 'gblinear' can not use some
-        # parameters. Exclude them from the configuration space by introducing a condition.
-        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
-
-        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
-        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
-        cs.add_conditions(conditions)
-        return cs
-
-    # noinspection PyMethodOverriding
-    # pylint: disable=arguments-differ
-    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
-                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
-                      colsample_bylevel: float = None, subsample_per_it: float = None) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        configuration = dict(booster=booster,
-                             max_depth=max_depth,
-                             learning_rate=eta,
-                             min_child_weight=min_child_weight,
-                             colsample_bytree=colsample_bytree,
-                             colsample_bylevel=colsample_bylevel,
-                             reg_alpha=reg_alpha,
-                             reg_lambda=reg_lambda,
-                             n_estimators=n_estimators,
-                             objective=objective,
-                             n_jobs=self.n_threads,
-                             random_state=self.rng.randint(1, 100000),
-                             num_class=self.num_class,
-                             subsample=subsample_per_it)
-
-        configuration = {k: v for k, v in configuration.items() if v is not None}
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(**configuration))
-        ])
-        return clf
diff --git a/hpobench/benchmarks/ml/yahpo_benchmark.py b/hpobench/benchmarks/ml/yahpo_benchmark.py
new file mode 100644
index 00000000..d06d23fc
--- /dev/null
+++ b/hpobench/benchmarks/ml/yahpo_benchmark.py
@@ -0,0 +1,317 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites: 1) Install Conda
+===============================
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+Prerequisites: 2) Install R
+===========================
+
+Install R (4.0.5 - IMPORTANT!) and the required dependencies:  # works also with higher R versions(?)
+
+``` bash
+Rscript -e 'install.packages("remotes", repos = "http://cran.r-project.org")'
+
+# Install OpenML dependencies
+Rscript -e 'install.packages("curl", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("httr", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("farff", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("OpenML", repos = "http://cran.r-project.org")' \
+
+# Install rbv2 dependencies
+Rscript -e 'remotes::install_version("BBmisc", version = "1.11", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("glmnet", version = "2.0-16", upgrade = "never", repos = "http://cran.r-project.o")' \
+&& Rscript -e 'remotes::install_version("rpart", version = "4.1-13", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("e1071", version = "1.7-0.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("xgboost", version = "0.82.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("ranger", version = "0.11.2", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("RcppHNSW", version = "0.1.0", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("mlr", version = "2.14", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_github("mlr-org/mlr3misc", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("mlrCPO", version = "0.3.6", upgrade = "never", repos = "http://cran.r-projt.org")' \
+&& Rscript -e 'remotes::install_github("pfistfl/rbv2", upgrade = "never")' \
+&& Rscript -e 'remotes::install_version("testthat", version = "3.1.4", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_github("sumny/iaml", upgrade = "never")'
+```
+Prerequisites: 3) Install rpy2
+==============================
+Installing the connector between R and python might be a little bit tricky.
+Official installation guide: https://rpy2.github.io/doc/latest/html/introduction.html
+
+We received in some cases the error: "/opt/R/4.0.5/lib/R/library/methods/libs/methods.so: undefined symbol".
+To solve this error, we had to execute the following command:
+```
+export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
+```
+
+1. Download data:
+=================
+Normally, the data will be downloaded automatically.
+
+If you want to download the data on your own, you can download the data with the following command:
+
+``` bash
+git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git
+```
+
+Later, you have to give yahpo the link to the data.
+
+```python
+from yahpo_gym import local_config
+local_config.init_config()
+local_config.set_data_path("path-to-data")
+```
+
+The data consist of surrogates for different data sets. Each surrogate is a compressed ONNX neural network.
+
+
+2. Install HPOBench:
+====================
+```
+git clone HPOBench
+cd /path/to/HPOBench
+pip install .[yahpo_gym_raw]
+```
+
+Changelog:
+==========
+0.0.1:
+* First implementation
+"""  # noqa: E501
+
+import logging
+from pathlib import Path
+from typing import Union, Dict, List
+
+import pandas as pd
+import ConfigSpace as CS
+import numpy as np
+import rpy2.robjects as robjects
+from rpy2.robjects.packages import importr
+from yahpo_gym.benchmark_set import BenchmarkSet
+
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('YAHPO-Raw')
+
+
+class YAHPOGymMORawBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, scenario: str, instance: str,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 data_dir: Union[Path, str, None] = None):
+        """
+        Parameters
+        ----------
+        scenario : str
+            Name for the learner. Must be one of [
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_xgboost", "rbv2_svm", "rbv2_aknn", "rbv2_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet", "iaml_xgboost"
+            ]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+            https://slds-lmu.github.io/yahpo_gym/scenarios.html#instances
+        rng : np.random.RandomState, int, None
+        """
+
+        assert scenario.startswith('rbv2_') or scenario.startswith('iaml_'), \
+            'Currently, we only support the experiments with rbv2_ and iaml from yahpo. ' \
+            f'The scenario has to start with either rbv2_ or iaml_, but was {scenario}'
+
+        from hpobench.util.data_manager import YAHPODataManager
+        self.data_manager = YAHPODataManager(data_dir=data_dir)
+        self.data_manager.load()
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymMORawBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # Cast python dict to R list:
+        parameters = {**configuration, **fidelity}
+        r_list = YAHPOGymMORawBenchmark._cast_dict_to_rlist(parameters)
+
+        # Call the random bot evaluation method
+        if self.scenario.startswith('rbv2_'):
+
+            # Establish a connection to the R package
+            rbv2pkg = importr('rbv2')
+
+            learner = self.scenario.replace('rbv2_', 'classif.')
+            r_out = rbv2pkg.eval_config(
+                learner=learner, task_id=int(configuration['task_id']), configuration=r_list
+            )
+            # Extract the run data frame via replications and cast the R list (result) back to a python dictionary
+            result_r_df = r_out[0][0][0][4]
+            result_dict = YAHPOGymMORawBenchmark._cast_to_dict(result_r_df)
+            result_df = pd.DataFrame(result_dict)
+            result = result_df.mean(axis=0)
+            result = result.to_dict()
+            time_cols = [col for col in result_df.columns if 'time' in col]
+            times = {col: result_df.loc[:, col].sum() for col in time_cols}
+            result.update(times)
+
+        elif self.scenario.startswith('iaml_'):
+
+            iaml = importr('iaml')
+            out = iaml.eval_yahpo(scenario=robjects.StrVector([self.scenario]), configuration=r_list)
+            result = YAHPOGymMORawBenchmark._cast_to_dict(out)
+
+        elif self.scenario.startswith('fair_'):
+
+            fair_pkg = importr('fair')
+            out = fair_pkg.eval_yahpo(scenario=robjects.StrVector([self.scenario]), configuration=r_list)
+            result = YAHPOGymMORawBenchmark._cast_to_dict(out)
+
+        else:
+            raise NotImplementedError()
+
+        objectives = {target: value for target, value in result.items() if target in self.benchset.config.y_names}
+        additional = {target: value for target, value in result.items() if target not in self.benchset.config.y_names}
+
+        return {
+            'function_value': objectives,
+            'cost': result['timetrain'],
+            'info': {'fidelity': fidelity, 'additional_info': additional}
+        }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity Benchmark '
+                               '       for Hyperparameter Optimization},',
+                               'author={Florian Pfisterer and Lennart Schneider and Julia Moosbauer '
+                               '        and Martin Binder and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year={2021}}'],
+                'code': ['https://github.com/pfistfl/yahpo_gym/yahpo_gym',
+                         'https://github.com/pfistfl/rbv2/',
+                         'https://github.com/sumny/iaml',
+                         'https://github.com/sumny/fair']
+                }
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+    @staticmethod
+    def _cast_dict_to_rlist(py_dict):
+        """ Convert a python dictionary to a RPy2 ListVector"""
+        pairs = [f'{key} = {value}' if not isinstance(value, str) else f'{key} = \"{value}\"'
+                 for key, value in py_dict.items()]
+        pairs = ",".join(pairs)
+        str_list = f"list({pairs})"
+        r_list = robjects.r(str_list)
+        return r_list
+
+    @staticmethod
+    def _cast_to_dict(r_list_object) -> Dict:
+        """
+        Convert an RPy2 ListVector to a Python dict.
+        Source: https://ogeek.cn/qa/?qa=815151/
+        """
+        result = {}
+        for i, name in enumerate(r_list_object.names):
+            if isinstance(r_list_object[i], robjects.ListVector):
+                result[name] = YAHPOGymMORawBenchmark._cast_to_dict(r_list_object[i])
+            elif len(r_list_object[i]) == 1:
+                result[name] = r_list_object[i][0]
+            else:
+                result[name] = r_list_object[i]
+        return result
+
+
+class YAHPOGymRawBenchmark(AbstractBenchmark):
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+            https://slds-lmu.github.io/yahpo_gym/scenarios.html#instances
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        rng : np.random.RandomState, int, None
+        """
+        self.backbone = YAHPOGymMORawBenchmark(scenario=scenario, instance=instance, rng=rng)
+        self.objective = objective
+        super(YAHPOGymRawBenchmark, self).__init__(rng=rng)
+
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self.backbone.objective_function(configuration=configuration,
+                                                      fidelity=fidelity,
+                                                      **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.backbone.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity,
+                         'additional_info': mo_results['info']['additional_info'],
+                         'objectives': mo_results['function_value']}}
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_configuration_space(seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        return YAHPOGymMORawBenchmark.get_meta_information()
diff --git a/hpobench/benchmarks/mo/__init__.py b/hpobench/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/mo/adult_benchmark.py b/hpobench/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..30631cae
--- /dev/null
+++ b/hpobench/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,447 @@
+"""
+Changelog:
+==========
+0.0.2:
+* Change the objective value from accuracy to misclassification rate. (1 - accuracy)
+
+0.0.1:
+* First implementation of the Multi-Objective Fair Adult Benchmark.
+"""
+import logging
+import time
+from typing import Union, Dict, List, Any, Tuple
+
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.conditions import GreaterThanCondition
+from sklearn.metrics import accuracy_score
+from sklearn.neural_network import MLPClassifier
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.dependencies.mo.fairness_metrics import fairness_risk, STATISTICAL_DISPARITY, UNEQUALIZED_ODDS, \
+    UNEQUAL_OPPORTUNITY
+from hpobench.dependencies.mo.scalar import get_fitted_scaler
+from hpobench.util.data_manager import AdultDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('ADULT_FAIR')
+
+
+class AdultBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Multi-objective fairness HPO task. Optimize the HP of a NN on the adult data set.
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+        super(AdultBenchmark, self).__init__(rng=rng, **kwargs)
+
+        data_manager = AdultDataManager()
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+        self.output_class = np.unique(self.y_train)
+        self.feature_names = data_manager.feature_names
+        self.sensitive_feature = data_manager.sensitive_names
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for the MLP.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=4, log=False),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_3', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformFloatHyperparameter('alpha', lower=10**-5, upper=10**-1, default_value=10**-2, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_1', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_2', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('tol', lower=10**-5, upper=10**-2, default_value=10**-3, log=True),
+        ])
+
+        cs.add_conditions([
+            # Add the fc_layer_1 (2nd layer) if we allow more than 1 `n_fc_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_3'), cs.get_hyperparameter('n_fc_layers'), 3),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters.
+
+        Fidelities
+        ----------
+        budget: int - Values: [1, 200]
+            Number of epochs an architecture was trained.
+            Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=200, default_value=200, log=False
+            )
+        )
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references':
+                ['@article{schmucker2021multi,'
+                 'title={Multi-objective Asynchronous Successive Halving},'
+                 'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                 ' David and Archambeau, C{\'e}dric},'
+                 'journal={arXiv preprint arXiv:2106.12639},'
+                 'year={2021}']}
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get a list of objectives evaluated in the objective_function. """
+        return ['misclassification_rate', 'DSP', 'DEO', 'DFP']
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the validation set.
+        However, we report also train and test performance.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+        fidelity: Dict, None
+            budget: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - validation metrics after training on train
+                misclassification_rate: float: 1 - validation accuracy
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : time to train the network
+            info : Dict
+                 train_accuracy : float
+                 valid_accuracy : float
+                 test_accuracy : float
+                 training_cost : float - time to train the network. see `training_cost`
+                 total_cost : float - elapsed time for the entire obj_func call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_valid_cost : float - time to compute metrics on validation split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 valid_DSO : float
+                 valid_DEO : float
+                 valid_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+        logger.debug(f"budget for evaluation of config:{budget}")
+        logger.debug(f"config for evaluation:{configuration}")
+
+        sensitive_rows_train = self.X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_val = self.X_valid[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = self.X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_valid = scaler(X_valid)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=self.rng, max_iter=budget)
+
+        mlp.fit(X_train, self.y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, self.y_train, sensitive_rows_train, mlp)
+
+        val_accuracy, val_statistical_disparity, val_unequal_opportunity, val_unequalized_odds, eval_valid_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_valid, self.y_valid, sensitive_rows_val, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        logger.debug(f"config: {configuration}, val_acc: {val_accuracy}, test_score: {test_accuracy}, "
+                     f"train score: {train_accuracy}, dsp: {val_statistical_disparity}, "
+                     f"deo :{val_unequal_opportunity}, dfp :{val_unequalized_odds}")
+
+        elapsed_time = time.time() - ts_start
+
+        return {'function_value': {'misclassification_rate': 1 - float(val_accuracy),
+                                   'DSO': float(val_statistical_disparity),
+                                   'DEO': float(val_unequal_opportunity),
+                                   'DFP': float(val_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'valid_accuracy': float(val_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_valid_cost': eval_valid_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'valid_DSO': float(val_statistical_disparity),
+                         'valid_DEO': float(val_unequal_opportunity),
+                         'valid_DFP': float(val_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: Union[bool, None] = False,
+                                **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the test set.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+            Use default configuration if None.
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - test metrics reported after training on (train+valid)
+                misclassification_rate: float: 1 - test accuracy
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : float - time to train the network. see `training_cost`
+            info : Dict
+                 train_accuracy : float
+                 test_accuracy : float
+                 training_cost : float
+                 total_cost : float - elapsed time for the entire obj_func_test call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self._shuffle_data(self.rng, shuffle_valid=True)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+        X_train = np.vstack((X_train, X_valid))
+        y_train = np.vstack((self.y_train[:, np.newaxis], self.y_valid[:, np.newaxis])).ravel()
+
+        sensitive_rows_train = X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=rng, max_iter=budget)
+        mlp.fit(X_train, y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, y_train, sensitive_rows_train, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        elapsed_time = time.time() - ts_start
+
+        logger.debug(f"config:{configuration}, test_score: {test_accuracy}, train score:{train_accuracy},"
+                     f"dsp:{test_statistical_disparity}, deo :{test_unequal_opportunity}, dfp :{test_unequalized_odds}")
+
+        return {'function_value': {'misclassification_rate': 1 - float(test_accuracy),
+                                   'DSO': float(test_statistical_disparity),
+                                   'DEO': float(test_unequal_opportunity),
+                                   'DFP': float(test_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @staticmethod
+    def _compute_metrics_on_split(
+            x_split: np.ndarray, y_split: np.ndarray, sensitive_rows: Any,  mlp: Any
+    ) -> Tuple:
+
+        start = time.time()
+        _y_pred = mlp.predict(x_split)
+        accuracy = accuracy_score(y_split, _y_pred)
+        statistical_disparity = fairness_risk(x_split, y_split, sensitive_rows, mlp, STATISTICAL_DISPARITY)
+        unequal_opportunity = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUAL_OPPORTUNITY)
+        unequalized_odds = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUALIZED_ODDS)
+        runtime = time.time() - start
+        return accuracy, statistical_disparity, unequal_opportunity, unequalized_odds, runtime
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+
+__all__ = ['AdultBenchmark']
diff --git a/hpobench/benchmarks/mo/cnn_benchmark.py b/hpobench/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..516b459a
--- /dev/null
+++ b/hpobench/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,578 @@
+"""
+Changelog:
+==========
+0.0.2:
+* Rename the returned function value
+  'negative_accuracy' -> 'misclassification_rate'
+
+0.0.1:
+* First implementation of the Multi-Objective CNN Benchmark.
+"""
+import logging
+import random
+import time
+from typing import Union, Dict, List, Tuple, Any
+
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import tqdm
+from ConfigSpace.conditions import GreaterThanCondition
+from torch.utils.data import TensorDataset, DataLoader
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import CNNDataManager
+
+__version__ = '0.0.2'
+
+logger = logging.getLogger('MO_CNN')
+
+
+class AccuracyTop1:
+
+    def __init__(self):
+        self.reset()
+
+        self.sum = 0
+        self.cnt = 0
+
+    def reset(self):
+        self.sum = 0
+        self.cnt = 0
+
+    def __call__(self, y_true: torch.Tensor, y_pred: torch.Tensor) -> float:
+        self.sum += y_pred.topk(1)[1].eq(y_true.argmax(-1).reshape(-1, 1).expand(-1, 1)).float().sum().to('cpu').numpy()
+        self.cnt += y_pred.size(0)
+        return self.sum / self.cnt
+
+
+class Net(nn.Module):
+    """
+    The model to optimize
+    """
+
+    def __init__(self, config: Dict, input_shape: Tuple = (3, 28, 28),
+                 num_classes: Union[int, None] = 10):
+        super(Net, self).__init__()
+        inp_ch = input_shape[0]
+        layers = []
+        for i in range(config['n_conv_layers']):
+            out_ch = config['conv_layer_{}'.format(i)]
+            ks = config['kernel_size']
+            layers.append(nn.Conv2d(inp_ch, out_ch, kernel_size=ks, padding=(ks - 1) // 2))
+            layers.append(nn.ReLU())
+            if config['batch_norm']:
+                layers.append(nn.BatchNorm2d(out_ch))
+            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
+            inp_ch = out_ch
+
+        self.conv_layers = nn.Sequential(*layers)
+        self.pooling = nn.AdaptiveAvgPool2d(1) if config['global_avg_pooling'] else nn.Identity()
+        self.output_size = num_classes
+
+        self.fc_layers = nn.ModuleList()
+
+        inp_n = self._get_conv_output(input_shape)
+
+        layers = [nn.Flatten()]
+        for i in range(config['n_fc_layers']):
+            out_n = config['fc_layer_{}'.format(i)]
+
+            layers.append(nn.Linear(inp_n, out_n))
+            layers.append(nn.ReLU())
+
+            inp_n = out_n
+
+        layers.append(nn.Linear(inp_n, num_classes))
+        self.fc_layers = nn.Sequential(*layers)
+
+    # generate input sample and forward to get shape
+    def _get_conv_output(self, shape: Tuple) -> int:
+        bs = 1
+        input = torch.autograd.Variable(torch.rand(bs, *shape))
+        output_feat = self.conv_layers(input)
+        output_feat = self.pooling(output_feat)
+        n_size = output_feat.data.view(bs, -1).size(1)
+        return n_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv_layers(x)
+        x = self.pooling(x)
+        x = self.fc_layers(x)
+        return x
+
+    def train_fn(self, optimizer: torch.optim.Optimizer, criterion: Any, loader: DataLoader, device: torch.device):
+        """
+        Training method
+
+        Parameters
+        ----------
+        optimizer
+            optimization algorithm
+        criterion
+            loss function
+        loader
+            data loader for either training or testing set
+        device
+            Either CPU or GPU
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.train()
+
+        acc = 0
+        for images, labels in loader:
+            images = images.to(device)
+            labels = labels.to(device)
+
+            optimizer.zero_grad()
+            logits = self(images)
+
+            loss = criterion(logits, labels.argmax(-1))
+            loss.backward()
+            optimizer.step()
+
+            acc = accuracy(labels, logits)
+
+        return acc
+
+    def eval_fn(self, loader: DataLoader, device: torch.device):
+        """
+        Evaluation method
+
+        Parameters
+        ----------
+        loader:
+            data loader for either training or testing set
+        device:
+            torch device
+
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.eval()
+
+        acc = 0
+        with torch.no_grad():  # no gradient needed
+            for images, labels in loader:
+                images = images.to(device)
+                labels = labels.to(device)
+
+                outputs = self(images)
+                acc = accuracy(labels, outputs)
+
+        return acc
+
+
+class CNNBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Parameters
+        ----------
+        dataset : str
+            One of fashion, flower.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+
+        super(CNNBenchmark, self).__init__(rng=rng)
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+        logger.info(f'Start Benchmark on dataset {dataset}')
+
+        self.dataset = dataset
+        self.__seed_everything()
+
+        # Dataset loading
+        data_manager = CNNDataManager(dataset=self.dataset)
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+
+        self.output_classes = self.y_train.shape[1]
+        self.input_shape = self.X_train.shape[1:4]
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the CNN model.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_conv_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('conv_layer_0', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_1', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_2', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=32, lower=2, upper=512, log=True),
+
+            CS.UniformIntegerHyperparameter('batch_size', lower=1, upper=512, default_value=128, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.CategoricalHyperparameter('batch_norm', default_value=False, choices=[False, True]),
+            CS.CategoricalHyperparameter('global_avg_pooling', default_value=True, choices=[False, True]),
+            CS.CategoricalHyperparameter('kernel_size', default_value=5, choices=[7, 5, 3])
+        ])
+
+        cs.add_conditions([
+            # Add the conv_layer_1 (2nd layer) if we allow more than 1 (>1) `n_conv_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_1'), cs.get_hyperparameter('n_conv_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_2'), cs.get_hyperparameter('n_conv_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+        ])
+
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities
+        ----------
+        budget: int - [1, 25]
+            Number of epochs to train
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('budget', lower=1, upper=25, default_value=25, log=False)
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Bag of baselines for multi-objective joint neural architecture search and '
+                    'hyperparameter optimization',
+            'references': ['@article{guerrero2021bag,'
+                           'title   = {Bag of baselines for multi - objective joint neural architecture search and '
+                           'hyperparameter optimization},'
+                           'author  = {Guerrero-Viu, Julia and Hauns, Sven and Izquierdo, Sergio and Miotto, '
+                           'Guilherme and Schrodi, Simon and Biedenkapp, Andre and Elsken, Thomas and Deng, '
+                           'Difan and Lindauer, Marius and Hutter, Frank},},'
+                           'journal = {arXiv preprint arXiv:2105.01015},'
+                           'year    = {2021}}',
+                           ],
+            'code': 'https://github.com/automl/multi-obj-baselines',
+        }
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get the names of the objectives reported in the objective function."""
+        return ['misclassification_rate', 'model_size']
+
+    def init_model(self, config: Union[CS.Configuration, Dict]) -> Net:
+        """
+        Function that returns the model initialized based on the configuration and fidelity
+        """
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        return Net(config, self.input_shape, self.output_classes)
+
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        logger.debug(f'Generate seed: {seed}')
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Train a CNN on either the flower or the fashion data set and return the performance on the validation
+        data split.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                misclassification_rate: float
+                    1 - validation accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                valid_accuracy : float,
+                valid_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        time_in = time.time()
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        logger.info(f'We use the device: {device}')
+
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(self.X_train, self.y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_val = TensorDataset(self.X_valid, self.y_valid)
+        ds_val = DataLoader(ds_val, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()]).item()
+        start = time.time()
+        val_accuracy = model.eval_fn(ds_val, device).item()
+        eval_valid_runtime = time.time() - start
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            val_acc=val_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            train_runtime=training_runtime,
+            eval_valid_runtime=eval_valid_runtime,
+            eval_test_runtime=eval_test_runtime,
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'misclassification_rate': 1 - val_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': float(training_runtime),
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'valid_accuracy': val_accuracy,
+                         'valid_cost': eval_valid_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: bool = False,
+                                **kwargs) -> Dict:
+        """
+        Train a CNN on both the train adn validation split of either the flower or the fashion data set and
+        get the test results.
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                misclassification_rate: float
+                    1 - test accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        time_in = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        train_X = torch.vstack((self.X_train, self.X_valid))
+        y_train = torch.cat((self.y_train, self.y_valid))
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(train_X, y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()])
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            eval_train_runtime=training_runtime,
+            eval_test_runtime=eval_test_runtime,
+
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'misclassification_rate': 1 - test_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': training_runtime,
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+
+class FashionCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FashionCNNBenchmark, self).__init__(dataset='fashion', rng=rng, **kwargs)
+
+
+class FlowerCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FlowerCNNBenchmark, self).__init__(dataset='flower', rng=rng, **kwargs)
+
+
+__all__ = ["FashionCNNBenchmark",
+           "FlowerCNNBenchmark"]
diff --git a/hpobench/benchmarks/nas/nasbench_101.py b/hpobench/benchmarks/nas/nasbench_101.py
index f7ee1b20..c0f80737 100644
--- a/hpobench/benchmarks/nas/nasbench_101.py
+++ b/hpobench/benchmarks/nas/nasbench_101.py
@@ -42,6 +42,11 @@
 
 Changelog:
 ==========
+0.0.5
+* ADD Multi Objective version. Introduce objectives:
+  - misclassification_rate (0, 1)     - lower is better
+  - trainable_parameters   (0, 10**8) - lower is better
+
 0.0.4
 * New container release due to a general change in the communication between container and HPOBench.
   Works with HPOBench >= v0.0.8
@@ -61,23 +66,22 @@
 
 """
 import logging
-
 from pathlib import Path
-from typing import Union, Dict, Any, Tuple, List
+from typing import Union, Dict, Any, Tuple, List, Type
 
 import ConfigSpace as CS
 import numpy as np
-from tabular_benchmarks.nas_cifar10 import NASCifar10
 from nasbench import api
 from nasbench.api import OutOfDomainError
 from nasbench.lib import graph_util
+from tabular_benchmarks.nas_cifar10 import NASCifar10
 
-from hpobench import config_file
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench import config_file
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import NASBench_101DataManager
 
-__version__ = '0.0.4'
+__version__ = '0.0.5'
 logger = logging.getLogger('NasBench101')
 
 MAX_EDGES = 9
@@ -85,17 +89,19 @@
 DEFAULT_API_FILE = config_file.data_dir / "nasbench_101"
 
 
-class NASCifar10BaseBenchmark(AbstractBenchmark):
-    def __init__(self, benchmark: NASCifar10,
+class _NAS101BaseBenchmark:
+    def __init__(self,
+                 benchmark_type: Type[NASCifar10],
                  data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 **kwargs):
         """
         Baseclass for the tabular benchmarks https://github.com/automl/nas_benchmarks/tree/master/tabular_benchmarks.
         Please install the benchmark first. Place the data under ``data_path``.
 
         Parameters
         ----------
-        benchmark : NASCifar10
+        benchmark_type : Type[NASCifar10]
             Type of the benchmark to use. Don't call this class directly. Instantiate via subclasses (see below).
         data_path : str, Path, None
             Path to the folder, which contains the downloaded file nasbench_full.tfrecord.
@@ -103,21 +109,76 @@ def __init__(self, benchmark: NASCifar10,
             Random seed for the benchmarks
         """
 
-        super(NASCifar10BaseBenchmark, self).__init__(rng=rng)
-
-        self.benchmark = benchmark
+        data_path = self._try_download_api_file(data_path)
         self.data_path = data_path
+        self.rng = rng
+        self.benchmark: NASCifar10 = benchmark_type(data_dir=str(data_path), multi_fidelity=True)
+        super(_NAS101BaseBenchmark, self).__init__(rng=rng, **kwargs)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         raise NotImplementedError
 
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           run_index: Union[int, Tuple, None] = (0, 1, 2),
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'Tabular Benchmarks for Hyperparameter Optimization and Neural Architecture Search',
+                'references': ['@article{klein2019tabular,'
+                               'title   = {Tabular benchmarks for joint architecture and hyperparameter optimization},'
+                               'author  = {Klein, Aaron and Hutter, Frank},'
+                               'journal = {arXiv preprint arXiv:1905.04970},'
+                               'year    = {2019}}',
+                               'https://arxiv.org/abs/1905.04970',
+                               ],
+                'code': 'https://github.com/automl/nas_benchmarks',
+                }
+
+    @staticmethod
+    def _get_configuration_space(benchmark: Any, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """ Helper function to pass a seed to the configuration space """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = benchmark.get_configuration_space()
+        cs.seed(seed)
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 101.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.OrdinalHyperparameter('budget', sequence=[4, 12, 36, 108], default_value=108)
+        ])
+
+        return fidel_space
+
+    @staticmethod
+    def _try_download_api_file(save_to: Union[Path, str, None]):
+        data_manager = NASBench_101DataManager(save_to)
+        data_manager.download()
+        return data_manager.save_dir
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[CS.Configuration, Dict, None] = None,
+                               run_index: Union[int, Tuple, None] = (0, 1, 2),
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               **kwargs) -> Dict:
         """
         Query the NAS-benchmark using a given configuration and a epoch (=budget).
 
@@ -144,7 +205,12 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : validation error
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
             cost : runtime
             info : Dict
                 fidelity : used fidelities in this evaluation
@@ -176,6 +242,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         test_accuracies = []
         training_times = []
         additional = {}
+        failure = False
 
         for run_id in run_index:
             data = self._query_benchmark(config=configuration, budget=fidelity['budget'], run_index=run_id)
@@ -186,25 +253,31 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             training_times.append(data['training_time'])
 
             # Since those information are the same for all run ids, just store one of them.
-            additional = {'trainable_parameters': data['trainable_parameters'],
+            # Also, if the configuration is invalid, set the number of parameters to its upper limit.
+            trainable_parameters = data['trainable_parameters']
+            failure = trainable_parameters == 0
+            trainable_parameters = 10**8 if trainable_parameters == 0 else trainable_parameters
+
+            additional = {'trainable_parameters': trainable_parameters,
                           'module_operations': data['module_operations']}
 
-        return {'function_value': float(1 - np.mean(valid_accuracies)),
+        return {'function_value': {'misclassification_rate': float(1 - np.mean(valid_accuracies)),
+                                   'trainable_parameters': additional['trainable_parameters']},
                 'cost': float(np.sum(training_times)),
                 'info': {'fidelity': fidelity,
                          'train_accuracies': train_accuracies,
                          'valid_accuracies': valid_accuracies,
                          'test_accuracies': test_accuracies,
                          'training_times': training_times,
+                         'failure': 1 if failure else 0,
                          'data': additional
                          }
                 }
 
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def _mo_objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                    fidelity: Union[CS.Configuration, Dict, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
         """
         Validate a configuration on the maximum available budget.
 
@@ -222,83 +295,29 @@ def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
         Returns
         -------
         Dict -
-            function_value : test error
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
             cost : runtime
             info : Dict
                 fidelity : used fidelities in this evaluation
         """
 
-        result = self.objective_function(configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng)
-        result['function_value'] = float(1 - np.mean(result['info']['test_accuracies']))
+        result = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng
+        )
+        result['function_value']['misclassification_rate'] = float(1 - np.mean(result['info']['test_accuracies']))
 
         return result
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Tabular Benchmarks for Hyperparameter Optimization and Neural Architecture Search',
-                'references': ['@article{klein2019tabular,'
-                               'title   = {Tabular benchmarks for joint architecture and hyperparameter optimization},'
-                               'author  = {Klein, Aaron and Hutter, Frank},'
-                               'journal = {arXiv preprint arXiv:1905.04970},'
-                               'year    = {2019}}',
-                               'https://arxiv.org/abs/1905.04970',
-                               ],
-                'code': 'https://github.com/automl/nas_benchmarks',
-                }
-
-    @staticmethod
-    def _get_configuration_space(benchmark: Any, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """ Helper function to pass a seed to the configuration space """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = benchmark.get_configuration_space()
-        cs.seed(seed)
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 101.
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.OrdinalHyperparameter('budget', sequence=[4, 12, 36, 108], default_value=108)
-        ])
-
-        return fidel_space
-
-    @staticmethod
-    def _try_download_api_file(save_to: Union[Path, str, None]):
-        data_manager = NASBench_101DataManager(save_to)
-        data_manager.download()
-        return data_manager.save_dir
-
-
-class NASCifar10ABenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
 
+class _QueryA(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10A
-        benchmark = NASCifar10A(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10ABenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryA, self).__init__(benchmark_type=NASCifar10A)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -315,7 +334,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10A
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10A, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10A, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         """
@@ -372,15 +391,10 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
-class NASCifar10BBenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
-
+class _QueryB(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10B
-        benchmark = NASCifar10B(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10BBenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryB, self).__init__(benchmark_type=NASCifar10B, **kwargs)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -397,9 +411,10 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10B
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10B, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10B, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
+
         """
         Copied from the 'objective_function' from nas_cifar10.py
         We adapted the file in such a way, that the complete result is returned. The original implementation returns
@@ -408,6 +423,8 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         Parameters
         ----------
         config : Dict
+        run_index : int
+            Specifies the seed to use. Can be one of 0, 1, 2.
         budget : int
             The number of epochs. Must be one of: 4 12 36 108. Otherwise a accuracy of 0 is returned.
 
@@ -415,6 +432,7 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         -------
         Dict
         """
+
         failure = {"test_accuracy": 0, "train_accuracy": 0, "validation_accuracy": 0, "training_time": 0,
                    "info": "failure", "trainable_parameters": 0, "module_operations": 0}
 
@@ -439,6 +457,7 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         labeling = [config["op_node_%d" % i] for i in range(5)]
         labeling = ['input'] + list(labeling) + ['output']
         model_spec = api.ModelSpec(matrix, labeling)
+
         try:
             data = modified_query(self.benchmark, run_index=run_index, model_spec=model_spec, epochs=budget)
         except api.OutOfDomainError:
@@ -453,15 +472,10 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
-class NASCifar10CBenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
-
+class _QueryC(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10C
-        benchmark = NASCifar10C(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10CBenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryC, self).__init__(benchmark_type=NASCifar10C, **kwargs)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -478,7 +492,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10C
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10C, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10C, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         """
@@ -538,6 +552,221 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
+class _NASCifar10BaseMOBenchmark(_NAS101BaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS-benchmark using a given configuration and a epoch (=budget).
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        return self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=run_index, rng=rng, **kwargs
+        )
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+
+        return self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'trainable_parameters']
+
+
+class _NASCifar10BaseSOBenchmark(_NAS101BaseBenchmark, AbstractBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS-benchmark using a given configuration and a epoch (=budget).
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        result_dict = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=run_index, rng=rng, **kwargs
+        )
+
+        # swap function_value dict to value
+        result_dict['function_value'] = result_dict['function_value']['misclassification_rate']
+        return result_dict
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                equals misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        result_dict = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+        # swap function_value dict to value
+        result_dict['function_value'] = result_dict['function_value']['misclassification_rate']
+        return result_dict
+
+
+class NASCifar10ABenchmark(_QueryA, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10ABenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10AMOBenchmark(_QueryA, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10AMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10BBenchmark(_QueryB, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10BBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10BMOBenchmark(_QueryB, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10BMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10CBenchmark(_QueryC, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10CBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10CMOBenchmark(_QueryC, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10CMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
 def modified_query(benchmark, model_spec, run_index: int, epochs=108, stop_halfway=False):
     """
     NOTE:
@@ -607,3 +836,11 @@ def modified_query(benchmark, model_spec, run_index: int, epochs=108, stop_halfw
         benchmark.dataset.total_epochs_spent += epochs
 
     return data
+
+
+__all__ = ["NASCifar10ABenchmark",
+           "NASCifar10AMOBenchmark",
+           "NASCifar10BBenchmark",
+           "NASCifar10BMOBenchmark",
+           "NASCifar10CBenchmark",
+           "NASCifar10CMOBenchmark"]
diff --git a/hpobench/benchmarks/nas/nasbench_1shot1.py b/hpobench/benchmarks/nas/nasbench_1shot1.py
index 4d8231a0..5d94631e 100644
--- a/hpobench/benchmarks/nas/nasbench_1shot1.py
+++ b/hpobench/benchmarks/nas/nasbench_1shot1.py
@@ -34,7 +34,7 @@
 pip install .[nasbench_1shot1]
 
 pip install git+https://github.com/google-research/nasbench.git@master
-git clone https://github.com/automl/nasbench-1shot1/tree/master/nasbench_analysis/
+git clone https://github.com/automl/nasbench-1shot1
 
 3. Environment setup
 ====================
@@ -46,6 +46,9 @@
 
 Changelog:
 ==========
+0.0.5
+* Add MO Version
+
 0.0.4
 * New container release due to a general change in the communication between container and HPOBench.
   Works with HPOBench >= v0.0.8
@@ -62,34 +65,33 @@
 
 """
 import logging
-
+from ast import literal_eval
 from pathlib import Path
 from typing import Union, Dict, Any, Tuple, List
-from ast import literal_eval
 
 import ConfigSpace as CS
 import numpy as np
 from nasbench import api
 from nasbench.api import OutOfDomainError
-
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.data_manager import NASBench_101DataManager
-from hpobench.util import rng_helper
-
 from nasbench_analysis.search_spaces.search_space_1 import SearchSpace1  # noqa
 from nasbench_analysis.search_spaces.search_space_2 import SearchSpace2  # noqa
 from nasbench_analysis.search_spaces.search_space_3 import SearchSpace3  # noqa
 from nasbench_analysis.utils import INPUT, OUTPUT, CONV1X1, CONV3X3, MAXPOOL3X3  # noqa
 
-__version__ = '0.0.4'
+from hpobench.abstract_benchmark import AbstractSingleObjectiveBenchmark, AbstractMultiObjectiveBenchmark
+from hpobench.util import rng_helper
+from hpobench.util.data_manager import NASBench_101DataManager
+
+__version__ = '0.0.5'
 logger = logging.getLogger('NasBench1shot1')
 
 
-class NASBench1shot1BaseBenchmark(AbstractBenchmark):
+class _NASBench1shot1BaseBenchmark:
+
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
-        Baseclass for the nasbench 1shot1 benchmarks.
+        Baseclass for the all nasbench 1shot1 benchmarks.
         Please install the benchmark first. Place the data under ``data_path``.
 
         Parameters
@@ -99,18 +101,18 @@ def __init__(self, data_path: Union[Path, str, None] = None,
         rng : np.random.RandomState, int, None
             Random seed for the benchmarks
         """
-        super(NASBench1shot1BaseBenchmark, self).__init__(rng=rng)
+
         data_manager = NASBench_101DataManager(data_path)
         self.api = data_manager.load()
         self.search_space = None
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+        self.rng = rng
+        super(_NASBench1shot1BaseBenchmark, self).__init__(rng=rng)
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[CS.Configuration, Dict, None] = None,
+                               run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               **kwargs) -> Dict:
         """
         Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
         Only data for the budgets 4, 12, 36, 108 are available.
@@ -171,7 +173,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                           'module_operations': data['module_operations']}
             failure = failure or ('info' in data and data['info'] == 'failure')
 
-        return {'function_value': float(1 - np.mean(valid_accuracies)),
+        return {'function_value': {'misclassification_rate': float(1 - np.mean(valid_accuracies)),
+                                   'trainable_parameters': additional['trainable_parameters']},
                 'cost': float(np.sum(training_times)),
                 'info': {'fidelity': fidelity,
                          'train_accuracies': train_accuracies,
@@ -179,50 +182,24 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                          'test_accuracies': test_accuracies,
                          'training_times': training_times,
                          'data': additional,
-                         'failure': 'False' if not failure else 'True'
+                         'failure': 0 if not failure else 1
                          }
                 }
 
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
-        """
-        Validate a configuration on the maximum available budget (108) and on all three seeds.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
-            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
-            function. If this parameter is not given, the default random state is used.
-        kwargs
+    def _mo_objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                    fidelity: Union[CS.Configuration, Dict, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
 
-        Returns
-        -------
-        Dict -
-            function_value : test error on largest fidelity.
-            cost : runtime
-            info : Dict
-                train_accuracies
-                test_accuracies
-                valid_accuracies
-                training_times
-                fidelity : used fidelities in this evaluation
-                data : additional data such as trainable parameters and used operations
-        """
         assert fidelity['budget'] == 108, 'Only test data for the 108th epoch is available.'
-        result = self.objective_function(configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng)
-        result['function_value'] = float(1 - np.mean(result['info']['test_accuracies']))
+        result = self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                             run_index=(0, 1, 2), rng=rng, **kwargs)
+        result['function_value']['misclassification_rate'] = float(1 - np.mean(result['info']['test_accuracies']))
         return result
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
+        raise NotImplementedError()
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -264,7 +241,6 @@ def get_meta_information() -> Dict:
                 }
 
     def _check_run_index(self, run_index):
-
         if isinstance(run_index, int):
             assert 0 <= run_index <= 2, f'run_index must be in [0, 2], not {run_index}'
             run_index = (run_index, )
@@ -426,7 +402,223 @@ def _get_configuration_space(search_space: Any, seed: Union[int, None] = None) -
         return cs
 
 
-class NASBench1shot1SearchSpace1Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1BaseMOBenchmark(_NASBench1shot1BaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
+        Only data for the budgets 4, 12, 36, 108 are available.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple/List, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation error
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        return self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                           run_index=run_index, rng=rng, **kwargs)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget (108) and on all three seeds.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test error on largest fidelity.
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        return self._mo_objective_function_test(configuration=configuration, fidelity=fidelity,
+                                                rng=rng, **kwargs)
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'trainable_parameters']
+
+
+class NASBench1shot1BaseSOBenchmark(_NASBench1shot1BaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
+        Only data for the budgets 4, 12, 36, 108 are available.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple/List, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation error
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        result = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, run_index=run_index, **kwargs
+        )
+        result['info'].update(result['function_value'])
+        result['function_value'] = result['function_value']['misclassification_rate']
+        return result
+
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget (108) and on all three seeds.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test error on largest fidelity.
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+
+        result = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+        result['info'].update(result['function_value'])
+        result['function_value'] = result['function_value']['misclassification_rate']
+        return result
+
+
+class NASBench1shot1SearchSpace1MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace1MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace1()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
+
+
+class NASBench1shot1SearchSpace2MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace2MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace2()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
+
+
+class NASBench1shot1SearchSpace3MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace3MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace3()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+
+
+class NASBench1shot1SearchSpace1Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace1Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -434,10 +626,10 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
 
 
-class NASBench1shot1SearchSpace2Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1SearchSpace2Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace2Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -445,10 +637,10 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
 
 
-class NASBench1shot1SearchSpace3Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1SearchSpace3Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace3Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -456,4 +648,14 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+
+
+__all__ = [
+    "NASBench1shot1SearchSpace1Benchmark",
+    "NASBench1shot1SearchSpace2Benchmark",
+    "NASBench1shot1SearchSpace3Benchmark",
+    "NASBench1shot1SearchSpace1MOBenchmark",
+    "NASBench1shot1SearchSpace3MOBenchmark",
+    "NASBench1shot1SearchSpace3MOBenchmark",
+]
diff --git a/hpobench/benchmarks/nas/nasbench_201.py b/hpobench/benchmarks/nas/nasbench_201.py
index 17bac321..1ca0beb3 100644
--- a/hpobench/benchmarks/nas/nasbench_201.py
+++ b/hpobench/benchmarks/nas/nasbench_201.py
@@ -27,6 +27,12 @@
 
 Changelog:
 ==========
+0.0.6
+* Add the multiobjective version of this benchmark by returning flops, model size, latency and missclassification rate
+* Integrate #138: Improve the docstrings about the seeds.
+* Scale the returned misclassification rate from range [0, 100] to [0, 1].
+* Improve naming in the result object ("*_precision" -> "*_misclassification_rate")
+
 0.0.5
 * Add for each benchmark a new one with a different fidelity space.
   The new fidelity space corresponds to the fidelity space in the DEHB paper.
@@ -47,23 +53,23 @@
 * First implementation
 """
 import logging
-from typing import Union, Dict, List, Text, Tuple
 from copy import deepcopy
+from typing import Union, Dict, List, Text, Tuple
 
 import ConfigSpace as CS
 import numpy as np
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractSingleObjectiveBenchmark, AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import NASBench_201Data
 
-__version__ = '0.0.5'
+__version__ = '0.0.6'
 MAX_NODES = 4
 
 logger = logging.getLogger('NASBENCH201')
 
 
-class NasBench201BaseBenchmark(AbstractBenchmark):
+class _NasBench201BaseBenchmark:
     def __init__(self, dataset: str,
                  rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         """
@@ -129,6 +135,8 @@ def __init__(self, dataset: str,
         - In the original data, the training splits are always marked with the key 'train' but they use different
           identifiers to refer to the available evaluation splits. We report them also in the table below.
         - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
 
          Some further remarks:
         - cifar10-valid is trained on the train split and tested on the validation split.
@@ -145,13 +153,12 @@ def __init__(self, dataset: str,
             Random seed for the benchmark's random state.
         """  # noqa: E501
 
-        super(NasBench201BaseBenchmark, self).__init__(rng=rng)
-
         data_manager = NASBench_201Data(dataset=dataset)
 
         self.dataset = dataset
         self.data = data_manager.load()
-        self.config_to_structure = NasBench201BaseBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        self.config_to_structure = _NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        super(_NasBench201BaseBenchmark, self).__init__(rng=rng, **kwargs)
 
     def dataset_mapping(self, dataset):
         mapping = {'cifar10-valid': ('x-valid', 'ori-test'),
@@ -159,8 +166,254 @@ def dataset_mapping(self, dataset):
                    'cifar100': ('ori-test', 'x-test')}
         return mapping[dataset]
 
+    @staticmethod
+    def config_to_structure_func(max_nodes: int):
+        """
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+        """
+
+        def config_to_structure(config):
+            genotypes = []
+            for i in range(1, max_nodes):
+                x_list = []
+                for j in range(i):
+                    node_str = f'{i}<-{j}'
+                    op_name = config[node_str]
+                    x_list.append((op_name, j))
+                genotypes.append(tuple(x_list))
+            return _NasBench201BaseMOBenchmark._Structure(genotypes)
+
+        return config_to_structure
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+
+        Parameters
+        ----------
+        seed : int, None
+            Random seed for the configuration space.
+
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        search_space = _NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
+        cs.add_hyperparameters(hps)
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
+
+        Fidelities
+        ----------
+         epoch: int
+            The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('epoch', lower=1, upper=200, default_value=200)
+        ])
+        return fidel_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
+                'references': ['@article{dong2020bench,'
+                               'title   = {Nas-bench-201: Extending the scope of reproducible neural '
+                               '           architecture search},'
+                               'author  = {Dong, Xuanyi and Yang, Yi},'
+                               'journal = {arXiv preprint arXiv:2001.00326},'
+                               'year    = {2020}}',
+                               'https://openreview.net/forum?id=HJxyZkBKDr',
+                               ],
+                'code': 'https://github.com/D-X-Y/AutoDL-Projects',
+                }
+
+    @staticmethod
+    def get_search_spaces(xtype: str, name: str) -> List[Text]:
+        """ obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+        """
+        # pylint: disable=no-else-return
+        if xtype == 'cell':
+            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
+            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
+            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
+            return SearchSpaceNames[name]
+        else:
+            raise ValueError('invalid search-space type is {:}'.format(xtype))
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[Dict, CS.Configuration, None] = None,
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                               **kwargs) -> Dict:
+
+        self.rng = rng_helper.get_rng(rng)
+
+        if isinstance(data_seed, (List, Tuple)):
+            assert len(data_seed) != 0, 'data_seed must not be empty'
+            if len(set(data_seed)) != len(data_seed):
+                logger.debug('There are some values more than once in the run_index. We remove the redundant entries.')
+            data_seed = tuple(set(data_seed))
+        elif isinstance(data_seed, int):
+            data_seed = (data_seed,)
+        elif data_seed is None:
+            logger.debug('The data seed is explicitly set to None! A random seed will be selected.')
+            data_seed = tuple(self.rng.choice((777, 888, 999), size=1))
+        # Check if the data set seeds are valid
+        else:
+            raise ValueError(f'data seed has unknown data type {type(data_seed)}, '
+                             f'but should be tuple or int (777,888,999)')
+
+        assert len(set(data_seed) - {777, 888, 999}) == 0, \
+            f'data seed can only contain the elements 777, 888, 999, but was {data_seed}'
+
+        structure = self.config_to_structure(configuration)
+        structure_str = structure.tostr()
+
+        epoch = fidelity['epoch'] - 1
+        data_seed = [str(seed) for seed in data_seed]
+        valid_key, test_key = self.dataset_mapping(self.dataset)
+
+        train_accuracies = [self.data[seed][structure_str]['train_acc1es'][f'{epoch}'] for seed in data_seed]
+        train_losses = [self.data[seed][structure_str]['train_losses'][f'{epoch}'] for seed in data_seed]
+        train_times = [np.sum((self.data[seed][structure_str]['train_times'][f'{e}']) for e in range(1, epoch + 1))
+                       for seed in data_seed]
+
+        valid_accuracies = [self.data[seed][structure_str]['eval_acc1es'][f'{valid_key}@{epoch}'] for seed in data_seed]
+        valid_losses = [self.data[seed][structure_str]['eval_losses'][f'{valid_key}@{epoch}'] for seed in data_seed]
+        valid_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{valid_key}@{e}'])
+                              for e in range(1, epoch + 1)) for seed in data_seed]
+
+        # There is a single value for the eval data per seed. (only epoch 200)
+        test_accuracies = [self.data[seed][structure_str]['eval_acc1es'][f'{valid_key}@{199}'] for seed in data_seed]
+        test_losses = [self.data[seed][structure_str]['eval_losses'][f'{valid_key}@{199}'] for seed in data_seed]
+        test_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{test_key}@{199}'])
+                             for e in range(1, epoch + 1)) for seed in data_seed]
+
+        # Number of floating point operations in million
+        num_flops = [self.data[seed][structure_str]['flop'] for seed in data_seed]
+
+        # Number of trainable model parameters in MB
+        model_size = [self.data[seed][structure_str]['params'] for seed in data_seed]
+
+        # Time to evaluate in seconds
+        latency = [self.data[seed][structure_str]['latency'] for seed in data_seed]
+
+        return {
+            'function_value': {
+                # The original benchmark returned the accuracy with range [0, 100].
+                # We cast it to a minimization problem with range [0-1] to have a more standardized return value.
+                'misclassification_rate': 0.01 * float(100 - np.mean(valid_accuracies)),
+                'num_flops': float(np.mean(num_flops)),
+                'model_size': float(np.mean(model_size)),
+                'latency': float(np.mean(latency)),
+            },
+            'cost': float(np.sum(valid_times) + np.sum(train_times)),
+            'info': {
+                'train_misclassification_rate': 0.01 * float(100 - np.mean(train_accuracies)),
+                'train_losses': float(np.mean(train_losses)),
+                'train_cost': float(np.sum(train_times)),
+                'valid_misclassification_rate': 0.01 * float(100 - np.mean(valid_accuracies)),
+                'valid_losses': float(np.mean(valid_losses)),
+                'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
+                'test_misclassification_rate': 0.01 * float(100 - np.mean(test_accuracies)),
+                'test_losses': float(np.mean(test_losses)),
+                'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
+                'fidelity': fidelity
+            }
+        }
+
+    def _mo_objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                    fidelity: Union[Dict, CS.Configuration, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
+        # The result dict should contain already all necessary information -> Just swap the function value from valid
+        # to test and the corresponding time cost
+        assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
+
+        if 'data_seed' in kwargs:
+            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
+            if not all_seeds_available:
+                logger.warning('You have not specified all available seeds for the '
+                               '`objective_function_test`. However, we are going to ignore them, '
+                               ' because we report test values only as mean across all seeds.'
+                               f' Your given seeds: {kwargs["seed"]}')
+            del kwargs['data_seed']
+
+        result = self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                             data_seed=(777, 888, 999),
+                                             rng=rng, **kwargs)
+        result['function_value']['misclassification_rate'] = result['info']['test_misclassification_rate']
+        result['cost'] = result['info']['test_cost']
+        return result
+
+    class _Structure:
+        def __init__(self, genotype):
+            assert isinstance(genotype, (list, tuple)), 'invalid class of genotype : {:}'.format(type(genotype))
+            self.node_num = len(genotype) + 1
+            self.nodes = []
+            self.node_N = []
+            for idx, node_info in enumerate(genotype):
+                assert isinstance(node_info, (list, tuple)), 'invalid class of node_info : {:}'.format(type(node_info))
+                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
+                for node_in in node_info:
+                    assert isinstance(node_in, (list, tuple)), 'invalid class of in-node : {:}'.format(type(node_in))
+                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
+                self.node_N.append(len(node_info))
+                self.nodes.append(tuple(deepcopy(node_info)))
+
+        def tostr(self):
+            """ Helper function: Create a string representation of the configuration """
+            strings = []
+            for node_info in self.nodes:
+                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
+                string = '|{:}|'.format(string)
+                strings.append(string)
+            return '+'.join(strings)
+
+        def __repr__(self):
+            return (
+                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
+                                                                   **self.__dict__))
+
+        def __len__(self):
+            return len(self.nodes) + 1
+
+        def __getitem__(self, index):
+            return self.nodes[index]
+
+
+class _NasBench201BaseMOBenchmark(_NasBench201BaseBenchmark, AbstractMultiObjectiveBenchmark):
     # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
@@ -205,15 +458,23 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : training precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - validation accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to train the network
             info : Dict
-                train_precision : float
+                train_misclassification_rate : float
                 train_losses : float
                 train_cost : float
                     Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
                     this field is the sum of the training time per network
-                eval_precision : float
+                eval_misclassification_rate : float
                 eval_losses : float
                 eval_cost : float
                     Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
@@ -221,65 +482,10 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 fidelity : Dict
                     used fidelities in this evaluation
         """
-        self.rng = rng_helper.get_rng(rng)
-
-        if isinstance(data_seed, (List, Tuple)):
-            assert len(data_seed) != 0, 'data_seed must not be empty'
-            if len(set(data_seed)) != len(data_seed):
-                logger.debug('There are some values more than once in the run_index. We remove the redundant entries.')
-            data_seed = tuple(set(data_seed))
-        elif isinstance(data_seed, int):
-            data_seed = (data_seed, )
-        elif data_seed is None:
-            logger.debug('The data seed is explicitly set to None! A random seed will be selected.')
-            data_seed = tuple(self.rng.choice((777, 888, 999), size=1))
-        # Check if the data set seeds are valid
-        else:
-            raise ValueError(f'data seed has unknown data type {type(data_seed)}, '
-                             f'but should be tuple or int (777,888,999)')
+        return self._mo_objective_function(configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed,
+                                           **kwargs)
 
-        assert len(set(data_seed) - {777, 888, 999}) == 0,\
-            f'data seed can only contain the elements 777, 888, 999, but was {data_seed}'
-
-        structure = self.config_to_structure(configuration)
-        structure_str = structure.tostr()
-
-        epoch = fidelity['epoch'] - 1
-        data_seed = [str(seed) for seed in data_seed]
-        valid_key, test_key = self.dataset_mapping(self.dataset)
-
-        train_accuracies = [self.data[seed][structure_str]['train_acc1es'][f'{epoch}'] for seed in data_seed]
-        train_losses = [self.data[seed][structure_str]['train_losses'][f'{epoch}'] for seed in data_seed]
-        train_times = [np.sum((self.data[seed][structure_str]['train_times'][f'{e}']) for e in range(1, epoch + 1))
-                       for seed in data_seed]
-
-        valid_accuracies = [self.data[seed][structure_str]['eval_acc1es'][f'{valid_key}@{epoch}'] for seed in data_seed]
-        valid_losses = [self.data[seed][structure_str]['eval_losses'][f'{valid_key}@{epoch}'] for seed in data_seed]
-        valid_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{valid_key}@{e}'])
-                              for e in range(1, epoch + 1)) for seed in data_seed]
-
-        # There is a single value for the eval data per seed. (only epoch 200)
-        test_accuracies = [self.data[seed][structure_str]['eval_acc1es'][f'{valid_key}@{199}'] for seed in data_seed]
-        test_losses = [self.data[seed][structure_str]['eval_losses'][f'{valid_key}@{199}'] for seed in data_seed]
-        test_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{test_key}@{199}'])
-                             for e in range(1, epoch + 1)) for seed in data_seed]
-
-        return {'function_value': float(100 - np.mean(valid_accuracies)),
-                'cost': float(np.sum(valid_times) + np.sum(train_times)),
-                'info': {'train_precision': float(100 - np.mean(train_accuracies)),
-                         'train_losses': float(np.mean(train_losses)),
-                         'train_cost': float(np.sum(train_times)),
-                         'valid_precision': float(100 - np.mean(valid_accuracies)),
-                         'valid_losses': float(np.mean(valid_losses)),
-                         'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
-                         'test_precision': float(100 - np.mean(test_accuracies)),
-                         'test_losses': float(np.mean(test_losses)),
-                         'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
-                         'fidelity': fidelity
-                         }
-                }
-
-    @AbstractBenchmark.check_parameters
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
@@ -294,10 +500,9 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [1, 200]
+            epoch: int - Values: [200]
                 Number of epochs an architecture was trained.
-                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
-
+                Note: We only have test performance on the last epoch.
             Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
         rng : np.random.RandomState, int, None
             Random seed to use in the benchmark.
@@ -311,185 +516,190 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : evaluation precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - test accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to the network + time to validate
             info : Dict
-                train_precision
+                train_misclassification_rate
                 train_losses
                 train_cost
-                eval_precision
+                eval_misclassification_rate
                 eval_losses
                 eval_cost
                 fidelity : used fidelities in this evaluation
         """
+        return self._mo_objective_function_test(configuration=configuration, fidelity=fidelity, rng=rng, **kwargs)
 
-        # The result dict should contain already all necessary information -> Just swap the function value from valid
-        # to test and the corresponding time cost
-        assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'num_flops', 'model_size', 'latency']
 
-        result = self.objective_function(configuration=configuration, fidelity=fidelity,
-                                         data_seed=(777, 888, 999),
-                                         rng=rng, **kwargs)
-        result['function_value'] = result['info']['test_precision']
-        result['cost'] = result['info']['test_cost']
-        return result
 
-    @staticmethod
-    def config_to_structure_func(max_nodes: int):
-        """
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
-        """
-        def config_to_structure(config):
-            genotypes = []
-            for i in range(1, max_nodes):
-                x_list = []
-                for j in range(i):
-                    node_str = f'{i}<-{j}'
-                    op_name = config[node_str]
-                    x_list.append((op_name, j))
-                genotypes.append(tuple(x_list))
-            return NasBench201BaseBenchmark._Structure(genotypes)
-        return config_to_structure
+class Cifar10ValidNasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
-    @staticmethod
-    def get_search_spaces(xtype: str, name: str) -> List[Text]:
-        """ obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
-        """
-        # pylint: disable=no-else-return
-        if xtype == 'cell':
-            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
-            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
-            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
-            return SearchSpaceNames[name]
-        else:
-            raise ValueError('invalid search-space type is {:}'.format(xtype))
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Return the CS representation of the search space.
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
 
-        Parameters
-        ----------
-        seed : int, None
-            Random seed for the configuration space.
+class Cifar100NasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
-        Returns
-        -------
-        CS.ConfigurationSpace -
-            Containing the benchmark's hyperparameter
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar100NasBench201MOBenchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
-        search_space = NasBench201BaseBenchmark.get_search_spaces('cell', 'nas-bench-201')
-        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
-        cs.add_hyperparameters(hps)
-        return cs
 
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+class ImageNetNasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(ImageNetNasBench201MOBenchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
+
+
+class _NasBench201SOBenchmark(_NasBench201BaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
         """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 201.
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits.
 
-        Fidelities:
-         - epoch: int
-         The loss / accuracy at `epoch`. Can be from 0 to 199.
+        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
+        dataset.
 
         Parameters
         ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the training time is the sum of the costs per seed.
+            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
+
+        kwargs
 
         Returns
         -------
-        ConfigSpace.ConfigurationSpace
+        Dict -
+            function_value : training misclassification_rate
+            cost : time to train the network
+            info : Dict
+                train_misclassification_rate : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_misclassification_rate : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
+        results = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed, **kwargs
+        )
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
 
-        fidel_space.add_hyperparameters([
-            CS.UniformIntegerHyperparameter('epoch', lower=1, upper=200, default_value=200)
-        ])
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
+        The test function uses all data set seeds (777, 888, 999).
 
-        return fidel_space
+        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
 
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
-                'references': ['@article{dong2020bench,'
-                               'title   = {Nas-bench-201: Extending the scope of reproducible neural '
-                               '           architecture search},'
-                               'author  = {Dong, Xuanyi and Yang, Yi},'
-                               'journal = {arXiv preprint arXiv:2001.00326},'
-                               'year    = {2020}}',
-                               'https://openreview.net/forum?id=HJxyZkBKDr',
-                               ],
-                'code': 'https://github.com/D-X-Y/AutoDL-Projects',
-                }
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
 
-    class _Structure:
-        def __init__(self, genotype):
-            assert isinstance(genotype, (list, tuple)), 'invalid class of genotype : {:}'.format(type(genotype))
-            self.node_num = len(genotype) + 1
-            self.nodes = []
-            self.node_N = []
-            for idx, node_info in enumerate(genotype):
-                assert isinstance(node_info, (list, tuple)), 'invalid class of node_info : {:}'.format(type(node_info))
-                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
-                for node_in in node_info:
-                    assert isinstance(node_in, (list, tuple)), 'invalid class of in-node : {:}'.format(type(node_in))
-                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
-                self.node_N.append(len(node_info))
-                self.nodes.append(tuple(deepcopy(node_info)))
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
 
-        def tostr(self):
-            """ Helper function: Create a string representation of the configuration """
-            strings = []
-            for node_info in self.nodes:
-                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
-                string = '|{:}|'.format(string)
-                strings.append(string)
-            return '+'.join(strings)
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
 
-        def __repr__(self):
-            return (
-                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
-                                                                   **self.__dict__))
+        kwargs
 
-        def __len__(self):
-            return len(self.nodes) + 1
+        Returns
+        -------
+        Dict -
+            function_value : evaluation misclassification_rate
+            cost : time to the network + time to validate
+            info : Dict
+                train_misclassification_rate
+                train_losses
+                train_cost
+                eval_misclassification_rate
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
+        """
 
-        def __getitem__(self, index):
-            return self.nodes[index]
+        results = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
 
 
-class Cifar10ValidNasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar10ValidNasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar100NasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201Benchmark(NasBench201BaseBenchmark):
+class ImageNetNasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class _NasBench201BaseBenchmarkOriginal(NasBench201BaseBenchmark):
+class _NasBench201SOBenchmarkOriginal(_NasBench201SOBenchmark):
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -502,7 +712,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         experiments from DEHB
         [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)
 
-        Fidelities:
+        Fidelities
+        ----------
         epoch: int
             The loss / accuracy at `epoch`.
 
@@ -528,26 +739,26 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
-        meta_information = NasBench201BaseBenchmark.get_meta_information()
+        meta_information = _NasBench201SOBenchmark.get_meta_information()
         meta_information['note'] = \
             'This version of the benchmark implements the fidelity space defined in the DEHB paper.' \
             'See [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)'
         return meta_information
 
 
-class Cifar10ValidNasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class Cifar10ValidNasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201BenchmarkOriginal, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class Cifar100NasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201BenchmarkOriginal, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class ImageNetNasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
@@ -558,4 +769,7 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/hpobench/benchmarks/nas/tabular_benchmarks.py b/hpobench/benchmarks/nas/tabular_benchmarks.py
index fd7404a0..5db34f2f 100644
--- a/hpobench/benchmarks/nas/tabular_benchmarks.py
+++ b/hpobench/benchmarks/nas/tabular_benchmarks.py
@@ -50,7 +50,6 @@
 * First implementation
 """
 import logging
-
 from pathlib import Path
 from typing import Union, Dict, Tuple, List
 
diff --git a/hpobench/benchmarks/od/od_ae.py b/hpobench/benchmarks/od/od_ae.py
index e3beca47..af80b106 100644
--- a/hpobench/benchmarks/od/od_ae.py
+++ b/hpobench/benchmarks/od/od_ae.py
@@ -407,6 +407,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         return fidel_space
 
+    # pylint: disable=arguments-differ
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
         X_train, _ = self.datamanager.dataset.get_train_data()
diff --git a/hpobench/benchmarks/rl/cartpole.py b/hpobench/benchmarks/rl/cartpole.py
index 3bcaeab4..ea9ef053 100644
--- a/hpobench/benchmarks/rl/cartpole.py
+++ b/hpobench/benchmarks/rl/cartpole.py
@@ -20,12 +20,13 @@
 """
 
 import logging
+import os
 import time
 from typing import Union, Dict
 
 import ConfigSpace as CS
 import numpy as np
-import os
+
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
 import tensorflow as tf  # noqa: E402
diff --git a/hpobench/benchmarks/surrogates/paramnet_benchmark.py b/hpobench/benchmarks/surrogates/paramnet_benchmark.py
index 2e809b7b..35c7f80d 100644
--- a/hpobench/benchmarks/surrogates/paramnet_benchmark.py
+++ b/hpobench/benchmarks/surrogates/paramnet_benchmark.py
@@ -61,8 +61,8 @@
 0.0.1:
 * First implementation
 """
-import warnings
 import logging
+import warnings
 from typing import Union, Dict
 
 import ConfigSpace as CS
diff --git a/hpobench/benchmarks/surrogates/yahpo_gym.py b/hpobench/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..ad552acd
--- /dev/null
+++ b/hpobench/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,276 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites:
+==============
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+1. Clone from github:
+=====================
+```
+git clone HPOBench
+```
+
+2. Clone and install
+====================
+```
+cd /path/to/HPOBench
+pip install .[yahpo_gym]
+
+```
+
+Changelog:
+==========
+0.0.2:
+
+* Add support for multi-objective benchmarks
+* Add support for fairness benchmarks and interpretability benchmarks.
+For these new benchmarks (fairness and interpretability), we recommend the following benchmarks and objectives:
+For the entire list of available benchmarks, please take a look in the yahpo benchmark documentation.
+
+Benchmark Name      |   Scenario    |   Objectives
+--------------------|---------------|--------------
+fair_fgrrm          | 7592          | mmce, feo
+                    | 14965         | mmce, feo
+--------------------|---------------|--------------
+fair_rpart          | 317599        | mmce, ffomr
+                    | 7592          | mmce, feo
+--------------------|---------------|--------------
+fair_ranger         | 317599        | mmce, fpredp
+                    | 14965         | mmce, fpredp
+--------------------|---------------|--------------
+fair_xgboost        | 317599        | mmce, ffomr
+                    | 7592          | mmce, ffnr
+--------------------|---------------|--------------
+fair_super          | 14965         | mmce, feo
+                    | 317599        | mmce, ffnr
+--------------------|---------------|--------------
+
+
+Benchmark Name      |   Scenario    |   Objectives
+--------------------|---------------|--------------
+iaml_glmnet          | 1489         | mmce, nf
+                    | 40981         | mmce, nf
+--------------------|---------------|--------------
+iaml_rpart          | 1489          | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_ranger         | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_xgboost        | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_super          | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+
+0.0.1:
+* First implementation
+"""
+import logging
+from pathlib import Path
+from typing import Union, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+from yahpo_gym.benchmark_set import BenchmarkSet
+
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractSingleObjectiveBenchmark
+from hpobench.util.data_manager import YAHPODataManager
+
+__version__ = '0.0.2'
+
+logger = logging.getLogger('YAHPOGym')
+
+
+class YAHPOGymBaseBenchmark:
+    def __init__(self, scenario: str, instance: str,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Base Benchmark for all single and multi objective yahpo surrogate benchmarks.
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
+        rng : np.random.RandomState, int, None
+        """
+        self.data_manager = YAHPODataManager(data_dir=data_dir)
+        self.data_manager.load()
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True, multithread=multi_thread)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymBaseBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # No batch predicts, so we can grab the first item
+        out = self.benchset.objective_function({**configuration, **fidelity})[0]
+        # Convert to float for serialization
+        out = {k: float(v) for k, v in out.items()}
+
+        # Get runtime name
+        cost = out[self.benchset.config.runtime_name]
+
+        return {'function_value': out,
+                "cost": cost,
+                'info': {'fidelity': fidelity}}
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity '
+                               '       Benchmark for Hyperparameter Optimization},',
+                               'author    = {Florian Pfisterer and Lennart Schneider and'
+                               '             Julia Moosbauer and Martin Binder'
+                               '             and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year      = {2021}}'],
+                'code': 'https://github.com/pfistfl/yahpo_gym/yahpo_gym'}
+
+
+class YAHPOGymMOBenchmark(YAHPOGymBaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
+        rng : np.random.RandomState, int, None
+        """
+        self.objective = objective
+        super(YAHPOGymMOBenchmark, self).__init__(scenario=scenario, instance=instance, rng=rng, data_dir=data_dir, multi_thread=multi_thread)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        return self._mo_objective_function(configuration, fidelity, rng, **kwargs)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
+            -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+
+class YAHPOGymBenchmark(YAHPOGymBaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
+        rng : np.random.RandomState, int, None
+        """
+        self.objective = objective
+        super(YAHPOGymBenchmark, self).__init__(scenario=scenario, instance=instance, rng=rng, data_dir=data_dir, multi_thread=multi_thread)
+
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self._mo_objective_function(configuration=configuration,
+                                                 fidelity=fidelity,
+                                                 **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity, 'objectives': mo_results['function_value']}}
+
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
diff --git a/hpobench/config.py b/hpobench/config.py
index 9d7964e0..cd46c6e5 100644
--- a/hpobench/config.py
+++ b/hpobench/config.py
@@ -64,7 +64,16 @@ def __init__(self):
 
         # Options for the singularity container
         self.socket_dir = Path(self.socket_dir).expanduser().absolute()
-        self.container_dir = self.cache_dir / f'hpobench-{os.getuid()}'
+
+        # os.getuid is only for posix os. Make it compatible with windows
+        # https://stackoverflow.com/questions/842059/is-there-a-portable-way-to-get-the-current-username-in-python
+        if os.name == 'nt':
+            import getpass
+            user_name = getpass.getuser()
+        else:
+            user_name = os.getuid()
+
+        self.container_dir = self.cache_dir / f'hpobench-{user_name}'
         self.container_source = 'oras://gitlab.tf.uni-freiburg.de:5050/muelleph/hpobench-registry'
         self.pyro_connect_max_wait = 400
 
diff --git a/hpobench/container/benchmarks/ml/__init__.py b/hpobench/container/benchmarks/ml/__init__.py
index ed2ce40f..f342f5f8 100644
--- a/hpobench/container/benchmarks/ml/__init__.py
+++ b/hpobench/container/benchmarks/ml/__init__.py
@@ -6,7 +6,7 @@
 from hpobench.container.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.container.benchmarks.ml.tabular_benchmark import TabularBenchmark
 from hpobench.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-
+from hpobench.container.benchmarks.ml.yahpo_benchmark import YAHPOGymRawBenchmark, YAHPOGymMORawBenchmark
 
 __all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
            'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
@@ -14,4 +14,5 @@
            'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
            'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
            'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
+           'YAHPOGymRawBenchmark', 'YAHPOGymMORawBenchmark']
diff --git a/hpobench/container/benchmarks/ml/lr_benchmark.py b/hpobench/container/benchmarks/ml/lr_benchmark.py
index 979cda3e..61b80a13 100644
--- a/hpobench/container/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml/lr_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class LRBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmark, self).__init__(**kwargs)
 
 
 class LRBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmarkBB, self).__init__(**kwargs)
 
 
 class LRBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/nn_benchmark.py b/hpobench/container/benchmarks/ml/nn_benchmark.py
index 04955e82..d4b0f52a 100644
--- a/hpobench/container/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml/nn_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class NNBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmark, self).__init__(**kwargs)
 
 
 class NNBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmarkBB, self).__init__(**kwargs)
 
 
 class NNBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/rf_benchmark.py b/hpobench/container/benchmarks/ml/rf_benchmark.py
index a414349d..13e9bb47 100644
--- a/hpobench/container/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml/rf_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class RandomForestBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmark, self).__init__(**kwargs)
 
 
 class RandomForestBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmarkBB, self).__init__(**kwargs)
 
 
 class RandomForestBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark.py b/hpobench/container/benchmarks/ml/svm_benchmark.py
index 7547a81a..7a20f40b 100644
--- a/hpobench/container/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml/svm_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class SVMBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmark, self).__init__(**kwargs)
 
 
 class SVMBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmarkMF, self).__init__(**kwargs)
 
 
 class SVMBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmarkBB, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark_old.py b/hpobench/container/benchmarks/ml/svm_benchmark_old.py
deleted file mode 100644
index 4955f057..00000000
--- a/hpobench/container/benchmarks/ml/svm_benchmark_old.py
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class SupportVectorMachine(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SupportVectorMachine')
-        kwargs['container_name'] = kwargs.get('container_name', 'svm_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(SupportVectorMachine, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml/tabular_benchmark.py b/hpobench/container/benchmarks/ml/tabular_benchmark.py
index 6d19953b..5c8a22ef 100644
--- a/hpobench/container/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/container/benchmarks/ml/tabular_benchmark.py
@@ -6,11 +6,15 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_tabular_benchmarks"
+container_version = "0.0.4"
+
+
 class TabularBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'TabularBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_tabular_benchmarks')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(TabularBenchmark, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark.py b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
index c82ea606..726d6f45 100644
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
@@ -6,36 +6,42 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class XGBoostBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmark, self).__init__(**kwargs)
 
 
 class XGBoostBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmarkBB, self).__init__(**kwargs)
 
 
 class XGBoostBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmarkMF, self).__init__(**kwargs)
 
 
 class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
 
 
-__all__ = ['XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
+__all__ = [
+    'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF', 'XGBoostSearchSpace3Benchmark'
+]
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
deleted file mode 100644
index df475748..00000000
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class XGBoostBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostBenchmark, self).__init__(**kwargs)
-
-
-class XGBoostExtendedBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostExtendedBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostExtendedBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml/yahpo_benchmark.py b/hpobench/container/benchmarks/ml/yahpo_benchmark.py
new file mode 100644
index 00000000..e4d9cf0c
--- /dev/null
+++ b/hpobench/container/benchmarks/ml/yahpo_benchmark.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient, \
+    AbstractBenchmarkClient
+
+
+class YAHPOGymMORawBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMORawBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_raw')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymMORawBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymRawBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymRawBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_raw')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymRawBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/__init__.py b/hpobench/container/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/mo/adult_benchmark.py b/hpobench/container/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..34baf1b9
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,12 @@
+""" Benchmark for the Multi-Objective Adult Benchmark from hpobench/benchmarks/mo/adult_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class AdultBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'AdultBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'fair_adult')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
+        super(AdultBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/cnn_benchmark.py b/hpobench/container/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..9e5cfe6f
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,22 @@
+""" Benchmark for the Multi-Objective CNN Benchmark from hpobench/benchmarks/mo/cnn_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient
+
+
+class FlowerCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FlowerCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FlowerCNNBenchmark, self).__init__(**kwargs)
+
+
+class FashionCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FashionCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FashionCNNBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_101.py b/hpobench/container/benchmarks/nas/nasbench_101.py
index 7984d786..a47e96a2 100644
--- a/hpobench/container/benchmarks/nas/nasbench_101.py
+++ b/hpobench/container/benchmarks/nas/nasbench_101.py
@@ -3,14 +3,14 @@
 
 """ Benchmark for the Tabular Benchmark from hpobench/benchmarks/nas/nasbench_101.py """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
 
 
 class NASCifar10ABenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10ABenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10ABenchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class NASCifar10BBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10BBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10BBenchmark, self).__init__(**kwargs)
 
 
@@ -26,5 +26,29 @@ class NASCifar10CBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10CBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10CBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10AMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10AMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10AMOBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10BMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10BMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10BMOBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10CMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10CMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10CMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_1shot1.py b/hpobench/container/benchmarks/nas/nasbench_1shot1.py
index a88dcf9a..bef0bf16 100644
--- a/hpobench/container/benchmarks/nas/nasbench_1shot1.py
+++ b/hpobench/container/benchmarks/nas/nasbench_1shot1.py
@@ -3,14 +3,14 @@
 
 """ Benchmark for the nasbench 1shot1 benchmarks from hpobench/benchmarks/nas/nasbench_1shot1.py """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
 
 
 class NASBench1shot1SearchSpace1Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace1Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace1Benchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class NASBench1shot1SearchSpace2Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace2Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace2Benchmark, self).__init__(**kwargs)
 
 
@@ -26,5 +26,29 @@ class NASBench1shot1SearchSpace3Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace3Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace1MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace1MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace1MOBenchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace2MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace2MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace2MOBenchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace3MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace3MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace3MOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_201.py b/hpobench/container/benchmarks/nas/nasbench_201.py
index 5eb9c68f..83b6f488 100644
--- a/hpobench/container/benchmarks/nas/nasbench_201.py
+++ b/hpobench/container/benchmarks/nas/nasbench_201.py
@@ -10,7 +10,7 @@ class Cifar10ValidNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar10ValidNasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class Cifar100NasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar100NasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -26,7 +26,7 @@ class ImageNetNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(ImageNetNasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -34,7 +34,7 @@ class Cifar10ValidNasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar10ValidNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
@@ -42,7 +42,7 @@ class Cifar100NasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar100NasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
@@ -50,13 +50,40 @@ class ImageNetNasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
+class Cifar10ValidNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar100NasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(ImageNetNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
 __all__ = ["Cifar10ValidNasBench201Benchmark",
            "Cifar100NasBench201Benchmark",
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/hpobench/container/benchmarks/od/__init__.py b/hpobench/container/benchmarks/od/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/surrogates/yahpo_gym.py b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..64cee463
--- /dev/null
+++ b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
+
+
+class YAHPOGymBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
+        super(YAHPOGymBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
+        super(YAHPOGymMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/client_abstract_benchmark.py b/hpobench/container/client_abstract_benchmark.py
index 4a9eb96c..d2963c00 100644
--- a/hpobench/container/client_abstract_benchmark.py
+++ b/hpobench/container/client_abstract_benchmark.py
@@ -14,22 +14,21 @@
 The name of the container (``container_name``) is defined either in its belonging
 container-benchmark definition. (hpobench/container/<type>/<name> or via ``container_name``.
 """
-import os
 import abc
-import sys
 import json
 import logging
+import os
 import subprocess
+import sys
 import time
 from pathlib import Path
-from typing import Optional
-from typing import Union, Dict
+from typing import Optional, Union, Dict, List, Tuple
 from uuid import uuid1
 
 import ConfigSpace as CS
 import Pyro4
-import Pyro4.util
 import Pyro4.errors
+import Pyro4.util
 import numpy as np
 from ConfigSpace.read_and_write import json as csjson
 from oslo_concurrency import lockutils
@@ -512,3 +511,9 @@ def __del__(self):
     def _id_generator() -> str:
         """ Helper function: Creates unique socket ids for the benchmark server """
         return str(uuid1())
+
+
+class AbstractMOBenchmarkClient(AbstractBenchmarkClient):
+    def get_objective_names(self) -> Union[Tuple, List, Dict]:
+        json_str = self.benchmark.get_objective_names()
+        return json.loads(json_str, cls=BenchmarkDecoder)
diff --git a/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark b/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
new file mode 100644
index 00000000..e79dab4b
--- /dev/null
+++ b/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
@@ -0,0 +1,82 @@
+Bootstrap: docker
+From: rpy2/rpy2:latest
+
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.1
+
+%help
+    This is the recipe for the Raw YAHPO Benchmarks.
+
+
+%post
+    cd /home
+
+    ####################### INSTALL THE R + BASE DEPENDENCIES #################
+    FILE="libssl1.1_1.1.1f-1ubuntu2_amd64.deb"
+    wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/${FILE}
+    sudo dpkg -i ${FILE}
+
+    FILE="libssl-dev_1.1.1f-1ubuntu2_amd64.deb"
+    wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/${FILE}
+    sudo dpkg -i ${FILE}
+
+    sudo apt-get install openssl
+    sudo apt-get install libcurl4-openssl-dev git
+
+    # Instal R-Packages
+    cd /home \
+    && Rscript -e 'install.packages("remotes", repos = "http://cran.r-project.org")'
+
+    # Install OpenML dependencies
+    Rscript -e 'install.packages("curl", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("openssl", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("httr", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("farff", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("OpenML", repos = "http://cran.r-project.org")'
+
+    # Install rbv2 dependencies
+    Rscript -e 'remotes::install_version("BBmisc", version = "1.11", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("rpart", version = "4.1-13", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("e1071", version = "1.7-0.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("xgboost", version = "0.82.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("ranger", version = "0.11.2", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("RcppHNSW", version = "0.1.0", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("mlr", version = "2.14", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_github("mlr-org/mlr3misc", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("mlrCPO", version = "0.3.6", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("testthat", version = "3.1.4", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("glmnet", version = "4.1-3", upgrade = "never", repos = "http://cran.r-project.org")'
+    # ################################ BASE DEPENDENCIES ################################
+
+    Rscript -e 'remotes::install_github("pfistfl/rbv2", upgrade = "never", dependencies = True)' \
+    && Rscript -e 'remotes::install_github("sumny/iaml", upgrade = "never", dependencies = True)' \
+    && Rscript -e 'remotes::install_github("sumny/fair", upgrade = "never", dependencies = True)'
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git \
+
+    # Upgrade pip
+    python3 -m pip install --upgrade pip
+
+    # Install HPOBench
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout development \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && pip uninstall -y rpy2 \
+    && pip install .[yahpo_gym_raw]
+    # && git checkout development \
+
+    # Clean Up.
+    echo "Please don't touch the following lines" \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python3 -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml.yahpo_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark b/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmarks
similarity index 100%
rename from hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark
rename to hpobench/container/recipes/ml/Singularity.ml_tabular_benchmarks
diff --git a/hpobench/container/recipes/ml/Singularity.rbv2Benchmark b/hpobench/container/recipes/ml/Singularity.rbv2Benchmark
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/recipes/mo/Singularity.AdultBenchmark b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
new file mode 100644
index 00000000..d373caa2
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
@@ -0,0 +1,25 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_adult] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/mo/Singularity.CNNBenchmark b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
new file mode 100644
index 00000000..c9870968
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
@@ -0,0 +1,26 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_cnn] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.cnn_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
new file mode 100644
index 00000000..98914ed1
--- /dev/null
+++ b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
@@ -0,0 +1,39 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.2
+
+%help
+    This is a template for a Singularity recipe
+
+%environment
+    YAHPO_CONTAINER=1
+    export YAHPO_CONTAINER
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    /usr/local/bin/python -m pip install --upgrade pip
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b fair https://github.com/slds-lmu/yahpo_data.git
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && git checkout master \
+    && pip install .[yahpo_gym] \
+    && echo "Please don't touch the following lines" \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py surrogates.yahpo_gym $@
diff --git a/hpobench/container/server_abstract_benchmark.py b/hpobench/container/server_abstract_benchmark.py
index cad09792..de828300 100644
--- a/hpobench/container/server_abstract_benchmark.py
+++ b/hpobench/container/server_abstract_benchmark.py
@@ -104,6 +104,14 @@ def get_meta_information(self):
         logger.debug('Server: get_meta_info called')
         return json.dumps(self.benchmark.get_meta_information(), indent=None, cls=BenchmarkEncoder)
 
+    def get_objective_names(self):
+        logger.debug('Server: get_objective_names called')
+        if hasattr(self.benchmark, 'get_objective_names'):
+            return json.dumps(self.benchmark.get_objective_names(), indent=None, cls=BenchmarkEncoder)
+        else:
+            logger.warning('Server: This is not a MO Benchmark. The `get_objective_names` function is not implemented.')
+            return ''
+
     @Pyro4.oneway   # in case call returns much later than daemon.shutdown
     def shutdown(self):
         logger.debug('Server: Shutting down...')
diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index 526c6756..ebc48c95 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -1,20 +1,20 @@
-import openml
-import numpy as np
-import pandas as pd
-from typing import Union
 from pathlib import Path
+from typing import Union
 
+import numpy as np
+import openml
+import pandas as pd
+from oslo_concurrency import lockutils
+from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from oslo_concurrency import lockutils
+from sklearn.utils import check_random_state
 
-from hpobench.util.data_manager import DataManager
 from hpobench import config_file
+from hpobench.util.data_manager import DataManager
 
 
 class OpenMLDataManager(DataManager):
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 3c6fcdaf..1ffe7b9e 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -33,25 +33,34 @@ class MLBenchmark(AbstractBenchmark):
     def __init__(
             self,
             task_id: int,
-            rng: Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
             data_path: Union[str, Path, None] = None,
             global_seed: int = 1
     ):
+        """ Base template for the ML multi-fidelity benchmarks.
+
+        Parameters
+        ----------
+        task_id : int
+            A valid OpenML Task ID.
+        valid_size : float
+            The fraction of training set to be used as validation split.
+        rng : np.random.RandomState, int (optional)
+            The random seed that will be passed to the ML model if not explicitly passed.
+        data_path : str, Path (optional)
+            The path from where the training-validation-testing splits may be loaded.
+        global_seed : int
+            The fixed global seed that is used for creating validation splits if not available.
+        """
         super(MLBenchmark, self).__init__(rng=rng)
 
-        if isinstance(rng, int):
-            self.seed = rng
-        else:
-            self.seed = self.rng.randint(1, 10**6)
-
         self.global_seed = global_seed  # used for fixed training-validation splits
 
         self.task_id = task_id
         self.valid_size = valid_size
-        self.scorers = dict()
-        for k, v in metrics.items():
-            self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
+        self.scorers = metrics
+        self.scorer_args = metrics_kwargs
 
         if data_path is None:
             from hpobench import config_file
@@ -59,7 +68,7 @@ def __init__(
 
         self.data_path = Path(data_path)
 
-        dm = OpenMLDataManager(task_id, valid_size, data_path, global_seed)
+        dm = OpenMLDataManager(self.task_id, self.valid_size, self.data_path, self.global_seed)
         dm.load()
 
         # Data variables
@@ -77,10 +86,6 @@ def __init__(
         self.lower_bound_train_size = dm.lower_bound_train_size
         self.n_classes = dm.n_classes
 
-        # Observation and fidelity spaces
-        self.fidelity_space = self.get_fidelity_space(self.seed)
-        self.configuration_space = self.get_configuration_space(self.seed)
-
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
@@ -90,32 +95,33 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
         """
         raise NotImplementedError()
 
+    # pylint: disable=arguments-differ
     def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
+        """ Returns the meta information for the benchmark
+        """
         return {
             'name': 'Support Vector Machine',
             'shape of train data': self.train_X.shape,
             'shape of test data': self.test_X.shape,
             'shape of valid data': self.valid_X.shape,
-            'initial random seed': self.seed,
+            'initial random seed': self.rng,
             'task_id': self.task_id
         }
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def get_model_size(self, model):
+        """ Returns a custom model size specific to the ML model, if applicable
+        """
+        raise NotImplementedError
+
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         raise NotImplementedError()
@@ -134,18 +140,74 @@ def get_fidelity(self, size: Union[int, None] = None):
             return self.fidelity_space.sample_configuration()
         return [self.fidelity_space.sample_configuration() for i in range(size)]
 
-    def shuffle_data_idx(self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None) -> Iterable:
+    def shuffle_data_idx(
+            self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None
+    ) -> Iterable:
         rng = self.rng if rng is None else rng
         train_idx = self.train_idx if train_idx is None else train_idx
         rng.shuffle(train_idx)
         return train_idx
 
-    def _train_objective(self,
-                         config: Dict,
-                         fidelity: Dict,
-                         shuffle: bool,
-                         rng: Union[np.random.RandomState, int, None] = None,
-                         evaluation: Union[str, None] = "valid"):
+    def _get_lc_spacing(self, max_iter, k):
+        """ Creates an integer sequence to record Learning Curves for every k iteration.
+
+        Designed to include the maximum iteration. A k-spaced iteration sequence may not include
+        the endpoint implicitly.
+        """
+        assert k > 0, "Spacing needs to be at >=1"
+        assert k < max_iter, "Spacing should be in {1, 2, ..., max_iter-1}"
+        spacing = np.arange(0, max_iter + 1, step=k).tolist()
+        spacing = spacing[1:]  # eliminating 0
+        if spacing[-1] != max_iter:
+            spacing.append(max_iter)
+        return spacing
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if get_learning_curve:
+            raise NotImplementedError(
+                "Need to implement partial or intermediate training to record Learning curves"
+            )
+        learning_curves = None
+        lc_time = None
 
         if rng is not None:
             rng = get_rng(rng, self.rng)
@@ -154,26 +216,32 @@ def _train_objective(self,
         model = self.init_model(config, fidelity, rng)
 
         # preparing data
-        if eval == "valid":
+        if evaluation == "valid":
             train_X = self.train_X
             train_y = self.train_y
-            train_idx = self.train_idx
-        else:
+        elif evaluation == "test":
             train_X = np.vstack((self.train_X, self.valid_X))
             train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
 
         # shuffling data
         if shuffle:
             train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
             train_y = train_y.iloc[train_idx]
 
         # subsample here:
         # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
         if self.lower_bound_train_size is None:
             self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
-            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
         subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
         train_idx = self.rng.choice(
             np.arange(len(train_X)), size=int(
@@ -184,102 +252,209 @@ def _train_objective(self,
         start = time.time()
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
         model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
         # computing statistics on training data
         scores = dict()
         score_cost = dict()
         for k, v in self.scorers.items():
             scores[k] = 0.0
             score_cost[k] = 0.0
-            if evaluation == "test":
-                _start = time.time()
-                scores[k] = v(model, train_X, train_y)
-                score_cost[k] = time.time() - _start
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
-        return model, model_fit_time, train_loss, scores, score_cost
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
-    def objective_function(self,
-                           configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
+
+        The ML model is trained on the training split, and evaluated on the valid and test splits.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="val"
-        )
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng,
+                evaluation="valid", record_stats=record_train,
+                get_learning_curve=get_learning_curve, lc_every_k=lc_every_k
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on validation set
+        start = time.time()
+        pred_val = model.predict(self.valid_X)
+        val_inference_time = time.time() - start
         val_scores = dict()
         val_score_cost = dict()
         for k, v in self.scorers.items():
+            val_scores[k] = 0.0
+            val_score_cost[k] = 0.0
             _start = time.time()
-            val_scores[k] = v(model, self.valid_X, self.valid_y)
-            val_score_cost[k] = time.time() - _start
+            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
+            val_score_cost[k] = time.time() - _start + val_inference_time
         val_loss = 1 - val_scores["acc"]
 
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
         test_scores = dict()
         test_score_cost = dict()
         for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
             _start = time.time()
-            test_scores[k] = v(model, self.test_X, self.test_y)
-            test_score_cost[k] = time.time() - _start
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
+            'model_size': model_size,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
             'val_scores': val_scores,
             'val_costs': val_score_cost,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            'learning_curves_spacing': lc_every_k,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
         }
 
         return {
-            'function_value': info['val_loss'],
-            'cost': model_fit_time + info['val_costs']['acc'],
+            'function_value': float(info['val_loss']),
+            'cost': float(model_fit_time + info['val_costs']['acc']),
             'info': info
         }
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
-    def objective_function_test(self,
-                                configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
+
+        The ML model is trained on the training+valid split, and evaluated on the test split.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="test"
-        )
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng,
+                evaluation="test", record_stats=record_train,
+                get_learning_curve=get_learning_curve, lc_every_k=lc_every_k
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
         test_scores = dict()
         test_score_cost = dict()
         for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
             _start = time.time()
-            test_scores[k] = v(model, self.test_X, self.test_y)
-            test_score_cost[k] = time.time() - _start
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
         info = {
             'train_loss': train_loss,
             'val_loss': None,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
+            'model_size': model_size,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'val_scores': dict(),
-            'val_costs': dict(),
+            'val_scores': None,
+            'val_costs': None,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            'learning_curves_spacing': lc_every_k,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
diff --git a/hpobench/dependencies/mo/__init__.py b/hpobench/dependencies/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/mo/fairness_metrics.py b/hpobench/dependencies/mo/fairness_metrics.py
new file mode 100644
index 00000000..7776fbd9
--- /dev/null
+++ b/hpobench/dependencies/mo/fairness_metrics.py
@@ -0,0 +1,110 @@
+"""
+This file contains functionality to compute various fairness related risk scores.
+"""
+
+import numpy as np
+
+STATISTICAL_DISPARITY = 'statistical_disparity'  # P(1 | group A) - P(1 | group B)
+UNEQUAL_OPPORTUNITY = 'unequal_opportunity'  # P(1 | group A, 0) - P(1 | group B, 0)
+UNEQUALIZED_ODDS = 'unequalized_odds'  # P(1 | group A, 1) - P(1 | group B, 1)
+
+TPR0 = 'tpr0'
+TPR1 = 'tpr1'
+TPR_DIF = 'tpr_dif'
+TPR_MIN = 'tpr_min'
+
+FAIRNESS_METRICS = [STATISTICAL_DISPARITY, UNEQUAL_OPPORTUNITY, UNEQUALIZED_ODDS, TPR0, TPR1, TPR_DIF, TPR_MIN]
+
+PRED_THRESHOLD = 0.5
+
+
+def fairness_risk(x, y, sensitive_rows, model, unfairness_metric):
+    """
+    Returns the fairness_risk based on the definition of the unfairness_metric, currently supporting:
+    statistical_disparity: P(positive prediction | group A) = P(positive prediction | group B)
+
+    Parameters
+    ----------
+    x: np.ndarray
+        inputs
+    y: np.ndarray
+        labels in {0, 1} such that 0 is a "positive" label, 1 "negative"
+    sensitive_rows: np.ndarray
+        binary array indicating which rows correspond to the protected group
+    model:
+        trained sklearn model
+    unfairness_metric: str
+        string with unfairness condition
+
+    Returns
+    -------
+        float
+    """
+    predicted_probs = model.predict_proba(x)
+    if unfairness_metric == STATISTICAL_DISPARITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUAL_OPPORTUNITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 0)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 0)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUALIZED_ODDS:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 1)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 1)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == TPR0:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+        return tpr0
+    elif unfairness_metric == TPR1:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return tpr1
+    elif unfairness_metric == TPR_DIF:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return abs(tpr0 - tpr1)
+    elif unfairness_metric == TPR_MIN:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return min(tpr0, tpr1)
+    else:
+        raise ValueError(
+            f'{unfairness_metric} is not a valid unfairness condition. '
+            f'Please specify one among ({STATISTICAL_DISPARITY}, {UNEQUAL_OPPORTUNITY}, {UNEQUALIZED_ODDS})'
+        )
diff --git a/hpobench/dependencies/mo/scalar.py b/hpobench/dependencies/mo/scalar.py
new file mode 100644
index 00000000..185c2730
--- /dev/null
+++ b/hpobench/dependencies/mo/scalar.py
@@ -0,0 +1,37 @@
+from typing import Union
+
+import numpy as np
+
+try:
+    from sklearn.preprocessing import MinMaxScaler, StandardScaler
+except ImportError:
+    print("scikit-learn not installed")
+
+
+def get_fitted_scaler(x_train: np.ndarray, name: Union[None, str] = None):
+    """
+    Instantiates a scaler by a given name and fits the scaler with x_train.
+    Parameters
+    ----------
+    x_train: np.ndarray
+        Train data
+
+    name: str, None
+        Name of the scaling method. Defaults to no scaling.
+
+    Returns
+    -------
+
+    """
+
+    if name == "MinMax":
+        scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
+    elif name == "Standard":
+        scaler = StandardScaler(copy=True)
+    elif name is None or name == "None":
+        return None
+    else:
+        raise NotImplementedError()
+
+    scaler.fit(x_train)
+    return lambda x: scaler.transform(x)
diff --git a/hpobench/dependencies/od/traditional_benchmark.py b/hpobench/dependencies/od/traditional_benchmark.py
index 68cef2e5..1d82dfe6 100644
--- a/hpobench/dependencies/od/traditional_benchmark.py
+++ b/hpobench/dependencies/od/traditional_benchmark.py
@@ -214,6 +214,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidel_space = CS.ConfigurationSpace(seed=seed)
         return fidel_space
 
+    # pylint: disable=arguments-differ
     def get_meta_information(self):
         """ Returns the meta information for the benchmark """
         X_train, y_train = self.datamanager.dataset.get_train_data()
diff --git a/hpobench/util/clean_up_script.py b/hpobench/util/clean_up_script.py
index 5fe9fd0c..771ab80f 100644
--- a/hpobench/util/clean_up_script.py
+++ b/hpobench/util/clean_up_script.py
@@ -1,7 +1,8 @@
+import logging
+import shutil
+
 from hpobench import config_file
 
-import shutil
-import logging
 logger = logging.getLogger('Clean-up')
 logger.setLevel(logging.INFO)
 
diff --git a/hpobench/util/container_utils.py b/hpobench/util/container_utils.py
index 7fee19e9..bb7221c3 100644
--- a/hpobench/util/container_utils.py
+++ b/hpobench/util/container_utils.py
@@ -1,11 +1,11 @@
-import os
+import enum
 import importlib
 import json
-import numpy as np
-import enum
-
+import os
 from typing import Any, Union
 
+import numpy as np
+
 from hpobench.util.rng_helper import serialize_random_state, deserialize_random_state
 
 
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index a2e33121..914d651c 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -15,6 +15,7 @@
 import gzip
 import json
 import logging
+import os
 import pickle
 import tarfile
 from io import BytesIO
@@ -37,10 +38,18 @@
 except ImportError:
     print("pandas is not installed, can't download datasets for the ml.tabular_benchmarks (not needed for containers)")
 
-
 import hpobench
 
 
+
+tabular_multi_fidelity_urls = dict(
+    xgb="https://figshare.com/ndownloader/files/35414756",
+    svm="https://figshare.com/ndownloader/files/35414447",
+    lr="https://figshare.com/ndownloader/files/35412425",
+    rf="https://figshare.com/ndownloader/files/35414801",
+    nn="https://figshare.com/ndownloader/files/35414996"
+)
+
 class DataManager(abc.ABC, metaclass=abc.ABCMeta):
     """ Base Class for loading and managing the data.
 
@@ -845,6 +854,93 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_train, y_train = data[:n_train, 1:], data[:n_train, 0]
         X_val, y_val = data[n_train:n_train + n_val, 1:], data[n_train:n_train + n_val, 0]
         X_test, y_test = data[n_train + n_val:, 1:], data[n_train + n_val:, 0]
+        return X_train, y_train, X_val, y_val, X_test, y_test
+
+
+class CNNDataManager(HoldoutDataManager):
+
+    def __init__(self, dataset: str):
+
+        super(CNNDataManager, self).__init__()
+        self.logger.debug('CNNDataManager: Starting to load data')
+
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+
+        self.url_source = f'https://github.com/ayushi-3536/DatasetHost/blob/main/{dataset}.tar.gz?raw=true'
+        self.dataset = dataset
+        self.save_dir = hpobench.config_file.data_dir / "CNN" / f'{dataset}'
+        self.compressed_data = self.save_dir / f'{dataset}.tar.gz'
+        self.create_save_directory(self.save_dir)
+
+    def load(self):
+        """
+        Loads CNN Benchmark from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'CNNDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        # Check if data is already downloaded.
+        # Use a file lock to ensure that no two processes try to download the same files at the same time.
+        if self.compressed_data.exists():
+            self.logger.debug('CNNDataManager: Data already downloaded')
+        else:
+
+            self.logger.info(f'CNNDataManager: Start downloading data from {self.url_source} '
+                             f'to {self.save_dir}')
+            self._download_file_with_progressbar(data_url=self.url_source, data_file=self.compressed_data)
+            self._untar_data(compressed_file=self.compressed_data, save_dir=self.save_dir)
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        data_extract_path = self.save_dir / "data"
+        X_train = np.load(data_extract_path / 'x_train.npy')
+        y_train = np.load(data_extract_path / 'y_train.npy')
+
+        X_val = np.load(data_extract_path / 'x_val.npy')
+        y_val = np.load(data_extract_path / 'y_val.npy')
+
+        # Read Test datasets
+        X_test = np.load(data_extract_path / 'x_test.npy')
+        y_test = np.load(data_extract_path / 'y_test.npy')
+
+        def __cast_x_y(x, y) -> Tuple:
+            import torch
+            return torch.tensor(x).float().permute(0, 3, 1, 2), torch.tensor(y).long()
+
+        X_train, y_train = __cast_x_y(X_train, y_train)
+        X_val, y_val = __cast_x_y(X_val, y_val)
+        X_test, y_test = __cast_x_y(X_test, y_test)
 
         return X_train, y_train, X_val, y_val, X_test, y_test
 
@@ -926,24 +1022,176 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         return X_trn, y_trn, X_val, y_val, X_tst, y_tst
 
 
+class AdultDataManager(HoldoutDataManager):
+
+    def __init__(self):
+        super(AdultDataManager, self).__init__()
+        self.logger.debug('AdultDataManager: Starting to load data')
+        self.urls = {"data": "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+                     "test_data": "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"}
+
+        self.feature_names = ['age', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'race',
+                              'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country',
+                              'employment_type']
+        self.sensitive_names = 'sex'
+
+        self._save_dir = hpobench.config_file.data_dir / "adult"
+
+        self._data_extract_path = self._save_dir / "processed_data"
+
+        self.create_save_directory(self._data_extract_path)
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'AdultDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        if not (self._save_dir / "adult.data").exists():
+            self._download_file_with_progressbar(self.urls["data"], self._save_dir / "adult.data")
+
+        if not (self._save_dir / "adult.test").exists():
+            self._download_file_with_progressbar(self.urls["test_data"], self._save_dir / "adult.test")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+        processed_files = ['x_train', 'x_valid', 'x_test', 'y_train', 'y_valid', 'y_test']
+        file_is_missing = not all([(self._data_extract_path / f'{file}.npy').exists() for file in processed_files])
+
+        if file_is_missing:
+            columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
+                       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
+                       "hours-per-week", "country", "salary"]
+            train_data = pd.read_csv(self._save_dir / 'adult.data', names=columns, sep=',', na_values='?')
+            test_data = pd.read_csv(self._save_dir / 'adult.test', names=columns, sep=',', skiprows=1, na_values='?')
+
+            X, y = self._process_adult_data(train_data)
+            X_test, y_test = self._process_adult_data(test_data)
+
+            n_trn = int(X.shape[0] * 0.7)
+            # Creation of Train and Test dataset
+            X_train, y_train = X[:n_trn], y[:n_trn]
+            X_valid, y_valid = X[n_trn:], y[n_trn:]
+
+            np.save(self._data_extract_path / 'x_train.npy', X_train)
+            np.save(self._data_extract_path / 'x_valid.npy', X_valid)
+            np.save(self._data_extract_path / 'x_test.npy', X_test)
+
+            np.save(self._data_extract_path / 'y_train.npy', y_train)
+            np.save(self._data_extract_path / 'y_valid.npy', y_valid)
+            np.save(self._data_extract_path / 'y_test.npy', y_test)
+
+        else:
+            X_train = np.load(self._data_extract_path / 'x_train.npy')
+            X_valid = np.load(self._data_extract_path / 'x_valid.npy')
+            X_test = np.load(self._data_extract_path / 'x_test.npy')
+
+            y_train = np.load(self._data_extract_path / 'y_train.npy')
+            y_valid = np.load(self._data_extract_path / 'y_valid.npy')
+            y_test = np.load(self._data_extract_path / 'y_test.npy')
+
+        return X_train, y_train, X_valid, y_valid, X_test, y_test
+
+    def _process_adult_data(self, df) -> Tuple[np.ndarray, np.ndarray]:
+        # mapping all categories of marital status to Single(1) or Couple(0)
+        df['marital-status'] = df['marital-status'].replace(
+            [' Divorced', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'], 'Single')
+        df['marital-status'] = df['marital-status'].replace([' Married-AF-spouse', ' Married-civ-spouse'], 'Couple')
+        df['marital-status'] = df['marital-status'].map({'Couple': 0, 'Single': 1})
+
+        # mapping race
+        race_map = {' White': 0, ' Amer-Indian-Eskimo': 1, ' Asian-Pac-Islander': 2, ' Black': 3, ' Other': 4}
+        df['race'] = df['race'].map(race_map)
+
+        # categorizing all work classes into 4 major categories
+        def get_workclass(x):
+            if x['workclass'] == ' Federal-gov' or x['workclass'] == ' Local-gov' or x['workclass'] == ' State-gov':
+                return 'govt'
+            elif x['workclass'] == ' Private':
+                return 'private'
+            elif x['workclass'] == ' Self-emp-inc' or x['workclass'] == ' Self-emp-not-inc':
+                return 'self_employed'
+            else:
+                return 'without_pay'
+
+        df['employment_type'] = df.apply(get_workclass, axis=1)
+        employment_map = {'govt': 0, 'private': 1, 'self_employed': 2, 'without_pay': 3}
+        df['employment_type'] = df['employment_type'].map(employment_map)
+
+        # mapping relationship map
+        rel_map = {' Unmarried': 0, ' Wife': 1, ' Husband': 2, ' Not-in-family': 3, ' Own-child': 4,
+                   ' Other-relative': 5}
+        df['relationship'] = df['relationship'].map(rel_map)
+
+        # maping capital gain/loss to binary values
+        df.loc[(df['capital-gain'] > 0), 'capital-gain'] = 1
+        df.loc[(df['capital-gain'] == 0, 'capital-gain')] = 0
+        df.loc[(df['capital-loss'] > 0), 'capital-loss'] = 1
+        df.loc[(df['capital-loss'] == 0, 'capital-loss')] = 0
+
+        # defining salary map
+        salary_map = {' <=50K': 1, ' >50K': 0, ' <=50K.': 1, ' >50K.': 0, }
+        df['salary'] = df['salary'].map(salary_map).astype(int)
+
+        df['sex'] = df['sex'].map({' Male': 1, ' Female': 0}).astype(int)
+
+        # replacing all missing values with np.nan
+        df['country'] = df['country'].replace(' ?', np.nan)
+        df['workclass'] = df['workclass'].replace(' ?', np.nan)
+        df['occupation'] = df['occupation'].replace(' ?', np.nan)
+
+        # categorizing countries into "Non-US" and "US"
+        df.loc[df['country'] != ' United-States', 'country'] = 'Non-US'
+        df.loc[df['country'] == ' United-States', 'country'] = 'US'
+        df['country'] = df['country'].map({'US': 1, 'Non-US': 0}).astype(int)
+
+        df.drop(labels=['workclass', 'education', 'occupation'], axis=1, inplace=True)
+        X = df.drop(['salary'], axis=1)
+        y = df['salary']
+
+        return X.to_numpy(), y.to_numpy()
+
+
 class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
+        
+        self.model = model
+        self.task_id = str(task_id)
 
-        url_dict = dict(
-            xgb="https://ndownloader.figshare.com/files/30469920",
-            svm="https://ndownloader.figshare.com/files/30379359",
-            lr="https://ndownloader.figshare.com/files/30379038",
-            rf="https://ndownloader.figshare.com/files/30469089",
-            nn="https://ndownloader.figshare.com/files/30379005"
-        )
-
+        url_dict = tabular_multi_fidelity_urls
         assert model in url_dict.keys(), \
             f'Model has to be one of {list(url_dict.keys())} but was {model}'
 
-        self.model = model
-        self.task_id = str(task_id)
-
         self.url_to_use = url_dict.get(model)
 
         if data_dir is None:
@@ -980,3 +1228,42 @@ def _load_json(path):
         with open(path, "r") as f:
             data = json.load(f)
         return data
+
+
+class YAHPODataManager(DataManager):
+    def __init__(self, data_dir: Union[Path, str, None]):
+        super(YAHPODataManager, self).__init__()
+
+        if data_dir is None:
+            data_dir = hpobench.config_file.data_dir / "yahpo_data"
+        self.data_dir = Path(data_dir)
+        self.logger.info(f'Read data from data directory: {data_dir}')
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/lock_yahpo_raw', delay=0.5)
+    def _try_download(self):
+        """Clone the data repository."""
+        if not self.data_dir.exists():
+            self.logger.info(
+                'Try to download data from https://github.com/slds-lmu/yahpo_data/tree/fair'
+            )
+            # Create the data directory if not existing
+            self.create_save_directory(self.data_dir.parent)
+
+            import git
+            git.Repo.clone_from(url='https://github.com/slds-lmu/yahpo_data.git',
+                                to_path=str(self.data_dir),
+                                branch='fair',
+                                multi_options=['--depth 1'])
+            self.logger.info(f'Successfully cloned data from repo to {self.data_dir}')
+
+    def load(self):
+        from yahpo_gym.local_config import LocalConfiguration
+        local_config = LocalConfiguration()
+
+        # When in the containerized version, redirect to the data inside the container.
+        if 'YAHPO_CONTAINER' in os.environ:
+            local_config.init_config(data_path='/home/data/yahpo_data')
+        else:
+            self._try_download()
+            local_config.init_config(data_path=str(self.data_dir))
diff --git a/hpobench/util/test_utils.py b/hpobench/util/test_utils.py
new file mode 100644
index 00000000..b2683135
--- /dev/null
+++ b/hpobench/util/test_utils.py
@@ -0,0 +1,24 @@
+import os
+
+CONST_RUN_ALL_TESTS_ENV_VAR = 'HPOBENCH_RUN_EXPENSIVE_TESTS'
+DEFAULT_SKIP_MSG = 'Skip this test due to time limitations'
+
+
+def check_run_all_tests():
+    """ Helper function: Check if all tests should run. """
+    return os.environ.get(CONST_RUN_ALL_TESTS_ENV_VAR, 'false').lower() == 'true'
+
+
+def enable_all_tests():
+    """
+    Some tests are quite expensive. We control if all runs should be executed by this
+    environment variable.
+    """
+    os.environ[CONST_RUN_ALL_TESTS_ENV_VAR] = 'true'
+
+
+def disable_all_tests():
+    """
+    This function disables the evaluation of all test functions.
+    """
+    os.environ[CONST_RUN_ALL_TESTS_ENV_VAR] = 'false'
diff --git a/requirements.txt b/requirements.txt
index 73ae9818..b5db0198 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,8 @@
-scipy>=1.4.1
 numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80
-oslo.concurrency>=4.2.0
\ No newline at end of file
+oslo.concurrency>=4.2.0
+pandas>=1.2.4
+scikit-learn>=0.24.1
+openml>=0.12.2
+tqdm>=4.64.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4c53ecb0..ef1f292c 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ def read_file(file_name):
     version=read_file('hpobench/__version__.py').split()[-1].strip('\''),
     packages=setuptools.find_packages(exclude=['*.tests', '*.tests.*',
                                                'tests.*', 'tests'],),
-    python_requires='>=3.6, <=3.10',
+    python_requires='>=3.6',
     install_requires=read_file('./requirements.txt').split('\n'),
     extras_require=get_extra_requirements(),
     test_suite='pytest',
diff --git a/tests/test_abstract_benchmark.py b/tests/test_abstract_benchmark.py
index 22a26790..5c98e613 100644
--- a/tests/test_abstract_benchmark.py
+++ b/tests/test_abstract_benchmark.py
@@ -1,12 +1,17 @@
 import pytest
 
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 
-with pytest.raises(NotImplementedError):
-    AbstractBenchmark.get_configuration_space()
 
-with pytest.raises(NotImplementedError):
-    AbstractBenchmark.get_fidelity_space()
+def test_abstract_benchmark():
+    with pytest.raises(NotImplementedError):
+        AbstractBenchmark.get_configuration_space()
 
-with pytest.raises(NotImplementedError):
-    AbstractBenchmark.get_meta_information()
+    with pytest.raises(NotImplementedError):
+        AbstractBenchmark.get_fidelity_space()
+
+    with pytest.raises(NotImplementedError):
+        AbstractBenchmark.get_meta_information()
+
+    with pytest.raises(NotImplementedError):
+        AbstractMultiObjectiveBenchmark.get_objective_names()
diff --git a/tests/test_adult.py b/tests/test_adult.py
new file mode 100644
index 00000000..b52c37ed
--- /dev/null
+++ b/tests/test_adult.py
@@ -0,0 +1,37 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_adult_benchmark():
+    from hpobench.container.benchmarks.mo.adult_benchmark import AdultBenchmark
+
+    # Check Seeding
+    benchmark = AdultBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'alpha': 0.00046568046379195655, 'beta_1': 0.14382335124614148, 'beta_2': 0.0010007892350251595,
+        'fc_layer_0': 4, 'fc_layer_1': 2, 'fc_layer_2': 2, 'fc_layer_3': 3,'n_fc_layers': 4,
+        'learning_rate_init': 0.0005343227125594117,
+        'tol': 0.0004134759007834719
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+
+    assert result_1['info']['valid_accuracy'] == pytest.approx(0.7539, rel=0.001)
+    assert 1 - result_1['info']['valid_accuracy'] == result_1['function_value']['misclassification_rate']
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
+
+    result_1 = benchmark.objective_function_test(test_config, rng=1, fidelity={'budget': 3})
+    assert 1 - result_1['function_value']['misclassification_rate'] == pytest.approx(0.76377, rel=0.001)
+    assert 1 - result_1['function_value']['misclassification_rate'] == result_1['info']['test_accuracy']
diff --git a/tests/test_check_configuration.py b/tests/test_check_configuration.py
index 8d3db58f..09322025 100644
--- a/tests/test_check_configuration.py
+++ b/tests/test_check_configuration.py
@@ -32,20 +32,23 @@ def get_fidelity_space(self):
 
             _check_and_cast_configuration = AbstractBenchmark._check_and_cast_configuration
             _check_and_cast_fidelity = AbstractBenchmark._check_and_cast_fidelity
+            _check_return_values = AbstractBenchmark._check_return_values
 
         self.foo = Dummy()
 
     def test_config_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration}}
 
         ret = tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
         self.assertIsInstance(ret, Dict)
+        self.assertIsInstance(ret['info'], Dict)
+        self.assertIsInstance(ret['info']['config'], Dict)
 
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration}}
 
         tmp(self=self.foo, configuration={"flt": 0.2, "cat": 1, "itg": 1})
         tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
@@ -57,23 +60,27 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
     def test_fidel_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
-            return configuration, fidelity, kwargs
+            return {'function_value': 0, 'cost': 0, 'info': {'config': configuration, 'fidel': fidelity, 'kwargs': kwargs}}
+            # return configuration, fidelity, kwargs
 
         sample_fidel = dict(self.foo.get_fidelity_space().get_default_configuration())
 
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration(),
-                        fidelity=sample_fidel)
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration(),
+                          fidelity=sample_fidel)
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
         less_fidel = {"f_cat": 1}
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration(),
-                        fidelity=less_fidel)
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration(),
+                          fidelity=less_fidel)
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
-        _, ret, _ = tmp(self=self.foo,
-                        configuration=self.foo.configuration_space.sample_configuration())
+        result_dict = tmp(self=self.foo,
+                          configuration=self.foo.configuration_space.sample_configuration())
+        ret = result_dict['info']['fidel']
         self.assertEqual(ret, sample_fidel)
 
         with pytest.raises(ValueError):
@@ -87,6 +94,7 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
                                            "configuration": self.foo.configuration_space.sample_configuration(),
                                            "fidelity": [0.1]})
 
+
 class TestCheckUnittest2(unittest.TestCase):
 
     def setUp(self):
@@ -100,6 +108,7 @@ class Dummy():
 
             _check_and_cast_configuration = AbstractBenchmark._check_and_cast_configuration
             _check_and_cast_fidelity = AbstractBenchmark._check_and_cast_fidelity
+            _check_return_values = AbstractBenchmark._check_return_values
 
             fidelity_space = ConfigurationSpace(seed=1)
             fidelity_space.add_hyperparameter(UniformFloatHyperparameter('fidelity1', lower=0., upper=1., default_value=1.))
@@ -108,11 +117,14 @@ class Dummy():
     def test_config_decorator(self):
         @AbstractBenchmark.check_parameters
         def tmp(_, configuration: Union[Dict, np.ndarray], fidelity: Dict, **kwargs):
-            return configuration, fidelity
+            return {'function_value': 0, 'cost': 0,
+                    'info': {'config': configuration, 'fidel': fidelity, 'kwargs': kwargs}}
 
         hps = dict(hp1=0.25, hp2=1.25, hp3=2.25)
         configuration = Configuration(self.foo.configuration_space, hps)
-        config, fidel = tmp(self=self.foo, configuration=configuration, fidelity=None)
+
+        return_dict = tmp(self=self.foo, configuration=configuration, fidelity=None)
+        config, fidel = return_dict['info']['config'], return_dict['info']['fidel']
 
         assert isinstance(config, Dict)
         assert isinstance(fidel, Dict)
@@ -153,3 +165,20 @@ def test_remove_inactive_parameter():
     # Remove inactive: - case: config is dict
     transformed = AbstractBenchmark._check_and_cast_configuration(not_allowed, configuration_space)
     assert transformed.get_dictionary() == {'hp1': 0, 'hp3': 5}
+
+
+def test_check_return_values():
+    return_values = {'function_value': 0, 'cost': 0}
+    AbstractBenchmark._check_return_values(return_values)
+
+    with pytest.raises(AssertionError):
+        AbstractBenchmark._check_return_values({'function_value': 0})
+
+    with pytest.raises(AssertionError):
+        AbstractBenchmark._check_return_values({'cost': 0})
+
+
+def test_check_return_values_mo():
+    return_values = {'function_value': {'obj1': 0, 'obj2': 0}, 'cost': 0}
+    from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+    AbstractMultiObjectiveBenchmark._check_return_values(return_values)
diff --git a/tests/test_data_manager.py b/tests/test_data_manager.py
index 7e32ce84..cee56ccc 100644
--- a/tests/test_data_manager.py
+++ b/tests/test_data_manager.py
@@ -1,14 +1,13 @@
 import shutil
-from multiprocessing import Pool
-
 import pytest
+from multiprocessing import Pool
 
 import hpobench
 from hpobench.util.data_manager import NASBench_201Data, YearPredictionMSDData, ProteinStructureData, BostonHousingData
-skip_message = 'We currently skip this test because it takes too much time.'
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_load_thread_safe():
     shutil.rmtree(hpobench.config_file.data_dir / "nasbench_201", ignore_errors=True)
     function = lambda: NASBench_201Data(dataset='cifar100').load()
@@ -16,7 +15,7 @@ def test_nasbench_201_load_thread_safe():
         pool.map(function, [])
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_init():
 
     data_manager = NASBench_201Data(dataset='cifar100')
@@ -30,7 +29,7 @@ def test_nasbench_201_init():
     assert data_manager._save_dir.exists()
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_load():
 
     shutil.rmtree(hpobench.config_file.data_dir / "nasbench_201", ignore_errors=True)
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
new file mode 100644
index 00000000..cded9444
--- /dev/null
+++ b/tests/test_mo_cnn.py
@@ -0,0 +1,51 @@
+import pytest
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_mo_cnn_seeding():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+    b1 = FlowerCNNBenchmark(rng=0)
+    b2 = FlowerCNNBenchmark(rng=0)
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = b1.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = b2.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    for metric in result_1['function_value'].keys():
+        assert result_1['function_value'][metric] == pytest.approx(result_2['function_value'][metric], abs=0.001)
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_mo_cnn_benchmark():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+
+    # Check Seeding
+    benchmark = FlowerCNNBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    print(f'MO CNN: Valid Accuracy = {result_1["info"]["valid_accuracy"]}')
+    print(f'MO CNN: Train Accuracy = {result_1["info"]["train_accuracy"]}')
+    # assert result_1['info']['train_accuracy'] == pytest.approx(0.1044, rel=0.001)
+    # assert result_1['info']['valid_accuracy'] == pytest.approx(0.1029, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == pytest.approx(1 - result_1['function_value']['negative_accuracy'], abs=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
diff --git a/tests/test_nasbench_101.py b/tests/test_nasbench_101.py
new file mode 100644
index 00000000..67ac7f65
--- /dev/null
+++ b/tests/test_nasbench_101.py
@@ -0,0 +1,82 @@
+import pytest
+import numpy as np
+
+from hpobench.container.benchmarks.nas.nasbench_101 import (
+    NASCifar10ABenchmark, NASCifar10BBenchmark, NASCifar10CBenchmark,
+    NASCifar10AMOBenchmark, NASCifar10BMOBenchmark, NASCifar10CMOBenchmark,
+)
+
+from hpobench.util.container_utils import disable_container_debug, enable_container_debug
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
+
+# from hpobench.util.test_utils import enable_all_tests
+# enable_all_tests()
+
+
+@pytest.fixture(scope='module')
+def enable_debug():
+    enable_container_debug()
+    yield
+    disable_container_debug()
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_nasbench101_A_SO(enable_debug):
+
+    b = NASCifar10ABenchmark(rng=0)
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    assert len(b.get_fidelity_space()) == 1
+
+    config = {
+        'edge_0': 0, 'edge_1': 0, 'edge_10': 0, 'edge_11': 1, 'edge_12': 1, 'edge_13': 0, 'edge_14': 1, 'edge_15': 0,
+        'edge_16': 0, 'edge_17': 1, 'edge_18': 1, 'edge_19': 0, 'edge_2': 0, 'edge_20': 1, 'edge_3': 0, 'edge_4': 0,
+        'edge_5': 1, 'edge_6': 1, 'edge_7': 0, 'edge_8': 0, 'edge_9': 0, 'op_node_0': 'maxpool3x3',
+        'op_node_1': 'conv1x1-bn-relu', 'op_node_2': 'conv3x3-bn-relu', 'op_node_3': 'conv3x3-bn-relu',
+        'op_node_4': 'conv3x3-bn-relu'
+    }
+
+    result = b.objective_function(configuration=config, fidelity={'budget': 108}, run_index=(0, 1, 2))
+    assert result['function_value'] == pytest.approx(0.1659655372301737, abs=0.1)
+    assert result['cost'] == pytest.approx(853.5010070800781, abs=0.1)
+    assert 1 - np.mean(result['info']['valid_accuracies']) == result['function_value']
+
+    with pytest.raises(AssertionError):
+        result = b.objective_function_test(configuration=config, fidelity={'epoch': 109})
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_nasbench101_C_MO(enable_debug):
+    b = NASCifar10CMOBenchmark(rng=0)
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    assert len(b.get_fidelity_space()) == 1
+
+    config = {
+        'edge_0': 0.9446689170495839, 'edge_1': 0.1289262976548533, 'edge_10': 0.09710127579306127,
+        'edge_11': 0.09394051075844168, 'edge_12': 0.5722519057908734, 'edge_13': 0.30157481667454933,
+        'edge_14': 0.9194826137446735, 'edge_15': 0.3599780644783639, 'edge_16': 0.589909976354571,
+        'edge_17': 0.4536968445560453, 'edge_18': 0.21550767711355845, 'edge_19': 0.18327983621407862,
+        'edge_2': 0.5864101661863267, 'edge_20': 0.47837030703998806, 'edge_3': 0.05342718178682526,
+        'edge_4': 0.6956254456388572, 'edge_5': 0.3068100995451961, 'edge_6': 0.399025321703102,
+        'edge_7': 0.15941446344895593, 'edge_8': 0.23274412927905685, 'edge_9': 0.0653042071517802, 'num_edges': 9,
+        'op_node_0': 'conv1x1-bn-relu', 'op_node_1': 'maxpool3x3', 'op_node_2': 'conv1x1-bn-relu',
+        'op_node_3': 'maxpool3x3', 'op_node_4': 'maxpool3x3'
+    }
+
+    result = b.objective_function(configuration=config, fidelity={'budget': 108}, run_index=(0, 1, 2))
+    assert result['function_value']['misclassification_rate'] == pytest.approx(0.11985842386881507, abs=0.1)
+    assert result['function_value']['trainable_parameters'] == 1115277
+    assert result['cost'] == pytest.approx(3175.9591064453125, abs=0.1)
+    assert 1 - np.mean(result['info']['valid_accuracies']) == result['function_value']['misclassification_rate']
+
+    with pytest.raises(AssertionError):
+        result = b.objective_function_test(configuration=config, fidelity={'epoch': 109})
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 925ac911..29ef18ec 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -1,14 +1,11 @@
-import logging
-logging.basicConfig(level=logging.DEBUG)
-
 import pytest
 
-from hpobench.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
+from hpobench.container.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
     Cifar10ValidNasBench201Benchmark
-
+from hpobench.benchmarks.nas.nasbench_201 import \
+    Cifar10ValidNasBench201MOBenchmark as LocalCifar10ValidNasBench201MOBenchmark
 from hpobench.util.container_utils import disable_container_debug, enable_container_debug
-
-skip_message = 'We currently skip this test because it takes too much time.'
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
 @pytest.fixture(scope='module')
@@ -18,76 +15,97 @@ def enable_debug():
     disable_container_debug()
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_cifar10valid(enable_debug):
 
     b = Cifar10ValidNasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
-    fidelity = {'epoch': 199}
-
-    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    assert result['function_value'] == pytest.approx(0.411, abs=0.1)
-    assert result['cost'] == pytest.approx(6650.88, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
-
-    result = b.objective_function_test(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    config = {
+        '1<-0': 'nor_conv_1x1',
+        '2<-0': 'nor_conv_3x3',
+        '2<-1': 'nor_conv_3x3',
+        '3<-0': 'nor_conv_1x1',
+        '3<-1': 'nor_conv_1x1',
+        '3<-2': 'nor_conv_3x3'
+    }
+    result = b.objective_function(configuration=config, fidelity={'epoch': 199}, data_seed=(777, 888, 999))
+    assert result['function_value'] == pytest.approx(0.0978, abs=0.1)
+    assert result['cost'] == pytest.approx(11973.20, abs=0.1)
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
+
+    result = b.objective_function_test(configuration=config, fidelity={'epoch': 200})
+    assert result['function_value'] == pytest.approx(0.0970, abs=0.1)
+    assert result['cost'] == pytest.approx(10426.33, abs=0.2)
+    assert result['info']['test_misclassification_rate'] == result['function_value']
+    assert result['info']['test_cost'] == result['cost']
 
     with pytest.raises(AssertionError):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
-@pytest.mark.skip(reason=skip_message)
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_cifar100(enable_debug):
     b = Cifar100NasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(7.8259, abs=0.1)
-    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(0.295233, abs=0.1)
+    assert result['cost'] == pytest.approx(19681.70, abs=0.1)
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_Image(enable_debug):
     b = ImageNetNasBench201Benchmark(rng=0)
-
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(62.858, abs=0.1)
-    assert result['cost'] == pytest.approx(40357.56, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(0.552167, abs=0.1)
+    assert result['cost'] == pytest.approx(57119.22, abs=0.1)
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 def test_nasbench201_fidelity_space():
-    fs = Cifar10ValidNasBench201Benchmark.get_fidelity_space()
+    fs = LocalCifar10ValidNasBench201MOBenchmark.get_fidelity_space()
     assert len(fs.get_hyperparameters()) == 1
 
 
 def test_nasbench201_config():
-    cs = Cifar10ValidNasBench201Benchmark.get_configuration_space(seed=0)
+
+    cs = LocalCifar10ValidNasBench201MOBenchmark.get_configuration_space(seed=0)
     c = cs.sample_configuration()
-    func = Cifar10ValidNasBench201Benchmark.config_to_structure_func(4)
-    struct = func(c)
 
-    assert struct.__repr__() == '_Structure(4 nodes with |avg_pool_3x3~0|+|none~0|nor_conv_3x3~1|+' \
-                                '|nor_conv_3x3~0|nor_conv_3x3~1|skip_connect~2|)'
+    func = LocalCifar10ValidNasBench201MOBenchmark.config_to_structure_func(4)
+    struct = func(c)
+    assert struct.__repr__() == '_Structure(4 nodes with |nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
+                                '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|)'
     assert len(struct) == 4
-    assert struct[0] == (('avg_pool_3x3', 0),)
+    assert struct[0] == (('nor_conv_1x1', 0),)
 
     struct_str = struct.tostr()
-    assert struct_str == '|avg_pool_3x3~0|+|none~0|nor_conv_3x3~1|+|nor_conv_3x3~0|nor_conv_3x3~1|skip_connect~2|'
+    assert struct_str == '|nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
+                         '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|'
diff --git a/tests/test_od.py b/tests/test_od.py
index f6ca038f..c6e2b36a 100644
--- a/tests/test_od.py
+++ b/tests/test_od.py
@@ -14,17 +14,16 @@ def test_ocsvm():
 
 
 def test_kde():
-    from hpobench.container.benchmarks.od.od_benchmarks import  ODKernelDensityEstimation
+    from hpobench.container.benchmarks.od.od_benchmarks import ODKernelDensityEstimation
     seed = 6
     benchmark = ODKernelDensityEstimation("cardio", rng=seed)
 
     config = benchmark.get_configuration_space(seed=seed).sample_configuration()
-    result = benchmark.objective_function_test(configuration=config, rng=seed)
-    print(config['kernel'], config['bandwidth'], result['function_value'])
+    assert config is not None
 
-    assert config['kernel'] == "exponential"
-    assert config['bandwidth'] == pytest.approx(15.2274, abs=0.001)
-    assert result['function_value'] == pytest.approx(0.14409, abs=0.0001)
+    test_config = {'bandwidth': 15.227439996058147, 'kernel': 'tophat', 'scaler': 'Standard'}
+    result = benchmark.objective_function_test(configuration=test_config, rng=seed)
+    assert result['function_value'] == pytest.approx(0.8675, abs=0.0001)
 
 
 def test_ae():
@@ -33,8 +32,13 @@ def test_ae():
     benchmark = ODAutoencoder("cardio", rng=seed)
 
     config = benchmark.get_configuration_space(seed=seed).sample_configuration()
-    result = benchmark.objective_function(configuration=config, rng=seed)
-    print(config['dropout_rate'], result['function_value'])
+    assert config is not None
+
+    test_config = {'activation': 'tanh', 'batch_normalization': True,
+                   'batch_size': 424, 'beta1': 0.8562127972330622, 'beta2': 0.9107549023256032,
+                   'dropout': False, 'lr': 0.0013160410886450579, 'num_latent_units': 5,
+                   'num_layers': 1, 'scaler': 'MinMax', 'skip_connection': True,
+                   'weight_decay': 0.07358821063486902, 'num_units_layer_1': 16}
 
-    assert config['dropout_rate'] == pytest.approx(0.69512, abs=0.00001)
-    assert result['function_value'] == pytest.approx(0.2833, abs=0.0001)
+    result = benchmark.objective_function(configuration=test_config, rng=seed)
+    assert result['function_value'] == pytest.approx(0.81378, abs=0.001)
diff --git a/tests/test_paramnet.py b/tests/test_paramnet.py
index 52d55f94..076f4b38 100644
--- a/tests/test_paramnet.py
+++ b/tests/test_paramnet.py
@@ -1,11 +1,13 @@
 import pytest
+import sys
 
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-# from hpobench.util.container_utils import enable_container_debug
-# enable_container_debug()
 
+MSG = 'Skip this test for new (>3.9) python versions. ' \
+      'The paramnet benchmarks require an specific old scikit learn version. This version however does not work under ' \
+      'python 3.10. Therefore we skip this test. The containerized version does still work under 3.10.'
 
+
+@pytest.mark.skipif(sys.version_info > (3, 9), reason=MSG)
 def test_load_data():
     from hpobench.util.data_manager import ParamNetDataManager
 
diff --git a/tests/test_pybnn.py b/tests/test_pybnn.py
index 0e749457..f1c6b5fc 100644
--- a/tests/test_pybnn.py
+++ b/tests/test_pybnn.py
@@ -1,14 +1,19 @@
+import sys
 import pytest
 
 from hpobench.container.benchmarks.ml.pybnn import BNNOnToyFunction, BNNOnBostonHousing, BNNOnProteinStructure, \
     BNNOnYearPrediction
 
-import logging
-logging.basicConfig(level=logging.DEBUG)
 from hpobench.util.container_utils import enable_container_debug
+from hpobench.util.test_utils import check_run_all_tests, DEFAULT_SKIP_MSG
+
 enable_container_debug()
+MSG = 'Skip this test for new (>3.9) python versions. ' \
+      'The paramnet benchmarks require an specific old scikit learn version. This version however does not work under ' \
+      'python 3.10. Therefore we skip this test. The containerized version does still work under 3.10.'
 
 
+@pytest.mark.skipif(sys.version_info > (3, 9), reason=MSG)
 def test_bnn_init():
     benchmark = BNNOnToyFunction(rng=1)
 
@@ -58,6 +63,7 @@ def test_bnn_boston_housing():
     assert test_result['info']['fidelity']['budget'] == 1000
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_bnn_protein():
     benchmark = BNNOnProteinStructure(rng=1)
     test_result = simple_call(benchmark)
@@ -66,6 +72,7 @@ def test_bnn_protein():
     assert test_result['info']['fidelity']['budget'] == 1000
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_year_pred():
     benchmark = BNNOnYearPrediction(rng=1)
     test_result = simple_call(benchmark)
diff --git a/tests/test_tabular_benchmarks.py b/tests/test_tabular_benchmarks.py
index 59d8dd45..573a2822 100644
--- a/tests/test_tabular_benchmarks.py
+++ b/tests/test_tabular_benchmarks.py
@@ -134,7 +134,7 @@ def test_parkinson_benchmark(self):
             benchmark.objective_function_test(default_config, fidelity=dict(budget=1, ))
 
         result = benchmark.objective_function_test(configuration=default_config, fidelity=dict(budget=100))
-        assert pytest.approx(0.15010187, result['function_value'], abs=0.001)
+        assert result['function_value'] == pytest.approx(0.15010187, abs=0.001)
 
         runtime = 62.7268
         assert result['cost'] == pytest.approx(runtime, abs=0.0001)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9bc5ff3b..e570dbd7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -105,3 +105,15 @@ def test_debug_level():
 
     disable_container_debug()
     assert os.environ['HPOBENCH_DEBUG'] == 'false'
+
+
+def test_test_utils():
+    from hpobench.util.test_utils import DEFAULT_SKIP_MSG, enable_all_tests, disable_all_tests, check_run_all_tests
+
+    assert isinstance(DEFAULT_SKIP_MSG, str)
+
+    enable_all_tests()
+    assert check_run_all_tests()
+
+    disable_all_tests()
+    assert not check_run_all_tests()
\ No newline at end of file
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index 35a9a940..585f9867 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -63,6 +63,7 @@ def test_whitebox_with_container():
     assert np.isclose(test_loss, 0.43636, atol=0.001)
 
 
+@pytest.mark.skipif(skip_container_test, reason="Requires singularity and flask")
 def test_cartpole():
     from hpobench.container.benchmarks.rl.cartpole import CartpoleReduced as Benchmark
     b = Benchmark(container_name='cartpole',
diff --git a/tests/test_yahpo.py b/tests/test_yahpo.py
new file mode 100644
index 00000000..97a7d06d
--- /dev/null
+++ b/tests/test_yahpo.py
@@ -0,0 +1,77 @@
+import sys
+from typing import Dict, List
+
+import pytest
+
+from hpobench.container.benchmarks.surrogates.yahpo_gym import YAHPOGymBenchmark, YAHPOGymMOBenchmark
+
+
+def test_yahpo_init():
+    b = YAHPOGymBenchmark(scenario="lcbench", instance="167152", objective="val_accuracy")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert result['function_value'] == pytest.approx(61.297, abs=0.1)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.1)
+    assert isinstance(result['info'], Dict)
+
+
+def test_yahpo_mo():
+    b = YAHPOGymMOBenchmark(scenario="lcbench", instance="167152")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert isinstance(result['function_value'], Dict)
+    assert result['function_value']['val_accuracy'] == pytest.approx(61.2971, abs=0.0001)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.0001)
+
+    names = b.get_objective_names()
+    assert isinstance(names, List)
+    assert len(names) == 6
+    assert names[2] == 'val_cross_entropy'
diff --git a/tests/test_yahpo_raw.py b/tests/test_yahpo_raw.py
new file mode 100644
index 00000000..65694603
--- /dev/null
+++ b/tests/test_yahpo_raw.py
@@ -0,0 +1,12 @@
+from hpobench.container.benchmarks.ml.yahpo_benchmark import YAHPOGymMORawBenchmark
+
+
+def test_mo_benchmark():
+
+    b = YAHPOGymMORawBenchmark(scenario="iaml_xgboost", instance="40981",)
+    cfg = b.get_configuration_space().get_default_configuration()
+    b.objective_function(cfg)
+
+
+if __name__ == '__main__':
+    test_mo_benchmark()
\ No newline at end of file