diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 82f9aa432..11baa888f 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,33 +1,61 @@
 name: dist-check
 
-on: [push, pull_request]
+on:
+  # Manually triggerable in github
+  workflow_dispatch:
+
+  # When a push occurs on either of these branches
+  push:
+    branches:
+      - master
+      - development
+
+  # When a push occurs on a PR that targets these branches
+  pull_request:
+    branches:
+      - master
+      - development
+
+  schedule:
+    # Every day at 7AM UTC
+    - cron: '0 07 * * *'
 
 jobs:
+
   dist:
     runs-on: ubuntu-latest
+
     steps:
-    - uses: actions/checkout@v2
+    - name: Checkout
+      uses: actions/checkout@v2
       with:
         submodules: recursive
     - name: Setup Python
       uses: actions/setup-python@v2
       with:
         python-version: 3.8
+
     - name: Build dist
       run: |
         python setup.py sdist
+
     - name: Twine check
       run: |
         pip install twine
         last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1)
-        twine_output=`twine check "$last_dist"`
-        if [[ "$twine_output" != "Checking $last_dist: PASSED" ]]; then echo $twine_output && exit 1;fi
+        twine check "$last_dist" --strict
+
     - name: Install dist
       run: |
         last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1)
         pip install $last_dist
+
     - name: PEP 561 Compliance
       run: |
         pip install mypy
-        cd ..  # required to use the installed version of autosklearn
-        if ! python -c "import autoPyTorch"; then exit 1; fi
\ No newline at end of file
+
+        cd ..  # required to use the installed version of autoPyTorch
+
+        # Note this doesn't perform mypy checks, those are handled in pre-commit.yaml
+        # This only checks if autoPyTorch exports type information
+        if ! mypy -c "import autoPyTorch"; then exit 1; fi
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
new file mode 100644
index 000000000..285ae1f9c
--- /dev/null
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,80 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Publish Docker image
+
+on:
+  push:
+    # Push  to `master` or `development`
+    branches:
+      - master
+      - development
+      - fixes_docker
+  workflow_dispatch:
+
+jobs:
+  push_to_registries:
+    name: Push Docker image to multiple registries
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v2
+
+      - name: Extract branch name
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: extract_branch
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+      
+      - name: Log in to the Container registry
+        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
+        with:
+          images: |
+            automlorg/autopytorch
+            ghcr.io/${{ github.repository }}
+      
+      - name: Build and push Docker images
+        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.extract_branch.outputs.branch }}
+
+      - name: Docker Login
+        run: docker login ghcr.io -u $GITHUB_ACTOR -p $GITHUB_TOKEN
+        env:
+            GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Pull Docker image
+        run: docker pull ghcr.io/$GITHUB_REPOSITORY/autoPyTorch:$BRANCH
+        env:
+            BRANCH: ${{ steps.extract_branch.outputs.branch }}
+
+      - name: Run image
+        run: docker run -i -d --name unittester -v $GITHUB_WORKSPACE:/workspace -w /workspace ghcr.io/$GITHUB_REPOSITORY/autoPyTorch:$BRANCH
+        env:
+            BRANCH: ${{ steps.extract_branch.outputs.branch }}
+
+      - name: Auto-PyTorch loaded
+        run: docker exec  -i unittester python3 -c 'import autoPyTorch; print(f"Auto-PyTorch imported from {autoPyTorch.__file__}")'
+
+      - name: Run unit testing
+        run: docker exec  -i unittester python3 -m pytest -v test
\ No newline at end of file
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f6a87c91b..480883eaa 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,29 +1,51 @@
 name: Docs
-on: [pull_request, push]
+
+on:
+  # Allow to manually trigger through github API
+  # Wont trigger the push to github pages where the documentation is located
+  workflow_dispatch:
+
+  # Triggers with push to these branches
+  push:
+    branches:
+      - master
+      - development
+
+  # Triggers with push to a pr aimed at these branches
+  pull_request:
+    branches:
+      - master
+      - development
 
 jobs:
   build-and-deploy:
     runs-on: ubuntu-latest
+
     steps:
-    - uses: actions/checkout@v2
+    - name: Checkout
+      uses: actions/checkout@v2
       with:
         submodules: recursive
     - name: Setup Python
       uses: actions/setup-python@v2
       with:
         python-version: 3.8
+
     - name: Install dependencies
       run: |
-        pip install -e .[docs,examples]
+        pip install -e .[docs,examples,forecasting]
+
     - name: Make docs
       run: |
         cd docs
         make html
+
     - name: Pull latest gh-pages
       if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
       run: |
         cd ..
         git clone https://github.com/automl/Auto-PyTorch.git --branch gh-pages --single-branch gh-pages
+
     - name: Copy new doc into gh-pages
       if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
       run: |
@@ -31,6 +53,7 @@ jobs:
         cd ../gh-pages
         rm -rf $branch_name
         cp -r ../Auto-PyTorch/docs/build/html $branch_name
+
     - name: Push to gh-pages
       if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
       run: |
diff --git a/.github/workflows/long_regression_test.yml b/.github/workflows/long_regression_test.yml
index e7ccb5ea0..c36bb5e6d 100644
--- a/.github/workflows/long_regression_test.yml
+++ b/.github/workflows/long_regression_test.yml
@@ -7,15 +7,15 @@ on:
     #- cron: '0 07 * * 2'
     - cron: '0 07 * * *'
 
-
 jobs:
-  ubuntu:
 
+  ubuntu:
     runs-on: ubuntu-latest
+
     strategy:
+      fail-fast:  false
       matrix:
         python-version: [3.8]
-      fail-fast:  false
 
     steps:
     - uses: actions/checkout@v2
@@ -26,10 +26,12 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
+
     - name: Install test dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e .[test]
+        pip install -e .[forecasting,test]
+
     - name: Run tests
       run: |
         python -m pytest --durations=200 cicd/test_preselected_configs.py -vs
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
index 5e192375a..d9fd438c5 100644
--- a/.github/workflows/pre-commit.yaml
+++ b/.github/workflows/pre-commit.yaml
@@ -1,22 +1,44 @@
 name: pre-commit
 
-on: [push, pull_request]
+on:
+  # Allow to manually trigger through github API
+  workflow_dispatch:
+
+  # Triggers with push to these branches
+  push:
+    branches:
+      - master
+      - development
+
+  # Triggers with push to a pr aimed at these branches
+  pull_request:
+    branches:
+      - master
+      - development
 
 jobs:
+
   run-all-files:
     runs-on: ubuntu-latest
+
     steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: recursive
+    - name: Checkout
+      uses: actions/checkout@v2
+
     - name: Setup Python 3.7
       uses: actions/setup-python@v2
       with:
         python-version: 3.7
+
+    - name: Init Submodules
+      run: |
+        git submodule update --init --recursive
+
     - name: Install pre-commit
       run: |
         pip install pre-commit
         pre-commit install
+
     - name: Run pre-commit
       run: |
         pre-commit run --all-files
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index fed77c484..0ac1d04e1 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -1,42 +1,118 @@
 name: Tests
 
-on: [push, pull_request]
+on:
+  # Allow to manually trigger through github API
+  workflow_dispatch:
+
+  # Triggers with push to these branches
+  push:
+    branches:
+      - master
+      - development
+
+  # Triggers with push to pr targeting these branches
+  pull_request:
+    branches:
+      - master
+      - development
+
+  schedule:
+  # Every day at 7AM UTC
+  - cron: '0 07 * * *'
+
+env:
+
+  # Arguments used for pytest
+  pytest-args: >-
+    --forked
+    --durations=20
+    --timeout=600
+    --timeout-method=signal
+    -v
+
+  # Arguments used for code-cov which is later used to annotate PR's on github
+  code-cov-args: >-
+    --cov=autoPyTorch
+    --cov-report=xml
+    --cov-config=.coveragerc
 
 jobs:
-  ubuntu:
+  tests:
+
+    name: ${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.kind }}
+    runs-on: ${{ matrix.os }}
 
-    runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        os: [windows-latest, macos-latest, ubuntu-latest]
+        python-version: ['3.7', '3.8', '3.9', '3.10']
+        kind: ['source', 'dist']
+
+        exclude:
+          # Exclude all configurations *-*-dist, include one later
+          - kind: 'dist'
+
+          # Exclude windows as bash commands wont work in windows runner
+          - os: windows-latest
+
+          # Exclude macos as there are permission errors using conda as we do
+          - os: macos-latest
+
+          # Exclude python 3.10 as torch is not support python 3.10 yet
+          - python-version: '3.10'
+
         include:
-          - python-version: 3.8
+          # Add the tag code-cov to ubuntu-3.7-source
+          - os: ubuntu-latest
+            python-version: 3.7
+            kind: 'source'
             code-cov: true
-      fail-fast: false
-      max-parallel: 2
+
+          # Include one config with dist, ubuntu-3.7-dist
+          - os: ubuntu-latest
+            python-version: 3.7
+            kind: 'dist'
 
     steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: recursive
+    - name: Checkout
+      uses: actions/checkout@v2
+
+
     - name: Setup Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install test dependencies
+
+    - name: Source install
+      if: matrix.kind == 'source'
       run: |
+        git submodule update --init --recursive
         python -m pip install --upgrade pip
-        pip install -e .[test]
+        pip install -e .[forecasting,test]
+
+    - name: Dist install
+      if: matrix.kind == 'dist'
+      run: |
+        git submodule update --init --recursive
+
+        python setup.py sdist
+        last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1)
+        pip install $last_dist[forecasting,test]
+
     - name: Store repository status
       id: status-before
       run: |
         echo "::set-output name=BEFORE::$(git status --porcelain -b)"
+
     - name: Run tests
       run: |
         if [ ${{ matrix.code-cov }} ]; then
-          codecov='--cov=autoPyTorch --cov-report=xml --cov-config=.coveragerc';
+          python -m pytest ${{ env.pytest-args }} ${{ env.code-cov-args }} test
+        else
+          python -m pytest ${{ env.pytest-args }} test
         fi
-        python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
+
     - name: Check for files left behind by test
       if: ${{ always() }}
       run: |
@@ -48,6 +124,7 @@ jobs:
             echo "Not all generated files have been deleted!"
             exit 1
         fi
+
     - name: Upload coverage
       if: matrix.code-cov && always()
       uses: codecov/codecov-action@v1
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d76014c44..c9b2e7615 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -8,7 +8,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  build-n-publish:
+  publish:
     runs-on: "ubuntu-latest"
 
     steps:
@@ -50,4 +50,4 @@ jobs:
         uses: pypa/gh-action-pypi-publish@master
         with:
           user: __token__
-          password: ${{ secrets.PYPI_TOKEN }}
+          password: ${{ secrets.pypi_token }}
diff --git a/.github/workflows/scheduled_test.yml b/.github/workflows/scheduled_test.yml
deleted file mode 100644
index ce9615b0c..000000000
--- a/.github/workflows/scheduled_test.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Tests
-
-on:
-  schedule:
-    # Every Monday at 7AM UTC
-    - cron: '0 07 * * 1'
-
-
-jobs:
-  ubuntu:
-
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [3.8]
-      fail-fast:  false
-      max-parallel: 2
-
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        ref: master
-        submodules: recursive
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install test dependencies
-      run: |
-        git submodule update --init --recursive
-        python -m pip install --upgrade pip
-        pip install -e .[test]
-    - name: Run tests
-      run: |
-        python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v test
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 2f6b9ae8b..4096cc1b6 100755
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include requirements.txt
+include autoPyTorch/py.typed
 include autoPyTorch/utils/logging.yaml
 include autoPyTorch/configs/default_pipeline_options.json
 include autoPyTorch/configs/greedy_portfolio.json
diff --git a/README.md b/README.md
index 92f63c387..d64eff179 100755
--- a/README.md
+++ b/README.md
@@ -4,10 +4,13 @@ Copyright (C) 2021  [AutoML Groups Freiburg and Hannover](http://www.automl.org/
 
 While early AutoML frameworks focused on optimizing traditional ML pipelines and their hyperparameters, another trend in AutoML is to focus on neural architecture search. To bring the best of these two worlds together, we developed **Auto-PyTorch**, which jointly and robustly optimizes the network architecture and the training hyperparameters to enable fully automated deep learning (AutoDL).
 
-Auto-PyTorch is mainly developed to support tabular data (classification, regression).
+Auto-PyTorch is mainly developed to support tabular data (classification, regression) and time series data (forecasting).
 The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below for bibtex ref).
+Details about Auto-PyTorch for multi-horizontal time series forecasting tasks can be found in the paper ["Efficient Automated Deep Learning for Time Series Forecasting"](https://arxiv.org/abs/2205.05511) (also see below for bibtex ref).
+
 Also, find the documentation [here](https://automl.github.io/Auto-PyTorch/master).
 
+
 ***From v0.1.0, AutoPyTorch has been updated to further improve usability, robustness and efficiency by using SMAC as the underlying optimization package as well as changing the code structure. Therefore, moving from v0.0.2 to v0.1.0 will break compatibility. 
 In case you would like to use the old API, you can find it at [`master_old`](https://github.com/automl/Auto-PyTorch/tree/master-old).***
 
@@ -23,10 +26,11 @@ The current version only supports the *greedy portfolio* as described in the pap
 This portfolio is used to warm-start the optimization of SMAC.
 In other words, we evaluate the portfolio on a provided data as initial configurations.
 Then API starts the following procedures:
-
 1. **Validate input data**: Process each data type, e.g. encoding categorical data, so that Auto-Pytorch can handled.
 2. **Create dataset**: Create a dataset that can be handled in this API with a choice of cross validation or holdout splits.
-3. **Evaluate baselines** *1: Train each algorithm in the predefined pool with a fixed hyperparameter configuration and dummy model from `sklearn.dummy` that represents the worst possible performance.
+3. **Evaluate baselines** 
+   * ***Tabular dataset*** *1: Train each algorithm in the predefined pool with a fixed hyperparameter configuration and dummy model from `sklearn.dummy` that represents the worst possible performance.
+   * ***Time Series Forecasting dataset*** : Train a dummy predictor that repeats the last observed value in each series
 4. **Search by [SMAC](https://github.com/automl/SMAC3)**:\
     a. Determine budget and cut-off rules by [Hyperband](https://jmlr.org/papers/volume18/16-558/16-558.pdf)\
     b. Sample a pipeline hyperparameter configuration *2 by SMAC\
@@ -49,6 +53,14 @@ pip install autoPyTorch
 
 ```
 
+Auto-PyTorch for Time Series Forecasting requires additional dependencies 
+
+```sh
+
+pip install autoPyTorch[forecasting]
+
+```
+
 ### Manual Installation
 
 We recommend using Anaconda for developing as follows:
@@ -69,6 +81,20 @@ python setup.py install
 
 ```
 
+Similarly, to install all the dependencies for Auto-PyTorch-TimeSeriesForecasting:
+
+
+```sh
+
+git submodule update --init --recursive
+
+conda create -n auto-pytorch python=3.8
+conda activate auto-pytorch
+conda install swig
+pip install -e[forecasting]
+
+```
+
 ## Examples
 
 In a nutshell:
@@ -104,6 +130,66 @@ score = api.score(y_pred, y_test)
 print("Accuracy score", score)
 ```
 
+For Time Series Forecasting Tasks
+```py
+
+from autoPyTorch.api.time_series_forecasting import TimeSeriesForecastingTask
+
+# data and metric imports
+from sktime.datasets import load_longley
+targets, features = load_longley()
+
+# define the forecasting horizon
+forecasting_horizon = 3
+
+# Dataset optimized by APT-TS can be a list of np.ndarray/ pd.DataFrame where each series represents an element in the 
+# list, or a single pd.DataFrame that records the series
+# index information: to which series the timestep belongs? This id can be stored as the DataFrame's index or a separate
+# column
+# Within each series, we take the last forecasting_horizon as test targets. The items before that as training targets
+# Normally the value to be forecasted should follow the training sets
+y_train = [targets[: -forecasting_horizon]]
+y_test = [targets[-forecasting_horizon:]]
+
+# same for features. For uni-variant models, X_train, X_test can be omitted and set as None
+X_train = [features[: -forecasting_horizon]]
+# Here x_test indicates the 'known future features': they are the features known previously, features that are unknown
+# could be replaced with NAN or zeros (which will not be used by our networks). If no feature is known beforehand,
+# we could also omit X_test
+known_future_features = list(features.columns)
+X_test = [features[-forecasting_horizon:]]
+
+start_times = [targets.index.to_timestamp()[0]]
+freq = '1Y'
+
+# initialise Auto-PyTorch api
+api = TimeSeriesForecastingTask()
+
+# Search for an ensemble of machine learning algorithms
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test, 
+    optimize_metric='mean_MAPE_forecasting',
+    n_prediction_steps=forecasting_horizon,
+    memory_limit=16 * 1024,  # Currently, forecasting models use much more memories
+    freq=freq,
+    start_times=start_times,
+    func_eval_time_limit_secs=50,
+    total_walltime_limit=60,
+    min_num_test_instances=1000,  # proxy validation sets. This only works for the tasks with more than 1000 series
+    known_future_features=known_future_features,
+)
+
+# our dataset could directly generate sequences for new datasets
+test_sets = api.dataset.generate_test_seqs()
+
+# Calculate test accuracy
+y_pred = api.predict(test_sets)
+score = api.score(y_pred, y_test)
+print("Forecasting score", score)
+```
+
 For more examples including customising the search space, parellising the code, etc, checkout the `examples` folder
 
 ```sh
@@ -162,6 +248,17 @@ Please refer to the branch `TPAMI.2021.3067763` to reproduce the paper *Auto-PyT
 }
 ```
 
+```bibtex
+@article{deng-ecml22,
+  author    = {Difan Deng and Florian Karl and Frank Hutter and Bernd Bischl and Marius Lindauer},
+  title     = {Efficient Automated Deep Learning for Time Series Forecasting},
+  year      = {2022},
+  booktitle = {Machine Learning and Knowledge Discovery in Databases. Research Track
+               - European Conference, {ECML} {PKDD} 2022},
+  url       = {https://doi.org/10.48550/arXiv.2205.05511},
+}
+```
+
 ## Contact
 
-Auto-PyTorch is developed by the [AutoML Group of the University of Freiburg](http://www.automl.org/).
+Auto-PyTorch is developed by the [AutoML Groups of the University of Freiburg and Hannover](http://www.automl.org/).
diff --git a/autoPyTorch/__version__.py b/autoPyTorch/__version__.py
index 10b95664f..94b9a71f5 100644
--- a/autoPyTorch/__version__.py
+++ b/autoPyTorch/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.1.1"
+__version__ = "0.2"
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index a997c505b..c5468eae7 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -11,7 +11,7 @@
 import typing
 import unittest.mock
 import warnings
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
@@ -21,28 +21,40 @@
 
 import joblib
 
+import matplotlib.pyplot as plt
+
 import numpy as np
 
 import pandas as pd
 
-from smac.runhistory.runhistory import DataOrigin, RunHistory
+from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
 from smac.stats.stats import Stats
 from smac.tae import StatusType
 
-from autoPyTorch.api.results_manager import ResultsManager, SearchResults
+from autoPyTorch import metrics
 from autoPyTorch.automl_common.common.utils.backend import Backend, create
 from autoPyTorch.constants import (
+    FORECASTING_BUDGET_TYPE,
+    FORECASTING_TASKS,
     REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
+    TIMESERIES_FORECASTING,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+    NoResamplingStrategyTypes,
+    ResamplingStrategies,
+)
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
@@ -58,6 +70,8 @@
 )
 from autoPyTorch.utils.parallel import preload_modules
 from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements
+from autoPyTorch.utils.results_manager import MetricResults, ResultsManager, SearchResults
+from autoPyTorch.utils.results_visualizer import ColorLabelSettings, PlotSettingParams, ResultsVisualizer
 from autoPyTorch.utils.single_thread_client import SingleThreadedClient
 from autoPyTorch.utils.stopwatch import StopWatch
 
@@ -66,7 +80,8 @@ def _pipeline_predict(pipeline: BasePipeline,
                       X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int,
                       logger: PicklableClientLogger,
-                      task: int) -> np.ndarray:
+                      task: int,
+                      task_type: str = "") -> np.ndarray:
     @typing.no_type_check
     def send_warnings_to_log(
             message, category, filename, lineno, file=None, line=None):
@@ -76,7 +91,7 @@ def send_warnings_to_log(
     X_ = X.copy()
     with warnings.catch_warnings():
         warnings.showwarning = send_warnings_to_log
-        if task in REGRESSION_TASKS:
+        if task in REGRESSION_TASKS or task in FORECASTING_TASKS:
             # Voting regressor does not support batch size
             prediction = pipeline.predict(X_)
         else:
@@ -90,17 +105,17 @@ def send_warnings_to_log(
                     prediction,
                     np.sum(prediction, axis=1)
                 ))
-
-    if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
-            X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
-        logger.warning(
-            "Prediction shape for model %s is %s while X_.shape is %s",
-            pipeline, str(prediction.shape), str(X_.shape)
-        )
+    if STRING_TO_TASK_TYPES.get(task_type, -1) != TIMESERIES_FORECASTING:
+        if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
+                X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
+            logger.warning(
+                "Prediction shape for model %s is %s while X_.shape is %s",
+                pipeline, str(prediction.shape), str(X_.shape)
+            )
     return prediction
 
 
-class BaseTask:
+class BaseTask(ABC):
     """
     Base class for the tasks that serve as API to the pipelines.
 
@@ -130,13 +145,23 @@ class BaseTask:
         delete_tmp_folder_after_terminate (bool):
             Determines whether to delete the temporary directory,
             when finished
-        include_components (Optional[Dict]):
-            If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]):
-            If None, all possible components are used.
-            Otherwise specifies set of components not to use.
-            Incompatible with include components
+        include_components (Optional[Dict[str, Any]]):
+            Dictionary containing components to include. Key is the node
+            name and Value is an Iterable of the names of the components
+            to include. Only these components will be present in the
+            search space.
+        exclude_components (Optional[Dict[str, Any]]):
+            Dictionary containing components to exclude. Key is the node
+            name and Value is an Iterable of the names of the components
+            to exclude. All except these components will be present in
+            the search space.
+        resampling_strategy resampling_strategy (RESAMPLING_STRATEGIES),
+                (default=HoldoutValTypes.holdout_validation):
+                strategy to split the training data.
+        resampling_strategy_args (Optional[Dict[str, Any]]): arguments
+            required for the chosen resampling strategy. If None, uses
+            the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+            in ```datasets/resampling_strategy.py```.
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
             Search space updates that can be used to modify the search
             space of particular components or choice modules of the pipeline
@@ -155,14 +180,18 @@ def __init__(
         output_directory: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
         delete_output_folder_after_terminate: bool = True,
-        include_components: Optional[Dict] = None,
-        exclude_components: Optional[Dict] = None,
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
         backend: Optional[Backend] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
         task_type: Optional[str] = None
     ) -> None:
+
+        if isinstance(resampling_strategy, NoResamplingStrategyTypes) and ensemble_size != 0:
+            raise ValueError("`NoResamplingStrategy` cannot be used for ensemble construction")
+
         self.seed = seed
         self.n_jobs = n_jobs
         self.n_threads = n_threads
@@ -193,6 +222,8 @@ def __init__(
         self.search_space: Optional[ConfigurationSpace] = None
         self._dataset_requirements: Optional[List[FitRequirement]] = None
         self._metric: Optional[autoPyTorchMetric] = None
+        self._metrics_kwargs: Dict = {}
+
         self._scoring_functions: Optional[List[autoPyTorchMetric]] = None
         self._logger: Optional[PicklableClientLogger] = None
         self.dataset_name: Optional[str] = None
@@ -219,7 +250,7 @@ def __init__(
         if self.n_jobs == 1:
             self._multiprocessing_context = 'fork'
 
-        self.InputValidator: Optional[BaseInputValidator] = None
+        self.input_validator: Optional[BaseInputValidator] = None
 
         self.search_space_updates = search_space_updates
         if search_space_updates is not None:
@@ -229,19 +260,177 @@ def __init__(
                                  " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
 
     @abstractmethod
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline:
+    def build_pipeline(
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ) -> BasePipeline:
         """
         Build pipeline according to current task
         and for the passed dataset properties
 
         Args:
-            dataset_properties (Dict[str,Any])
+            dataset_properties (Dict[str, Any]):
+                Characteristics of the dataset to guide the pipeline
+                choices of components
+            include_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to include. Key is the node
+                name and Value is an Iterable of the names of the components
+                to include. Only these components will be present in the
+                search space.
+            exclude_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to exclude. Key is the node
+                name and Value is an Iterable of the names of the components
+                to exclude. All except these components will be present in
+                the search space.
+            search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+                Search space updates that can be used to modify the search
+                space of particular components or choice modules of the pipeline
 
         Returns:
+            BasePipeline
+
+        """
+        raise NotImplementedError("Function called on BaseTask, this can only be called by "
+                                  "specific task which is a child of the BaseTask")
 
+    @abstractmethod
+    def _get_dataset_input_validator(
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any
+    ) -> Tuple[BaseDataset, BaseInputValidator]:
+        """
+        Returns an object of a child class of `BaseDataset` and
+        an object of a child class of `BaseInputValidator` according
+        to the current task.
+
+        Args:
+            X_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training feature set.
+            y_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training target set.
+            X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing feature set
+            y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing target set
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+
+        Returns:
+            BaseDataset:
+                the dataset object
+            BaseInputValidator:
+                fitted input validator
         """
         raise NotImplementedError
 
+    def get_dataset(
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any
+    ) -> BaseDataset:
+        """
+        Returns an object of a child class of `BaseDataset` according to the current task.
+
+        Args:
+            X_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training feature set.
+            y_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training target set.
+            X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing feature set
+            y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing target set
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+                    It can be either float or int.
+
+                **methods**
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
+            kwargs (Any):
+                can be used to pass task specific dataset arguments. Currently supports
+                passing `feat_types` for tabular tasks which specifies whether a feature is
+                'numerical' or 'categorical'.
+
+        Returns:
+            BaseDataset:
+                the dataset object
+        """
+        dataset, _ = self._get_dataset_input_validator(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            dataset_name=dataset_name,
+            dataset_compression=dataset_compression,
+            **kwargs)
+
+        return dataset
+
     @property
     def run_history(self) -> RunHistory:
         return self._results_manager.run_history
@@ -553,14 +742,15 @@ def _do_dummy_prediction(self) -> None:
             backend=self._backend,
             seed=self.seed,
             metric=self._metric,
+            multi_objectives=["cost"],
             logger_port=self._logger_port,
             cost_for_crash=get_cost_of_crash(self._metric),
             abort_on_first_run_crash=False,
             initial_num_run=num_run,
             stats=stats,
             memory_limit=memory_limit,
-            disable_file_output=True if len(self._disable_file_output) > 0 else False,
-            all_supported_metrics=self._all_supported_metrics
+            disable_file_output=self._disable_file_output,
+            all_supported_metrics=self._all_supported_metrics,
         )
 
         status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -636,6 +826,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
                     pynisher_context=self._multiprocessing_context,
                     backend=self._backend,
                     seed=self.seed,
+                    multi_objectives=["cost"],
                     metric=self._metric,
                     logger_port=self._logger_port,
                     cost_for_crash=get_cost_of_crash(self._metric),
@@ -643,8 +834,8 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
                     initial_num_run=self._backend.get_next_num_run(),
                     stats=stats,
                     memory_limit=memory_limit,
-                    disable_file_output=True if len(self._disable_file_output) > 0 else False,
-                    all_supported_metrics=self._all_supported_metrics
+                    disable_file_output=self._disable_file_output,
+                    all_supported_metrics=self._all_supported_metrics,
                 )
                 dask_futures.append([
                     classifier,
@@ -728,8 +919,8 @@ def _search(
         optimize_metric: str,
         dataset: BaseDataset,
         budget_type: str = 'epochs',
-        min_budget: int = 5,
-        max_budget: int = 50,
+        min_budget: Union[int, float] = 5,
+        max_budget: Union[int, float] = 50,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
@@ -739,10 +930,11 @@ def _search(
         tae_func: Optional[Callable] = None,
         all_supported_metrics: bool = True,
         precision: int = 32,
-        disable_file_output: List = [],
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
-        dask_client: Optional[dask.distributed.Client] = None
+        dask_client: Optional[dask.distributed.Client] = None,
+        **kwargs: Any
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -840,10 +1032,10 @@ def _search(
             precision (int: default=32):
                 Numeric precision used when loading ensemble data.
                 Can be either '16', '32' or '64'.
-            disable_file_output (Union[bool, List]):
-                If True, disable model and prediction output.
-                Can also be used as a list to pass more fine-grained
-                information on what to save. Allowed elements in the list are:
+            disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+                Used as a list to pass more fine-grained
+                information on what to save. Must be a member of `DisableFileOutputParameters`.
+                Allowed elements in the list are:
 
                 + `y_optimization`:
                     do not save the predictions for the optimization set,
@@ -856,6 +1048,9 @@ def _search(
                     pipelines fit on each fold.
                 + `y_test`:
                     do not save the predictions for the test set.
+                + `all`:
+                    do not save any of the above.
+                For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
             load_models (bool: default=True):
                 Whether to load the models after fitting AutoPyTorch.
             portfolio_selection (Optional[str]):
@@ -867,7 +1062,14 @@ def _search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
-
+            kwargs: Any
+                additional arguments that are customed by some specific task.
+                For instance, forecasting tasks require:
+                    min_num_test_instances (int):  minimal number of instances used to initialize a proxy validation set
+                    suggested_init_models (List[str]):  A set of initial models suggested by the users. Their
+                        hyperparameters are determined by the default configurations
+                    custom_init_setting_path (str): The path to the initial hyperparameter configurations set by
+                    the users
         Returns:
             self
 
@@ -897,7 +1099,14 @@ def _search(
         self._backend.setup_logger(port=self._logger_port)
 
         self._all_supported_metrics = all_supported_metrics
-        self._disable_file_output = disable_file_output
+        self._disable_file_output = disable_file_output if disable_file_output is not None else []
+        if (
+            DisableFileOutputParameters.y_optimization in self._disable_file_output
+            and self.ensemble_size > 1
+        ):
+            self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
+                                 f" is in disable_file_output")
+
         self._memory_limit = memory_limit
         self._time_for_task = total_walltime_limit
         # Save start time to backend
@@ -922,7 +1131,10 @@ def _search(
         self.search_space = self.get_search_space(dataset)
 
         # Incorporate budget to pipeline config
-        if budget_type not in ('epochs', 'runtime'):
+        if budget_type not in ('epochs', 'runtime') and (
+                budget_type in FORECASTING_BUDGET_TYPE
+                and STRING_TO_TASK_TYPES[self.task_type] != TIMESERIES_FORECASTING
+        ):
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type
@@ -1028,6 +1240,7 @@ def _search(
                 precision=precision,
                 logger_port=self._logger_port,
                 pynisher_context=self._multiprocessing_context,
+                metrics_kwargs=self._metrics_kwargs,
             )
             self._stopwatch.stop_task(ensemble_task_name)
 
@@ -1041,7 +1254,6 @@ def _search(
         if time_left_for_smac <= 0:
             self._logger.warning(" Not starting SMAC because there is no time left")
         else:
-
             _proc_smac = AutoMLSMBO(
                 config_space=self.search_space,
                 dataset_name=str(dataset.dataset_name),
@@ -1071,6 +1283,8 @@ def _search(
                 search_space_updates=self.search_space_updates,
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
+                task_type=self.task_type,
+                **kwargs,
             )
             try:
                 run_history, self._results_manager.trajectory, budget_type = \
@@ -1135,12 +1349,19 @@ def _get_fit_dictionary(
         dataset: BaseDataset,
         split_id: int = 0
     ) -> Dict[str, Any]:
-        X_test = dataset.test_tensors[0].copy() if dataset.test_tensors is not None else None
-        y_test = dataset.test_tensors[1].copy() if dataset.test_tensors is not None else None
+        if dataset.test_tensors is not None:
+            X_test = dataset.test_tensors[0].copy() if dataset.test_tensors[0] is not None else None
+            y_test = dataset.test_tensors[1].copy() if dataset.test_tensors[1] is not None else None
+        else:
+            X_test = None
+            y_test = None
+
+        X_train = dataset.train_tensors[0].copy() if dataset.train_tensors[0] is not None else None
+        y_train = dataset.train_tensors[1].copy()
         X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
                                   'backend': self._backend,
-                                  'X_train': dataset.train_tensors[0].copy(),
-                                  'y_train': dataset.train_tensors[1].copy(),
+                                  'X_train': X_train,
+                                  'y_train': y_train,
                                   'X_test': X_test,
                                   'y_test': y_test,
                                   'train_indices': dataset.splits[split_id][0],
@@ -1148,6 +1369,10 @@ def _get_fit_dictionary(
                                   'split_id': split_id,
                                   'num_run': self._backend.get_next_num_run(),
                                   })
+        if STRING_TO_TASK_TYPES[self.task_type] == TIMESERIES_FORECASTING:
+            warnings.warn("Currently Time Series Forecasting tasks do not allow computing metrics "
+                          "during training. It will be automatically set as False")
+            self.pipeline_options["metrics_during_training"] = False
         X.update(self.pipeline_options)
         return X
 
@@ -1210,7 +1435,7 @@ def refit(
             # could alleviate the problem in algorithms that depend on
             # the ordering of the data.
             X = self._get_fit_dictionary(
-                dataset_properties=dataset_properties,
+                dataset_properties=copy.copy(dataset_properties),
                 dataset=dataset,
                 split_id=split_id)
             fit_and_suppress_warnings(self._logger, model, X, y=None)
@@ -1219,10 +1444,30 @@ def refit(
 
         return self
 
-    def fit(self,
-            dataset: BaseDataset,
-            pipeline_config: Optional[Configuration] = None,
-            split_id: int = 0) -> BasePipeline:
+    def fit_pipeline(
+        self,
+        configuration: Configuration,
+        *,
+        dataset: Optional[BaseDataset] = None,
+        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        dataset_name: Optional[str] = None,
+        resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        run_time_limit_secs: int = 60,
+        memory_limit: Optional[int] = None,
+        eval_metric: Optional[str] = None,
+        all_supported_metrics: bool = False,
+        budget_type: Optional[str] = None,
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        budget: Optional[float] = None,
+        pipeline_options: Optional[Dict] = None,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+    ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]:
         """
         Fit a pipeline on the given task for the budget.
         A pipeline configuration can be specified if None,
@@ -1233,24 +1478,130 @@ def fit(self,
         methods.
 
         Args:
-            dataset (Dataset):
-                The argument that will provide the dataset splits. It can either
-                be a dictionary with the splits, or the dataset object which can
-                generate the splits based on different restrictions.
-            split_id (int: default=0):
-                split id to fit on.
-            pipeline_config (Optional[Configuration]):
-                configuration to fit the pipeline with. If None,
-                uses default
+            configuration (Configuration):
+                configuration to fit the pipeline with.
+            dataset (BaseDataset):
+                An object of the appropriate child class of `BaseDataset`,
+                that will be used to fit the pipeline
+            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
+                A pair of features (X_train) and targets (y_train) used to fit a
+                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
+                be provided to track the generalization performance of each stage.
+            dataset_name (Optional[str]):
+                Name of the dataset, if None, random value is used.
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                Arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            run_time_limit_secs (int: default=60):
+                Time limit for a single call to the machine learning model.
+                Model fitting will be terminated if the machine learning algorithm
+                runs over the time limit. Set this value high enough so that
+                typical machine learning algorithms can be fit on the training
+                data.
+            memory_limit (Optional[int]):
+                Memory limit in MB for the machine learning algorithm. autopytorch
+                will stop fitting the machine learning algorithm if it tries
+                to allocate more than memory_limit MB. If None is provided,
+                no memory limit is set. In case of multi-processing, memory_limit
+                will be per job. This memory limit also applies to the ensemble
+                creation process.
+            eval_metric (Optional[str]):
+                Name of the metric that is used to evaluate a pipeline.
+            all_supported_metrics (bool: default=True):
+                if True, all metrics supporting current task will be calculated
+                for each pipeline and results will be available via cv_results
+            budget_type (str):
+                Type of budget to be used when fitting the pipeline.
+                It can be one of:
+
+                + `epochs`: The training of each pipeline will be terminated after
+                    a number of epochs have passed. This number of epochs is determined by the
+                    budget argument of this method.
+                + `runtime`: The training of each pipeline will be terminated after
+                    a number of seconds have passed. This number of seconds is determined by the
+                    budget argument of this method. The overall fitting time of a pipeline is
+                    controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
+                    time to train a pipeline, but it does not consider the overall time it takes
+                    to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
+            include_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to include. Key is the node
+                name and Value is an Iterable of the names of the components
+                to include. Only these components will be present in the
+                search space.
+            exclude_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to exclude. Key is the node
+                name and Value is an Iterable of the names of the components
+                to exclude. All except these components will be present in
+                the search space.
+            search_space_updates(Optional[HyperparameterSearchSpaceUpdates]):
+                Updates to be made to the hyperparameter search space of the pipeline
+            budget (Optional[float]):
+                Budget to fit a single run of the pipeline. If not
+                provided, uses the default in the pipeline config
+            pipeline_options (Optional[Dict]):
+                Valid config options include "device",
+                "torch_num_threads", "early_stopping", "use_tensorboard_logger",
+                "metrics_during_training"
+            disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+                Used as a list to pass more fine-grained
+                information on what to save. Must be a member of `DisableFileOutputParameters`.
+                Allowed elements in the list are:
+
+                + `y_optimization`:
+                    do not save the predictions for the optimization set,
+                    which would later on be used to build an ensemble. Note that SMAC
+                    optimizes a metric evaluated on the optimization set.
+                + `pipeline`:
+                    do not save any individual pipeline files
+                + `pipelines`:
+                    In case of cross validation, disables saving the joint model of the
+                    pipelines fit on each fold.
+                + `y_test`:
+                    do not save the predictions for the test set.
+                + `all`:
+                    do not save any of the above.
+                For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
 
         Returns:
-            BasePipeline:
+            (BasePipeline):
                 fitted pipeline
+            (RunInfo):
+                Run information
+            (RunValue):
+                Result of fitting the pipeline
+            (BaseDataset):
+                Dataset created from the given tensors
         """
-        self.dataset_name = dataset.dataset_name
 
-        if self._logger is None:
-            self._logger = self._get_logger(str(self.dataset_name))
+        if dataset is None:
+            if (
+                X_train is not None
+                and y_train is not None
+            ):
+                raise ValueError("No dataset provided, must provide X_train, y_train tensors")
+            dataset = self.get_dataset(X_train=X_train,
+                                       y_train=y_train,
+                                       X_test=X_test,
+                                       y_test=y_test,
+                                       resampling_strategy=resampling_strategy,
+                                       resampling_strategy_args=resampling_strategy_args,
+                                       dataset_name=dataset_name
+                                       )
+
+        # dataset_name is created inside the constructor of BaseDataset
+        # we expect it to be not None. This is for mypy
+        assert dataset.dataset_name is not None
+
+        # TAE expects each configuration to have a config_id.
+        # For fitting a pipeline as it is not part of the
+        # search process, it makes sense to set it to 0
+        configuration.__setattr__('config_id', 0)
 
         # get dataset properties
         dataset_requirements = get_dataset_requirements(
@@ -1261,21 +1612,116 @@ def fit(self,
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._backend.save_datamanager(dataset)
 
-        # build pipeline
-        pipeline = self.build_pipeline(dataset_properties)
-        if pipeline_config is not None:
-            pipeline.set_hyperparameters(pipeline_config)
+        if self._logger is None:
+            self._logger = self._get_logger(dataset.dataset_name)
 
-        # initialise fit dictionary
-        X = self._get_fit_dictionary(
-            dataset_properties=dataset_properties,
-            dataset=dataset,
-            split_id=split_id)
+        include_components = self.include_components if include_components is None else include_components
+        exclude_components = self.exclude_components if exclude_components is None else exclude_components
+        search_space_updates = self.search_space_updates if search_space_updates is None else search_space_updates
 
-        fit_and_suppress_warnings(self._logger, pipeline, X, y=None)
+        scenario_mock = unittest.mock.Mock()
+        scenario_mock.wallclock_limit = run_time_limit_secs
+        # This stats object is a hack - maybe the SMAC stats object should
+        # already be generated here!
+        stats = Stats(scenario_mock)
+
+        if memory_limit is None and getattr(self, '_memory_limit', None) is not None:
+            memory_limit = self._memory_limit
+
+        metric = get_metrics(dataset_properties=dataset_properties,
+                             names=[eval_metric] if eval_metric is not None else None,
+                             all_supported_metrics=False).pop()
+
+        pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \
+            else self.pipeline_options.copy()
+
+        assert pipeline_options is not None
+
+        if budget_type is not None:
+            pipeline_options.update({'budget_type': budget_type})
+        else:
+            budget_type = pipeline_options['budget_type']
+
+        budget = budget if budget is not None else pipeline_options[budget_type]
+
+        if disable_file_output is None:
+            disable_file_output = getattr(self, '_disable_file_output', [])
+
+        stats.start_timing()
+
+        tae = ExecuteTaFuncWithQueue(
+            backend=self._backend,
+            seed=self.seed,
+            metric=metric,
+            multi_objectives=["cost"],
+            logger_port=self._logger_port,
+            cost_for_crash=get_cost_of_crash(metric),
+            abort_on_first_run_crash=False,
+            initial_num_run=self._backend.get_next_num_run(),
+            stats=stats,
+            memory_limit=memory_limit,
+            disable_file_output=disable_file_output,
+            all_supported_metrics=all_supported_metrics,
+            budget_type=budget_type,
+            include=include_components,
+            exclude=exclude_components,
+            search_space_updates=search_space_updates,
+            pipeline_config=pipeline_options,
+            pynisher_context=self._multiprocessing_context,
+        )
+
+        run_info, run_value = tae.run_wrapper(
+            RunInfo(config=configuration,
+                    budget=budget,
+                    seed=self.seed,
+                    cutoff=run_time_limit_secs,
+                    capped=False,
+                    instance_specific=None,
+                    instance=None)
+        )
+
+        fitted_pipeline = self._get_fitted_pipeline(
+            dataset_name=dataset.dataset_name,
+            pipeline_idx=run_info.config.config_id + tae.initial_num_run,
+            run_info=run_info,
+            run_value=run_value,
+            disable_file_output=disable_file_output
+        )
 
         self._clean_logger()
-        return pipeline
+
+        return fitted_pipeline, run_info, run_value, dataset
+
+    def _get_fitted_pipeline(
+        self,
+        dataset_name: str,
+        pipeline_idx: int,
+        run_info: RunInfo,
+        run_value: RunValue,
+        disable_file_output: List[Union[str, DisableFileOutputParameters]]
+    ) -> Optional[BasePipeline]:
+
+        if self._logger is None:
+            self._logger = self._get_logger(str(dataset_name))
+
+        if run_value.status != StatusType.SUCCESS:
+            warnings.warn(f"Fitting pipeline failed with status: {run_value.status}"
+                          f", additional_info: {run_value.additional_info}")
+            return None
+        elif any(disable_file_output for c in ['all', 'pipeline']):
+            self._logger.warning("File output is disabled. No pipeline can returned")
+            return None
+
+        if self.resampling_strategy in CrossValTypes:
+            load_function = self._backend.load_cv_model_by_seed_and_id_and_budget
+        else:
+            load_function = self._backend.load_model_by_seed_and_id_and_budget
+
+        return load_function(  # type: ignore[no-any-return]
+            seed=self.seed,
+            idx=pipeline_idx,
+            budget=float(run_info.budget),
+        )
 
     def predict(
         self,
@@ -1306,14 +1752,15 @@ def predict(
         # Mypy assert
         assert self.ensemble_ is not None, "Load models should error out if no ensemble"
 
-        if isinstance(self.resampling_strategy, HoldoutValTypes):
+        if isinstance(self.resampling_strategy, (HoldoutValTypes, NoResamplingStrategyTypes)):
             models = self.models_
         elif isinstance(self.resampling_strategy, CrossValTypes):
             models = self.cv_models_
 
         all_predictions = joblib.Parallel(n_jobs=n_jobs)(
             joblib.delayed(_pipeline_predict)(
-                models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type]
+                models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type],
+                self.task_type
             )
             for identifier in self.ensemble_.get_selected_model_identifiers()
         )
@@ -1479,3 +1926,59 @@ def sprint_statistics(self) -> str:
             scoring_functions=self._scoring_functions,
             metric=self._metric
         )
+
+    def plot_perf_over_time(
+        self,
+        metric_name: str,
+        ax: Optional[plt.Axes] = None,
+        plot_setting_params: PlotSettingParams = PlotSettingParams(),
+        color_label_settings: ColorLabelSettings = ColorLabelSettings(),
+        *args: Any,
+        **kwargs: Any
+    ) -> None:
+        """
+        Visualize the performance over time using matplotlib.
+        The plot related arguments are based on matplotlib.
+        Please refer to the matplotlib documentation for more details.
+
+        Args:
+            metric_name (str):
+                The name of metric to visualize.
+                The names are available in
+                    * autoPyTorch.metrics.CLASSIFICATION_METRICS
+                    * autoPyTorch.metrics.REGRESSION_METRICS
+            ax (Optional[plt.Axes]):
+                axis to plot (subplots of matplotlib).
+                If None, it will be created automatically.
+            plot_setting_params (PlotSettingParams):
+                Parameters for the plot.
+            color_label_settings (ColorLabelSettings):
+                The settings of a pair of color and label for each plot.
+            args, kwargs (Any):
+                Arguments for the ax.plot.
+
+        Note:
+            You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment.
+        """
+
+        if not hasattr(metrics, metric_name):
+            raise ValueError(
+                f'metric_name must be in {list(metrics.CLASSIFICATION_METRICS.keys())} '
+                f'or {list(metrics.REGRESSION_METRICS.keys())}, but got {metric_name}'
+            )
+        if len(self.ensemble_performance_history) == 0:
+            raise RuntimeError('Visualization is available only after ensembles are evaluated.')
+
+        results = MetricResults(
+            metric=getattr(metrics, metric_name),
+            run_history=self.run_history,
+            ensemble_performance_history=self.ensemble_performance_history
+        )
+
+        colors, labels = color_label_settings.extract_dicts(results)
+
+        ResultsVisualizer().plot_perf_over_time(  # type: ignore
+            results=results, plot_setting_params=plot_setting_params,
+            colors=colors, labels=labels, ax=ax,
+            *args, **kwargs
+        )
diff --git a/autoPyTorch/api/results_manager.py b/autoPyTorch/api/results_manager.py
deleted file mode 100644
index e52d21613..000000000
--- a/autoPyTorch/api/results_manager.py
+++ /dev/null
@@ -1,326 +0,0 @@
-import io
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from ConfigSpace.configuration_space import Configuration
-
-import numpy as np
-
-import scipy
-
-from smac.runhistory.runhistory import RunHistory, RunValue
-from smac.tae import StatusType
-from smac.utils.io.traj_logging import TrajEntry
-
-from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-
-
-# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2
-#  is the new minimum required version!
-STATUS2MSG = {
-    StatusType.SUCCESS: 'Success',
-    StatusType.DONOTADVANCE: 'Success (but did not advance to higher budget)',
-    StatusType.TIMEOUT: 'Timeout',
-    StatusType.CRASHED: 'Crash',
-    StatusType.ABORT: 'Abort',
-    StatusType.MEMOUT: 'Memory out'
-}
-
-
-def cost2metric(cost: float, metric: autoPyTorchMetric) -> float:
-    """
-    Revert cost metric evaluated in SMAC to the original metric.
-
-    The conversion is defined in:
-        autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
-        cost = metric._optimum - metric._sign * original_metric_value
-        ==> original_metric_value = metric._sign * (metric._optimum - cost)
-    """
-    return metric._sign * (metric._optimum - cost)
-
-
-def _extract_metrics_info(
-    run_value: RunValue,
-    scoring_functions: List[autoPyTorchMetric]
-) -> Dict[str, float]:
-    """
-    Extract the metric information given a run_value
-    and a list of metrics of interest.
-
-    Args:
-        run_value (RunValue):
-            The information for each config evaluation.
-        scoring_functions (List[autoPyTorchMetric]):
-            The list of metrics to retrieve the info.
-    """
-
-    if run_value.status not in (StatusType.SUCCESS, StatusType.DONOTADVANCE):
-        # Additional info for metrics is not available in this case.
-        return {metric.name: np.nan for metric in scoring_functions}
-
-    cost_info = run_value.additional_info['opt_loss']
-    avail_metrics = cost_info.keys()
-
-    return {
-        metric.name: cost2metric(cost=cost_info[metric.name], metric=metric)
-        if metric.name in avail_metrics else np.nan
-        for metric in scoring_functions
-    }
-
-
-class SearchResults:
-    def __init__(
-        self,
-        metric: autoPyTorchMetric,
-        scoring_functions: List[autoPyTorchMetric],
-        run_history: RunHistory
-    ):
-        self.metric_dict: Dict[str, List[float]] = {
-            metric.name: []
-            for metric in scoring_functions
-        }
-        self._opt_scores: List[float] = []
-        self._fit_times: List[float] = []
-        self.configs: List[Configuration] = []
-        self.status_types: List[str] = []
-        self.budgets: List[float] = []
-        self.config_ids: List[int] = []
-        self.is_traditionals: List[bool] = []
-        self.additional_infos: List[Optional[Dict[str, Any]]] = []
-        self.rank_test_scores: np.ndarray = np.array([])
-        self._scoring_functions = scoring_functions
-        self._metric = metric
-
-        self._extract_results_from_run_history(run_history)
-
-    @property
-    def opt_scores(self) -> np.ndarray:
-        return np.asarray(self._opt_scores)
-
-    @property
-    def fit_times(self) -> np.ndarray:
-        return np.asarray(self._fit_times)
-
-    def update(
-        self,
-        config: Configuration,
-        status: str,
-        budget: float,
-        fit_time: float,
-        config_id: int,
-        is_traditional: bool,
-        additional_info: Dict[str, Any],
-        score: float,
-        metric_info: Dict[str, float]
-    ) -> None:
-
-        self.status_types.append(status)
-        self.configs.append(config)
-        self.budgets.append(budget)
-        self.config_ids.append(config_id)
-        self.is_traditionals.append(is_traditional)
-        self.additional_infos.append(additional_info)
-        self._fit_times.append(fit_time)
-        self._opt_scores.append(score)
-
-        for metric_name, val in metric_info.items():
-            self.metric_dict[metric_name].append(val)
-
-    def clear(self) -> None:
-        self._opt_scores = []
-        self._fit_times = []
-        self.configs = []
-        self.status_types = []
-        self.budgets = []
-        self.config_ids = []
-        self.additional_infos = []
-        self.is_traditionals = []
-        self.rank_test_scores = np.array([])
-
-    def _extract_results_from_run_history(self, run_history: RunHistory) -> None:
-        """
-        Extract the information to match this class format.
-
-        Args:
-            run_history (RunHistory):
-                The history of config evals from SMAC.
-        """
-
-        self.clear()  # Delete cache before the extraction
-
-        for run_key, run_value in run_history.data.items():
-            config_id = run_key.config_id
-            config = run_history.ids_config[config_id]
-
-            status_msg = STATUS2MSG.get(run_value.status, None)
-            if run_value.status in (StatusType.STOP, StatusType.RUNNING):
-                continue
-            elif status_msg is None:
-                raise ValueError(f'Unexpected run status: {run_value.status}')
-
-            is_traditional = False  # If run is not successful, unsure ==> not True ==> False
-            if run_value.additional_info is not None:
-                is_traditional = run_value.additional_info['configuration_origin'] == 'traditional'
-
-            self.update(
-                status=status_msg,
-                config=config,
-                budget=run_key.budget,
-                fit_time=run_value.time,
-                score=cost2metric(cost=run_value.cost, metric=self._metric),
-                metric_info=_extract_metrics_info(run_value=run_value, scoring_functions=self._scoring_functions),
-                is_traditional=is_traditional,
-                additional_info=run_value.additional_info,
-                config_id=config_id
-            )
-
-        self.rank_test_scores = scipy.stats.rankdata(
-            -1 * self._metric._sign * self.opt_scores,  # rank order
-            method='min'
-        )
-
-
-class ResultsManager:
-    def __init__(self, *args: Any, **kwargs: Any):
-        """
-        Attributes:
-            run_history (RunHistory):
-                A `SMAC Runshistory <https://automl.github.io/SMAC3/master/apidoc/smac.runhistory.runhistory.html>`_
-                object that holds information about the runs of the target algorithm made during search
-            ensemble_performance_history (List[Dict[str, Any]]):
-                The list of ensemble performance in the optimization.
-                The list includes the `timestamp`, `result on train set`, and `result on test set`
-            trajectory (List[TrajEntry]):
-                A list of all incumbent configurations during search
-        """
-        self.run_history: RunHistory = RunHistory()
-        self.ensemble_performance_history: List[Dict[str, Any]] = []
-        self.trajectory: List[TrajEntry] = []
-
-    def _check_run_history(self) -> None:
-        if self.run_history is None:
-            raise RuntimeError("No Run History found, search has not been called.")
-
-        if self.run_history.empty():
-            raise RuntimeError("Run History is empty. Something went wrong, "
-                               "SMAC was not able to fit any model?")
-
-    def get_incumbent_results(
-        self,
-        metric: autoPyTorchMetric,
-        include_traditional: bool = False
-    ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]:
-        """
-        Get Incumbent config and the corresponding results
-
-        Args:
-            metric (autoPyTorchMetric):
-                A metric that is evaluated when searching with fit AutoPytorch.
-            include_traditional (bool):
-                Whether to include results from tradtional pipelines
-
-        Returns:
-            Configuration (CS.ConfigurationSpace):
-                The incumbent configuration
-            Dict[str, Union[int, str, float]]:
-                Additional information about the run of the incumbent configuration.
-        """
-        self._check_run_history()
-
-        results = SearchResults(metric=metric, scoring_functions=[], run_history=self.run_history)
-
-        if not include_traditional:
-            non_traditional = ~np.array(results.is_traditionals)
-            scores = results.opt_scores[non_traditional]
-            indices = np.arange(len(results.configs))[non_traditional]
-        else:
-            scores = results.opt_scores
-            indices = np.arange(len(results.configs))
-
-        incumbent_idx = indices[np.nanargmax(metric._sign * scores)]
-        incumbent_config = results.configs[incumbent_idx]
-        incumbent_results = results.additional_infos[incumbent_idx]
-
-        assert incumbent_results is not None  # mypy check
-        return incumbent_config, incumbent_results
-
-    def get_search_results(
-        self,
-        scoring_functions: List[autoPyTorchMetric],
-        metric: autoPyTorchMetric
-    ) -> SearchResults:
-        """
-        This attribute is populated with data from `self.run_history`
-        and contains information about the configurations, and their
-        corresponding metric results, status of run, parameters and
-        the budget
-
-        Args:
-            scoring_functions (List[autoPyTorchMetric]):
-                Metrics to show in the results.
-            metric (autoPyTorchMetric):
-                A metric that is evaluated when searching with fit AutoPytorch.
-
-        Returns:
-            SearchResults:
-                An instance that contains the results from search
-        """
-        self._check_run_history()
-        return SearchResults(metric=metric, scoring_functions=scoring_functions, run_history=self.run_history)
-
-    def sprint_statistics(
-        self,
-        dataset_name: str,
-        scoring_functions: List[autoPyTorchMetric],
-        metric: autoPyTorchMetric
-    ) -> str:
-        """
-        Prints statistics about the SMAC search.
-
-        These statistics include:
-
-        1. Optimisation Metric
-        2. Best Optimisation score achieved by individual pipelines
-        3. Total number of target algorithm runs
-        4. Total number of successful target algorithm runs
-        5. Total number of crashed target algorithm runs
-        6. Total number of target algorithm runs that exceeded the time limit
-        7. Total number of successful target algorithm runs that exceeded the memory limit
-
-        Args:
-            dataset_name (str):
-                The dataset name that was used in the run.
-            scoring_functions (List[autoPyTorchMetric]):
-                Metrics to show in the results.
-            metric (autoPyTorchMetric):
-                A metric that is evaluated when searching with fit AutoPytorch.
-
-        Returns:
-            (str):
-                Formatted string with statistics
-        """
-        search_results = self.get_search_results(scoring_functions, metric)
-        success_msgs = (STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.DONOTADVANCE])
-        sio = io.StringIO()
-        sio.write("autoPyTorch results:\n")
-        sio.write(f"\tDataset name: {dataset_name}\n")
-        sio.write(f"\tOptimisation Metric: {metric}\n")
-
-        num_runs = len(search_results.status_types)
-        num_success = sum([s in success_msgs for s in search_results.status_types])
-        num_crash = sum([s == STATUS2MSG[StatusType.CRASHED] for s in search_results.status_types])
-        num_timeout = sum([s == STATUS2MSG[StatusType.TIMEOUT] for s in search_results.status_types])
-        num_memout = sum([s == STATUS2MSG[StatusType.MEMOUT] for s in search_results.status_types])
-
-        if num_success > 0:
-            best_score = metric._sign * np.nanmax(metric._sign * search_results.opt_scores)
-            sio.write(f"\tBest validation score: {best_score}\n")
-
-        sio.write(f"\tNumber of target algorithm runs: {num_runs}\n")
-        sio.write(f"\tNumber of successful target algorithm runs: {num_success}\n")
-        sio.write(f"\tNumber of crashed target algorithm runs: {num_crash}\n")
-        sio.write(f"\tNumber of target algorithms that exceeded the time "
-                  f"limit: {num_timeout}\n")
-        sio.write(f"\tNumber of target algorithms that exceeded the memory "
-                  f"limit: {num_memout}\n")
-
-        return sio.getvalue()
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index d83f1dc01..facb59f99 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -1,6 +1,4 @@
-import os
-import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 
@@ -13,11 +11,17 @@
     TASK_TYPES_TO_STRING,
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
+)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
     HoldoutValTypes,
+    ResamplingStrategies,
 )
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -54,15 +58,25 @@ class TabularClassificationTask(BaseTask):
         delete_tmp_folder_after_terminate (bool):
             Determines whether to delete the temporary directory,
             when finished
-        include_components (Optional[Dict]):
-            If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]):
-            If None, all possible components are used.
-            Otherwise specifies set of components not to use.
-            Incompatible with include components.
+        include_components (Optional[Dict[str, Any]]):
+            Dictionary containing components to include. Key is the node
+            name and Value is an Iterable of the names of the components
+            to include. Only these components will be present in the
+            search space.
+        exclude_components (Optional[Dict[str, Any]]):
+            Dictionary containing components to exclude. Key is the node
+            name and Value is an Iterable of the names of the components
+            to exclude. All except these components will be present in
+            the search space.
+        resampling_strategy resampling_strategy (RESAMPLING_STRATEGIES),
+                (default=HoldoutValTypes.holdout_validation):
+                strategy to split the training data.
+        resampling_strategy_args (Optional[Dict[str, Any]]): arguments
+            required for the chosen resampling strategy. If None, uses
+            the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+            in ```datasets/resampling_strategy.py```.
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
-            search space updates that can be used to modify the search
+            Search space updates that can be used to modify the search
             space of particular components or choice modules of the pipeline
     """
     def __init__(
@@ -78,9 +92,9 @@ def __init__(
         output_directory: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
         delete_output_folder_after_terminate: bool = True,
-        include_components: Optional[Dict] = None,
-        exclude_components: Optional[Dict] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
@@ -106,18 +120,121 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
         )
 
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
+    def build_pipeline(
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ) -> TabularClassificationPipeline:
+        """
+        Build pipeline according to current task
+        and for the passed dataset properties
+
+        Args:
+            dataset_properties (Dict[str, Any]):
+                Characteristics of the dataset to guide the pipeline
+                choices of components
+            include_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to include. Key is the node
+                name and Value is an Iterable of the names of the components
+                to include. Only these components will be present in the
+                search space.
+            exclude_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to exclude. Key is the node
+                name and Value is an Iterable of the names of the components
+                to exclude. All except these components will be present in
+                the search space.
+            search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+                Search space updates that can be used to modify the search
+                space of particular components or choice modules of the pipeline
+
+        Returns:
+            TabularClassificationPipeline
+
+        """
+        return TabularClassificationPipeline(dataset_properties=dataset_properties,
+                                             include=include_components,
+                                             exclude=exclude_components,
+                                             search_space_updates=search_space_updates)
+
+    def _get_dataset_input_validator(
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any,
+    ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
-        Build pipeline according to current task and for the passed dataset properties
+        Returns an object of `TabularDataset` and an object of
+        `TabularInputValidator` according to the current task.
 
         Args:
-            dataset_properties (Dict[str,Any])
+            X_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training feature set.
+            y_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training target set.
+            X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing feature set
+            y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing target set
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+            kwargs (Any):
+                Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
+                specifies whether a feature is 'numerical' or 'categorical'.
 
         Returns:
-            TabularClassificationPipeline:
-                Pipeline compatible with the given dataset properties.
+            TabularDataset:
+                the dataset object.
+            TabularInputValidator:
+                the input validator fitted on the data.
         """
-        return TabularClassificationPipeline(dataset_properties=dataset_properties)
+
+        resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
+        resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
+            self.resampling_strategy_args
+
+        feat_types = kwargs.pop('feat_types', None)
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        input_validator = TabularInputValidator(
+            is_classification=True,
+            logger_port=self._logger_port,
+            dataset_compression=dataset_compression,
+            feat_types=feat_types
+        )
+
+        # Fit a input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+
+        dataset = TabularDataset(
+            X=X_train, Y=y_train,
+            X_test=X_test, Y_test=y_test,
+            validator=input_validator,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            dataset_name=dataset_name
+        )
+
+        return dataset, input_validator
 
     def search(
         self,
@@ -127,20 +244,22 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
+        feat_types: Optional[List[str]] = None,
         budget_type: str = 'epochs',
         min_budget: int = 5,
         max_budget: int = 50,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
-        memory_limit: Optional[int] = 4096,
+        memory_limit: int = 4096,
         smac_scenario_args: Optional[Dict[str, Any]] = None,
         get_smac_object_callback: Optional[Callable] = None,
         all_supported_metrics: bool = True,
         precision: int = 32,
-        disable_file_output: List = [],
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -154,6 +273,10 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
+            feat_types (Optional[List[str]]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool. Defaults to None.
             optimize_metric (str):
                 name of the metric that is used to evaluate a pipeline.
             budget_type (str):
@@ -209,7 +332,7 @@ def search(
                 feature by turning this flag to False. All machine learning
                 algorithms that are fitted during search() are considered for
                 ensemble building.
-            memory_limit (Optional[int]: default=4096):
+            memory_limit (int: default=4096):
                 Memory limit in MB for the machine learning algorithm.
                 Autopytorch will stop fitting the machine learning algorithm
                 if it tries to allocate more than memory_limit MB. If None
@@ -237,10 +360,10 @@ def search(
             precision (int: default=32):
                 Numeric precision used when loading ensemble data.
                 Can be either '16', '32' or '64'.
-            disable_file_output (Union[bool, List]):
-                If True, disable model and prediction output.
-                Can also be used as a list to pass more fine-grained
-                information on what to save. Allowed elements in the list are:
+            disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+                Used as a list to pass more fine-grained
+                information on what to save. Must be a member of `DisableFileOutputParameters`.
+                Allowed elements in the list are:
 
                 + `y_optimization`:
                     do not save the predictions for the optimization set,
@@ -253,6 +376,9 @@ def search(
                     pipelines fit on each fold.
                 + `y_test`:
                     do not save the predictions for the test set.
+                + `all`:
+                    do not save any of the above.
+                For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
             load_models (bool: default=True):
                 Whether to load the models after fitting AutoPyTorch.
             portfolio_selection (Optional[str]):
@@ -264,37 +390,62 @@ def search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
+            dataset_compression: Union[bool, Mapping[str, Any]] = True
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                Default configuration when left as ``True``:
+                .. code-block:: python
+                    {
+                        "memory_allocation": 0.1,
+                        "methods": ["precision"]
+                    }
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
+                    float value can be set with ``"memory_allocation": 0.1``. We also allow for
+                    specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+
+                **methods**
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             self
 
         """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
-
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
-
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TabularInputValidator(
-            is_classification=True,
-            logger_port=self._logger_port,
-        )
+        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
 
-        # Fit a input validator to check the provided data
-        # Also, an encoder is fit to both train and test data,
-        # to prevent unseen categories during inference
-        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
-
-        self.dataset = TabularDataset(
-            X=X_train, Y=y_train,
-            X_test=X_test, Y_test=y_test,
-            validator=self.InputValidator,
-            dataset_name=dataset_name,
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
-        )
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression,
+            feat_types=feat_types)
 
         return self._search(
             dataset=self.dataset,
@@ -331,28 +482,28 @@ def predict(
         Returns:
             Array with estimator predictions.
         """
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
+        if self.input_validator is None or not self.input_validator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
-                             "the estimator fit() method.")
+                             "the estimator search() method.")
 
-        X_test = self.InputValidator.feature_validator.transform(X_test)
+        X_test = self.input_validator.feature_validator.transform(X_test)
         predicted_probabilities = super().predict(X_test, batch_size=batch_size,
                                                   n_jobs=n_jobs)
 
-        if self.InputValidator.target_validator.is_single_column_target():
+        if self.input_validator.target_validator.is_single_column_target():
             predicted_indexes = np.argmax(predicted_probabilities, axis=1)
         else:
             predicted_indexes = (predicted_probabilities > 0.5).astype(int)
 
         # Allow to predict in the original domain -- that is, the user is not interested
         # in our encoded values
-        return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
+        return self.input_validator.target_validator.inverse_transform(predicted_indexes)
 
     def predict_proba(self,
                       X_test: Union[np.ndarray, pd.DataFrame, List],
                       batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
+        if self.input_validator is None or not self.input_validator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
-                             "the estimator fit() method.")
-        X_test = self.InputValidator.feature_validator.transform(X_test)
+                             "the estimator search() method.")
+        X_test = self.input_validator.feature_validator.transform(X_test)
         return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index a68990732..e0c1e4eac 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -1,6 +1,4 @@
-import os
-import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 
@@ -13,11 +11,17 @@
     TASK_TYPES_TO_STRING
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
+)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
     HoldoutValTypes,
+    ResamplingStrategies,
 )
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -54,15 +58,25 @@ class TabularRegressionTask(BaseTask):
         delete_tmp_folder_after_terminate (bool):
             Determines whether to delete the temporary directory,
             when finished
-        include_components (Optional[Dict]):
-            If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]):
-            If None, all possible components are used.
-            Otherwise specifies set of components not to use.
-            Incompatible with include components.
+        include_components (Optional[Dict[str, Any]]):
+            Dictionary containing components to include. Key is the node
+            name and Value is an Iterable of the names of the components
+            to include. Only these components will be present in the
+            search space.
+        exclude_components (Optional[Dict[str, Any]]):
+            Dictionary containing components to exclude. Key is the node
+            name and Value is an Iterable of the names of the components
+            to exclude. All except these components will be present in
+            the search space.
+        resampling_strategy resampling_strategy (RESAMPLING_STRATEGIES),
+                (default=HoldoutValTypes.holdout_validation):
+                strategy to split the training data.
+        resampling_strategy_args (Optional[Dict[str, Any]]): arguments
+            required for the chosen resampling strategy. If None, uses
+            the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+            in ```datasets/resampling_strategy.py```.
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
-            search space updates that can be used to modify the search
+            Search space updates that can be used to modify the search
             space of particular components or choice modules of the pipeline
     """
 
@@ -79,9 +93,9 @@ def __init__(
         output_directory: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
         delete_output_folder_after_terminate: bool = True,
-        include_components: Optional[Dict] = None,
-        exclude_components: Optional[Dict] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
@@ -107,18 +121,120 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
         )
 
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
+    def build_pipeline(
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ) -> TabularRegressionPipeline:
         """
-        Build pipeline according to current task and for the passed dataset properties
+        Build pipeline according to current task
+        and for the passed dataset properties
 
         Args:
-            dataset_properties (Dict[str,Any])
+            dataset_properties (Dict[str, Any]):
+                Characteristics of the dataset to guide the pipeline
+                choices of components
+            include_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to include. Key is the node
+                name and Value is an Iterable of the names of the components
+                to include. Only these components will be present in the
+                search space.
+            exclude_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to exclude. Key is the node
+                name and Value is an Iterable of the names of the components
+                to exclude. All except these components will be present in
+                the search space.
+            search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+                Search space updates that can be used to modify the search
+                space of particular components or choice modules of the pipeline
 
         Returns:
             TabularRegressionPipeline:
-                Pipeline compatible with the given dataset properties.
+
         """
-        return TabularRegressionPipeline(dataset_properties=dataset_properties)
+        return TabularRegressionPipeline(dataset_properties=dataset_properties,
+                                         include=include_components,
+                                         exclude=exclude_components,
+                                         search_space_updates=search_space_updates)
+
+    def _get_dataset_input_validator(
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any
+    ) -> Tuple[TabularDataset, TabularInputValidator]:
+        """
+        Returns an object of `TabularDataset` and an object of
+        `TabularInputValidator` according to the current task.
+
+        Args:
+            X_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training feature set.
+            y_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training target set.
+            X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing feature set
+            y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing target set
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+            kwargs (Any):
+                Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
+                specifies whether a feature is 'numerical' or 'categorical'.
+        Returns:
+            TabularDataset:
+                the dataset object.
+            TabularInputValidator:
+                the input validator fitted on the data.
+        """
+
+        resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
+        resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
+            self.resampling_strategy_args
+
+        feat_types = kwargs.pop('feat_types', None)
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        input_validator = TabularInputValidator(
+            is_classification=False,
+            logger_port=self._logger_port,
+            dataset_compression=dataset_compression,
+            feat_types=feat_types
+        )
+
+        # Fit a input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+
+        dataset = TabularDataset(
+            X=X_train, Y=y_train,
+            X_test=X_test, Y_test=y_test,
+            validator=input_validator,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            dataset_name=dataset_name
+        )
+
+        return dataset, input_validator
 
     def search(
         self,
@@ -128,20 +244,22 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
+        feat_types: Optional[List[str]] = None,
         budget_type: str = 'epochs',
         min_budget: int = 5,
         max_budget: int = 50,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
-        memory_limit: Optional[int] = 4096,
+        memory_limit: int = 4096,
         smac_scenario_args: Optional[Dict[str, Any]] = None,
         get_smac_object_callback: Optional[Callable] = None,
         all_supported_metrics: bool = True,
         precision: int = 32,
-        disable_file_output: List = [],
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -155,8 +273,12 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
-            optimize_metric (str): name of the metric that is used to
-                evaluate a pipeline.
+            feat_types (Optional[List[str]]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool. Defaults to None.
+            optimize_metric (str):
+                Name of the metric that is used to evaluate a pipeline.
             budget_type (str):
                 Type of budget to be used when fitting the pipeline.
                 It can be one of:
@@ -210,7 +332,7 @@ def search(
                 feature by turning this flag to False. All machine learning
                 algorithms that are fitted during search() are considered for
                 ensemble building.
-            memory_limit (Optional[int]: default=4096):
+            memory_limit (int: default=4096):
                 Memory limit in MB for the machine learning algorithm.
                 Autopytorch will stop fitting the machine learning algorithm
                 if it tries to allocate more than memory_limit MB. If None
@@ -238,10 +360,10 @@ def search(
             precision (int: default=32):
                 Numeric precision used when loading ensemble data.
                 Can be either '16', '32' or '64'.
-            disable_file_output (Union[bool, List]):
-                If True, disable model and prediction output.
-                Can also be used as a list to pass more fine-grained
-                information on what to save. Allowed elements in the list are:
+            disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+                Used as a list to pass more fine-grained
+                information on what to save. Must be a member of `DisableFileOutputParameters`.
+                Allowed elements in the list are:
 
                 + `y_optimization`:
                     do not save the predictions for the optimization set,
@@ -254,6 +376,9 @@ def search(
                     pipelines fit on each fold.
                 + `y_test`:
                     do not save the predictions for the test set.
+                + `all`:
+                    do not save any of the above.
+                For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
             load_models (bool: default=True):
                 Whether to load the models after fitting AutoPyTorch.
             portfolio_selection (Optional[str]):
@@ -265,37 +390,63 @@ def search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
+            dataset_compression: Union[bool, Mapping[str, Any]] = True
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                Default configuration when left as ``True``:
+                .. code-block:: python
+                    {
+                        "memory_allocation": 0.1,
+                        "methods": ["precision"]
+                    }
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
+                    float value can be set with ``"memory_allocation": 0.1``. We also allow for
+                    specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+
+                **methods**
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             self
 
         """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
 
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
+        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
 
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TabularInputValidator(
-            is_classification=False,
-            logger_port=self._logger_port,
-        )
-
-        # Fit a input validator to check the provided data
-        # Also, an encoder is fit to both train and test data,
-        # to prevent unseen categories during inference
-        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
-
-        self.dataset = TabularDataset(
-            X=X_train, Y=y_train,
-            X_test=X_test, Y_test=y_test,
-            validator=self.InputValidator,
-            dataset_name=dataset_name,
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
-        )
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression,
+            feat_types=feat_types)
 
         return self._search(
             dataset=self.dataset,
@@ -322,14 +473,14 @@ def predict(
             batch_size: Optional[int] = None,
             n_jobs: int = 1
     ) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
+        if self.input_validator is None or not self.input_validator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
-                             "the estimator fit() method.")
+                             "the estimator search() method.")
 
-        X_test = self.InputValidator.feature_validator.transform(X_test)
+        X_test = self.input_validator.feature_validator.transform(X_test)
         predicted_values = super().predict(X_test, batch_size=batch_size,
                                            n_jobs=n_jobs)
 
         # Allow to predict in the original domain -- that is, the user is not interested
         # in our encoded values
-        return self.InputValidator.target_validator.inverse_transform(predicted_values)
+        return self.input_validator.target_validator.inverse_transform(predicted_values)
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
new file mode 100644
index 000000000..27b923576
--- /dev/null
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -0,0 +1,592 @@
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+
+import numpy as np
+
+import pandas as pd
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import MAX_WINDOW_SIZE_BASE, TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping
+)
+from autoPyTorch.datasets.base_dataset import (
+    BaseDataset,
+    BaseDatasetPropertiesType
+)
+from autoPyTorch.datasets.resampling_strategy import (
+    HoldoutValTypes,
+    ResamplingStrategies
+)
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesForecastingTask(BaseTask):
+    """
+    Time Series Forecasting API to the pipelines.
+
+    Args:
+        seed (int):
+            seed to be used for reproducibility.
+        n_jobs (int), (default=1):
+            number of consecutive processes to spawn.
+        logging_config (Optional[Dict]):
+            specifies configuration for logging, if None, it is loaded from the logging.yaml
+        ensemble_size (int), (default=50):
+            Number of models added to the ensemble built by Ensemble selection from libraries of models.
+            Models are drawn with replacement.
+        ensemble_nbest (int), (default=50):
+            only consider the ensemble_nbest models to build the ensemble
+        max_models_on_disc (int), (default=50):
+            maximum number of models saved to disc. Also, controls the size of the ensemble as any additional models
+             will be deleted. Must be greater than or equal to 1.
+        temporary_directory (str):
+            folder to store configuration output and log file
+        output_directory (str):
+            folder to store predictions for optional test set
+        delete_tmp_folder_after_terminate (bool):
+            determines whether to delete the temporary directory, when finished
+        include_components (Optional[Dict]):
+            If None, all possible components are used. Otherwise specifies set of components to use.
+        exclude_components (Optional[Dict]):
+            If None, all possible components are used. Otherwise specifies set of components not to use.
+            Incompatible with include components
+    """
+
+    def __init__(
+        self,
+        seed: int = 1,
+        n_jobs: int = 1,
+        logging_config: Optional[Dict] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
+        temporary_directory: Optional[str] = None,
+        output_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        delete_output_folder_after_terminate: bool = True,
+        include_components: Optional[Dict] = None,
+        exclude_components: Optional[Dict] = None,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.time_series_hold_out_validation,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        backend: Optional[Backend] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    ):
+        super().__init__(
+            seed=seed,
+            n_jobs=n_jobs,
+            logging_config=logging_config,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
+            temporary_directory=temporary_directory,
+            output_directory=output_directory,
+            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
+            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
+            include_components=include_components,
+            exclude_components=exclude_components,
+            backend=backend,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            search_space_updates=search_space_updates,
+            task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
+        )
+
+        self.customized_window_size = False
+        if self.search_space_updates is not None:
+            for update in self.search_space_updates.updates:
+                # user has already specified a window_size range
+                if (
+                    update.node_name == "data_loader"
+                    and update.hyperparameter == "window_size"
+                ):
+                    self.customized_window_size = True
+
+    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
+        if not isinstance(dataset, TimeSeriesForecastingDataset):
+            raise ValueError(
+                "Dataset is incompatible for the given task,: {}".format(type(dataset))
+            )
+        return dataset.get_required_dataset_info()
+
+    def build_pipeline(
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    ) -> TimeSeriesForecastingPipeline:
+        """
+        Build pipeline according to current task
+        and for the passed dataset properties
+
+        Args:
+            dataset_properties (Dict[str, Any]):
+                Characteristics of the dataset to guide the pipeline
+                choices of components
+            include_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to include. Key is the node
+                name and Value is an Iterable of the names of the components
+                to include. Only these components will be present in the
+                search space.
+            exclude_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to exclude. Key is the node
+                name and Value is an Iterable of the names of the components
+                to exclude. All except these components will be present in
+                the search space.
+            search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+                Search space updates that can be used to modify the search
+                space of particular components or choice modules of the pipeline
+
+        Returns:
+            TimeSeriesForecastingPipeline:
+
+        """
+        return TimeSeriesForecastingPipeline(
+            dataset_properties=dataset_properties,
+            include=include_components,
+            exclude=exclude_components,
+            search_space_updates=search_space_updates,
+        )
+
+    def _get_dataset_input_validator(
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        freq: Optional[Union[str, int, List[int]]] = None,
+        start_times: Optional[List[pd.DatetimeIndex]] = None,
+        series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+        n_prediction_steps: int = 1,
+        known_future_features: Union[Tuple[Union[int, str]], Tuple[()]] = (),
+        **forecasting_dataset_kwargs: Any,
+    ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
+        """
+        Returns an object of `TimeSeriesForecastingDataset` and an object of
+        `TimeSeriesForecastingInputValidator` according to the current task.
+
+        Args:
+            X_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training feature set.
+            y_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training target set.
+            X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing feature set
+            y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing target set
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+            freq (Optional[Union[str, int, List[int]]]):
+                frequency information, it determines the configuration space of the window size, if it is not given,
+                we will use the default configuration
+            start_times (Optional[List[pd.DatetimeIndex]]):
+                starting time of each series when they are sampled. If it is not given, we simply start with a fixed
+                timestamp.
+            series_idx (Optional[Union[List[Union[str, int]], str, int]]):
+                (only works if X is stored as pd.DataFrame). This value is applied to identify to which series the data
+                belongs if the data is presented as a "chunk" dataframe
+            n_prediction_steps (int):
+                The number of steps you want to forecast into the future (forecast horizon)
+            known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
+                future features that are known in advance. For instance, holidays.
+            forecasting_kwargs (Any)
+                kwargs for forecasting dataset, for more details, please check
+                ```datasets/time_series_dataset.py```
+        Returns:
+            TimeSeriesForecastingDataset:
+                the dataset object.
+            TimeSeriesForecastingInputValidator:
+                the input validator fitted on the data.
+        """
+
+        resampling_strategy = (
+            resampling_strategy
+            if resampling_strategy is not None
+            else self.resampling_strategy
+        )
+        resampling_strategy_args = (
+            resampling_strategy_args
+            if resampling_strategy_args is not None
+            else self.resampling_strategy_args
+        )
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        input_validator = TimeSeriesForecastingInputValidator(
+            is_classification=False,
+            logger_port=self._logger_port,
+            dataset_compression=dataset_compression,
+        )
+
+        # Fit an input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        input_validator.fit(
+            X_train=X_train,
+            y_train=y_train,
+            start_times=start_times,
+            series_idx=series_idx,
+            X_test=X_test,
+            y_test=y_test,
+        )
+
+        dataset = TimeSeriesForecastingDataset(
+            X=X_train,
+            Y=y_train,
+            X_test=X_test,
+            Y_test=y_test,
+            dataset_name=dataset_name,
+            freq=freq,
+            start_times=start_times,
+            series_idx=series_idx,
+            validator=input_validator,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            n_prediction_steps=n_prediction_steps,
+            known_future_features=known_future_features,
+            **forecasting_dataset_kwargs,
+        )
+
+        return dataset, input_validator
+
+    def search(
+        self,
+        optimize_metric: str,
+        X_train: Optional[Union[List, pd.DataFrame]] = None,
+        y_train: Optional[Union[List, pd.DataFrame]] = None,
+        X_test: Optional[Union[List, pd.DataFrame]] = None,
+        y_test: Optional[Union[List, pd.DataFrame]] = None,
+        n_prediction_steps: int = 1,
+        freq: Optional[Union[str, int, List[int]]] = None,
+        start_times: Optional[List[pd.DatetimeIndex]] = None,
+        series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+        dataset_name: Optional[str] = None,
+        budget_type: str = "epochs",
+        min_budget: Union[int, float] = 5,
+        max_budget: Union[int, float] = 50,
+        total_walltime_limit: int = 100,
+        func_eval_time_limit_secs: Optional[int] = None,
+        enable_traditional_pipeline: bool = False,
+        memory_limit: Optional[int] = 4096,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: List = [],
+        load_models: bool = True,
+        portfolio_selection: Optional[str] = None,
+        suggested_init_models: Optional[List[str]] = None,
+        custom_init_setting_path: Optional[str] = None,
+        min_num_test_instances: Optional[int] = None,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
+        **forecasting_dataset_kwargs: Any,
+    ) -> "BaseTask":
+        """
+        Search for the best pipeline configuration for the given dataset.
+
+        Fit both optimizes the machine learning models and builds an ensemble out of them.
+        To disable ensembling, set ensemble_size==0.
+        using the optimizer.
+
+        Args:
+            optimize_metric (str):
+                name of the metric that is used to evaluate a pipeline.
+            X_train: Optional[Union[List, pd.DataFrame]]
+                A pair of features (X_train) and targets (y_train) used to fit a
+                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
+                be provided to track the generalization performance of each stage.
+            y_train: Union[List, pd.DataFrame]
+                training target, must be given
+            X_test: Optional[Union[List, pd.DataFrame]]
+                Test Features, Test series need to end at one step before forecasting
+            y_test: Optional[Union[List, pd.DataFrame]]
+                Test Targets
+            n_prediction_steps: int
+                How many steps in advance we need to predict
+            freq: Optional[Union[str, int, List[int]]]
+                frequency information, it determines the configuration space of the window size, if it is not given,
+                we will use the default configuration
+            start_times: : List[pd.DatetimeIndex]
+                A list indicating the start time of each series in the training sets
+            series_idx: Optional[Union[List[Union[str, int]], str, int]]
+                variable in X indicating series indices
+            dataset_name: Optional[str],
+                dataset name
+            budget_type (str):
+                Type of budget to be used when fitting the pipeline.
+                It can be one of:
+
+                + `epochs`: The training of each pipeline will be terminated after
+                    a number of epochs have passed. This number of epochs is determined by the
+                    budget argument of this method.
+                + `runtime`: The training of each pipeline will be terminated after
+                    a number of seconds have passed. This number of seconds is determined by the
+                    budget argument of this method. The overall fitting time of a pipeline is
+                    controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
+                    time to train a pipeline, but it does not consider the overall time it takes
+                    to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
+                    budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
+                    is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
+                    min_budget will refer to seconds.
+                + 'resolution': The sample resolution of time series, for instance, if a time series sequence is
+                [0, 1, 2, 3, 4] with resolution 0.5, the sequence fed to the network is [0, 2, 4]
+            min_budget Union[int, float]:
+                Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>`_ to
+                trade-off resources between running many pipelines at min_budget and
+                running the top performing pipelines on max_budget.
+                min_budget states the minimum resource allocation a pipeline should have
+                so that we can compare and quickly discard bad performing models.
+                For example, if the budget_type is epochs, and min_budget=5, then we will
+                run every pipeline to a minimum of 5 epochs before performance comparison.
+            max_budget Union[int, float]:
+                Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>`_ to
+                trade-off resources between running many pipelines at min_budget and
+                running the top performing pipelines on max_budget.
+                max_budget states the maximum resource allocation a pipeline is going to
+                be ran. For example, if the budget_type is epochs, and max_budget=50,
+                then the pipeline training will be terminated after 50 epochs.
+
+            total_walltime_limit (int), (default=100): Time limit
+                in seconds for the search of appropriate models.
+                By increasing this value, autopytorch has a higher
+                chance of finding better models.
+            func_eval_time_limit (int), (default=60): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit. Set
+                this value high enough so that typical machine
+                learning algorithms can be fit on the training
+                data.
+            traditional_per_total_budget (float), (default=0.1):
+                Percent of total walltime to be allocated for
+                running traditional classifiers.
+            memory_limit (Optional[int]), (default=4096): Memory
+                limit in MB for the machine learning algorithm. autopytorch
+                will stop fitting the machine learning algorithm if it tries
+                to allocate more than memory_limit MB. If None is provided,
+                no memory limit is set. In case of multi-processing, memory_limit
+                will be per job. This memory limit also applies to the ensemble
+                creation process.
+            smac_scenario_args (Optional[Dict]): Additional arguments inserted
+                into the scenario of SMAC. See the
+                [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
+            get_smac_object_callback (Optional[Callable]): Callback function
+                to create an object of class
+                [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
+                The function must accept the arguments scenario_dict,
+                instances, num_params, runhistory, seed and ta. This is
+                an advanced feature. Use only if you are familiar with
+                [SMAC](https://automl.github.io/SMAC3/master/index.html).
+            all_supported_metrics (bool), (default=True): if True, all
+                metrics supporting current task will be calculated
+                for each pipeline and results will be available via cv_results
+            precision (int), (default=32): Numeric precision used when loading
+                ensemble data. Can be either '16', '32' or '64'.
+            disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+                Used as a list to pass more fine-grained
+                information on what to save. Must be a member of `DisableFileOutputParameters`.
+                Allowed elements in the list are:
+
+                + `y_optimization`:
+                    do not save the predictions for the optimization set,
+                    which would later on be used to build an ensemble. Note that SMAC
+                    optimizes a metric evaluated on the optimization set.
+                + `pipeline`:
+                    do not save any individual pipeline files
+                + `pipelines`:
+                    In case of cross validation, disables saving the joint model of the
+                    pipelines fit on each fold.
+                + `y_test`:
+                    do not save the predictions for the test set.
+                + `all`:
+                    do not save any of the above.
+                For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
+            load_models (bool), (default=True): Whether to load the
+                models after fitting AutoPyTorch.
+            suggested_init_models: Optional[List[str]]
+                suggested initial models with their default configurations setting
+            custom_init_setting_path: Optional[str]
+                path to a json file that contains the initial configuration suggested by the users
+            min_num_test_instances: Optional[int]
+                if it is set None, then full validation sets will be evaluated in each fidelity. Otherwise, the number
+                of instances in the test sets should be a value that is at least as great as this value, otherwise, the
+                number of test instance is proportional to its fidelity
+            forecasting_dataset_kwargs: Dict[Any]
+                Forecasting dataset kwargs used to initialize forecasting dataset
+        Returns:
+            self
+
+        """
+        if memory_limit is not None:
+            self._dataset_compression = get_dataset_compression_mapping(
+                memory_limit, dataset_compression
+            )
+        else:
+            self._dataset_compression = None
+
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression,
+            freq=freq,
+            start_times=start_times,
+            series_idx=series_idx,
+            n_prediction_steps=n_prediction_steps,
+            **forecasting_dataset_kwargs,
+        )
+
+        if not self.customized_window_size:
+            self.update_sliding_window_size(n_prediction_steps=n_prediction_steps)
+
+        self._metrics_kwargs = {
+            "sp": self.dataset.seasonality,
+            "n_prediction_steps": n_prediction_steps,
+        }
+
+        forecasting_kwargs = dict(
+            suggested_init_models=suggested_init_models,
+            custom_init_setting_path=custom_init_setting_path,
+            min_num_test_instances=min_num_test_instances,
+        )
+
+        return self._search(
+            dataset=self.dataset,
+            optimize_metric=optimize_metric,
+            budget_type=budget_type,
+            min_budget=min_budget,
+            max_budget=max_budget,
+            total_walltime_limit=total_walltime_limit,
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            enable_traditional_pipeline=enable_traditional_pipeline,
+            memory_limit=memory_limit,
+            smac_scenario_args=smac_scenario_args,
+            get_smac_object_callback=get_smac_object_callback,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            load_models=load_models,
+            portfolio_selection=portfolio_selection,
+            **forecasting_kwargs,  # type: ignore[arg-type]
+        )
+
+    def predict(
+        self,
+        X_test: List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]] = None,
+        batch_size: Optional[int] = None,
+        n_jobs: int = 1,
+        past_targets: Optional[List[np.ndarray]] = None,
+        future_targets: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]] = None,
+        start_times: List[pd.DatetimeIndex] = [],
+    ) -> np.ndarray:
+        """
+        Predict the future varaibles
+
+        Args:
+            X_test (List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]])
+                if it is a list of TimeSeriesSequence, then it is the series to be forecasted. Otherwise, it is the
+                known future features
+            batch_size: Optional[int]
+                batch size
+            n_jobs (int):
+                number of jobs
+            past_targets (Optional[List[np.ndarray]])
+                past observed targets, required when X_test is not a list of TimeSeriesSequence
+            future_targets (Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]]):
+                future targets (test sets)
+            start_times (List[pd.DatetimeIndex]):
+                starting time of each series when they are sampled. If it is not given, we simply start with a fixed
+                timestamp.
+
+        Return:
+            np.ndarray
+                predicted value, it needs to be with shape (B, H, N),
+                B is the number of series, H is forecasting horizon (n_prediction_steps), N is the number of targets
+        """
+        if X_test is None or not isinstance(X_test[0], TimeSeriesSequence):
+            assert past_targets is not None
+            # Validate and construct TimeSeriesSequence
+            X_test, _, _, _ = self.dataset.transform_data_into_time_series_sequence(
+                X=X_test,
+                Y=past_targets,
+                X_test=future_targets,
+                start_times=start_times,
+                is_test_set=True,
+            )
+        flattened_res = super(TimeSeriesForecastingTask, self).predict(
+            X_test, batch_size, n_jobs
+        )
+        # forecasting result from each series is stored as an array
+        if self.dataset.num_targets == 1:
+            forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps])
+        else:
+            forecasting = flattened_res.reshape(
+                [-1, self.dataset.n_prediction_steps, self.dataset.num_target]
+            )
+        if self.dataset.normalize_y:
+            mean = np.repeat(
+                self.dataset.y_mean.values(), self.dataset.n_prediction_steps
+            )
+            std = np.repeat(
+                self.dataset.y_std.values(), self.dataset.n_prediction_steps
+            )
+            return forecasting * std + mean
+        return forecasting
+
+    def update_sliding_window_size(self, n_prediction_steps: int) -> None:
+        """
+        the size of the sliding window is heavily dependent on the dataset,
+        so we only update them when we get the information from the
+
+        Args:
+            n_prediction_steps (int):
+                forecast horizon. Sometimes we could also make our base sliding window size based on the
+                forecast horizon
+        """
+        base_window_size = int(np.ceil(self.dataset.base_window_size))
+        # we don't want base window size to large, which might cause a too long computation time, in which case
+        # we will use n_prediction_step instead (which is normally smaller than base_window_size)
+        if base_window_size > MAX_WINDOW_SIZE_BASE:
+            # TODO considering padding to allow larger upper_window_size !!!
+            if n_prediction_steps > MAX_WINDOW_SIZE_BASE:
+                base_window_size = 50
+            else:
+                base_window_size = n_prediction_steps
+
+        if self.search_space_updates is None:
+            self.search_space_updates = HyperparameterSearchSpaceUpdates()
+
+        window_size_scales = [1, 3]
+
+        self.search_space_updates.append(
+            node_name="data_loader",
+            hyperparameter="window_size",
+            value_range=[
+                int(window_size_scales[0] * base_window_size),
+                int(window_size_scales[1] * base_window_size),
+            ],
+            default_value=int(np.ceil(1.25 * base_window_size)),
+        )
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
new file mode 100644
index 000000000..526fb3cfe
--- /dev/null
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -0,0 +1,263 @@
+{
+    "trainer": {
+        "data_loader:batch_size": 32,
+        "data_loader:backcast": false,
+        "data_loader:sample_strategy": "SeqUniform",
+        "data_loader:num_batches_per_epoch": 50,
+        "data_loader:transform_time_features": false,
+        "lr_scheduler:__choice__": "ReduceLROnPlateau",
+        "lr_scheduler:ReduceLROnPlateau:mode": "max",
+        "lr_scheduler:ReduceLROnPlateau:factor": 0.5,
+        "lr_scheduler:ReduceLROnPlateau:patience": 10,
+        "optimizer:__choice__": "AdamOptimizer",
+        "optimizer:AdamOptimizer:lr": 0.001,
+        "optimizer:AdamOptimizer:weight_decay": 1e-08,
+        "optimizer:AdamOptimizer:beta1": 0.9,
+        "optimizer:AdamOptimizer:beta2": 0.999,
+        "network_init:__choice__": "XavierInit",
+        "network_init:XavierInit:bias_strategy": "Normal",
+        "target_scaler:scaling_mode": "mean_abs",
+        "trainer:__choice__": "ForecastingStandardTrainer",
+        "network_embedding:__choice__": "NoEmbedding"
+    },
+    "feature_preprocessing": {
+        "feature_encoding:__choice__": "OneHotEncoder",
+        "scaler:scaling_mode": "standard",
+        "network_embedding:__choice__": "NoEmbedding"
+    },
+    "feature_imputer": {
+        "imputer:numerical_strategy": "ffill"
+    },
+    "target_imputer": {
+        "target_imputer:numerical_strategy": "ffill"
+    },
+    "models": {
+        "MLP": {
+            "loss:__choice__": "DistributionLoss",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "flat_encoder",
+            "network_backbone:flat_encoder:__choice__": "MLPEncoder",
+            "network_backbone:flat_encoder:MLPEncoder:num_groups": 1,
+            "network_backbone:flat_encoder:MLPEncoder:num_units_1": 40,
+            "network_backbone:flat_encoder:MLPEncoder:activation": "relu",
+            "network_backbone:flat_encoder:MLPEncoder:use_dropout": false,
+            "network_backbone:flat_encoder:MLPEncoder:normalization": "NoNorm",
+            "network_backbone:flat_encoder:MLPDecoder:num_layers": 0,
+            "network_backbone:flat_encoder:MLPDecoder:has_local_layer": true,
+            "network_backbone:flat_encoder:MLPDecoder:units_local_layer": 40
+        },
+        "DeepAR": {
+            "loss:__choice__": "DistributionLoss",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 2,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 40,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:seq_encoder:block_1:MLPDecoder:num_layers": 0,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:auto_regressive": true
+        },
+        "Seq2Seq-RNN2MLP": {
+            "loss:__choice__": "DistributionLoss",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "gru",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 50,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:seq_encoder:block_1:MLPDecoder:num_layers": 0,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:has_local_layer": true,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:units_local_layer": 30
+        },
+        "Seq2Seq-TCN2MLP": {
+            "loss:__choice__": "DistributionLoss",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "TCNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_blocks": 3,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_1": 30,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:kernel_size_1": 7,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_2": 30,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:kernel_size_2": 3,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_3": 30,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:kernel_size_3": 3,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:num_layers": 0,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:has_local_layer": false
+        },
+        "Seq2Seq-RNN2RNN": {
+            "loss:__choice__": "DistributionLoss",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "loss:DistributionLoss:forecast_strategy": "mean",
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "gru",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 3,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 32,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:block_1:RNNDecoder:decoder_type": "RNNDecoder"
+        },
+        "Seq2Seq-Transformer2Transformer": {
+            "loss:__choice__": "DistributionLoss",
+            "data_loader:transform_time_features": true,
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:norm_first": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:decoder_type": "TransformerDecoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_positional_encoder": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout_positional_encoder": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_feed_forward_log": 7,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:n_head_log": 3,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_layer_norm_output": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:norm_first": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_positional_decoder": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout_positional_decoder": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:d_feed_forward_log": 7,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:n_head_log": 3,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_layer_norm_output": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
+        },
+        "NBEATS-I": {
+            "target_scaler:scaling_mode": "none",
+            "data_loader:backcast": true,
+            "data_loader:backcast_period": 2,
+            "loss:__choice__": "RegressionLoss",
+            "loss:RegressionLoss:loss_name": "mase",
+            "network_backbone:__choice__": "flat_encoder",
+            "network_backbone:flat_encoder:__choice__": "NBEATSEncoder",
+            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ration": 0.0,
+            "network_backbone:flat_encoder:NBEATSDecoder:normalization": "NoNorm",
+            "network_backbone:flat_encoder:NBEATSDecoder:activation": "relu",
+            "network_backbone:flat_encoder:NBEATSDecoder:n_beats_type": "I",
+            "network_backbone:flat_encoder:NBEATSDecoder:use_dropout_i": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_stacks_i": 2,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_blocks_i_1": 3,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_layers_i_1": 2,
+            "network_backbone:flat_encoder:NBEATSDecoder:width_i_1": 256,
+            "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_i_1": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:stack_type_i_1": "trend",
+            "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_i_trend_1": 3,
+            "network_backbone:flat_encoder:NBEATSDecoder:dropout_i_1": 0.1,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_blocks_i_2": 3,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_layers_i_2": 2,
+            "network_backbone:flat_encoder:NBEATSDecoder:width_i_2": 512,
+            "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_i_2": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:stack_type_i_2": "seasonality",
+            "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_i_seasonality_2": 7,
+            "network_backbone:flat_encoder:NBEATSDecoder:dropout_i_2": 0.1
+        },
+        "NBEATS-G": {
+            "loss:__choice__": "RegressionLoss",
+            "loss:RegressionLoss:loss_name": "mape",
+            "network_backbone:__choice__": "flat_encoder",
+            "network_backbone:flat_encoder:__choice__": "NBEATSEncoder",
+            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ration": 0.0,
+            "network_backbone:flat_encoder:NBEATSDecoder:normalization": "NoNorm",
+            "network_backbone:flat_encoder:NBEATSDecoder:activation": "relu",
+            "network_backbone:flat_encoder:NBEATSDecoder:n_beats_type": "G",
+            "network_backbone:flat_encoder:NBEATSDecoder:use_dropout_g": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_stacks_g": 30,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_blocks_g": 1,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_layers_g": 4,
+            "network_backbone:flat_encoder:NBEATSDecoder:width_g": 512,
+            "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_g": false,
+            "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_g": 32,
+            "network_backbone:flat_encoder:NBEATSDecoder:dropout_g": 0.1
+        },
+        "TemoporalFusionTransformer": {
+            "loss:__choice__": "QuantileLoss",
+            "target_scaler:scaling_mode": "standard",
+            "data_loader:transform_time_features": true,
+            "loss:QuantileLoss:lower_quantile": 0.1,
+            "loss:QuantileLoss:upper_quantile": 0.9,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": true,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:variable_selection": true,
+            "network_backbone:seq_encoder:variable_selection_use_dropout": true,
+            "network_backbone:seq_encoder:variable_selection_dropout_rate": 0.1,
+            "network_backbone:seq_encoder:share_single_variable_networks": true,
+            "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
+            "network_backbone:seq_encoder:grn_use_dropout": true,
+            "network_backbone:seq_encoder:grn_dropout_rate": 0.1,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 32,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:block_1:RNNDecoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:use_temporal_fusion": true,
+            "network_backbone:seq_encoder:temporal_fusion:attention_d_model_log": 5,
+            "network_backbone:seq_encoder:temporal_fusion:attention_n_head_log": 2,
+            "network_backbone:seq_encoder:temporal_fusion:use_dropout": true,
+            "network_backbone:seq_encoder:temporal_fusion:dropout_rate": 0.1
+        }
+    }
+}
\ No newline at end of file
diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json
index a8e640a4e..bdcb45401 100644
--- a/autoPyTorch/configs/greedy_portfolio.json
+++ b/autoPyTorch/configs/greedy_portfolio.json
@@ -1,7 +1,7 @@
 [{"data_loader:batch_size": 60,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -31,8 +31,8 @@
  "network_backbone:ShapedMLPBackbone:max_dropout": 0.023271935735825866},
  {"data_loader:batch_size": 255,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -65,8 +65,8 @@
  "network_backbone:ShapedResNetBackbone:max_dropout": 0.7662454727603789},
  {"data_loader:batch_size": 165,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -96,8 +96,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 299,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -128,8 +128,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 183,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -162,8 +162,8 @@
  "network_backbone:ShapedResNetBackbone:max_dropout": 0.27204101593048097},
  {"data_loader:batch_size": 21,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -191,8 +191,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 159,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -221,8 +221,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 442,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -254,8 +254,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 140,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -287,8 +287,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 48,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -315,8 +315,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 168,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -348,8 +348,8 @@
  "network_backbone:ShapedResNetBackbone:max_dropout": 0.8992826006547855},
  {"data_loader:batch_size": 21,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -377,8 +377,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 163,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -410,8 +410,8 @@
  "network_backbone:ShapedResNetBackbone:max_dropout": 0.6341848343636569},
  {"data_loader:batch_size": 150,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -444,8 +444,8 @@
  "network_backbone:ShapedResNetBackbone:max_dropout": 0.7133813761319248},
  {"data_loader:batch_size": 151,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -474,8 +474,8 @@
  "network_head:fully_connected:units_layer_1": 128},
  {"data_loader:batch_size": 42,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 652a546b9..bfd56d27f 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -2,31 +2,30 @@
 IMAGE_CLASSIFICATION = 2
 TABULAR_REGRESSION = 3
 IMAGE_REGRESSION = 4
-TIMESERIES_CLASSIFICATION = 5
-TIMESERIES_REGRESSION = 6
+TIMESERIES_FORECASTING = 5
 
-REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION]
-CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION, TIMESERIES_CLASSIFICATION]
+REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION]
+CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION]
+FORECASTING_TASKS = [TIMESERIES_FORECASTING]  # TODO extend FORECASTING TASKS to Classification and regression tasks
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
-TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS
+TIMESERIES_TASKS = [TIMESERIES_FORECASTING]
+TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + FORECASTING_TASKS
 
 TASK_TYPES_TO_STRING = \
     {TABULAR_CLASSIFICATION: 'tabular_classification',
      IMAGE_CLASSIFICATION: 'image_classification',
      TABULAR_REGRESSION: 'tabular_regression',
      IMAGE_REGRESSION: 'image_regression',
-     TIMESERIES_CLASSIFICATION: 'time_series_classification',
-     TIMESERIES_REGRESSION: 'time_series_regression'}
+     TIMESERIES_FORECASTING: 'time_series_forecasting'}
 
 STRING_TO_TASK_TYPES = \
     {'tabular_classification': TABULAR_CLASSIFICATION,
      'image_classification': IMAGE_CLASSIFICATION,
      'tabular_regression': TABULAR_REGRESSION,
      'image_regression': IMAGE_REGRESSION,
-     'time_series_classification': TIMESERIES_CLASSIFICATION,
-     'time_series_regression': TIMESERIES_REGRESSION}
+     'time_series_forecasting': TIMESERIES_FORECASTING}
 
 # Output types have been defined as in scikit-learn type_of_target
 # (https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html)
@@ -54,3 +53,28 @@
 
 CLASSIFICATION_OUTPUTS = [BINARY, MULTICLASS, MULTICLASSMULTIOUTPUT]
 REGRESSION_OUTPUTS = [CONTINUOUS, CONTINUOUSMULTIOUTPUT]
+
+ForecastingDependenciesNotInstalledMSG = "Additional dependencies must be installed to work with time series " \
+                                         "forecasting tasks! Please run \n pip install autoPyTorch[forecasting] \n to "\
+                                         "install the corresponding dependencies!"
+
+
+# The constant values for time series forecasting comes from
+# https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
+# seasonality map, maps a frequency value to a number
+FORECASTING_BUDGET_TYPE = ('resolution', 'num_seq', 'num_sample_per_seq')
+
+SEASONALITY_MAP = {
+    "1min": [1440, 10080, 525960],
+    "10min": [144, 1008, 52596],
+    "30min": [48, 336, 17532],
+    "1H": [24, 168, 8766],
+    "1D": 7,
+    "1W": 365.25 / 7,
+    "1M": 12,
+    "1Q": 4,
+    "1Y": 1
+}
+
+# To avoid that we get a sequence that is too long to be fed to a network
+MAX_WINDOW_SIZE_BASE = 500
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 6ef7cae6b..2d09c474e 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -5,25 +5,14 @@
 
 import pandas as pd
 
-import scipy.sparse
+from scipy.sparse import spmatrix
 
 from sklearn.base import BaseEstimator
 
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_FEAT_TYPES = Union[
-    List,
-    pd.DataFrame,
-    np.ndarray,
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix]
 
 
 class BaseFeatureValidator(BaseEstimator):
@@ -46,7 +35,7 @@ def __init__(
         logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
     ):
         # Register types to detect unsupported data format changes
-        self.feat_type: Optional[List[str]] = None
+        self.feat_types: Optional[List[str]] = None
         self.data_type: Optional[type] = None
         self.dtypes: List[str] = []
         self.column_order: List[str] = []
@@ -68,8 +57,8 @@ def __init__(
 
     def fit(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features.
@@ -77,10 +66,10 @@ def fit(
         CSR sparse data types are also supported
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of data used for checking
         """
 
@@ -109,11 +98,11 @@ def fit(
 
     def _fit(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> BaseEstimator:
         """
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
@@ -124,11 +113,11 @@ def _fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> np.ndarray:
         """
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features, whose categorical features are going to be
                 transformed
 
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index 393f3d85b..9943d5c55 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -5,26 +5,14 @@
 
 import pandas as pd
 
-import scipy.sparse
+from scipy.sparse import spmatrix
 
 from sklearn.base import BaseEstimator
 
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_TARGET_TYPES = Union[
-    List,
-    pd.Series,
-    pd.DataFrame,
-    np.ndarray,
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix]
 
 
 class BaseTargetValidator(BaseEstimator):
@@ -69,17 +57,17 @@ def __init__(self,
 
     def fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 A set of targets set aside for training
-            y_test (Union[SUPPORTED_TARGET_TYPES])
+            y_test (Union[SupportedTargetTypes])
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
@@ -128,26 +116,26 @@ def fit(
 
     def _fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SupportedTargetTypes])
                 A holdout set of labels
         """
         raise NotImplementedError()
 
     def transform(
         self,
-        y: Union[SUPPORTED_TARGET_TYPES],
+        y: Union[SupportedTargetTypes],
     ) -> np.ndarray:
         """
         Args:
-            y (SUPPORTED_TARGET_TYPES)
+            y (SupportedTargetTypes)
                 A set of targets that are going to be encoded if the current task
                 is classification
         Returns:
@@ -158,7 +146,7 @@ def transform(
 
     def inverse_transform(
         self,
-        y: SUPPORTED_TARGET_TYPES,
+        y: SupportedTargetTypes,
     ) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array
@@ -195,3 +183,7 @@ def is_single_column_target(self) -> bool:
         Output is encoded with a single column encoding
         """
         return self.out_dimensionality == 1
+
+    @property
+    def allow_missing_values(self) -> bool:
+        return False
diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py
index 13bb421c7..bebddff49 100644
--- a/autoPyTorch/data/base_validator.py
+++ b/autoPyTorch/data/base_validator.py
@@ -7,8 +7,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
-from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
+from autoPyTorch.data.base_feature_validator import SupportedFeatTypes
+from autoPyTorch.data.base_target_validator import SupportedTargetTypes
 
 
 class BaseInputValidator(BaseEstimator):
@@ -40,10 +40,10 @@ def __init__(
 
     def fit(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        y_train: SUPPORTED_TARGET_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        y_train: SupportedTargetTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features, and
@@ -59,15 +59,15 @@ def fit(
             + If performing a classification task, the data is going to be encoded
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks). If this data contains categorical columns, an encoder is going to
                 be instantiated and trained with this data.
-            y_train (SUPPORTED_TARGET_TYPES):
+            y_train (SupportedTargetTypes):
                 A set of targets that are going to be encoded if the task is for classification
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of features used for checking
-            y_test (SUPPORTED_TARGET_TYPES):
+            y_test (SupportedTargetTypes):
                 A hold out set of targets used for checking. Additionally, if the current task
                 is a classification task, this y_test categories are also going to be used to
                 fit a pre-processing encoding (to prevent errors on unseen classes).
@@ -96,16 +96,16 @@ def fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
-        y: Optional[SUPPORTED_TARGET_TYPES] = None,
+        X: SupportedFeatTypes,
+        y: Optional[SupportedTargetTypes] = None,
     ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         """
         Transform the given target or features to a numpy array
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features to transform
-            y (Optional[SUPPORTED_TARGET_TYPES]):
+            y (Optional[SupportedTargetTypes]):
                 A set of targets to transform
 
         Returns:
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 27ed18cfc..fab2471c4 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,12 +1,13 @@
 import functools
-from typing import Dict, List, Optional, Tuple, cast
+from logging import Logger
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 
-import scipy.sparse
+from scipy.sparse import issparse, spmatrix
 
 import sklearn.utils
 from sklearn import preprocessing
@@ -16,7 +17,9 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
 
-from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
+from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
+from autoPyTorch.utils.common import ispandas
+from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
 def _create_column_transformer(
@@ -91,7 +94,19 @@ class TabularFeatureValidator(BaseFeatureValidator):
             List of indices of numerical columns
         categorical_columns (List[int]):
             List of indices of categorical columns
+        feat_types (List[str]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool.
     """
+    def __init__(
+        self,
+        logger: Optional[Union[PicklableClientLogger, Logger]] = None,
+        feat_types: Optional[List[str]] = None,
+    ):
+        super().__init__(logger)
+        self.feat_types = feat_types
+
     @staticmethod
     def _comparator(cmp1: str, cmp2: str) -> int:
         """Order so that categorical columns come left and numerical columns come right
@@ -117,7 +132,7 @@ def _comparator(cmp1: str, cmp2: str) -> int:
 
     def _fit(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> BaseEstimator:
         """
         In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -125,7 +140,7 @@ def _fit(
         will be able to use
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and an encoder fitted in the case the data needs encoding
 
@@ -139,7 +154,7 @@ def _fit(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
+        if ispandas(X) and not issparse(X):
             X = cast(pd.DataFrame, X)
             # Treat a column with all instances a NaN as numerical
             # This will prevent doing encoding to a categorical column made completely
@@ -158,9 +173,9 @@ def _fit(
             if not X.select_dtypes(include='object').empty:
                 X = self.infer_objects(X)
 
-            self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
+            self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
 
-            assert self.feat_type is not None
+            assert self.feat_types is not None
 
             if len(self.transformed_columns) > 0:
 
@@ -177,8 +192,8 @@ def _fit(
                 # The column transformer reorders the feature types
                 # therefore, we need to change the order of columns as well
                 # This means categorical columns are shifted to the left
-                self.feat_type = sorted(
-                    self.feat_type,
+                self.feat_types = sorted(
+                    self.feat_types,
                     key=functools.cmp_to_key(self._comparator)
                 )
 
@@ -192,7 +207,7 @@ def _fit(
                     for cat in encoded_categories
                 ]
 
-            for i, type_ in enumerate(self.feat_type):
+            for i, type_ in enumerate(self.feat_types):
                 if 'numerical' in type_:
                     self.numerical_columns.append(i)
                 else:
@@ -204,14 +219,14 @@ def _fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
-    ) -> np.ndarray:
+        X: SupportedFeatTypes,
+    ) -> Union[np.ndarray, spmatrix, pd.DataFrame]:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features, whose categorical features are going to be
                 transformed
 
@@ -229,7 +244,7 @@ def transform(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
+        if ispandas(X) and not issparse(X):
             if np.any(pd.isnull(X)):
                 for column in X.columns:
                     if X[column].isna().all():
@@ -243,7 +258,7 @@ def transform(
         self._check_data(X)
 
         # Pandas related transformations
-        if hasattr(X, "iloc") and self.column_transformer is not None:
+        if ispandas(X) and self.column_transformer is not None:
             if np.any(pd.isnull(X)):
                 # After above check it means that if there is a NaN
                 # the whole column must be NaN
@@ -256,7 +271,7 @@ def transform(
 
         # Sparse related transformations
         # Not all sparse format support index sorting
-        if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
+        if issparse(X) and hasattr(X, 'sort_indices'):
             X.sort_indices()
 
         try:
@@ -272,22 +287,23 @@ def transform(
                                   "Please try to manually cast it to a supported "
                                   "numerical or categorical values.")
             raise e
+
         return X
 
     def _check_data(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> None:
         """
         Feature dimensionality and data type checks
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and an encoder fitted in the case the data needs encoding
         """
 
-        if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
+        if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X):
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " scipy sparse and Python Lists, yet, the provided input is"
                              " of type {}".format(type(X))
@@ -316,7 +332,7 @@ def _check_data(
                 )
 
         # Then for Pandas, we do not support Nan in categorical columns
-        if hasattr(X, "iloc"):
+        if ispandas(X):
             # If entered here, we have a pandas dataframe
             X = cast(pd.DataFrame, X)
 
@@ -326,7 +342,7 @@ def _check_data(
 
             # Define the column to be encoded here as the feature validator is fitted once
             # per estimator
-            self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
+            self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
 
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
@@ -351,12 +367,72 @@ def _check_data(
             else:
                 self.dtypes = dtypes
 
+    def get_columns_to_encode(
+        self,
+        X: pd.DataFrame
+    ) -> Tuple[List[str], List[str]]:
+        """
+        Return the columns to be transformed as well as
+        the type of feature for each column.
+
+        The returned values are dependent on `feat_types` passed to the `__init__`.
+
+        Args:
+            X (pd.DataFrame)
+                A set of features that are going to be validated (type and dimensionality
+                checks) and an encoder fitted in the case the data needs encoding
+
+        Returns:
+            transformed_columns (List[str]):
+                Columns to encode, if any
+            feat_type:
+                Type of each column numerical/categorical
+        """
+        transformed_columns, feat_types = self._get_columns_to_encode(X)
+        if self.feat_types is not None:
+            self._validate_feat_types(X)
+            transformed_columns = [X.columns[i] for i, col in enumerate(self.feat_types)
+                                   if col.lower() == 'categorical']
+            return transformed_columns, self.feat_types
+        else:
+            return transformed_columns, feat_types
+
+    def _validate_feat_types(self, X: pd.DataFrame) -> None:
+        """
+        Checks if the passed `feat_types` is compatible with what
+        AutoPyTorch expects, i.e, it should only contain `numerical`
+        or `categorical` and the number of feature types is equal to
+        the number of features. The case does not matter.
+
+        Args:
+            X (pd.DataFrame):
+                input features set
+
+        Raises:
+            ValueError:
+                if the number of feat_types is not equal to the number of features
+                if the feature type are not one of "numerical", "categorical"
+        """
+        assert self.feat_types is not None  # mypy check
+
+        if len(self.feat_types) != len(X.columns):
+            raise ValueError(f"Expected number of `feat_types`: {len(self.feat_types)}"
+                             f" to be the same as the number of features {len(X.columns)}")
+        for feat_type in set(self.feat_types):
+            if feat_type.lower() not in ['numerical', 'categorical']:
+                raise ValueError(f"Expected type of features to be in `['numerical', "
+                                 f"'categorical']`, but got {feat_type}")
+
     def _get_columns_to_encode(
         self,
         X: pd.DataFrame,
     ) -> Tuple[List[str], List[str]]:
         """
-        Return the columns to be encoded from a pandas dataframe
+        Return the columns to be transformed as well as
+        the type of feature for each column from a pandas dataframe.
+
+        If `self.feat_types` is not None, it also validates that the
+        dataframe dtypes dont disagree with the ones passed in `__init__`.
 
         Args:
             X (pd.DataFrame)
@@ -370,21 +446,24 @@ def _get_columns_to_encode(
                 Type of each column numerical/categorical
         """
 
-        if len(self.transformed_columns) > 0 and self.feat_type is not None:
-            return self.transformed_columns, self.feat_type
+        if len(self.transformed_columns) > 0 and self.feat_types is not None:
+            return self.transformed_columns, self.feat_types
 
         # Register if a column needs encoding
         transformed_columns = []
 
         # Also, register the feature types for the estimator
-        feat_type = []
+        feat_types = []
 
         # Make sure each column is a valid type
         for i, column in enumerate(X.columns):
             if X[column].dtype.name in ['category', 'bool']:
 
                 transformed_columns.append(column)
-                feat_type.append('categorical')
+                if self.feat_types is not None and self.feat_types[i].lower() == 'numerical':
+                    raise ValueError(f"Passed numerical as the feature type for column: {column} "
+                                     f"but the column is categorical")
+                feat_types.append('categorical')
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
             elif not is_numeric_dtype(X[column]):
@@ -424,13 +503,13 @@ def _get_columns_to_encode(
                         )
                     )
             else:
-                feat_type.append('numerical')
-        return transformed_columns, feat_type
+                feat_types.append('numerical')
+        return transformed_columns, feat_types
 
     def list_to_dataframe(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
     ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
         """
         Converts a list to a pandas DataFrame. In this process, column types are inferred.
@@ -438,10 +517,10 @@ def list_to_dataframe(
         If test data is provided, we proactively match it to train data
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of data used for checking
 
         Returns:
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index c37dc81c3..3f1aa2f96 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -1,11 +1,12 @@
 from typing import List, Optional, Union, cast
 
 import numpy as np
+import numpy.ma as ma
 
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 
-import scipy.sparse
+from scipy.sparse import issparse, spmatrix
 
 import sklearn.utils
 from sklearn import preprocessing
@@ -13,14 +14,53 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.multiclass import type_of_target
 
-from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES
+from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
+from autoPyTorch.utils.common import ispandas
+
+
+ArrayType = Union[np.ndarray, spmatrix]
+
+
+def _check_and_to_array(y: SupportedTargetTypes, allow_nan: bool = False) -> ArrayType:
+    """ sklearn check array will make sure we have the correct numerical features for the array """
+    if allow_nan:
+        return sklearn.utils.check_array(y, force_all_finite=False, accept_sparse='csr', ensure_2d=False)
+    else:
+        return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
+
+
+def _modify_regression_target(y: ArrayType, allow_nan: bool = False) -> ArrayType:
+    # Regression targets must have numbers after a decimal point.
+    # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+
+    # For forecasting tasks, missing targets are allowed. Our TimeSeriesTargetValidator is inherent from
+    # TabularTargetValidator, if this function is called by TimeSeriesTargetValidator, we will allow nan here
+    if allow_nan:
+        y = ma.masked_where(np.isnan(y), y, 1e12)
+
+    y_min = np.abs(y).min()
+    offset = max(y_min, 1e-13) * 1e-13  # Sufficiently small number
+    if y_min > 1e12:
+        raise ValueError(
+            "The minimum value for the target labels of regression tasks must be smaller than "
+            f"1e12 to avoid errors caused by an overflow, but got {y_min}"
+        )
+
+    # Since it is all integer, we can just add a random small number
+    if isinstance(y, np.ndarray):
+        y = y.astype(dtype=np.float64) + offset
+    else:
+        y.data = y.data.astype(dtype=np.float64) + offset
+    if allow_nan:
+        return y.data
+    return y
 
 
 class TabularTargetValidator(BaseTargetValidator):
     def _fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         If dealing with classification, this utility encodes the targets.
@@ -29,10 +69,10 @@ def _fit(
         errors
 
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SupportedTargetTypes])
                 A holdout set of labels
         """
         if not self.is_classification or self.type_of_target == 'multilabel-indicator':
@@ -42,7 +82,7 @@ def _fit(
             return self
 
         if y_test is not None:
-            if hasattr(y_train, "iloc"):
+            if ispandas(y_train):
                 y_train = pd.concat([y_train, y_test], ignore_index=True, sort=False)
             elif isinstance(y_train, list):
                 y_train = y_train + y_test
@@ -57,12 +97,11 @@ def _fit(
                                                         unknown_value=-1)
         else:
             # We should not reach this if statement as we check for type of targets before
-            raise ValueError("Multi-dimensional classification is not yet supported. "
-                             "Encoding multidimensional data converts multiple columns "
-                             "to a 1 dimensional encoding. Data involved = {}/{}".format(
-                                 np.shape(y_train),
-                                 self.type_of_target
-                             ))
+            raise ValueError(f"Multi-dimensional classification is not yet supported. "
+                             f"Encoding multidimensional data converts multiple columns "
+                             f"to a 1 dimensional encoding. Data involved = "
+                             f"{np.shape(y_train)}/{self.type_of_target}"
+                             )
 
         # Mypy redefinition
         assert self.encoder is not None
@@ -71,7 +110,7 @@ def _fit(
         if ndim > 1:
             self.encoder.fit(y_train)
         else:
-            if hasattr(y_train, 'iloc'):
+            if ispandas(y_train):
                 y_train = cast(pd.DataFrame, y_train)
                 self.encoder.fit(y_train.to_numpy().reshape(-1, 1))
             else:
@@ -94,16 +133,31 @@ def _fit(
 
         return self
 
-    def transform(
-        self,
-        y: Union[SUPPORTED_TARGET_TYPES],
-    ) -> np.ndarray:
+    def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
+        if self.encoder is None:
+            return _check_and_to_array(y, self.allow_missing_values)
+
+        # remove ravel warning from pandas Series
+        shape = np.shape(y)
+        if len(shape) > 1:
+            y = self.encoder.transform(y)
+        elif ispandas(y):
+            # The Ordinal encoder expects a 2 dimensional input.
+            # The targets are 1 dimensional, so reshape to match the expected shape
+            y = cast(pd.DataFrame, y)
+            y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
+        else:
+            y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
+
+        return _check_and_to_array(y, self.allow_missing_values)
+
+    def transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            y (SUPPORTED_TARGET_TYPES)
+            y (SupportedTargetTypes)
                 A set of targets that are going to be encoded if the current task
                 is classification
 
@@ -116,47 +170,28 @@ def transform(
 
         # Check the data here so we catch problems on new test data
         self._check_data(y)
+        y = self._transform_by_encoder(y)
 
-        if self.encoder is not None:
-            # remove ravel warning from pandas Series
-            shape = np.shape(y)
-            if len(shape) > 1:
-                y = self.encoder.transform(y)
-            else:
-                # The Ordinal encoder expects a 2 dimensional input.
-                # The targets are 1 dimensional, so reshape to match the expected shape
-                if hasattr(y, 'iloc'):
-                    y = cast(pd.DataFrame, y)
-                    y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
-                else:
-                    y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
-
-        # sklearn check array will make sure we have the
-        # correct numerical features for the array
-        # Also, a numpy array will be created
-        y = sklearn.utils.check_array(
-            y,
-            force_all_finite=True,
-            accept_sparse='csr',
-            ensure_2d=False,
-        )
-
-        # When translating a dataframe to numpy, make sure we
-        # honor the ravel requirement
+        # When translating a dataframe to numpy, make sure we honor the ravel requirement
         if y.ndim == 2 and y.shape[1] == 1:
             y = np.ravel(y)
 
+        if self.allow_missing_values:
+            y_filled = np.nan_to_num(y)
+        else:
+            y_filled = y
+
+        if not self.is_classification and "continuous" not in type_of_target(y_filled):
+            y = _modify_regression_target(y, self.allow_missing_values)
+
         return y
 
-    def inverse_transform(
-        self,
-        y: SUPPORTED_TARGET_TYPES,
-    ) -> np.ndarray:
+    def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array
 
         Args:
-            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
+            y (SupportedTargetTypes):
                 Target array to be transformed back to original form before encoding
         Returns:
             np.ndarray:
@@ -172,7 +207,7 @@ def inverse_transform(
             y = self.encoder.inverse_transform(y)
         else:
             # The targets should be a flattened array, hence reshape with -1
-            if hasattr(y, 'iloc'):
+            if ispandas(y):
                 y = cast(pd.DataFrame, y)
                 y = self.encoder.inverse_transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
             else:
@@ -185,31 +220,27 @@ def inverse_transform(
             y = y.astype(self.dtype)
         return y
 
-    def _check_data(
-        self,
-        y: SUPPORTED_TARGET_TYPES,
-    ) -> None:
+    def _check_data(self, y: SupportedTargetTypes) -> None:
         """
         Perform dimensionality and data type checks on the targets
 
         Args:
-            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
+            y (SupportedTargetTypes):
                 A set of features whose dimensionality and data type is going to be checked
         """
 
         if not isinstance(y, (np.ndarray, pd.DataFrame,
                               List, pd.Series)) \
-                and not scipy.sparse.issparse(y):  # type: ignore[misc]
-            raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
-                             " pd.Series, sparse data and Python Lists as targets, yet, "
-                             "the provided input is of type {}".format(
-                                 type(y)
-                             ))
+                and not issparse(y):  # type: ignore[misc]
+            raise ValueError(f"AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
+                             f" pd.Series, sparse data and Python Lists as targets, yet, "
+                             f"the provided input is of type {type(y)}"
+                             )
 
         # Sparse data muss be numerical
         # Type ignore on attribute because sparse targets have a dtype
-        if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
-                                                          np.number):
+        if issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
+                                             np.number):
             raise ValueError("When providing a sparse matrix as targets, the only supported "
                              "values are numerical. Please consider using a dense"
                              " instead."
@@ -228,17 +259,25 @@ def _check_data(
 
         # No Nan is supported
         has_nan_values = False
-        if hasattr(y, 'iloc'):
+        sparse_has_nan = False
+        if ispandas(y):
             has_nan_values = cast(pd.DataFrame, y).isnull().values.any()
-        if scipy.sparse.issparse(y):
-            y = cast(scipy.sparse.spmatrix, y)
+            if has_nan_values and self.allow_missing_values:
+                # if missing value is allowed, we simply fill the missing values to pass 'type_of_target'
+                y = cast(pd.DataFrame, y).fillna(method='pad')
+        if issparse(y):
+            y = cast(spmatrix, y)
             has_nan_values = not np.array_equal(y.data, y.data)
+            if has_nan_values and self.allow_missing_values:
+                sparse_has_nan = True
         else:
             # List and array like values are considered here
             # np.isnan cannot work on strings, so we have to check for every element
             # but NaN, are not equal to themselves:
             has_nan_values = not np.array_equal(y, y)
-        if has_nan_values:
+            if has_nan_values and self.allow_missing_values:
+                y = np.nan_to_num(y)
+        if sparse_has_nan or has_nan_values and not self.allow_missing_values:
             raise ValueError("Target values cannot contain missing/NaN values. "
                              "This is not supported by scikit-learn. "
                              )
@@ -262,8 +301,7 @@ def _check_data(
                                   # should filter out unsupported types.
                                   )
         if self.type_of_target not in supported_output_types:
-            raise ValueError("Provided targets are not supported by AutoPyTorch. "
-                             "Provided type is {} whereas supported types are {}.".format(
-                                 self.type_of_target,
-                                 supported_output_types
-                             ))
+            raise ValueError(f"Provided targets are not supported by AutoPyTorch. "
+                             f"Provided type is {self.type_of_target} "
+                             f"whereas supported types are {supported_output_types}."
+                             )
diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py
index 677b55d4b..0f6f89e1c 100644
--- a/autoPyTorch/data/tabular_validator.py
+++ b/autoPyTorch/data/tabular_validator.py
@@ -1,10 +1,21 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Optional, Union
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from scipy.sparse import issparse
 
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+from autoPyTorch.data.tabular_feature_validator import SupportedFeatTypes, TabularFeatureValidator
+from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
+from autoPyTorch.data.utils import (
+    DatasetCompressionInputType,
+    DatasetCompressionSpec,
+    DatasetDTypeContainerType,
+    reduce_dataset_size_if_too_large
+)
+from autoPyTorch.utils.common import ispandas
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 
 
@@ -27,14 +38,28 @@ class TabularInputValidator(BaseInputValidator):
         target_validator (TargetValidator):
             A TargetValidator instance used to validate and encode (in case of classification)
             the target values
+        dataset_compression (Optional[DatasetCompressionSpec]):
+            specifications for dataset compression. For more info check
+            documentation for `BaseTask.get_dataset`.
+        feat_types (List[str]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool
     """
     def __init__(
         self,
         is_classification: bool = False,
         logger_port: Optional[int] = None,
-    ) -> None:
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        feat_types: Optional[List[str]] = None,
+        seed: int = 42,
+    ):
+        self.dataset_compression = dataset_compression
+        self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         self.is_classification = is_classification
         self.logger_port = logger_port
+        self.feat_types = feat_types
+        self.seed = seed
         if self.logger_port is not None:
             self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
                 name='Validation',
@@ -43,9 +68,61 @@ def __init__(
         else:
             self.logger = logging.getLogger('Validation')
 
-        self.feature_validator = TabularFeatureValidator(logger=self.logger)
+        self.feature_validator = TabularFeatureValidator(
+            logger=self.logger,
+            feat_types=self.feat_types)
         self.target_validator = TabularTargetValidator(
             is_classification=self.is_classification,
             logger=self.logger
         )
         self._is_fitted = False
+
+    def _compress_dataset(
+        self,
+        X: DatasetCompressionInputType,
+        y: SupportedTargetTypes,
+    ) -> DatasetCompressionInputType:
+        """
+        Compress the dataset. This function ensures that
+        the testing data is converted to the same dtype as
+        the training data.
+        See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
+        for more information.
+
+        Args:
+            X (DatasetCompressionInputType):
+                features of dataset
+            y (SupportedTargetTypes):
+                targets of dataset
+        Returns:
+            DatasetCompressionInputType:
+                Compressed dataset.
+        """
+        is_dataframe = ispandas(X)
+        is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
+        if not is_reducible_type or self.dataset_compression is None:
+            return X, y
+        elif self._reduced_dtype is not None:
+            X = X.astype(self._reduced_dtype)
+            return X, y
+        else:
+            X, y = reduce_dataset_size_if_too_large(
+                X,
+                y=y,
+                is_classification=self.is_classification,
+                random_state=self.seed,
+                **self.dataset_compression  # type: ignore [arg-type]
+            )
+            self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
+            return X, y
+
+    def transform(
+        self,
+        X: SupportedFeatTypes,
+        y: Optional[SupportedTargetTypes] = None,
+    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+
+        X, y = super().transform(X, y)
+        X_reduced, y_reduced = self._compress_dataset(X, y)
+
+        return X_reduced, y_reduced
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
new file mode 100644
index 000000000..962da78a8
--- /dev/null
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -0,0 +1,174 @@
+import logging
+from typing import List, Optional, Tuple, Union
+
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import issparse
+
+from sklearn.base import BaseEstimator
+from sklearn.preprocessing import OrdinalEncoder
+
+
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+from autoPyTorch.utils.logging_ import PicklableClientLogger
+
+
+def df2index(df: Union[pd.DataFrame, pd.Series]) -> np.ndarray:
+    if isinstance(df, pd.Series):
+        seq_lengths = df.value_counts().values
+    else:
+        seq_lengths = np.unique(
+            OrdinalEncoder().fit_transform(df), axis=0, return_counts=True
+        )[1]
+    return np.arange(len(seq_lengths)).repeat(seq_lengths)
+
+
+class TimeSeriesFeatureValidator(TabularFeatureValidator):
+    def __init__(
+        self,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+    ):
+        super().__init__(logger)
+        self.only_contain_series_idx = False
+        self.static_features: Union[Tuple[()], Tuple[Union[int, str]]] = ()
+        self.series_idx: Optional[List[Union[str, int]]] = None
+
+    def get_reordered_columns(self) -> List[str]:
+        return self.transformed_columns + [
+            col for col in self.column_order if col not in set(self.transformed_columns)
+        ]
+
+    def fit(
+        self,
+        X_train: Union[pd.DataFrame, np.ndarray],
+        X_test: Union[pd.DataFrame, np.ndarray] = None,
+        series_idx: Optional[Union[List[Union[str, int]]]] = None,
+        sequence_lengths: Optional[List[int]] = None,
+    ) -> BaseEstimator:
+        """
+
+        Arguments:
+            X_train (Union[pd.DataFrame, np.ndarray]):
+                A set of data that are going to be validated (type and dimensionality
+                checks) and used for fitting
+
+            X_test (Union[pd.DataFrame, np.ndarray]):
+                An optional set of data that is going to be validated
+
+            series_idx (Optional[List[Union[str, int]]]):
+                Series Index, to identify each individual series
+
+            sequence_lengths (Optional[List[int]]):
+                Length of each sequence
+
+        Returns:
+            self:
+                The fitted base estimator
+        """
+        if issparse(X_train):
+            raise NotImplementedError(
+                "Sparse matrix is currently unsupported for Forecasting tasks"
+            )
+        index = None
+
+        if series_idx is not None:
+            self.series_idx = series_idx
+
+            # remove series idx as they are not part of features
+            # TODO consider them as static features?
+            if isinstance(X_train, pd.DataFrame):
+                for series_id in series_idx:
+                    if series_id not in X_train.columns:
+                        raise ValueError(
+                            f"All Series ID must be contained in the training column, however, {series_id}"
+                            f"is not part of {X_train.columns.tolist()}"
+                        )
+                if X_train[list(series_idx)].isnull().values.any():
+                    raise ValueError("NaN should not exit in Series ID!")
+                index = df2index(df=X_train[series_idx])
+
+                self.only_contain_series_idx = len(X_train.columns) == len(series_idx)
+
+                if self.only_contain_series_idx:
+                    self._is_fitted = True
+
+                    self.num_features = 0
+                    self.numerical_columns = []
+                    self.categorical_columns = []
+                    return self
+
+                X_train = X_train.drop(series_idx, axis=1)
+                X_train.index = index
+
+                if X_test is not None:
+                    assert isinstance(X_test, pd.DataFrame)
+                    index = df2index(df=X_test[series_idx])
+                    X_test = X_test.drop(series_idx, axis=1)
+                    X_test.index = index
+
+                super().fit(X_train, X_test)
+            else:
+                raise NotImplementedError(
+                    f"series idx only works with pandas.DataFrame but the type of "
+                    f"X_train is {type(X_train)} "
+                )
+        else:
+            super().fit(X_train, X_test)
+
+        X_train_has_idx = isinstance(X_train, pd.DataFrame)
+        X_train = pd.DataFrame(X_train)
+        if index is None:
+            if sequence_lengths is None:
+                if not X_train_has_idx:
+                    index = np.zeros(len(X_train))
+                else:
+                    index = X_train.index
+            else:
+                if np.sum(sequence_lengths) != len(X_train):
+                    raise ValueError(
+                        "The Sum of Sequence length must equal to the length of hte dataset"
+                    )
+                index = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+        X_train.index = index
+
+        static_features: pd.Series = (
+            X_train.groupby(X_train.index).nunique() <= 1
+        ).all()
+        self.static_features = tuple(   # type: ignore[assignment]
+            idx for idx in static_features.index if static_features[idx]
+        )
+        return self
+
+    def transform(
+        self,
+        X: Union[pd.DataFrame, np.ndarray],
+        index: Optional[Union[pd.Index, np.ndarray]] = None,
+    ) -> Optional[pd.DataFrame]:
+        if self.only_contain_series_idx:
+            return None
+        if self.series_idx is not None:
+            if isinstance(X, pd.DataFrame):
+                X = X.drop(self.series_idx, axis=1)
+            else:
+                raise NotImplementedError(
+                    f"series idx only works with pandas.DataFrame but the type of "
+                    f"X_train is {type(X)} "
+                )
+        X_has_idx = isinstance(X, pd.DataFrame)
+        if X_has_idx and index is None:
+            index = X.index
+        X = super(TimeSeriesFeatureValidator, self).transform(X)
+        if X.ndim == 1:
+            X = np.expand_dims(X, -1)  # type:ignore[no-redef]
+        X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())  # type:ignore[no-redef]
+        if index is None:
+            if not X_has_idx:
+                index = np.array([0] * len(X))
+        else:
+            if len(index) != X.shape[0]:
+                raise ValueError("Given index must have length as the input features!")
+        X.index = index
+        return X
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
new file mode 100644
index 000000000..c19224b70
--- /dev/null
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -0,0 +1,358 @@
+# -*- encoding: utf-8 -*-
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
+
+
+from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator, df2index
+from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
+from autoPyTorch.data.utils import DatasetCompressionSpec
+
+
+class TimeSeriesForecastingInputValidator(TabularInputValidator):
+    """
+    A validator designed for a time series forecasting dataset.
+    As a time series forecasting dataset might contain several time sequence with different length, we will transform
+    all the data to DataFrameGroupBy whereas each group represents a series
+    TODO for multiple output: target names and shapes
+    TODO check if we can compress time series forecasting datasets
+    """
+
+    def __init__(
+        self,
+        is_classification: bool = False,
+        logger_port: Optional[int] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+    ) -> None:
+        super(TimeSeriesForecastingInputValidator, self).__init__(
+            is_classification, logger_port, dataset_compression
+        )
+        self.feature_validator: TimeSeriesFeatureValidator = TimeSeriesFeatureValidator(
+            logger=self.logger
+        )
+        self.target_validator: TimeSeriesTargetValidator = TimeSeriesTargetValidator(
+            is_classification=self.is_classification, logger=self.logger
+        )
+        self._is_uni_variant = False
+        self.start_times: Optional[List[pd.DatetimeIndex]] = None
+        self.feature_shapes: Dict[str, int] = {}
+        self.feature_names: List[str] = []
+        self.series_idx: Optional[Union[List[Union[str, int]], str, int]] = None
+
+    def fit(  # type: ignore[override]
+        self,
+        X_train: Optional[Union[List, pd.DataFrame]],
+        y_train: Union[List, pd.DataFrame],
+        series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+        X_test: Optional[Union[List, pd.DataFrame]] = None,
+        y_test: Optional[Union[List, pd.DataFrame]] = None,
+        start_times: Optional[List[pd.DatetimeIndex]] = None,
+    ) -> BaseEstimator:
+        """
+        fit the validator with the training data, (optionally) start times and other information
+
+        Args:
+            X_train (Optional[Union[List, pd.DataFrame]]):
+                training features, could be None for uni-variant forecasting tasks
+            y_train (Union[List, pd.DataFrame]),
+                training targets
+            series_idx (Optional[Union[List[Union[str, int]], str, int]])
+                which columns of features are applied to identify the series
+            X_test (Optional[Union[List, pd.DataFrame]]):
+                test features. For forecasting tasks, test features indicates known future features
+                after the forecasting timestep
+            y_test (Optional[Union[List, pd.DataFrame]]):
+                target in the future
+            start_times (Optional[List[pd.DatetimeIndex]]):
+                start times on which the first element of each series is sampled
+
+        """
+        if series_idx is not None and not isinstance(series_idx, Iterable):
+            series_idx: Optional[List[Union[str, int]]] = [series_idx]  # type: ignore[no-redef]
+
+        self.series_idx = series_idx
+
+        if X_train is None:
+            self._is_uni_variant = True
+
+            self.feature_validator.num_features = 0
+            self.feature_validator.numerical_columns = []
+            self.feature_validator.categorical_columns = []
+            if isinstance(y_train, List):
+                n_seqs = len(y_train)
+                y_train = self.join_series(y_train)
+                if y_test is not None:
+                    y_test = self.join_series(y_test, return_seq_lengths=False)
+                else:
+                    y_test = None
+            elif isinstance(y_train, pd.DataFrame):
+                n_seqs = len(y_train.index.unique())
+            else:
+                raise NotImplementedError
+
+            self.target_validator.fit(y_train, y_test)
+            self._is_fitted = True
+        else:
+            if isinstance(y_train, List):
+                # Check that the data is valid
+                if len(X_train) != len(y_train):
+                    raise ValueError(
+                        "Inconsistent number of sequences for features and targets,"
+                        " {} for features and {} for targets".format(
+                            len(X_train),
+                            len(y_train),
+                        )
+                    )
+                n_seqs = len(y_train)
+
+                # X_train and y_train are stored as lists
+                y_train = self.join_series(y_train)
+                if y_test is not None:
+                    y_test = self.join_series(y_test, return_seq_lengths=False)
+
+                X_train, sequence_lengths = self.join_series(
+                    X_train, return_seq_lengths=True
+                )
+                X_test = self.join_series(X_test) if X_test is not None else None
+                if X_test is not None and y_test is not None:
+                    if len(X_test) != len(y_test):
+                        raise ValueError(
+                            "Inconsistent number of test datapoints for features and targets,"
+                            " {} for features and {} for targets".format(
+                                len(X_test),
+                                len(y_test),
+                            )
+                        )
+            elif isinstance(y_train, (pd.DataFrame, pd.Series)):
+                sequence_lengths = None
+                assert isinstance(X_train, pd.DataFrame)
+                if series_idx is not None:
+                    n_seqs = len(X_train.groupby(series_idx))
+                else:
+                    n_seqs = len(y_train.index.unique())
+            else:
+                raise NotImplementedError
+
+            self.feature_validator.fit(
+                X_train,
+                X_test,
+                series_idx=series_idx,  # type: ignore[arg-type]
+                sequence_lengths=sequence_lengths,
+            )
+            self.target_validator.fit(y_train, y_test)
+
+            if self.feature_validator.only_contain_series_idx:
+                self._is_uni_variant = True
+
+            self._is_fitted = True
+
+            self.feature_names = self.feature_validator.get_reordered_columns()
+            self.feature_shapes = {
+                feature_name: 1 for feature_name in self.feature_names
+            }
+
+        if start_times is None:
+            start_times = [pd.Timestamp("1900-01-01")] * n_seqs
+        else:
+            assert (
+                len(start_times) == n_seqs
+            ), "start_times_train must have the same length as y_train!"
+
+        self.start_times = start_times
+
+        return self
+
+    def transform(  # type: ignore[override]
+        self,
+        X: Optional[Union[List, pd.DataFrame]],
+        y: Optional[Union[List, pd.DataFrame]] = None,
+        validate_for_future_features: bool = False,
+    ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
+        """
+        transform the data with the fitted validator
+
+        Args:
+            X: Optional[Union[List, pd.DataFrame]]
+                time features
+            y: Optional[Union[List, pd.DataFrame]]
+                forecasting targets
+            validate_for_future_features: bool
+                if the validator is applied to transform future features (for test sets), in this case we only validate
+                features
+        """
+        if not self._is_fitted:
+            raise NotFittedError(
+                "Cannot call transform on a validator that is not fitted"
+            )
+        if validate_for_future_features and y is None:
+            if X is None:
+                return None, None, np.asarray([])
+            if isinstance(X, List):
+                num_sequences = len(X)
+                sequence_lengths = [0] * num_sequences
+                for seq_idx in range(num_sequences):
+                    sequence_lengths[seq_idx] = len(X[seq_idx])
+                npa_sequence_lengths = np.asarray(sequence_lengths)
+                x_transformed, _ = self._transform_X(X, npa_sequence_lengths)
+                return x_transformed, None, npa_sequence_lengths
+            elif isinstance(X, pd.DataFrame):
+                if self.series_idx is not None:
+                    X = X.sort_values(self.series_idx)
+                x_transformed, _ = self._transform_X(X, None)
+                return x_transformed, None, X.index.value_counts(sort=False).values
+
+            else:
+                raise NotImplementedError
+        else:
+            if y is None:
+                raise ValueError("Targets must be given!")
+
+            if isinstance(y, List):
+                num_sequences = len(y)
+                sequence_lengths = [0] * num_sequences
+                if not self._is_uni_variant:
+                    if X is None:
+                        raise ValueError("Multi Variant dataset requires X as input!")
+                    assert len(X) == len(
+                        y
+                    ), "Length of features must equal to length of targets!"
+                if self.series_idx is not None and X is None:
+                    raise ValueError("X must be given as series_idx!")
+
+                for seq_idx in range(num_sequences):
+                    sequence_lengths[seq_idx] = len(y[seq_idx])
+                npa_sequence_lengths = np.asarray(sequence_lengths)
+
+                y_stacked = self.join_series(y)
+
+                x_transformed, series_number = self._transform_X(
+                    X, npa_sequence_lengths
+                )
+                y_transformed = self.target_validator.transform(
+                    y_stacked, index=series_number
+                )
+
+                if self._is_uni_variant:
+                    return None, y_transformed, npa_sequence_lengths
+
+                return x_transformed, y_transformed, npa_sequence_lengths
+            elif isinstance(y, (pd.DataFrame, pd.Series)):
+                if self.series_idx is not None:
+                    if isinstance(y, pd.Series):
+                        y_columns = [y.name]
+                    else:
+                        if isinstance(y.columns, pd.RangeIndex):
+                            y_columns = [f"target_{i}" for i in y.columns]
+                            y.columns = y_columns
+                        y_columns = y.columns
+                    xy = pd.concat([X, y], axis=1)
+                    xy.sort_values(self.feature_validator.series_idx, inplace=True)
+
+                    y = xy[y_columns]
+                    X = xy.drop(y_columns, axis=1)
+                    del xy
+
+                x_transformed, series_number = self._transform_X(X, None)
+
+                if self._is_uni_variant:
+                    y_transformed = self.target_validator.transform(
+                        y, series_number
+                    )
+                    return (
+                        None,
+                        y_transformed,
+                        y_transformed.index.value_counts(sort=False).values,
+                    )
+
+                y_transformed = self.target_validator.transform(
+                    y, x_transformed.index
+                )
+                return (
+                    x_transformed,
+                    y_transformed,
+                    y_transformed.index.value_counts(sort=False).values,
+                )
+
+            else:
+                raise NotImplementedError
+
+    def _transform_X(
+        self,
+        X: Optional[Union[List, pd.DataFrame]],
+        sequence_lengths: Optional[np.ndarray] = None,
+    ) -> Tuple[pd.DataFrame, Optional[Union[np.ndarray, pd.Index]]]:
+        if self.series_idx is None:
+            if self._is_uni_variant:
+                x_transformed = None
+                if sequence_lengths is not None:
+                    series_number = np.arange(len(sequence_lengths)).repeat(
+                        sequence_lengths
+                    )
+                else:
+                    series_number = None
+            else:
+                if isinstance(X, List):
+                    assert sequence_lengths is not None
+                    series_number = np.arange(len(sequence_lengths)).repeat(
+                        sequence_lengths
+                    )
+                    x_stacked = self.join_series(X)
+                    x_transformed = self.feature_validator.transform(
+                        x_stacked, index=series_number
+                    )
+                elif isinstance(X, pd.DataFrame):
+                    series_number = X.index
+                    x_transformed = self.feature_validator.transform(X)
+                else:
+                    raise NotImplementedError
+        else:
+            if isinstance(X, List):
+                # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
+                x_stacked = pd.concat(X)
+            elif isinstance(X, pd.DataFrame):
+                x_stacked = X
+            else:
+                raise NotImplementedError
+            series_number = df2index(x_stacked[self.series_idx])
+
+            if not self._is_uni_variant:
+                x_transformed = self.feature_validator.transform(
+                    x_stacked, index=series_number
+                )
+            else:
+                x_transformed = None
+
+        return x_transformed, series_number
+
+    @staticmethod
+    def join_series(
+        X: List[Union[pd.DataFrame, np.ndarray]], return_seq_lengths: bool = False
+    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, List[int]]]:
+        """join the series into one single item"""
+        num_sequences = len(X)
+        sequence_lengths = [0] * num_sequences
+        for seq_idx in range(num_sequences):
+            sequence_lengths[seq_idx] = len(X[seq_idx])
+        series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+        if not isinstance(X, List):
+            raise ValueError(f"Input must be a list, but it is {type(X)}")
+        if isinstance(X[0], (pd.DataFrame, pd.Series)):
+            joint_input = pd.concat(X)
+        elif isinstance(X[0], (List, np.ndarray)):
+            joint_input = np.concatenate(X)
+        else:
+            raise NotImplementedError(f"Unsupported input type: List[{type(X[0])}]")
+        joint_input = pd.DataFrame(joint_input)
+        joint_input.index = series_number
+
+        if return_seq_lengths:
+            return joint_input, sequence_lengths
+        else:
+            return joint_input
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
new file mode 100644
index 000000000..1ade4c361
--- /dev/null
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -0,0 +1,82 @@
+import logging
+from typing import Optional, Union
+
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import issparse
+
+from sklearn.base import BaseEstimator
+
+
+from autoPyTorch.data.base_target_validator import SupportedTargetTypes
+from autoPyTorch.data.tabular_target_validator import ArrayType, TabularTargetValidator
+from autoPyTorch.utils.logging_ import PicklableClientLogger
+
+
+class TimeSeriesTargetValidator(TabularTargetValidator):
+    def __init__(
+        self,
+        is_classification: bool = False,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+    ):
+        if is_classification:
+            raise NotImplementedError(
+                "Classification is currently not supported for forecasting tasks!"
+            )
+        super().__init__(is_classification, logger)
+
+    def fit(
+        self,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
+    ) -> BaseEstimator:
+        if issparse(y_train):
+            # TODO fix this
+            raise NotImplementedError(
+                "Sparse Target is unsupported for forecasting task!"
+            )
+        return super().fit(y_train, y_test)
+
+    def transform(
+        self,
+        y: SupportedTargetTypes,
+        index: Optional[Union[pd.Index, np.ndarray]] = None,
+    ) -> pd.DataFrame:
+        """
+        Validates and fit a categorical encoder (if needed) to the features.
+        The supported data types are List, numpy arrays and pandas DataFrames.
+
+        Args:
+            y (SupportedTargetTypes)
+                A set of targets that are going to be encoded if the current task
+                is classification
+            index (Optional[Union[pd.Index], np.ndarray]):
+                index indentifying which series the data belongs to
+
+        Returns:
+            pd.DataFrame:
+                The transformed array
+        """
+        y_has_idx = isinstance(y, pd.DataFrame)
+        if y_has_idx and index is None:
+            index = y.index  # type: ignore[union-attr]
+        y: ArrayType = super().transform(y)  # type: ignore[no-redef]
+
+        if index is None:
+            if not y_has_idx:
+                index = np.array([0] * y.shape[0])  # type: ignore[union-attr]
+        else:
+            if len(index) != y.shape[0]:  # type: ignore[union-attr]
+                raise ValueError("Index must have length as the input targets!")
+        if y.ndim == 1:  # type: ignore[union-attr]
+            y = np.expand_dims(y, -1)
+        y: pd.DataFrame = pd.DataFrame(y)  # type: ignore[no-redef]
+        y.index = index  # type: ignore
+        return y
+
+    @property
+    def allow_missing_values(self) -> bool:
+        return True
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
new file mode 100644
index 000000000..20ad5612e
--- /dev/null
+++ b/autoPyTorch/data/utils.py
@@ -0,0 +1,567 @@
+# Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py
+import warnings
+from typing import (
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+    cast
+)
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import issparse, spmatrix
+
+from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
+from sklearn.model_selection._split import _validate_shuffle_split
+from sklearn.utils import _approximate_mode, check_random_state
+from sklearn.utils.validation import _num_samples, check_array
+
+from autoPyTorch.data.base_target_validator import SupportedTargetTypes
+from autoPyTorch.utils.common import ispandas
+
+# TODO: TypedDict with python 3.8
+#
+#   When upgrading to python 3.8 as minimum version, this should be a TypedDict
+#   so that mypy can identify the fields types
+DatasetCompressionSpec = Dict[str, Union[int, float, List[str]]]
+DatasetDTypeContainerType = Union[Type, Dict[str, Type]]
+DatasetCompressionInputType = Union[np.ndarray, spmatrix, pd.DataFrame]
+
+# Default specification for arg `dataset_compression`
+default_dataset_compression_arg: DatasetCompressionSpec = {
+    "memory_allocation": 0.1,
+    "methods": ["precision", "subsample"]
+}
+
+
+class CustomStratifiedShuffleSplit(StratifiedShuffleSplit):
+    """Splitter that deals with classes with too few samples"""
+
+    def _iter_indices(self, X, y, groups=None):  # type: ignore
+        n_samples = _num_samples(X)
+        y = check_array(y, ensure_2d=False, dtype=None)
+        n_train, n_test = _validate_shuffle_split(
+            n_samples,
+            self.test_size,
+            self.train_size,
+            default_test_size=self._default_test_size,
+        )
+
+        if y.ndim == 2:
+            # for multi-label y, map each distinct row to a string repr
+            # using join because str(row) uses an ellipsis if len(row) > 1000
+            y = np.array([" ".join(row.astype("str")) for row in y])
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+        n_classes = classes.shape[0]
+
+        class_counts = np.bincount(y_indices)
+
+        if n_train < n_classes:
+            raise ValueError(
+                "The train_size = %d should be greater or "
+                "equal to the number of classes = %d" % (n_train, n_classes)
+            )
+        if n_test < n_classes:
+            raise ValueError(
+                "The test_size = %d should be greater or "
+                "equal to the number of classes = %d" % (n_test, n_classes)
+            )
+
+        # Find the sorted list of instances for each class:
+        # (np.unique above performs a sort, so code is O(n logn) already)
+        class_indices = np.split(
+            np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
+        )
+
+        rng = check_random_state(self.random_state)
+
+        for _ in range(self.n_splits):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(class_counts, n_train, rng)
+            class_counts_remaining = class_counts - n_i
+            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
+            train = []
+            test = []
+
+            # NOTE: Adapting for unique instances
+            #
+            #   Each list n_i, t_i represent the list of class in the
+            #   training_set and test_set resepectively.
+            #
+            #   n_i = [100, 100, 0, 3]  # 100 of class '0', 0 of class '2'
+            #   t_i = [300, 300, 1, 3]  # 300 of class '0', 1 of class '2'
+            #
+            #  To support unique labels such as class '2', which only has one sample
+            #  between both n_i and t_i, we need to make sure that n_i has at least
+            #  one sample of all classes. There is also the extra check to ensure
+            #  that the sizes stay the same.
+            #
+            #   n_i = [ 99, 100, 1, 3]  # 100 of class '0', 0 of class '2'
+            #            |       ^
+            #            v       |
+            #   t_i = [301, 300, 0, 3]  # 300 of class '0', 1 of class '2'
+            #
+            for i, class_count in enumerate(n_i):
+                if class_count == 0:
+                    t_i[i] -= 1
+                    n_i[i] += 1
+
+                    j = np.argmax(n_i)
+                    if n_i[j] == 1:
+                        warnings.warn(
+                            "Can't respect size requirements for split.",
+                            " The training set must contain all of the unique"
+                            " labels that exist in the dataset.",
+                        )
+                    else:
+                        n_i[j] -= 1
+                        t_i[j] += 1
+
+            for i in range(n_classes):
+                permutation = rng.permutation(class_counts[i])
+                perm_indices_class_i = class_indices[i].take(permutation, mode="clip")
+
+                train.extend(perm_indices_class_i[: n_i[i]])
+                test.extend(perm_indices_class_i[n_i[i]: n_i[i] + t_i[i]])
+
+            train = rng.permutation(train)
+            test = rng.permutation(test)
+
+            yield train, test
+
+
+def get_dataset_compression_mapping(
+    memory_limit: int,
+    dataset_compression: Union[bool, Mapping[str, Any]]
+) -> Optional[DatasetCompressionSpec]:
+    """
+    Internal function to get value for `BaseTask._dataset_compression`
+    based on the value of `dataset_compression` passed.
+
+    If True, it returns the default_dataset_compression_arg. In case
+    of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
+
+    If False, it returns None.
+
+    Args:
+        memory_limit (int):
+            memory limit of the current search.
+        dataset_compression (Union[bool, Mapping[str, Any]]):
+            mapping passed to the `search` function.
+
+    Returns:
+        Optional[DatasetCompressionSpec]:
+            Validated data compression spec or None.
+    """
+    dataset_compression_mapping: Optional[Mapping[str, Any]] = None
+
+    if not isinstance(dataset_compression, bool):
+        dataset_compression_mapping = dataset_compression
+    elif dataset_compression:
+        dataset_compression_mapping = default_dataset_compression_arg
+
+    if dataset_compression_mapping is not None:
+        dataset_compression_mapping = validate_dataset_compression_arg(
+            dataset_compression_mapping, memory_limit=memory_limit)
+
+    return dataset_compression_mapping
+
+
+def validate_dataset_compression_arg(
+    dataset_compression: Mapping[str, Any],
+    memory_limit: int
+) -> DatasetCompressionSpec:
+    """Validate and return a correct dataset_compression argument
+
+    The returned value can be safely used with `reduce_dataset_size_if_too_large`.
+
+    Args:
+        dataset_compression: Mapping[str, Any]
+            The argumnents to validate
+
+    Returns:
+        DatasetCompressionSpec
+            The validated and correct dataset compression spec
+    """
+    if not isinstance(dataset_compression, Mapping):
+        raise ValueError(
+            f"Unknown type for `dataset_compression` {type(dataset_compression)}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+    # Fill with defaults if they don't exist
+    dataset_compression = {
+        **default_dataset_compression_arg,
+        **dataset_compression
+    }
+
+    # Must contain known keys
+    if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()):
+        raise ValueError(
+            f"Unknown key in dataset_compression, {list(dataset_compression.keys())}."
+            f"\nPossible keys are {list(default_dataset_compression_arg.keys())}"
+        )
+
+    memory_allocation = dataset_compression["memory_allocation"]
+
+    # "memory_allocation" must be float or int
+    if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)):
+        raise ValueError(
+            "key 'memory_allocation' must be an `int` or `float`"
+            f"\ntype = {memory_allocation}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+    # "memory_allocation" if absolute, should be > 0 and < memory_limit
+    if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit):
+        raise ValueError(
+            f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})"
+            f"\nmemory_allocation = {memory_allocation}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+    # "memory_allocation" must be in (0,1) if float
+    if isinstance(memory_allocation, float):
+        if not (0.0 < memory_allocation < 1.0):
+            raise ValueError(
+                "key 'memory_allocation' if float must be in (0, 1)"
+                f"\nmemory_allocation = {memory_allocation}"
+                f"\ndataset_compression = {dataset_compression}"
+            )
+        # convert to required memory so we can directly use
+        dataset_compression["memory_allocation"] = memory_allocation * memory_limit
+
+    # "methods" must be non-empty sequence
+    if (
+        not isinstance(dataset_compression["methods"], Sequence)
+        or len(dataset_compression["methods"]) <= 0
+    ):
+        raise ValueError(
+            "key 'methods' must be a non-empty list"
+            f"\nmethods = {dataset_compression['methods']}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+    # "methods" must contain known methods
+    if any(
+        method not in cast(Sequence, default_dataset_compression_arg["methods"])  # mypy
+        for method in dataset_compression["methods"]
+    ):
+        raise ValueError(
+            f"key 'methods' can only contain {default_dataset_compression_arg['methods']}"
+            f"\nmethods = {dataset_compression['methods']}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+    return cast(DatasetCompressionSpec, dataset_compression)
+
+
+class _DtypeReductionMapping(Mapping):
+    """
+    Unfortuantly, mappings compare by hash(item) and not the __eq__ operator
+    between the key and the item.
+
+    Hence we wrap the dict in a Mapping class and implement our own __getitem__
+    such that we do use __eq__ between keys and query items.
+
+    >>> np.float32 == dtype('float32') # True, they are considered equal
+    >>>
+    >>> mydict = { np.float32: 'hello' }
+    >>>
+    >>> # Equal by __eq__ but dict operations fail
+    >>> np.dtype('float32') in mydict # False
+    >>> mydict[dtype('float32')]  # KeyError
+
+    This mapping class fixes that supporting the `in` operator as well as `__getitem__`
+
+    >>> reduction_mapping = _DtypeReductionMapping()
+    >>>
+    >>> reduction_mapping[np.dtype('float64')] # np.float32
+    >>> np.dtype('float32') in reduction_mapping # True
+    """
+
+    # Information about dtype support
+    _mapping: Dict[type, type] = {
+        np.float32: np.float32,
+        np.float64: np.float32,
+        np.int32: np.int32,
+        np.int64: np.int32
+    }
+
+    # In spite of the names, np.float96 and np.float128
+    # provide only as much precision as np.longdouble,
+    # that is, 80 bits on most x86 machines and 64 bits
+    # in standard Windows builds.
+    _mapping.update({getattr(np, s): np.float64 for s in ['float96', 'float128'] if hasattr(np, s)})
+
+    @classmethod
+    def __getitem__(cls, item: type) -> type:
+        for k, v in cls._mapping.items():
+            if k == item:
+                return v
+        raise KeyError(item)
+
+    @classmethod
+    def __iter__(cls) -> Iterator[type]:
+        return iter(cls._mapping.keys())
+
+    @classmethod
+    def __len__(cls) -> int:
+        return len(cls._mapping)
+
+
+reduction_mapping = _DtypeReductionMapping()
+supported_precision_reductions = list(reduction_mapping)
+
+
+def reduce_precision(
+    X: DatasetCompressionInputType
+) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]:
+    """ Reduce the precision of a dataset containing floats or ints
+
+    Note:
+        For dataframe, the column's precision is reduced using pd.to_numeric.
+
+    Args:
+        X (DatasetCompressionInputType):
+            The data to reduce precision of.
+
+    Returns:
+        Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
+            Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
+    """
+    reduced_dtypes: Optional[DatasetDTypeContainerType] = None
+    if isinstance(X, np.ndarray) or issparse(X):
+        dtypes = X.dtype
+        if X.dtype not in supported_precision_reductions:
+            raise ValueError(f"X.dtype = {X.dtype} not equal to any supported"
+                             f" {supported_precision_reductions}")
+        reduced_dtypes = reduction_mapping[X.dtype]
+        X = X.astype(reduced_dtypes)
+
+    elif ispandas(X):
+        dtypes = dict(X.dtypes)
+
+        col_names = X.dtypes.index
+
+        float_cols = col_names[[dt.name.startswith("float") for dt in X.dtypes.values]]
+        int_cols = col_names[[dt.name.startswith("int") for dt in X.dtypes.values]]
+        X[int_cols] = X[int_cols].apply(lambda column: pd.to_numeric(column, downcast='integer'))
+        X[float_cols] = X[float_cols].apply(lambda column: pd.to_numeric(column, downcast='float'))
+
+        reduced_dtypes = dict(X.dtypes)
+    else:
+        raise ValueError(f"Unrecognised data type of X, expected data type to "
+                         f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")
+
+    return X, reduced_dtypes, dtypes
+
+
+def subsample(
+    X: DatasetCompressionInputType,
+    is_classification: bool,
+    sample_size: Union[float, int],
+    y: Optional[SupportedTargetTypes] = None,
+    random_state: Optional[Union[int, np.random.RandomState]] = None,
+) -> Tuple[DatasetCompressionInputType, SupportedTargetTypes]:
+    """Subsamples data returning the same type as it recieved.
+
+    If `is_classification`, we split using a stratified shuffle split which
+    preserves unique labels in the training set.
+
+    NOTE:
+    It's highly unadvisable to use lists here. In order to preserve types,
+    we convert to a numpy array and then back to a list.
+
+    NOTE2:
+    Interestingly enough, StratifiedShuffleSplut and descendants don't support
+    sparse `y` in `split(): _check_array` call. Hence, neither do we.
+
+    Args:
+        X: DatasetCompressionInputType
+            The X's to subsample
+        y: SupportedTargetTypes
+            The Y's to subsample
+        is_classification: bool
+            Whether this is classification data or regression data. Required for
+            knowing how to split.
+        sample_size: float | int
+            If float, percentage of data to take otherwise if int, an absolute
+            count of samples to take.
+        random_state: int | RandomState = None
+            The random state to pass to the splitted
+
+    Returns:
+        (DatasetCompressionInputType, SupportedTargetTypes)
+            The X and y subsampled according to sample_size
+    """
+
+    if isinstance(X, List):
+        X = np.asarray(X)
+    if isinstance(y, List):
+        y = np.asarray(y)
+
+    if is_classification and y is not None:
+        splitter = CustomStratifiedShuffleSplit(
+            train_size=sample_size, random_state=random_state
+        )
+        indices_to_keep, _ = next(splitter.split(X=X, y=y))
+        X, y = _subsample_by_indices(X, y, indices_to_keep)
+
+    elif y is None:
+        X, _ = train_test_split(  # type: ignore
+            X,
+            train_size=sample_size,
+            random_state=random_state,
+        )
+    else:
+        X, _, y, _ = train_test_split(  # type: ignore
+            X,
+            y,
+            train_size=sample_size,
+            random_state=random_state,
+        )
+
+    return X, y
+
+
+def _subsample_by_indices(
+    X: DatasetCompressionInputType,
+    y: SupportedTargetTypes,
+    indices_to_keep: np.ndarray
+) -> Tuple[DatasetCompressionInputType, SupportedTargetTypes]:
+    """
+    subsample data by given indices
+    """
+    if ispandas(X):
+        idxs = X.index[indices_to_keep]
+        X = X.loc[idxs]
+    else:
+        X = X[indices_to_keep]
+
+    if ispandas(y):
+        # Ifnoring types as mypy does not infer y as dataframe.
+        idxs = y.index[indices_to_keep]  # type: ignore [index]
+        y = y.loc[idxs]  # type: ignore [union-attr]
+    else:
+        y = y[indices_to_keep]
+    return X, y
+
+
+def megabytes(arr: DatasetCompressionInputType) -> float:
+
+    if isinstance(arr, np.ndarray):
+        memory_in_bytes = arr.nbytes
+    elif issparse(arr):
+        memory_in_bytes = arr.data.nbytes
+    elif ispandas(arr):
+        memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
+    else:
+        raise ValueError(f"Unrecognised data type of X, expected data type to "
+                         f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}")
+
+    return float(memory_in_bytes / (2**20))
+
+
+def reduce_dataset_size_if_too_large(
+    X: DatasetCompressionInputType,
+    memory_allocation: Union[int, float],
+    is_classification: bool,
+    random_state: Union[int, np.random.RandomState],
+    y: Optional[SupportedTargetTypes] = None,
+    methods: List[str] = ['precision', 'subsample'],
+) -> DatasetCompressionInputType:
+    f""" Reduces the size of the dataset if it's too close to the memory limit.
+
+    Follows the order of the operations passed in and retains the type of its
+    input.
+
+    Precision reduction will only work on the following data types:
+    -   {supported_precision_reductions}
+
+    Precision reduction will only perform one level of precision reduction.
+    Technically, you could supply multiple rounds of precision reduction, i.e.
+    to reduce np.float128 to np.float32 you could use `methods = ['precision'] * 2`.
+
+    However, if that's the use case, it'd be advised to simply use the function
+    `autoPyTorch.data.utils.reduce_precision`.
+
+    Args:
+        X: DatasetCompressionInputType
+            The features of the dataset.
+
+        methods (List[str] = ['precision', 'subsample']):
+            A list of operations that are permitted to be performed to reduce
+            the size of the dataset.
+
+            **precision**
+
+                Reduce the precision of float types
+
+            **subsample**
+                Reduce the amount of samples of the dataset such that it fits into the allocated
+                memory. Ensures stratification and that unique labels are present
+
+
+        memory_allocation (Union[int, float]):
+            The amount of memory to allocate to the dataset. It should specify an
+            absolute amount.
+
+    Returns:
+        DatasetCompressionInputType
+            The reduced X if reductions were needed
+    """
+
+    for method in methods:
+        if megabytes(X) <= memory_allocation:
+            break
+
+        if method == 'precision':
+            # If the dataset is too big for the allocated memory,
+            # we then try to reduce the precision if it's a high precision dataset
+            X, reduced_dtypes, dtypes = reduce_precision(X)
+            warnings.warn(
+                f'Dataset too large for allocated memory {memory_allocation}MB, '
+                f'reduced the precision from {dtypes} to {reduced_dtypes}',
+            )
+        elif method == "subsample":
+            # If the dataset is still too big such that we couldn't fit
+            # into the allocated memory, we subsample it so that it does
+
+            n_samples_before = X.shape[0]
+            sample_percentage = memory_allocation / megabytes(X)
+
+            # NOTE: type ignore
+            #
+            # Tried the generic `def subsample(X: T) -> T` approach but it was
+            # failing elsewhere, keeping it simple for now
+            X, y = subsample(  # type: ignore
+                X,
+                y=y,
+                sample_size=sample_percentage,
+                is_classification=is_classification,
+                random_state=random_state,
+            )
+
+            n_samples_after = X.shape[0]
+            warnings.warn(
+                f"Dataset too large for allocated memory {memory_allocation}MB,"
+                f" reduced number of samples from {n_samples_before} to"
+                f" {n_samples_after}."
+            )
+
+        else:
+            raise ValueError(f"Unknown operation `{method}`")
+
+    return X, y
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
index a3838007a..bd50cdbd6 100644
--- a/autoPyTorch/datasets/base_dataset.py
+++ b/autoPyTorch/datasets/base_dataset.py
@@ -21,12 +21,16 @@
     DEFAULT_RESAMPLING_PARAMETERS,
     HoldOutFunc,
     HoldOutFuncs,
-    HoldoutValTypes
+    HoldoutValTypes,
+    NoResamplingFunc,
+    NoResamplingFuncs,
+    NoResamplingStrategyTypes,
+    ResamplingStrategies
 )
-from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.utils.common import FitRequirement, ispandas
 
 BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
-BaseDatasetPropertiesType = Union[int, float, str, List, bool]
+BaseDatasetPropertiesType = Union[int, float, str, List, bool, Tuple]
 
 
 def check_valid_data(data: Any) -> None:
@@ -45,6 +49,36 @@ def type_check(train_tensors: BaseDatasetInputType,
             check_valid_data(val_tensors[i])
 
 
+def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, str]:
+    """
+    Return the dimension of output given a target_labels and output_type.
+
+    Args:
+        train_tensors (BaseDatasetInputType):
+            Training data.
+
+    Returns:
+        output_dim (int):
+            The dimension of outputs.
+        output_type (str):
+            The output type according to sklearn specification.
+    """
+    if isinstance(train_tensors, Dataset):
+        target_labels = np.array([sample[-1] for sample in train_tensors])
+    else:
+        target_labels = np.array(train_tensors[1])
+
+    output_type: str = type_of_target(target_labels)
+    if STRING_TO_OUTPUT_TYPES[output_type] in CLASSIFICATION_OUTPUTS:
+        output_dim = len(np.unique(target_labels))
+    elif target_labels.ndim > 1:
+        output_dim = target_labels.shape[-1]
+    else:
+        output_dim = 1
+
+    return output_dim, output_type
+
+
 class TransformSubset(Subset):
     """Wrapper of BaseDataset for splitted datasets
 
@@ -78,7 +112,7 @@ def __init__(
         dataset_name: Optional[str] = None,
         val_tensors: Optional[BaseDatasetInputType] = None,
         test_tensors: Optional[BaseDatasetInputType] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         shuffle: Optional[bool] = True,
         seed: Optional[int] = 42,
@@ -95,8 +129,7 @@ def __init__(
                 validation data
             test_tensors (An optional tuple of objects that have a __len__ and a __getitem__ attribute):
                 test data
-            resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
-                (default=HoldoutValTypes.holdout_validation):
+            resampling_strategy (RESAMPLING_STRATEGIES: default=HoldoutValTypes.holdout_validation):
                 strategy to split the training data.
             resampling_strategy_args (Optional[Dict[str, Any]]): arguments
                 required for the chosen resampling strategy. If None, uses
@@ -109,16 +142,18 @@ def __init__(
             val_transforms (Optional[torchvision.transforms.Compose]):
                 Additional Transforms to be applied to the validation/test data
         """
-        self.dataset_name = dataset_name
 
-        if self.dataset_name is None:
+        if dataset_name is None:
             self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+        else:
+            self.dataset_name = dataset_name
 
         if not hasattr(train_tensors[0], 'shape'):
             type_check(train_tensors, val_tensors)
         self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
         self.cross_validators: Dict[str, CrossValFunc] = {}
         self.holdout_validators: Dict[str, HoldOutFunc] = {}
+        self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
         self.random_state = np.random.RandomState(seed=seed)
         self.shuffle = shuffle
         self.resampling_strategy = resampling_strategy
@@ -127,15 +162,7 @@ def __init__(
         self.issparse: bool = issparse(self.train_tensors[0])
         self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
-            self.output_type: str = type_of_target(self.train_tensors[1])
-
-            if (
-                self.output_type in STRING_TO_OUTPUT_TYPES
-                and STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS
-            ):
-                self.output_shape = len(np.unique(self.train_tensors[1]))
-            else:
-                self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
+            self.output_shape, self.output_type = _get_output_properties(self.train_tensors)
 
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
@@ -143,6 +170,8 @@ def __init__(
         # Make sure cross validation splits are created once
         self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
         self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)
+        self.no_resampling_validators = NoResamplingFuncs.get_no_resampling_validators(*NoResamplingStrategyTypes)
+
         self.splits = self.get_splits_from_resampling_strategy()
 
         # We also need to be able to transform the data, be it for pre-processing
@@ -191,7 +220,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
             A transformed single point prediction
         """
 
-        X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \
+        X = self.train_tensors[0].iloc[[index]] if ispandas(self.train_tensors[0]) \
             else self.train_tensors[0][index]
 
         if self.train_transform is not None and train:
@@ -210,7 +239,7 @@ def __len__(self) -> int:
     def _get_indices(self) -> np.ndarray:
         return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
 
-    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
+    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]:
         """
         Creates a set of splits based on a resampling strategy provided
 
@@ -241,6 +270,9 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                     num_splits=cast(int, num_splits),
                 )
             )
+        elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
+            splits.append((self.no_resampling_validators[self.resampling_strategy.name](self.random_state,
+                                                                                        self._get_indices()), None))
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
         return splits
@@ -312,7 +344,7 @@ def create_holdout_val_split(
             self.random_state, val_share, self._get_indices(), **kwargs)
         return train, val
 
-    def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
+    def get_dataset(self, split_id: int, train: bool) -> Dataset:
         """
         The above split methods employ the Subset to internally subsample the whole dataset.
 
@@ -320,14 +352,21 @@ def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
         to provide training data to fit a pipeline
 
         Args:
-            split (int): The desired subset of the dataset to split and use
+            split_id (int): which split id to get from the splits
+            train (bool): whether the dataset is required for training or evaluating.
 
         Returns:
             Dataset: the reduced dataset to be used for testing
         """
         # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple
-        return (TransformSubset(self, self.splits[split_id][0], train=True),
-                TransformSubset(self, self.splits[split_id][1], train=False))
+        if split_id >= len(self.splits):  # old version: split_id > len(self.splits)
+            raise IndexError(f"self.splits index out of range, got split_id={split_id}"
+                             f" (>= num_splits={len(self.splits)})")
+        indices = self.splits[split_id][int(not train)]  # 0: for training, 1: for evaluation
+        if indices is None:
+            raise ValueError("Specified fold (or subset) does not exist")
+
+        return TransformSubset(self, indices, train=train)
 
     def replace_data(self, X_train: BaseDatasetInputType,
                      X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
diff --git a/autoPyTorch/datasets/image_dataset.py b/autoPyTorch/datasets/image_dataset.py
index 9da55ebc0..74b79db15 100644
--- a/autoPyTorch/datasets/image_dataset.py
+++ b/autoPyTorch/datasets/image_dataset.py
@@ -24,6 +24,7 @@
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
+    NoResamplingStrategyTypes
 )
 
 IMAGE_DATASET_INPUT = Union[Dataset, Tuple[Union[np.ndarray, List[str]], np.ndarray]]
@@ -39,7 +40,7 @@ class ImageDataset(BaseDataset):
             validation data
         test (Union[Dataset, Tuple[Union[np.ndarray, List[str]], np.ndarray]]):
             testing data
-        resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
+        resampling_strategy (Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]),
             (default=HoldoutValTypes.holdout_validation):
             strategy to split the training data.
         resampling_strategy_args (Optional[Dict[str, Any]]): arguments
@@ -57,7 +58,9 @@ def __init__(self,
                  train: IMAGE_DATASET_INPUT,
                  val: Optional[IMAGE_DATASET_INPUT] = None,
                  test: Optional[IMAGE_DATASET_INPUT] = None,
-                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+                 resampling_strategy: Union[CrossValTypes,
+                                            HoldoutValTypes,
+                                            NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 86e0ec733..4f373bf24 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -16,6 +16,13 @@
 
 
 # Use callback protocol as workaround, since callable with function fields count 'self' as argument
+class NoResamplingFunc(Protocol):
+    def __call__(self,
+                 random_state: np.random.RandomState,
+                 indices: np.ndarray) -> np.ndarray:
+        ...
+
+
 class CrossValFunc(Protocol):
     def __call__(self,
                  random_state: np.random.RandomState,
@@ -32,6 +39,21 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
         ...
 
 
+def holdout_split_forecasting(holdout: TimeSeriesSplit, indices: np.ndarray, n_prediction_steps: int,
+                              n_repeats: int = 1) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    A function that do holdout split without raising an error: When the target sequence is too short to be split into
+    training and validation set, the training set will simply ignore that and we only consider the validation set.
+    """
+    try:
+        train, val = list(holdout.split(indices))[-1]
+        val = [val[-1 - i * n_prediction_steps] for i in reversed(range(n_repeats))]
+    except (ValueError, IndexError):
+        train = np.array([], dtype=indices.dtype)
+        val = [-1]
+    return indices[train], indices[val]
+
+
 class CrossValTypes(IntEnum):
     """The type of cross validation
 
@@ -58,6 +80,7 @@ class CrossValTypes(IntEnum):
     stratified_shuffle_split_cross_validation = 3
     shuffle_split_cross_validation = 4
     time_series_cross_validation = 5
+    time_series_ts_cross_validation = 6
 
     def is_stratified(self) -> bool:
         stratified = [self.stratified_k_fold_cross_validation,
@@ -68,24 +91,38 @@ def is_stratified(self) -> bool:
 class HoldoutValTypes(IntEnum):
     """TODO: change to enum using functools.partial"""
     """The type of hold out validation (refer to CrossValTypes' doc-string)"""
-    holdout_validation = 6
-    stratified_holdout_validation = 7
+    holdout_validation = 11
+    stratified_holdout_validation = 12
+    time_series_hold_out_validation = 13
 
     def is_stratified(self) -> bool:
         stratified = [self.stratified_holdout_validation]
         return getattr(self, self.name) in stratified
 
 
+class NoResamplingStrategyTypes(IntEnum):
+    no_resampling = 8
+
+    def is_stratified(self) -> bool:
+        return False
+
+
 # TODO: replace it with another way
-RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes]
+ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
 
-DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] = {
+DEFAULT_RESAMPLING_PARAMETERS: Dict[
+    ResamplingStrategies,
+    Dict[str, Any]
+] = {
     HoldoutValTypes.holdout_validation: {
         'val_share': 0.33,
     },
     HoldoutValTypes.stratified_holdout_validation: {
         'val_share': 0.33,
     },
+    HoldoutValTypes.time_series_hold_out_validation: {
+        'val_share': 0.2
+    },
     CrossValTypes.k_fold_cross_validation: {
         'num_splits': 5,
     },
@@ -96,8 +133,12 @@ def is_stratified(self) -> bool:
         'num_splits': 5,
     },
     CrossValTypes.time_series_cross_validation: {
-        'num_splits': 5,
+        'num_splits': 3,
     },
+    CrossValTypes.time_series_ts_cross_validation: {
+        'num_splits': 2
+    },
+    NoResamplingStrategyTypes.no_resampling: {}
 }
 
 
@@ -125,6 +166,32 @@ def stratified_holdout_validation(random_state: np.random.RandomState,
                                       random_state=random_state)
         return train, val
 
+    @staticmethod
+    def time_series_hold_out_validation(random_state: np.random.RandomState,
+                                        val_share: float, indices: np.ndarray, **kwargs: Any) \
+            -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Return holdout indices respecting hte temporal ordering of the data
+        Args:
+            val_share:
+            indices: List of all possible indices
+            **kwargs:
+
+        Returns:
+        """
+        n_prediction_steps = kwargs['n_prediction_steps']
+        n_repeats = kwargs['n_repeats']
+        # Time Series prediction only requires on set of prediction for each
+        # This implement needs to be combined with time series forecasting dataloader, where each time an entire
+        # time series is used for prediction
+        cv = TimeSeriesSplit(n_splits=2, test_size=1 + n_prediction_steps * (n_repeats - 1), gap=n_prediction_steps - 1)
+
+        train, val = holdout_split_forecasting(holdout=cv,
+                                               indices=indices,
+                                               n_prediction_steps=n_prediction_steps,
+                                               n_repeats=n_repeats)
+        return train, val
+
     @classmethod
     def get_holdout_validators(cls, *holdout_val_types: HoldoutValTypes) -> Dict[str, HoldOutFunc]:
 
@@ -202,6 +269,9 @@ def time_series_cross_validation(random_state: np.random.RandomState,
         Args:
             indices (np.ndarray): array of indices to be split
             num_splits (int): number of cross validation splits
+            n_prediction_steps(int): forecsting horizon, to ensure that there is no overlapping between splits
+            n_repeats (int): number of sequences inside each split, e.g., inside each split, we could ask the model to
+                predict n_reapet times
 
         Returns:
             splits (List[Tuple[List, List]]): list of tuples of training and validation indices
@@ -214,8 +284,68 @@ def time_series_cross_validation(random_state: np.random.RandomState,
                  ([0, 1, 2], [3])]
 
         """
-        cv = TimeSeriesSplit(n_splits=num_splits, random_state=random_state)
-        splits = list(cv.split(indices))
+        test_size = kwargs['n_prediction_steps']
+        n_repeats = kwargs['n_repeats']
+        cv = TimeSeriesSplit(n_splits=num_splits, test_size=test_size * n_repeats, gap=0)
+        splits = [(
+            indices[split[0]],
+            indices[split[1][[-1 - n * test_size for n in reversed(range(n_repeats))]]]) for split in cv.split(indices)]
+        return splits
+
+    @staticmethod
+    def time_series_ts_cross_validation(random_state: np.random.RandomState,
+                                        num_splits: int,
+                                        indices: np.ndarray,
+                                        **kwargs: Any
+                                        ) -> List[Tuple[np.ndarray, np.ndarray]]:
+        """
+        A special sort of Time series cross validator: it could be considered as a mixture of two sorts of holdout set:
+        The first holdout setting: trend setting, simply consider the tail of the sequence as validation sets and the
+        part before as training set
+        The second holdout setting: seasonality setting, ensures that the distance between validation sets and test sets
+        is a multiple of seasonality period. We could thus ensure that validation and test sets are at the same
+        position of the period
+
+        Args:
+            indices (np.ndarray): array of indices to be split
+            num_splits (int): number of cross validation splits
+            seasonality_h_value (int): distance between the start of the validation set and the test set, this value
+                need to be (roughly) a multiple of freq_value
+
+        Returns:
+            splits (List[Tuple[List, List]]): list of tuples of training and validation indices
+        """
+        n_prediction_steps = kwargs['n_prediction_steps']
+        seasonality_h_value = kwargs['seasonality_h_value']
+        n_repeats = kwargs["n_repeats"]
+
+        assert seasonality_h_value >= n_prediction_steps
+        cv = TimeSeriesSplit(n_splits=2, test_size=n_prediction_steps * n_repeats, gap=0)
+
+        train_t, val_t = holdout_split_forecasting(holdout=cv,
+                                                   indices=indices,
+                                                   n_prediction_steps=n_prediction_steps,
+                                                   n_repeats=n_repeats)
+
+        splits = [(train_t, val_t)]
+        if num_splits > 1:
+            cv = TimeSeriesSplit(n_splits=2, test_size=seasonality_h_value * n_repeats, gap=0)
+            n_tail = - n_prediction_steps
+            for i_split in range(1, num_splits):
+                n_tail += seasonality_h_value
+                if n_tail > len(indices):
+                    # normally this should not happen as seasonality_h_value is carefully computed by ForecastingDataset
+                    indices_split = indices
+                else:
+                    indices_split = indices[:-n_tail]
+                train_s, val_s = holdout_split_forecasting(cv, indices_split,
+                                                           n_prediction_steps=seasonality_h_value,
+                                                           n_repeats=n_repeats)
+                if len(train_s) > 0:
+                    train_s = np.concatenate(
+                        [train_s, np.arange(seasonality_h_value - n_prediction_steps) + train_s[-1] + 1]
+                    )
+                splits.append((train_s, val_s))
         return splits
 
     @classmethod
@@ -225,3 +355,30 @@ def get_cross_validators(cls, *cross_val_types: CrossValTypes) -> Dict[str, Cros
             for cross_val_type in cross_val_types
         }
         return cross_validators
+
+
+class NoResamplingFuncs():
+    @classmethod
+    def get_no_resampling_validators(cls, *no_resampling_types: NoResamplingStrategyTypes
+                                     ) -> Dict[str, NoResamplingFunc]:
+        no_resampling_strategies: Dict[str, NoResamplingFunc] = {
+            no_resampling_type.name: getattr(cls, no_resampling_type.name)
+            for no_resampling_type in no_resampling_types
+        }
+        return no_resampling_strategies
+
+    @staticmethod
+    def no_resampling(random_state: np.random.RandomState,
+                      indices: np.ndarray) -> np.ndarray:
+        """
+        Returns the indices without performing
+        any operation on them. To be used for
+        fitting on the whole dataset.
+        This strategy is not compatible with
+        HPO search.
+        Args:
+            indices:  array of indices
+        Returns:
+            np.ndarray: array of indices
+        """
+        return indices
diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py
index c2e229868..6cabfe525 100644
--- a/autoPyTorch/datasets/tabular_dataset.py
+++ b/autoPyTorch/datasets/tabular_dataset.py
@@ -21,6 +21,7 @@
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
+    NoResamplingStrategyTypes
 )
 
 
@@ -32,11 +33,11 @@ class TabularDataset(BaseDataset):
             Y (Union[np.ndarray, pd.Series]): training data targets.
             X_test (Optional[Union[np.ndarray, pd.DataFrame]]):  input testing data.
             Y_test (Optional[Union[np.ndarray, pd.DataFrame]]): testing data targets
-            resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
+            resampling_strategy (Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]),
                 (default=HoldoutValTypes.holdout_validation):
                 strategy to split the training data.
-            resampling_strategy_args (Optional[Dict[str, Any]]): arguments
-                required for the chosen resampling strategy. If None, uses
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
                 the default values provided in DEFAULT_RESAMPLING_PARAMETERS
                 in ```datasets/resampling_strategy.py```.
             shuffle:  Whether to shuffle the data before performing splits
@@ -55,7 +56,9 @@ def __init__(self,
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+                 resampling_strategy: Union[CrossValTypes,
+                                            HoldoutValTypes,
+                                            NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
@@ -86,6 +89,7 @@ def __init__(self,
                          seed=seed, train_transforms=train_transforms,
                          dataset_name=dataset_name,
                          val_transforms=val_transforms)
+        self.issigned = bool(np.any((X.data if self.issparse else X) < 0))
         if self.output_type is not None:
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
                 self.task_type = TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION]
@@ -124,6 +128,7 @@ def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]:
         info.update({
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
-            'task_type': self.task_type
+            'task_type': self.task_type,
+            'issigned': self.issigned
         })
         return info
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index edd07a80e..670eb44c9 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,174 +1,1356 @@
-from typing import Any, Dict, Optional, Tuple, Union
+import bisect
+import copy
+import os
+import uuid
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+from gluonts.time_feature import Constant as ConstantTransform
+from gluonts.time_feature import TimeFeature, time_features_from_frequency_str
+from gluonts.time_feature.lag import get_lags_for_frequency
 
 import numpy as np
 
+import pandas as pd
+from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+
+from scipy.sparse import issparse
+
+import torch
+from torch.utils.data.dataset import ConcatDataset, Dataset
+
 import torchvision.transforms
 
-from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.constants import (
+    CLASSIFICATION_OUTPUTS,
+    MAX_WINDOW_SIZE_BASE,
+    SEASONALITY_MAP,
+    STRING_TO_OUTPUT_TYPES,
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING
+)
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDataset, type_of_target
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValFuncs,
     CrossValTypes,
+    DEFAULT_RESAMPLING_PARAMETERS,
     HoldOutFuncs,
-    HoldoutValTypes
+    HoldoutValTypes,
+    NoResamplingStrategyTypes,
+    ResamplingStrategies
 )
+from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
+from autoPyTorch.utils.common import FitRequirement
+
+
+def extract_feature_index(feature_shapes: Dict[str, int],
+                          feature_names: Tuple[str],
+                          queried_features: Union[Tuple[Union[str, int]], Tuple[()]]) -> Tuple[int]:
+    """
+    extract the index of a set of queried_features from the extracted feature_shapes
+
+    Args:
+        feature_shapes (dict):
+            feature_shapes recoding the shape of each features
+        feature_names (List[str]):
+            names of the features
+        queried_features (Tuple[str]):
+            names of the features that we expect their index
 
-TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
-TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
-TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
+    Returns:
+        feature_index (Tuple[int]):
+            indices of the corresponding features
+    """
+    df_range = pd.DataFrame(feature_shapes, columns=feature_names, index=[0])
+    df_range_end = df_range.cumsum(axis=1)
+    df_range = pd.concat([df_range_end - df_range, df_range_end])
+    value_ranges = df_range[list(queried_features)].T.values
+    feature_index: List[int] = sum([list(range(*value_r)) for value_r in value_ranges], [])
+    feature_index.sort()
+    return tuple(feature_index)  # type: ignore[return-value]
 
 
-class TimeSeriesForecastingDataset(BaseDataset):
+def compute_time_features(start_time: pd.DatetimeIndex,
+                          date_period_length: int,
+                          time_feature_length: int,
+                          freq: str,
+                          time_feature_transforms: List[TimeFeature]) -> np.ndarray:
+    date_info = pd.date_range(start=start_time,
+                              periods=date_period_length,
+                              freq=freq)[-time_feature_length:]
+    try:
+        time_features = np.vstack(
+            [transform(date_info) for transform in time_feature_transforms]
+        ).T
+    except OutOfBoundsDatetime:
+        # This is only a temporal solution TODO consider how to solve this!
+        time_features = np.zeros([time_feature_length, len(time_feature_transforms)])
+    return time_features
+
+
+class TimeSeriesSequence(Dataset):
+    """
+    A dataset representing a time series sequence. It returns all the previous observations once it is asked for an item
+
+    Args:
+        X (Optional[np.ndarray]):
+            past features
+        Y (np.ndarray):
+            past targets
+        start_time (Optional[pd.DatetimeIndex]):
+            times of the first timestep of the series
+        freq (str):
+            frequency that the data is sampled
+        time_feature_transform (List[TimeFeature]):
+            available time features applied to the series
+        X_test (Optional[np.ndarray]):
+            known future features
+        Y_test (Optional[np.ndarray]):
+            future targets
+        train_transforms (Optional[torchvision.transforms.Compose]):
+            training transforms, used to transform training features
+        val_transforms (Optional[torchvision.transforms.Compose]):
+            validation transforms, used to transform training features
+        n_prediction_steps (int):
+            how many steps need to be predicted in advance
+        known_future_features_index (int):
+            indices of the known future index
+        compute_mase_coefficient_value (bool):
+            if the mase coefficient for this series is pre-computed
+        time_features (Optional[np.ndarray]):
+            pre-computed time features
+        is_test_set (bool):
+            if this dataset is test sets. Test sequence will simply make X_test and Y_test as future features and
+            future targets
+    """
+    _is_test_set = False
+    is_pre_processed = False
+
     def __init__(self,
-                 target_variables: Tuple[int],
-                 sequence_length: int,
-                 n_steps: int,
-                 train: TIME_SERIES_FORECASTING_INPUT,
-                 val: Optional[TIME_SERIES_FORECASTING_INPUT] = None,
-                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+                 X: Optional[np.ndarray],
+                 Y: np.ndarray,
+                 start_time: Optional[pd.DatetimeIndex] = None,
+                 freq: str = '1Y',
+                 time_feature_transform: List[TimeFeature] = [ConstantTransform],
+                 X_test: Optional[np.ndarray] = None,
+                 Y_test: Optional[np.ndarray] = None,
+                 train_transforms: Optional[torchvision.transforms.Compose] = None,
+                 val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 n_prediction_steps: int = 1,
+                 sp: int = 1,
+                 known_future_features_index: Optional[Tuple[int]] = None,
+                 compute_mase_coefficient_value: bool = True,
+                 time_features: Optional[np.ndarray] = None,
+                 is_test_set: bool = False,
+                 ) -> None:
+        self.n_prediction_steps = n_prediction_steps
+
+        if X is not None and X.ndim == 1:
+            X = X[:, np.newaxis]
+        self.X = X
+        self.Y = Y
+
+        self.observed_target = ~np.isnan(self.Y)
+        if start_time is None:
+            start_time = pd.Timestamp('1900-01-01')
+        self.start_time = start_time
+
+        self.X_val = None
+        self.Y_val = None
+
+        if X_test is not None and X_test.ndim == 1:
+            X_test = X_test[:, np.newaxis]
+
+        self.X_test = X_test
+        self.Y_test = Y_test
+
+        self.time_feature_transform = time_feature_transform
+
+        self.freq = freq
+
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+        self.sp = sp
+
+        if compute_mase_coefficient_value:
+            if is_test_set:
+                self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp)
+            else:
+                self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp)
+
+        else:
+            self.mase_coefficient = np.asarray([1.0])
+        self.known_future_features_index = known_future_features_index
+
+        self.transform_time_features = False
+        self._cached_time_features: Optional[np.ndarray] = time_features
+
+        self.future_observed_target = None
+        self.is_test_set = is_test_set
+
+    @property
+    def is_test_set(self) -> bool:
+        return self._is_test_set
+
+    @is_test_set.setter
+    def is_test_set(self, value: bool) -> None:
+        if value and value != self._is_test_set:
+            if self.known_future_features_index:
+                if self.X_test is None:
+                    raise ValueError('When future features are known, X_test '
+                                     'for Time Series Sequences must be given!')
+        if self.Y_test is not None:
+            self.future_observed_target = ~np.isnan(self.Y_test)
+        self._is_test_set = value
+
+    def __getitem__(self, index: int, train: bool = True) \
+            -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
+        """
+        get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous observations
+        until the given index
+
+        Args:
+            index (int):
+                what element to yield from the series
+            train (bool):
+                Whether a train or test transformation is applied
+
+        Returns:
+            past_information (Dict[str, torch.Tensor]):
+                a dict contains all the required information required for future forecasting
+                past_targets (torch.Tensor), past_features(Optional[torch.Tensor]),
+                future_features(Optional[torch.Tensor]),
+                mase_coefficient (np.array, cached value to compute MASE scores),
+                past_observed_targets(torch.BoolTensor), if the past targets are observed.
+                decoder_lengths(int), length of decoder output
+            future_information (Optional[Dict[str, torch.Tensor]]):
+                a dict contains all the future information that are required to predict, including
+                future_targets: (torch.Tensor) and future_observed_targets (torch.BoolTensor)
+        """
+        if index < 0:
+            index = self.__len__() + index
+
+        if self.X is not None:
+            past_features = self.X[:index + 1]
+
+            if self.known_future_features_index:
+                if not self.is_test_set:
+                    future_features = \
+                        self.X[index + 1: index + self.n_prediction_steps + 1, self.known_future_features_index]
+                else:
+                    if index < self.__len__() - 1:
+                        raise ValueError('Test Sequence is only allowed to be accessed with the last index!')
+                    future_features = self.X_test[:, self.known_future_features_index]  # type: ignore[index]
+            else:
+                future_features = None
+        else:
+            past_features = None
+            future_features = None
+
+        if self.train_transform is not None and train and past_features is not None:
+            past_features = self.train_transform(past_features)
+            if future_features is not None:
+                future_features = self.train_transform(future_features)
+        elif self.val_transform is not None and not train and past_features is not None:
+            past_features = self.val_transform(past_features)
+            if future_features is not None:
+                future_features = self.val_transform(future_features)
+
+        if self.transform_time_features:
+            if self.time_feature_transform:
+                self.cache_time_features()
+
+                if past_features is not None:
+                    past_features = np.hstack(
+                        [past_features, self._cached_time_features[:index + 1]]  # type: ignore[index]
+                    )
+                else:
+                    past_features = self._cached_time_features[:index + 1]  # type: ignore[index]
+                if future_features is not None:
+                    future_features = np.hstack([
+                        future_features,
+                        self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]  # type: ignore[index]
+                    ])
+                else:
+                    future_features = self._cached_time_features[index + 1:  # type: ignore[index]
+                                                                 index + self.n_prediction_steps + 1]
+
+        if future_features is not None and future_features.shape[0] == 0:
+            future_features = None
+
+        # In case of prediction, the targets are not provided
+        targets = self.Y
+        if self.is_test_set:
+            if self.Y_test is not None:
+                future_targets: Optional[Dict[str, torch.Tensor]] = {
+                    'future_targets': torch.from_numpy(self.Y_test),
+                    'future_observed_targets': torch.from_numpy(self.future_observed_target)
+                }
+            else:
+                future_targets = None
+        else:
+            future_targets_np = targets[index + 1: index + self.n_prediction_steps + 1]
+            future_targets_tt = torch.from_numpy(future_targets_np)
+            future_targets = {
+                'future_targets': future_targets_tt,
+                'future_observed_targets': torch.from_numpy(
+                    self.observed_target[index + 1: index + self.n_prediction_steps + 1]
+                )
+            }
+
+        if isinstance(past_features, np.ndarray):
+            past_features = torch.from_numpy(past_features)
+
+        if isinstance(future_features, np.ndarray):
+            future_features = torch.from_numpy(future_features)
+
+        past_target = targets[:index + 1]
+        past_target = torch.from_numpy(past_target)
+
+        return {"past_targets": past_target,
+                "past_features": past_features,
+                "future_features": future_features,
+                "mase_coefficient": self.mase_coefficient,
+                'past_observed_targets': torch.from_numpy(self.observed_target[:index + 1]),
+                'decoder_lengths': 0 if future_targets is None else future_targets['future_targets'].shape[
+                    0]}, future_targets
+
+    def __len__(self) -> int:
+        return int(self.Y.shape[0]) if self.is_test_set else int(self.Y.shape[0]) - self.n_prediction_steps
+
+    def get_target_values(self, index: int) -> np.ndarray:
+        """
+        Get the visible targets in the datasets without generating a tensor. This can be used to create a dummy pipeline
+        Args:
+            index (int):
+                target index
+
+        Returns:
+            y (np.ndarray):
+                the last visible target value
+        """
+        if index < 0:
+            index = self.__len__() + index
+        return self.Y[index]
+
+    def cache_time_features(self) -> None:
+        """
+        compute time features if it is not cached. For test sets, we also need to compute the time features for future
+        """
+        if self._cached_time_features is None:
+            periods = self.Y.shape[0]
+            if self.is_test_set:
+                periods += self.n_prediction_steps
+            self._cached_time_features = compute_time_features(self.start_time, periods,
+                                                               periods, self.freq, self.time_feature_transform)
+
+        else:
+            if self.is_test_set:
+                if self._cached_time_features.shape[0] == self.Y.shape[0]:
+                    time_feature_future = compute_time_features(self.start_time,
+                                                                self.n_prediction_steps + self.Y.shape[0],
+                                                                self.n_prediction_steps,
+                                                                self.freq, self.time_feature_transform)
+                    self._cached_time_features = np.concatenate([self._cached_time_features, time_feature_future])
+
+    def update_transform(self, transform: Optional[torchvision.transforms.Compose],
+                         train: bool = True,
+                         ) -> 'BaseDataset':
+        """
+        During the pipeline execution, the pipeline object might propose transformations
+        as a product of the current pipeline configuration being tested.
+
+        This utility allows to return a self with the updated transformation, so that
+        a dataloader can yield this dataset with the desired transformations
+
+        Args:
+            transform (torchvision.transforms.Compose):
+                The transformations proposed by the current pipeline
+            train (bool):
+                    Whether to update the train or validation transform
+
+        Returns:
+            self: A copy of the update pipeline
+        """
+        if train:
+            self.train_transform = transform
+        else:
+            self.val_transform = transform
+        return self
+
+    def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
+        if self.is_test_set:
+            raise ValueError("get_val_seq_set is not supported for the test sequences!")
+        if index < 0:
+            index = self.__len__() + index
+        if index >= self.__len__() - 1:
+            # TODO consider X_test?
+            val_set = copy.deepcopy(self)
+            if val_set.X is not None:
+                val_set.X_test = val_set.X[-self.n_prediction_steps:]
+                val_set.X = val_set.X[:-self.n_prediction_steps]
+            val_set.Y_test = val_set.Y[-self.n_prediction_steps:]
+            val_set.Y = val_set.Y[:-self.n_prediction_steps]
+            val_set.future_observed_target = val_set.observed_target[-self.n_prediction_steps:]
+            val_set.observed_target = val_set.observed_target[:-self.n_prediction_steps]
+            val_set.is_test_set = True
+
+            return val_set
+        else:
+            if self.X is not None:
+                X = self.X[:index + 1]
+            else:
+                X = None
+            if self.known_future_features_index:
+                X_test = self.X[index + 1: index + 1 + self.n_prediction_steps]  # type: ignore[index]
+            else:
+                X_test = None
+            if self._cached_time_features is None:
+                cached_time_features = None
+            else:
+                cached_time_features = self._cached_time_features[:index + 1 + self.n_prediction_steps]
+
+            val_set = TimeSeriesSequence(X=X,
+                                         Y=self.Y[:index + 1],
+                                         X_test=X_test,
+                                         Y_test=self.Y[index + 1: index + 1 + self.n_prediction_steps],
+                                         start_time=self.start_time,
+                                         freq=self.freq,
+                                         time_feature_transform=self.time_feature_transform,
+                                         train_transforms=self.train_transform,
+                                         val_transforms=self.val_transform,
+                                         n_prediction_steps=self.n_prediction_steps,
+                                         known_future_features_index=self.known_future_features_index,
+                                         sp=self.sp,
+                                         compute_mase_coefficient_value=False,
+                                         time_features=cached_time_features,
+                                         is_test_set=True)
+
+            return val_set
+
+    def get_test_target(self, test_idx: int) -> np.ndarray:
+        if self.is_test_set:
+            raise ValueError("get_test_target is not supported for test sequences!")
+        if test_idx < 0:
+            test_idx = self.__len__() + test_idx
+        Y_future = self.Y[test_idx + 1: test_idx + self.n_prediction_steps + 1]
+        return Y_future
+
+    def update_attribute(self, **kwargs: Any) -> None:
+        for key, value in kwargs.items():
+            if not hasattr(self, key):
+                raise ValueError('Trying to update invalid attribute for TimeSeriesSequence!')
+            setattr(self, key, value)
+
+
+class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
+    """
+    Dataset class for time series forecasting used in AutoPyTorch. It consists of multiple TimeSeriesSequence.
+    Train and test tensors are stored as pd.DataFrame whereas their index indicates which series the data belongs to
+
+    Args:
+        X (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
+            time series features. can be None if we work with a uni-variant forecasting task
+        Y (Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]):
+            forecasting targets. Must be given
+        X_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
+            known future features. It is a collection of series that has the same amount of data as X. It
+            is designed to be at the tail of X. If no feature is known in the future, this value can be omitted.
+        Y_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None):
+            future targets. It is a collection of series that has the same data of series as Y. It is designed to be at
+            the tail of Y after the timestamps that need to be predicted.
+        start_times (Optional[List[pd.DatetimeIndex]]):
+            starting time of each series when they are sampled. If it is not given, we simply start with a fixed
+            timestamp.
+        series_idx (Optional[Union[List[Union[str, int]], str, int]]):
+            (only works if X is stored as pd.DataFrame). This value is applied to identify  towhich series the data
+            belongs if the data is presented as a "chunk" dataframe
+        known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
+            future features that are known in advance. For instance, holidays.
+        time_feature_transform (Optional[List[TimeFeature]]):
+            A list of time feature transformation methods implemented in gluonts. For more information, please check
+            gluonts.time_feature
+        freq (Optional[Union[str, int, List[int]]]):
+            the frequency that the data is sampled. It needs to keep consistent within one dataset
+        resampling_strategy (Optional[ResamplingStrategies])
+            resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks.
+            Please refer to autoPyTorch.datasets.resampling_strategy
+        resampling_strategy_args (Optional[Dict[str, Any]]):
+            arguments passed to resampling_strategy
+        seed (int):
+            random seeds
+        train_transforms (Optional[torchvision.transforms.Compose]):
+            Transformation applied to training data before it is fed to the dataloader
+        val_transforms (Optional[torchvision.transforms.Compose]):
+            Transformation applied to validation data before it is fed to the dataloader
+        validator (Optional[TimeSeriesForecastingInputValidator]):
+            Input Validator
+        lagged_value (Optional[List[int]])
+            We could consider past targets as additional features for the current timestep. This item indicates the
+            number of timesteps in advanced that we want to apply the targets as our current features
+        n_prediction_steps (int):
+            The number of steps you want to forecast into the future (forecast horizon)
+        dataset_name (Optional[str]):
+            dataset name
+        normalize_y(bool):
+            if targets are normalized within each series
+    """
+
+    datasets: List[TimeSeriesSequence]
+    cumulative_sizes: List[int]
+
+    def __init__(self,
+                 X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]],
+                 Y: Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]],
+                 X_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                 Y_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                 start_times: Optional[List[pd.DatetimeIndex]] = None,
+                 series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+                 known_future_features: Optional[Union[Tuple[Union[str, int]], Tuple[()]]] = None,
+                 time_feature_transform: Optional[List[TimeFeature]] = None,
+                 freq: Optional[Union[str, int, List[int]]] = None,
+                 resampling_strategy: Optional[ResamplingStrategies] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
-                 shuffle: Optional[bool] = False,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 validator: Optional[TimeSeriesForecastingInputValidator] = None,
+                 lagged_value: Optional[List[int]] = None,
+                 n_prediction_steps: int = 1,
+                 dataset_name: Optional[str] = None,
+                 normalize_y: bool = False,
                  ):
-        """
+        # Preprocess time series data information
+        assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
-        :param target_variables: The indices of the variables you want to forecast
-        :param sequence_length: The amount of past data you want to use to forecast future value
-        :param n_steps: The number of steps you want to forecast into the future
-        :param train: Tuple with one tensor holding the training data
-        :param val: Tuple with one tensor holding the validation data
-        """
-        _check_time_series_forecasting_inputs(
-            target_variables=target_variables,
-            sequence_length=sequence_length,
-            n_steps=n_steps,
-            train=train,
-            val=val)
-        train = _prepare_time_series_forecasting_tensor(tensor=train,
-                                                        target_variables=target_variables,
-                                                        sequence_length=sequence_length,
-                                                        n_steps=n_steps)
-        if val is not None:
-            val = _prepare_time_series_forecasting_tensor(tensor=val,
-                                                          target_variables=target_variables,
-                                                          sequence_length=sequence_length,
-                                                          n_steps=n_steps)
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=shuffle,
-                         resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
-                         seed=seed,
-                         train_transforms=train_transforms,
-                         val_transforms=val_transforms,
-                         )
-        self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation)
-        self.holdout_validators = HoldOutFuncs.get_holdout_validators(HoldoutValTypes.holdout_validation)
-
-
-def _check_time_series_forecasting_inputs(target_variables: Tuple[int],
-                                          sequence_length: int,
-                                          n_steps: int,
-                                          train: TIME_SERIES_FORECASTING_INPUT,
-                                          val: Optional[TIME_SERIES_FORECASTING_INPUT] = None) -> None:
-    if train[0].ndim != 3:
-        raise ValueError(
-            "The training data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
-    if val is not None:
-        if val[0].ndim != 3:
-            raise ValueError(
-                "The validation data for time series forecasting "
-                "has to be a three-dimensional tensor of shape PxLxM.")
-    _, time_series_length, num_features = train[0].shape
-    if sequence_length + n_steps > time_series_length:
-        raise ValueError(f"Invalid sequence length: Cannot create dataset "
-                         f"using sequence_length={sequence_length} and n_steps={n_steps} "
-                         f"when the time series are of length {time_series_length}")
-    for t in target_variables:
-        if t < 0 or t >= num_features:
-            raise ValueError(f"Target variable {t} is out of bounds. Number of features is {num_features}, "
-                             f"so each target variable has to be between 0 and {num_features - 1}.")
-
-
-def _prepare_time_series_forecasting_tensor(tensor: TIME_SERIES_FORECASTING_INPUT,
-                                            target_variables: Tuple[int],
-                                            sequence_length: int,
-                                            n_steps: int) -> Tuple[np.ndarray, np.ndarray]:
-    population_size, time_series_length, num_features = tensor[0].shape
-    num_targets = len(target_variables)
-    num_datapoints = time_series_length - sequence_length - n_steps + 1
-    x_tensor = np.zeros((num_datapoints, population_size, sequence_length, num_features), dtype=np.float)
-    y_tensor = np.zeros((num_datapoints, population_size, num_targets), dtype=np.float)
-
-    for p in range(population_size):
-        for i in range(num_datapoints):
-            x_tensor[i, p, :, :] = tensor[0][p, i:i + sequence_length, :]
-            y_tensor[i, p, :] = tensor[0][p, i + sequence_length + n_steps - 1, target_variables]
-
-    # get rid of population dimension by reshaping
-    x_tensor = x_tensor.reshape((-1, sequence_length, num_features))
-    y_tensor = y_tensor.reshape((-1, num_targets))
-    return x_tensor, y_tensor
-
-
-class TimeSeriesClassificationDataset(BaseDataset):
-    def __init__(self,
-                 train: TIME_SERIES_CLASSIFICATION_INPUT,
-                 val: Optional[TIME_SERIES_CLASSIFICATION_INPUT] = None):
-        _check_time_series_inputs(train=train,
-                                  val=val,
-                                  task_type="time_series_classification")
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
-        self.cross_validators = CrossValFuncs.get_cross_validators(
-            CrossValTypes.stratified_k_fold_cross_validation,
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation,
-            CrossValTypes.stratified_shuffle_split_cross_validation
-        )
-        self.holdout_validators = HoldOutFuncs.get_holdout_validators(
-            HoldoutValTypes.holdout_validation,
-            HoldoutValTypes.stratified_holdout_validation
-        )
+        seasonality, freq, freq_value = self.compute_freq_values(freq, n_prediction_steps)
+        self.seasonality = int(seasonality)
 
+        self.freq: str = freq
+        self.freq_value: Union[float, int] = freq_value
 
-class TimeSeriesRegressionDataset(BaseDataset):
-    def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np.ndarray, np.ndarray]] = None):
-        _check_time_series_inputs(train=train,
-                                  val=val,
-                                  task_type="time_series_regression")
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
-        self.cross_validators = CrossValFuncs.get_cross_validators(
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation
-        )
-        self.holdout_validators = HoldOutFuncs.get_holdout_validators(
-            HoldoutValTypes.holdout_validation
-        )
+        self.n_prediction_steps = n_prediction_steps
+
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+        self.dataset_name = dataset_name
+
+        # Data Validation
+        if validator is None:
+            validator = TimeSeriesForecastingInputValidator(is_classification=False)
+        self.validator: TimeSeriesForecastingInputValidator = validator
+
+        if not isinstance(validator, TimeSeriesForecastingInputValidator):
+            raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
+                             f"but receive {type(validator)}")
+
+        if not self.validator._is_fitted:
+            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test, series_idx=series_idx,
+                               start_times=start_times)
+
+        self.is_uni_variant = self.validator._is_uni_variant
+
+        self.numerical_columns = self.validator.feature_validator.numerical_columns
+        self.categorical_columns = self.validator.feature_validator.categorical_columns
+
+        self.num_features: int = self.validator.feature_validator.num_features  # type: ignore[assignment]
+        self.num_targets: int = self.validator.target_validator.out_dimensionality  # type: ignore[assignment]
+
+        self.categories = self.validator.feature_validator.categories
+
+        self.feature_shapes = self.validator.feature_shapes
+        self.feature_names = tuple(self.validator.feature_names)
 
+        assert self.validator.start_times is not None
+        self.start_times = self.validator.start_times
 
-def _check_time_series_inputs(task_type: str,
-                              train: Union[TIME_SERIES_CLASSIFICATION_INPUT, TIME_SERIES_REGRESSION_INPUT],
-                              val: Optional[
-                                  Union[TIME_SERIES_CLASSIFICATION_INPUT, TIME_SERIES_REGRESSION_INPUT]] = None
-                              ) -> None:
-    if len(train) != 2:
-        raise ValueError(f"There must be exactly two training tensors for {task_type}. "
-                         f"The first one containing the data and the second one containing the targets.")
-    if train[0].ndim != 3:
-        raise ValueError(
-            f"The training data for {task_type} has to be a three-dimensional tensor of shape NxSxM.")
-    if train[1].ndim != 1:
-        raise ValueError(
-            f"The training targets for {task_type} have to be of shape N."
+        self.static_features = self.validator.feature_validator.static_features
+
+        self._transform_time_features = False
+        if not time_feature_transform:
+            time_feature_transform = time_features_from_frequency_str(self.freq)
+            if not time_feature_transform:
+                # If time features are empty (as for yearly data), we add a
+                # constant feature of 0
+                time_feature_transform = [ConstantTransform()]
+
+        self.time_feature_transform = time_feature_transform
+        self.time_feature_names = tuple([f'time_feature_{t.__class__.__name__}' for t in self.time_feature_transform])
+
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+
+        # Construct time series sequences
+        if known_future_features is None:
+            known_future_features = tuple()  # type: ignore[assignment]
+        known_future_features_index = extract_feature_index(self.feature_shapes,
+                                                            self.feature_names,  # type: ignore[arg-type]
+                                                            queried_features=known_future_features)  # type: ignore
+
+        self.known_future_features = tuple(known_future_features)  # type: ignore[arg-type]
+
+        # initialize datasets
+        self.sequences_builder_kwargs = {"freq": self.freq,
+                                         "time_feature_transform": self.time_feature_transform,
+                                         "train_transforms": self.train_transform,
+                                         "val_transforms": self.val_transform,
+                                         "n_prediction_steps": n_prediction_steps,
+                                         "sp": self.seasonality,
+                                         "known_future_features_index": known_future_features_index}
+
+        self.normalize_y = normalize_y
+
+        training_sets = self.transform_data_into_time_series_sequence(X, Y,
+                                                                      start_times=self.start_times,
+                                                                      X_test=X_test,
+                                                                      Y_test=Y_test, )
+        sequence_datasets, train_tensors, test_tensors, sequence_lengths = training_sets
+        Y: pd.DataFrame = train_tensors[1]  # type: ignore[no-redef]
+
+        ConcatDataset.__init__(self, datasets=sequence_datasets)
+
+        self.num_sequences = len(Y)
+        self.sequence_lengths_train: np.ndarray = np.asarray(sequence_lengths) - n_prediction_steps
+
+        self.seq_length_min = int(np.min(self.sequence_lengths_train))
+        self.seq_length_median = int(np.median(self.sequence_lengths_train))
+        self.seq_length_max = int(np.max(self.sequence_lengths_train))
+
+        if int(freq_value) > self.seq_length_median:
+            self.base_window_size = self.seq_length_median
+        else:
+            self.base_window_size = int(freq_value)
+
+        self.train_tensors: Tuple[Optional[pd.DataFrame], pd.DataFrame] = train_tensors
+
+        self.test_tensors: Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]] = test_tensors
+        self.val_tensors = None
+
+        self.issparse: bool = issparse(self.train_tensors[0])
+
+        self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)  # type: ignore[assignment]
+
+        # process known future features
+        if known_future_features is None:
+            future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)
+        else:
+            future_feature_shapes = (self.seq_length_min, len(known_future_features))
+        self.encoder_can_be_auto_regressive = (self.input_shape[-1] == future_feature_shapes[-1])
+
+        if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
+            self.output_type: str = type_of_target(self.train_tensors[1][0].fillna(method="pad"))
+
+            if self.output_type in ["binary", "multiclass"]:
+                # TODO in the future we also want forecasting classification task, we need to find a way to distinguish
+                # TODO these tasks with the integral forecasting tasks!
+                self.output_type = "continuous"
+
+            if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
+                num_targets: int = len(np.unique(Y))
+            else:
+                num_targets = Y.shape[-1] if Y.ndim > 1 else 1  # type: ignore[union-attr]
+            self.output_shape = [self.n_prediction_steps, num_targets]  # type: ignore
+        else:
+            raise ValueError('Forecasting dataset must contain target values!')
+
+        # TODO: Look for a criteria to define small enough to preprocess
+        self.is_small_preprocess = True
+
+        # dataset split
+        self.task_type: str = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
+
+        self.numerical_features: List[int] = self.numerical_columns
+        self.categorical_features: List[int] = self.categorical_columns
+
+        self.random_state = np.random.RandomState(seed=seed)
+
+        resampling_strategy_opt, resampling_strategy_args_opt = self.get_split_strategy(
+            sequence_lengths=sequence_lengths,
+            n_prediction_steps=n_prediction_steps,
+            freq_value=self.freq_value,
+            resampling_strategy=resampling_strategy,  # type: ignore[arg-type]
+            resampling_strategy_args=resampling_strategy_args
         )
-    if val is not None:
-        if len(val) != 2:
-            raise ValueError(
-                f"There must be exactly two validation tensors for{task_type}. "
-                f"The first one containing the data and the second one containing the targets.")
-        if val[0].ndim != 3:
-            raise ValueError(
-                f"The validation data for {task_type} has to be a "
-                f"three-dimensional tensor of shape NxSxM.")
-        if val[0].ndim != 1:
+
+        self.resampling_strategy = resampling_strategy_opt   # type: ignore[assignment]
+        self.resampling_strategy_args = resampling_strategy_args_opt
+
+        if isinstance(self.resampling_strategy, CrossValTypes):
+            self.cross_validators = CrossValFuncs.get_cross_validators(self.resampling_strategy)
+        else:
+            self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation)
+        if isinstance(self.resampling_strategy, HoldoutValTypes):
+            self.holdout_validators = HoldOutFuncs.get_holdout_validators(self.resampling_strategy)
+
+        else:
+            self.holdout_validators = HoldOutFuncs.get_holdout_validators(
+                HoldoutValTypes.time_series_hold_out_validation)
+
+        self.splits = self.get_splits_from_resampling_strategy()  # type: ignore[assignment]
+
+        # TODO doing experiments to give the most proper way of defining these two values
+        if lagged_value is None:
+            try:
+                lagged_value = [0] + get_lags_for_frequency(freq)
+            except Exception:
+                lagged_value = list(range(8))
+
+        self.lagged_value = lagged_value
+
+    @staticmethod
+    def compute_freq_values(freq: Optional[Union[str, int, List[int]]],
+                            n_prediction_steps: int) -> Tuple[Union[int, float], str, Union[int, float]]:
+        """
+        Compute frequency related values
+        """
+        if freq is None:
+            freq = '1Y'
+
+        if isinstance(freq, str):
+            if freq not in SEASONALITY_MAP:
+                Warning("The given freq name is not supported by our dataset, we will use the default "
+                        "configuration space on the hyperparameter window_size, if you want to adapt this value"
+                        "you could pass freq with a numerical value")
+            freq_value = SEASONALITY_MAP.get(freq, 1)
+        else:
+            freq_value = freq
+            freq = '1Y'
+
+        seasonality = freq_value
+        if isinstance(freq_value, list):
+            min_base_size = min(n_prediction_steps, MAX_WINDOW_SIZE_BASE)
+            if np.max(freq_value) < min_base_size:
+                tmp_freq = max(freq_value)
+            else:
+                tmp_freq = min([freq_value_item for
+                                freq_value_item in freq_value if freq_value_item >= min_base_size])
+            freq_value = tmp_freq
+
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        return seasonality, freq, freq_value  # type: ignore[return-value]
+
+    @staticmethod
+    def compute_time_features(start_times: List[pd.DatetimeIndex],
+                              seq_lengths: List[int],
+                              freq: Union[str, pd.DateOffset],
+                              time_feature_transform: List[TimeFeature]) -> Dict[pd.DatetimeIndex, np.ndarray]:
+        """
+        compute the max series length for each start_time and compute their corresponding time_features. As lots of
+        series in a dataset share the same start time, we could only compute the features for longest possible series
+        and reuse them
+        """
+        series_lengths_max: Dict[pd.DatetimeIndex, int] = {}
+        for start_t, seq_l in zip(start_times, seq_lengths):
+            if start_t not in series_lengths_max or seq_l > series_lengths_max[start_t]:
+                series_lengths_max[start_t] = seq_l
+        series_time_features = {}
+        for start_t, max_l in series_lengths_max.items():
+            series_time_features[start_t] = compute_time_features(start_t, max_l, max_l, freq, time_feature_transform)
+        return series_time_features
+
+    def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:
+        """get which series the data point belongs to"""
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if only_dataset_idx:
+            return dataset_idx
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return dataset_idx, sample_idx
+
+    def __len__(self) -> int:
+        return ConcatDataset.__len__(self)  # type: ignore[no-any-return]
+
+    def __getitem__(self, idx: int,  # type: ignore[override]
+                    train: bool = True) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
+        dataset_idx, sample_idx = self._get_dataset_indices(idx)  # type: ignore[misc]
+        return self.datasets[dataset_idx].__getitem__(sample_idx, train)
+
+    def get_validation_set(self, idx: int) -> TimeSeriesSequence:
+        """generate validation series given the index. It ends at the position of the index"""
+        dataset_idx, sample_idx = self._get_dataset_indices(idx)  # type: ignore[misc]
+        return self.datasets[dataset_idx].get_val_seq_set(sample_idx)
+
+    def get_time_series_seq(self, idx: int) -> TimeSeriesSequence:
+        """get the series that the data point belongs to"""
+        dataset_idx = self._get_dataset_indices(idx, True)
+        return self.datasets[dataset_idx]  # type: ignore[index]
+
+    def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
+        """get the target data only. This function simply returns a np.array instead of a dictionary"""
+
+        test_indices = np.where(test_indices < 0, test_indices + len(self), test_indices)
+        y_test = np.ones([len(test_indices), self.n_prediction_steps, self.num_targets])
+        y_test_argsort = np.argsort(test_indices)
+        dataset_idx: int = self._get_dataset_indices(test_indices[y_test_argsort[0]],  # type: ignore[assignment]
+                                                     only_dataset_idx=True)
+
+        for y_i in y_test_argsort:
+            test_idx = test_indices[y_i]
+            while test_idx > self.cumulative_sizes[dataset_idx]:
+                dataset_idx += 1
+            if dataset_idx != 0:
+                test_idx = test_idx - self.cumulative_sizes[dataset_idx - 1]
+            y_test[y_i] = self.datasets[dataset_idx].get_test_target(test_idx)
+
+        return y_test.reshape([-1, self.num_targets])
+
+    def transform_data_into_time_series_sequence(self,
+                                                 X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]],
+                                                 Y: Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]],
+                                                 start_times: List[pd.DatetimeIndex],
+                                                 X_test: Optional[
+                                                     Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                                                 Y_test: Optional[
+                                                     Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                                                 is_test_set: bool = False, ) -> Tuple[
+        List[TimeSeriesSequence],
+        Tuple[Optional[pd.DataFrame], pd.DataFrame],
+        Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]],
+        List[int]
+    ]:
+        """
+        Transform the raw data into a list of TimeSeriesSequence that can be processed by AutoPyTorch Time Series
+                build a series time sequence datasets
+
+        Args:
+            X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]
+                features, if is_test_set is True, then its length of
+            Y: pd.DataFrame (N_all, N_target)
+                flattened train target array with size N_all (the sum of all the series sequences) and number of targets
+            start_times: List[pd.DatetimeIndex]
+                start time of each training series
+            X_test: Optional[np.ndarray (N_all_test, N_feature)]
+                flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
+                number of features
+            Y_test: np.ndarray (N_all_test, N_target)
+                flattened test target array with size N_all (the sum of all the series sequences) and number of targets
+            is_test_set: Optional[List[pd.DatetimeIndex]]
+                if the generated sequence used for test
+
+        Returns:
+            sequence_datasets : List[TimeSeriesSequence]
+                a list of datasets
+            train_tensors: Tuple[Optional[pd.DataFrame], pd.DataFrame]
+                training tensors
+            test_tensors: Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]]
+                test tensors
+
+        """
+        dataset_with_future_features = X is not None and len(self.known_future_features) > 0
+        X, Y, sequence_lengths = self.validator.transform(X, Y)
+        time_features = self.compute_time_features(start_times,
+                                                   sequence_lengths,
+                                                   self.freq,
+                                                   self.time_feature_transform)
+
+        if Y_test is not None or X_test is not None:
+            X_test, Y_test, _ = self.validator.transform(X_test, Y_test,
+                                                         validate_for_future_features=dataset_with_future_features)
+
+        y_groups: pd.DataFrameGroupBy = Y.groupby(Y.index)  # type: ignore[union-attr]
+        if self.normalize_y:
+            mean = y_groups.agg("mean")
+            std = y_groups.agg("std")
+            std[std == 0] = 1.
+            std.fillna(1.)
+            Y = (Y - mean) / std
+            self.y_mean = mean
+            self.y_std = std
+            if Y_test is not None:
+                Y_test = (Y_test[mean.columns] - mean) / std
+
+        sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
+            X=X, Y=Y,
+            X_test=X_test, Y_test=Y_test,
+            start_times=start_times,
+            time_features=time_features,
+            is_test_set=is_test_set,
+            **self.sequences_builder_kwargs)
+        return sequence_datasets, train_tensors, test_tensors, sequence_lengths
+
+    @staticmethod
+    def make_sequences_datasets(X: Optional[pd.DataFrame],
+                                Y: pd.DataFrame,
+                                start_times: List[pd.DatetimeIndex],
+                                time_features: Optional[Dict[pd.DatetimeIndex, np.ndarray]] = None,
+                                X_test: Optional[pd.DataFrame] = None,
+                                Y_test: Optional[pd.DataFrame] = None,
+                                is_test_set: bool = False,
+                                **sequences_kwargs: Any) -> Tuple[
+        List[TimeSeriesSequence],
+        Tuple[Optional[pd.DataFrame], pd.DataFrame],
+        Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]]
+    ]:
+        """
+        build a series time sequence datasets
+
+        Args:
+            X: pd.DataFrame (N_all, N_feature)
+                flattened train feature DataFrame with size N_all (the sum of all the series sequences) and N_feature,
+                number of features, X's index should contain the information identifying its series number
+            Y: pd.DataFrame (N_all, N_target)
+                flattened train target array with size N_all (the sum of all the series sequences) and number of targets
+            start_times: List[pd.DatetimeIndex]
+                start time of each training series
+            time_features: Dict[pd.Timestamp, np.ndarray]:
+                time features for each possible start training times
+            X_test: Optional[np.ndarray (N_all_test, N_feature)]
+                flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
+                number of features
+            Y_test: np.ndarray (N_all_test, N_target)
+                flattened test target array with size N_all (the sum of all the series sequences) and number of targets
+            is_test_set (bool):
+                if the generated sequence used for test
+            sequences_kwargs: Dict
+                additional arguments for test sets
+
+        Returns:
+            sequence_datasets : List[TimeSeriesSequence]
+                a list of datasets
+            train_tensors: Tuple[pd.DataFrame, pd.DataFrame]
+                training tensors
+            train_tensors: Optional[Tuple[pd.DataFrame, pd.DataFrame]]
+                training tensors
+
+        """
+        sequence_datasets = []
+
+        y_group = Y.groupby(Y.index)
+        if X is not None:
+            x_group = X.groupby(X.index)
+        if Y_test is not None:
+            y_test_group = Y_test.groupby(Y_test.index)
+
+        if X_test is not None:
+            x_test_group = X_test.groupby(X_test.index)
+
+        for i_ser, (start_time, y) in enumerate(zip(start_times, y_group)):
+            ser_id = y[0]
+            y_ser = y[1].transform(np.array).values
+            x_ser = x_group.get_group(ser_id).transform(np.array).values if X is not None else None
+
+            y_test_ser = y_test_group.get_group(ser_id).transform(np.array).values if Y_test is not None else None
+            x_test_ser = x_test_group.get_group(ser_id).transform(np.array).values if X_test is not None else None
+
+            sequence = TimeSeriesSequence(
+                X=x_ser,
+                Y=y_ser,
+                start_time=start_time,
+                X_test=x_test_ser,
+                Y_test=y_test_ser,
+                time_features=time_features[start_time][:len(y_ser)] if time_features is not None else None,
+                is_test_set=is_test_set,
+                **sequences_kwargs)
+            sequence_datasets.append(sequence)
+
+        train_tensors = (X, Y)
+        # we could guarantee that Y_test has shape [len(seq) * n_prediction_steps, num_targets]
+        test_tensors = (X_test, Y_test.values) if Y_test is not None else None
+
+        return sequence_datasets, train_tensors, test_tensors
+
+    def replace_data(self,
+                     X_train: pd.DataFrame,
+                     X_test: Optional[pd.DataFrame],
+                     known_future_features_index: Optional[Tuple[int]] = None) -> 'BaseDataset':
+        super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
+        if X_train is None:
+            return self
+        if X_test is not None:
+            X_test_group = X_test.groupby(X_test.index)
+        for seq, x in zip(self.datasets, X_train.groupby(X_train.index)):
+            ser_id = x[0]
+            x_ser = x[1].transform(np.array).values
+            seq.X = x_ser
+
+            if X_test is not None:
+                seq.X_test = X_test_group.get_group(ser_id).transform(np.array).values
+            seq.known_future_features_index = known_future_features_index
+            seq.is_pre_processed = True
+
+        return self
+
+    def update_transform(self, transform: Optional[torchvision.transforms.Compose],
+                         train: bool = True,
+                         ) -> 'BaseDataset':
+        """
+        During the pipeline execution, the pipeline object might propose transformations
+        as a product of the current pipeline configuration being tested.
+
+        This utility allows to return a self with the updated transformation, so that
+        a dataloader can yield this dataset with the desired transformations
+
+        Args:
+            transform (torchvision.transforms.Compose):
+                The transformations proposed by the current pipeline
+            train (bool):
+                Whether to update the train or validation transform
+
+        Returns:
+            self: A copy of the update pipeline
+        """
+        if train:
+            self.train_transform = transform
+        else:
+            self.val_transform = transform
+        for seq in self.datasets:
+            seq = seq.update_transform(transform, train)
+        return self
+
+    @property
+    def transform_time_features(self) -> bool:
+        return self._transform_time_features
+
+    @transform_time_features.setter
+    def transform_time_features(self, value: bool) -> None:
+        self._transform_time_features = value
+        for seq in self.datasets:
+            seq.transform_time_features = value
+
+    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]:
+        """
+        Creates a set of splits based on a resampling strategy provided, here each item in test_split represent
+        n_prediction_steps element in the dataset. (The start of timestep that we want to predict)
+
+        Returns
+            ( List[Tuple[List[int], Optional[List[int]]]]):
+            splits in the [train_indices, val_indices] format
+        """
+        splits = []
+        if isinstance(self.resampling_strategy, HoldoutValTypes):
+            val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'val_share', None)
+            if self.resampling_strategy_args is not None:
+                val_share = self.resampling_strategy_args.get('val_share', val_share)
+                n_repeats = self.resampling_strategy_args.get("n_repeats", 1)
+            else:
+                n_repeats = 1
+            splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
+                                                        val_share=val_share,
+                                                        n_repeats=n_repeats))
+
+        elif isinstance(self.resampling_strategy, CrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'num_splits', None)
+            if self.resampling_strategy_args is not None:
+                num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
+                n_repeats = self.resampling_strategy_args.get("n_repeats", 1)
+            else:
+                n_repeats = 1
+            # Create the split if it was not created before
+            splits.extend(self.create_cross_val_splits(  # type: ignore[arg-type]
+                cross_val_type=self.resampling_strategy,
+                num_splits=cast(int, num_splits),
+                n_repeats=n_repeats
+            ))
+        elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
+            splits.append(self.create_refit_split())
+        else:
+            raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
+        return splits  # type: ignore[return-value]
+
+    def get_required_dataset_info(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary containing required dataset properties to instantiate a pipeline,
+        """
+        info = super().get_required_dataset_info()
+        info.update({
+            'task_type': self.task_type,
+            'numerical_features': self.numerical_features,
+            'categorical_features': self.categorical_features,
+            'numerical_columns': self.numerical_columns,
+            'categorical_columns': self.categorical_columns,
+            'categories': self.categories,
+        })
+        return info
+
+    def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
+        dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
+        dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
+                                   'sp': self.seasonality,  # For metric computation
+                                   'input_shape': self.input_shape,
+                                   'time_feature_transform': self.time_feature_transform,
+                                   'uni_variant': self.is_uni_variant,
+                                   'static_features_shape': len(self.static_features),
+                                   'future_feature_shapes': (self.n_prediction_steps, len(self.known_future_features)),
+                                   'targets_have_missing_values': self.train_tensors[1].isnull().values.any(),
+                                   'encoder_can_be_auto_regressive': self.encoder_can_be_auto_regressive,
+                                   'features_have_missing_values': False if self.train_tensors[0] is None
+                                   else self.train_tensors[0].isnull().values.any()})
+        return dataset_properties
+
+    @staticmethod
+    def get_split_strategy(sequence_lengths: List[int],
+                           n_prediction_steps: int,
+                           freq_value: Union[float, int],
+                           resampling_strategy: ResamplingStrategies = HoldoutValTypes.time_series_hold_out_validation,
+                           resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
+            Tuple[ResamplingStrategies, Optional[Dict[str, Any]]]:
+        """
+        Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
+        enough to support cross-validation split, thus we need to carefully compute the number of folds. Additionally,
+        each fold might contain multiple forecasting instances (each with length n_prediction_steps and there is no
+        overlapping between the test instances). This value is considered as 'n_repeats'
+
+        Args:
+            sequence_lengths (List[int]):
+                lengths of each sequence
+            n_prediction_steps (int):
+                forecasting horizon
+            freq_value (Union[float, int]):
+                period of the dataset, determined by its sampling frequency
+            resampling_strategy(ResamplingStrategies):
+                resampling strategy to be checked
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                resampling strategy arguments to be checked
+
+        Returns:
+            resampling_strategy(ResamplingStrategies):
+                resampling strategy
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                resampling strategy arguments
+        """
+        # check if dataset could be split with cross validation
+        minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
+        if isinstance(resampling_strategy, CrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
+                'num_splits', 5)
+            if resampling_strategy_args is not None:
+                num_splits = resampling_strategy_args.get('num_splits', num_splits)
+
+            # Check if all the series can be properly split, if not, we reduce the number of split
+            if resampling_strategy != CrossValTypes.time_series_ts_cross_validation:
+                while minimal_seq_length - n_prediction_steps * num_splits <= 0:
+                    num_splits -= 1
+
+                if num_splits >= 2:
+                    resampling_strategy = CrossValTypes.time_series_cross_validation
+                    if resampling_strategy_args is None:
+                        resampling_strategy_args = {'num_splits': num_splits}
+                    else:
+                        resampling_strategy_args.update({'num_splits': num_splits})
+                else:
+                    warnings.warn('The dataset is not suitable for cross validation, we will apply holdout instead')
+
+                    resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
+                    resampling_strategy_args = None
+            else:
+                seasonality_h_value = int(
+                    np.round((n_prediction_steps // int(freq_value) + 1) * freq_value)
+                )
+
+                while minimal_seq_length < (num_splits - 1) * seasonality_h_value:
+                    if num_splits <= 2:
+                        break
+                    num_splits -= 1
+                if resampling_strategy_args is None:
+                    resampling_strategy_args = {'num_splits': num_splits}
+                else:
+                    resampling_strategy_args.update({'num_splits': num_splits})
+
+        num_seqs = len(sequence_lengths)
+
+        if resampling_strategy_args is not None and "n_repeats" in resampling_strategy_args:
+            n_repeats = resampling_strategy_args["n_repeats"]
+        else:
+            # we want to keep the amount of forecasting instances large enough to generalize well or make full use of
+            # the information from the training set
+            # if there are not enough series in the dataset or the minimal length of the sequence is large enough
+            # to support multiple predictions
+            if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
+                    minimal_seq_length > 50 * n_prediction_steps:
+                if num_seqs < 100:
+                    n_repeats = int(np.ceil(100.0 / num_seqs))
+                else:
+                    n_repeats = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
+            else:
+                n_repeats = 1
+
+        if resampling_strategy == CrossValTypes.time_series_cross_validation:
+            n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps * num_splits))
+        elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
+            seasonality_h_value = int(np.round(
+                (n_prediction_steps // int(freq_value) + 1) * freq_value)
+            )
+            while minimal_seq_length // 5 < (num_splits - 1) * n_repeats * seasonality_h_value - n_prediction_steps:
+                n_repeats -= 1
+
+        elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
+            n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps) - 1)
+
+        else:
+            n_repeats = 1
+
+        n_repeats = max(n_repeats, 1)
+
+        if resampling_strategy_args is None:
+            resampling_strategy_args = {'n_repeats': n_repeats}
+        else:
+            resampling_strategy_args.update({'n_repeats': n_repeats})
+        return resampling_strategy, resampling_strategy_args
+
+    def create_cross_val_splits(
+            self,
+            cross_val_type: CrossValTypes,
+            num_splits: int,
+            n_repeats: int = 1,
+    ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
+        """
+        This function creates the cross validation split for the given task.
+
+        It is done once per dataset to have comparable results among pipelines
+
+        Args:
+            cross_val_type (CrossValTypes):
+                cross validation type
+            num_splits (int):
+                number of splits to be created
+            n_repeats (int):
+                how many n_prediction_steps to repeat in the validation set
+
+        Returns:
+            (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]):
+                list containing 'num_splits' splits.
+        """
+        # Create just the split once
+        # This is gonna be called multiple times, because the current dataset
+        # is being used for multiple pipelines. That is, to be efficient with memory
+        # we dump the dataset to memory and read it on a need basis. So this function
+        # should be robust against multiple calls, and it does so by remembering the splits
+
+        if not isinstance(cross_val_type, CrossValTypes):
+            raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
+        idx_start = 0
+
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        if cross_val_type == CrossValTypes.time_series_ts_cross_validation:
+            seasonality_h_value = int(np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
+            kwargs.update({'seasonality_h_value': seasonality_h_value})
+        kwargs["n_repeats"] = n_repeats
+
+        splits: List[List[Tuple]] = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
+
+        for idx_seq, dataset in enumerate(self.datasets):
+            split = self.cross_validators[cross_val_type.name](self.random_state,
+                                                               num_splits,
+                                                               indices=idx_start + np.arange(len(dataset)),
+                                                               **kwargs)
+
+            for idx_split in range(num_splits):
+                splits[idx_split][idx_seq] = split[idx_split]
+            idx_start += self.sequence_lengths_train[idx_seq]
+        # in this case, splits is stored as :
+        #  [ first split, second_split ...]
+        #  first_split = [([0], [1]), ([2], [3])] ....
+        splits_merged = []
+        for i in range(num_splits):
+            split = splits[i]  # type: ignore[assignment]
+            train_indices = np.hstack([sp[0] for sp in split])
+            test_indices = np.hstack([sp[1] for sp in split])
+            splits_merged.append((train_indices, test_indices))
+        return splits_merged  # type: ignore[return-value]
+
+    def create_holdout_val_split(
+            self,
+            holdout_val_type: HoldoutValTypes,
+            val_share: float,
+            n_repeats: int = 1,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        This function creates the holdout split for the given task.
+
+        It is done once per dataset to have comparable results among pipelines
+
+        Args:
+            holdout_val_type (HoldoutValTypes):
+                holdout type
+            val_share (float):
+                share of the validation data
+            n_repeats (int):
+                how many n_prediction_steps to repeat in the validation set
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
+        """
+        if holdout_val_type is None:
             raise ValueError(
-                f"The validation targets for {task_type} have to be of shape N."
+                '`val_share` specified, but `holdout_val_type` not specified.'
             )
+
+        if val_share < 0 or val_share > 1:
+            raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
+        if not isinstance(holdout_val_type, HoldoutValTypes):
+            raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
+        kwargs = {"n_prediction_steps": self.n_prediction_steps,
+                  "n_repeats": n_repeats}
+
+        splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
+        idx_start = 0
+        for idx_seq, dataset in enumerate(self.datasets):
+            split = self.holdout_validators[holdout_val_type.name](self.random_state,
+                                                                   val_share,
+                                                                   indices=np.arange(len(dataset)) + idx_start,
+                                                                   **kwargs)
+            for idx_split in range(2):
+                splits[idx_split][idx_seq] = split[idx_split]  # type: ignore[call-overload]
+            idx_start += self.sequence_lengths_train[idx_seq]
+
+        train_indices = np.hstack([sp for sp in splits[0]])
+        test_indices = np.hstack([sp for sp in splits[1]])
+
+        return train_indices, test_indices
+
+    def create_refit_split(self) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        This function creates the refit split for the given task. All the data in the dataset will be considered as
+        training sets
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
+        """
+        splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
+        idx_start = 0
+        for idx_seq, dataset in enumerate(self.datasets):
+            split = [np.arange(len(dataset)), np.array([len(dataset) - 1])]
+
+            for idx_split in range(2):
+                splits[idx_split][idx_seq] = idx_start + split[idx_split]
+            idx_start += self.sequence_lengths_train[idx_seq]
+
+        train_indices = np.hstack([sp for sp in splits[0]])
+        test_indices = np.hstack([sp for sp in splits[1]])
+
+        return train_indices, test_indices
+
+    def create_refit_set(self) -> "TimeSeriesForecastingDataset":
+        """create a refit set that allows the network to be trained with the entire training-validation sets"""
+        refit_set: TimeSeriesForecastingDataset = copy.deepcopy(self)
+        refit_set.resampling_strategy = NoResamplingStrategyTypes.no_resampling
+        refit_set.splits = refit_set.get_splits_from_resampling_strategy()
+        return refit_set
+
+    def generate_test_seqs(self) -> List[TimeSeriesSequence]:
+        """
+        A function that generate a set of test series from the information available at this dataset. By calling this
+        function, we could make use of the cached information such as time features to accelerate the computation time
+
+        Returns:
+            test_sets(List[TimeSeriesSequence])
+                generated test sets
+        """
+        test_sets = copy.deepcopy(self.datasets)
+        for test_seq in test_sets:
+            test_seq.is_test_set = True
+        return test_sets
diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index 35a281235..662718873 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -66,6 +66,7 @@ def __init__(
         random_state: int,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         pynisher_context: str = 'fork',
+        metrics_kwargs: Dict = {},
     ):
         """ SMAC callback to handle ensemble building
         Args:
@@ -115,6 +116,8 @@ def __init__(
                 port in where to publish a msg
             pynisher_context: str
                 The multiprocessing context for pynisher. One of spawn/fork/forkserver.
+            metrics_kwargs: Optional[Dict]
+                Additioal information for computing metric
 
         Returns:
             List[Tuple[int, float, float, float]]:
@@ -128,6 +131,7 @@ def __init__(
         self.task_type = task_type
         self.output_type = output_type
         self.metrics = metrics
+        self.metrics_kwargs = metrics_kwargs
         self.opt_metric = opt_metric
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
@@ -243,6 +247,7 @@ def build_ensemble(
                     pynisher_context=self.pynisher_context,
                     logger_port=self.logger_port,
                     unit_test=unit_test,
+                    metric_kwargs=self.metrics_kwargs,
                 ))
 
                 logger.info(
@@ -284,6 +289,7 @@ def fit_and_return_ensemble(
     pynisher_context: str,
     logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
     unit_test: bool = False,
+    metric_kwargs: Dict = {}
 ) -> Tuple[
         List[Dict[str, float]],
         int,
@@ -344,6 +350,8 @@ def fit_and_return_ensemble(
             Having this is very bad coding style, but I did not find a way to make
             unittest.mock work through the pynisher with all spawn contexts. If you know a
             better solution, please let us know by opening an issue.
+        metric_kwargs: Dict
+            additional arguments for computing metrics, this is used for time series forecasting computation
     Returns
     -------
         List[Tuple[int, float, float, float]]
@@ -367,6 +375,7 @@ def fit_and_return_ensemble(
         random_state=random_state,
         logger_port=logger_port,
         unit_test=unit_test,
+        metric_kwargs=metric_kwargs,
     ).run(
         end_at=end_at,
         iteration=iteration,
@@ -396,6 +405,7 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         unit_test: bool = False,
+        metric_kwargs: Dict = {}
     ):
         """
             Constructor
@@ -449,6 +459,8 @@ def __init__(
                 Having this is very bad coding style, but I did not find a way to make
                 unittest.mock work through the pynisher with all spawn contexts. If you know a
                 better solution, please let us know by opening an issue.
+            metric_kwargs: Dict
+                additional arguments for computing metrics, this is used for time series forecasting computation
         """
 
         super(EnsembleBuilder, self).__init__()
@@ -458,6 +470,7 @@ def __init__(
         self.task_type = task_type
         self.output_type = output_type
         self.metrics = metrics
+        self.metric_kwargs = metric_kwargs
         self.opt_metric = opt_metric
         self.ensemble_size = ensemble_size
         self.performance_range_threshold = performance_range_threshold
@@ -971,6 +984,7 @@ def compute_loss_per_model(self) -> bool:
                     target=self.y_true_ensemble,
                     prediction=y_ensemble,
                     task_type=self.task_type,
+                    **self.metric_kwargs
                 )
 
                 if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]):
@@ -1284,6 +1298,7 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
             metric=opt_metric,
             random_state=self.random_state,
             task_type=self.task_type,
+            metric_kwargs=self.metric_kwargs
         )
 
         try:
@@ -1401,6 +1416,7 @@ def _add_ensemble_trajectory(self, train_pred: np.ndarray, test_pred: np.ndarray
             target=self.y_true_ensemble,
             prediction=train_pred,
             task_type=self.task_type,
+            **self.metric_kwargs
         )
         performance_stamp.update({'train_' + str(key): val for key, val in train_scores.items()})
         if self.y_test is not None:
@@ -1409,6 +1425,7 @@ def _add_ensemble_trajectory(self, train_pred: np.ndarray, test_pred: np.ndarray
                 target=self.y_test,
                 prediction=test_pred,
                 task_type=self.task_type,
+                **self.metric_kwargs
             )
             performance_stamp.update(
                 {'test_' + str(key): val for key, val in test_scores.items()})
diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py
index c296b14f4..6843e28f6 100644
--- a/autoPyTorch/ensemble/ensemble_selection.py
+++ b/autoPyTorch/ensemble/ensemble_selection.py
@@ -16,9 +16,11 @@ def __init__(
         metric: autoPyTorchMetric,
         task_type: int,
         random_state: np.random.RandomState,
+        metric_kwargs: Dict = {},
     ) -> None:
         self.ensemble_size = ensemble_size
         self.metric = metric
+        self.metric_kwargs = metric_kwargs
         self.random_state = random_state
         self.task_type = task_type
 
@@ -137,6 +139,7 @@ def _fit(
                     target=labels,
                     prediction=fant_ensemble_prediction,
                     task_type=self.task_type,
+                    **self.metric_kwargs
                 )[self.metric.name]
 
             all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py
index 881ae5fd2..9fcbeee82 100644
--- a/autoPyTorch/ensemble/singlebest_ensemble.py
+++ b/autoPyTorch/ensemble/singlebest_ensemble.py
@@ -25,8 +25,11 @@ def __init__(
         run_history: RunHistory,
         seed: int,
         backend: Backend,
+        metric_kwargs: Dict = {},
+
     ):
         self.metric = metric
+        self.metric_kwargs = metric_kwargs
         self.seed = seed
         self.backend = backend
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 027c7211a..d20a96b75 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -19,28 +19,45 @@
 import autoPyTorch.pipeline.image_classification
 import autoPyTorch.pipeline.tabular_classification
 import autoPyTorch.pipeline.tabular_regression
+try:
+    import autoPyTorch.pipeline.time_series_forecasting
+    forecasting_dependencies_installed = True
+except ModuleNotFoundError:
+    forecasting_dependencies_installed = False
 import autoPyTorch.pipeline.traditional_tabular_classification
 import autoPyTorch.pipeline.traditional_tabular_regression
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
+    FORECASTING_BUDGET_TYPE,
+    FORECASTING_TASKS,
+    ForecastingDependenciesNotInstalledMSG,
     IMAGE_TASKS,
     MULTICLASS,
     REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
-    TABULAR_TASKS,
+    TABULAR_TASKS
+)
+from autoPyTorch.datasets.base_dataset import (
+    BaseDataset,
+    BaseDatasetPropertiesType
 )
-from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.evaluation.utils import (
+    DisableFileOutputParameters,
     VotingRegressorWrapper,
     convert_multioutput_multiclass_to_multilabel
 )
+try:
+    from autoPyTorch.evaluation.utils_extra import DummyTimeSeriesForecastingPipeline
+    forecasting_dependencies_installed = True
+except ModuleNotFoundError:
+    forecasting_dependencies_installed = False
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import (
     calculate_loss,
-    get_metrics,
+    get_metrics
 )
 from autoPyTorch.utils.common import dict_repr, subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -375,10 +392,25 @@ class AbstractEvaluator(object):
             An optional dictionary to include components of the pipeline steps.
         exclude (Optional[Dict[str, Any]]):
             An optional dictionary to exclude components of the pipeline steps.
-        disable_file_output (Union[bool, List[str]]):
-            By default, the model, it's predictions and other metadata is stored on disk
-            for each finished configuration. This argument allows the user to skip
-            saving certain file type, for example the model, from being written to disk.
+        disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+            Used as a list to pass more fine-grained
+            information on what to save. Must be a member of `DisableFileOutputParameters`.
+            Allowed elements in the list are:
+
+            + `y_optimization`:
+                do not save the predictions for the optimization set,
+                which would later on be used to build an ensemble. Note that SMAC
+                optimizes a metric evaluated on the optimization set.
+            + `pipeline`:
+                do not save any individual pipeline files
+            + `pipelines`:
+                In case of cross validation, disables saving the joint model of the
+                pipelines fit on each fold.
+            + `y_test`:
+                do not save the predictions for the test set.
+            + `all`:
+                do not save any of the above.
+            For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
         init_params (Optional[Dict[str, Any]]):
             Optional argument that is passed to each pipeline step. It is the equivalent of
             kwargs for the pipeline steps.
@@ -404,7 +436,7 @@ def __init__(self, backend: Backend,
                  num_run: Optional[int] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
-                 disable_file_output: Union[bool, List[str]] = False,
+                 disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
@@ -417,43 +449,24 @@ def __init__(self, backend: Backend,
         self.backend: Backend = backend
         self.queue = queue
 
-        self.datamanager: BaseDataset = self.backend.load_datamanager()
-
-        assert self.datamanager.task_type is not None, \
-            "Expected dataset {} to have task_type got None".format(self.datamanager.__class__.__name__)
-        self.task_type = STRING_TO_TASK_TYPES[self.datamanager.task_type]
-        self.output_type = STRING_TO_OUTPUT_TYPES[self.datamanager.output_type]
-        self.issparse = self.datamanager.issparse
-
         self.include = include
         self.exclude = exclude
         self.search_space_updates = search_space_updates
 
-        self.X_train, self.y_train = self.datamanager.train_tensors
-
-        if self.datamanager.val_tensors is not None:
-            self.X_valid, self.y_valid = self.datamanager.val_tensors
-        else:
-            self.X_valid, self.y_valid = None, None
-
-        if self.datamanager.test_tensors is not None:
-            self.X_test, self.y_test = self.datamanager.test_tensors
-        else:
-            self.X_test, self.y_test = None, None
-
         self.metric = metric
 
         self.seed = seed
 
+        self._init_datamanager_info()
+
         # Flag to save target for ensemble
         self.output_y_hat_optimization = output_y_hat_optimization
 
-        if isinstance(disable_file_output, bool):
-            self.disable_file_output: bool = disable_file_output
-        elif isinstance(disable_file_output, List):
-            self.disabled_file_outputs: List[str] = disable_file_output
-        else:
-            raise ValueError('disable_file_output should be either a bool or a list')
+        disable_file_output = disable_file_output if disable_file_output is not None else []
+        # check compatibility of disable file output
+        DisableFileOutputParameters.check_compatibility(disable_file_output)
+
+        self.disable_file_output = disable_file_output
 
         self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None
         if self.task_type in REGRESSION_TASKS:
@@ -466,7 +479,7 @@ def __init__(self, backend: Backend,
             else:
                 raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_regression
-        else:
+        elif self.task_type in CLASSIFICATION_TASKS:
             if isinstance(self.configuration, int):
                 self.pipeline_class = DummyClassificationPipeline
             elif isinstance(self.configuration, str):
@@ -482,12 +495,19 @@ def __init__(self, backend: Backend,
                 else:
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
-        self.dataset_properties = self.datamanager.get_dataset_properties(
-            get_dataset_requirements(info=self.datamanager.get_required_dataset_info(),
-                                     include=self.include,
-                                     exclude=self.exclude,
-                                     search_space_updates=self.search_space_updates
-                                     ))
+        elif self.task_type in FORECASTING_TASKS:
+            if isinstance(self.configuration, int):
+                if not forecasting_dependencies_installed:
+                    raise ModuleNotFoundError(ForecastingDependenciesNotInstalledMSG)
+                self.pipeline_class = DummyTimeSeriesForecastingPipeline
+            elif isinstance(self.configuration, str):
+                raise ValueError("Only tabular classifications tasks "
+                                 "are currently supported with traditional methods")
+            elif isinstance(self.configuration, Configuration):
+                self.pipeline_class = autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline
+            else:
+                raise ValueError('task {} not available'.format(self.task_type))
+            self.predict_function = self._predict_regression
 
         self.additional_metrics: Optional[List[autoPyTorchMetric]] = None
         metrics_dict: Optional[Dict[str, List[str]]] = None
@@ -527,6 +547,53 @@ def __init__(self, backend: Backend,
         self.logger.debug("Fit dictionary in Abstract evaluator: {}".format(dict_repr(self.fit_dictionary)))
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
 
+    def _init_datamanager_info(
+        self,
+    ) -> None:
+        """
+        Initialises instance attributes that come from the datamanager.
+        For example,
+            X_train, y_train, etc.
+        """
+
+        datamanager: BaseDataset = self.backend.load_datamanager()
+
+        assert datamanager.task_type is not None, \
+            "Expected dataset {} to have task_type got None".format(datamanager.__class__.__name__)
+        self.task_type = STRING_TO_TASK_TYPES[datamanager.task_type]
+        self.output_type = STRING_TO_OUTPUT_TYPES[datamanager.output_type]
+        self.issparse = datamanager.issparse
+
+        self.X_train, self.y_train = datamanager.train_tensors
+
+        if datamanager.val_tensors is not None:
+            self.X_valid, self.y_valid = datamanager.val_tensors
+        else:
+            self.X_valid, self.y_valid = None, None
+
+        if datamanager.test_tensors is not None:
+            self.X_test, self.y_test = datamanager.test_tensors
+        else:
+            self.X_test, self.y_test = None, None
+
+        self.resampling_strategy = datamanager.resampling_strategy
+
+        self.num_classes: Optional[int] = getattr(datamanager, "num_classes", None)
+
+        self.dataset_properties = datamanager.get_dataset_properties(
+            get_dataset_requirements(info=datamanager.get_required_dataset_info(),
+                                     include=self.include,
+                                     exclude=self.exclude,
+                                     search_space_updates=self.search_space_updates
+                                     ))
+        self.splits = datamanager.splits
+        if self.splits is None:
+            raise AttributeError(f"create_splits on {datamanager.__class__.__name__} must be called "
+                                 f"before the instantiation of {self.__class__.__name__}")
+
+        # delete datamanager from memory
+        del datamanager
+
     def _init_fit_dictionary(
         self,
         logger_port: int,
@@ -575,8 +642,21 @@ def _init_fit_dictionary(
         elif self.budget_type == 'runtime':
             self.fit_dictionary['runtime'] = self.budget
             self.fit_dictionary.pop('epochs', None)
+        elif self.budget_type == 'resolution' and self.task_type in FORECASTING_TASKS:
+            self.fit_dictionary['sample_interval'] = int(np.ceil(1.0 / self.budget))
+            self.fit_dictionary.pop('epochs', None)
+            self.fit_dictionary.pop('runtime', None)
+        elif self.budget_type == 'num_seq':
+            self.fit_dictionary['fraction_seq'] = self.budget
+            self.fit_dictionary.pop('epochs', None)
+            self.fit_dictionary.pop('runtime', None)
+        elif self.budget_type == 'num_sample_per_seq':
+            self.fit_dictionary['fraction_samples_per_seq'] = self.budget
+            self.fit_dictionary.pop('epochs', None)
+            self.fit_dictionary.pop('runtime', None)
         else:
-            raise ValueError(f"budget type must be `epochs` or `runtime`, but got {self.budget_type}")
+            raise ValueError(f"budget type must be `epochs` or `runtime` or {FORECASTING_BUDGET_TYPE} "
+                             f"(Only used by forecasting taskss), but got {self.budget_type}")
 
     def _get_pipeline(self) -> BaseEstimator:
         """
@@ -618,7 +698,7 @@ def _get_pipeline(self) -> BaseEstimator:
             raise ValueError("Invalid configuration entered")
         return pipeline
 
-    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]:
+    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Any) -> Dict[str, float]:
         """SMAC follows a minimization goal, so the make_scorer
         sign is used as a guide to obtain the value to reduce.
         The calculate_loss internally translate a score function to
@@ -643,14 +723,13 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]:
             metrics = self.additional_metrics
         else:
             metrics = [self.metric]
-
         return calculate_loss(
-            y_true, y_hat, self.task_type, metrics)
+            y_true, y_hat, self.task_type, metrics, **metric_kwargs)
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   opt_pred: np.ndarray, valid_pred: Optional[np.ndarray],
                   test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
-                  file_output: bool, status: StatusType
+                  file_output: bool, status: StatusType, **metric_kwargs: Any
                   ) -> Optional[Tuple[float, float, int, Dict]]:
         """This function does everything necessary after the fitting is done:
 
@@ -680,6 +759,8 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                 Whether or not this pipeline should output information to disk
             status (StatusType)
                 The status of the run, following SMAC StatusType syntax.
+            metric_kwargs (Any)
+                Additional arguments for computing metrics
 
         Returns:
             duration (float):
@@ -703,7 +784,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             additional_run_info_ = {}
 
         validation_loss, test_loss = self.calculate_auxiliary_losses(
-            valid_pred, test_pred
+            valid_pred, test_pred, **metric_kwargs
         )
 
         if loss_ is not None:
@@ -735,6 +816,7 @@ def calculate_auxiliary_losses(
         self,
         Y_valid_pred: np.ndarray,
         Y_test_pred: np.ndarray,
+        **metric_kwargs: Any
     ) -> Tuple[Optional[Dict[str, float]], Optional[Dict[str, float]]]:
         """
         A helper function to calculate the performance estimate of the
@@ -747,6 +829,8 @@ def calculate_auxiliary_losses(
             Y_test_pred (np.ndarray):
                 predictions on a test set provided by the user,
                 matching self.y_test
+            metric_kwargs (Any)
+                additional argument for evaluating the loss metric
 
         Returns:
             validation_loss_dict (Optional[Dict[str, float]]):
@@ -759,12 +843,12 @@ def calculate_auxiliary_losses(
 
         if Y_valid_pred is not None:
             if self.y_valid is not None:
-                validation_loss_dict = self._loss(self.y_valid, Y_valid_pred)
+                validation_loss_dict = self._loss(self.y_valid, Y_valid_pred, **metric_kwargs)
 
         test_loss_dict: Optional[Dict[str, float]] = None
         if Y_test_pred is not None:
             if self.y_test is not None:
-                test_loss_dict = self._loss(self.y_test, Y_test_pred)
+                test_loss_dict = self._loss(self.y_test, Y_test_pred, **metric_kwargs)
 
         return validation_loss_dict, test_loss_dict
 
@@ -834,20 +918,17 @@ def file_output(
                 )
 
         # Abort if we don't want to output anything.
-        if hasattr(self, 'disable_file_output'):
-            if self.disable_file_output:
-                return None, {}
-            else:
-                self.disabled_file_outputs = []
+        if 'all' in self.disable_file_output:
+            return None, {}
 
         # This file can be written independently of the others down bellow
-        if 'y_optimization' not in self.disabled_file_outputs:
+        if 'y_optimization' not in self.disable_file_output:
             if self.output_y_hat_optimization:
                 self.backend.save_targets_ensemble(self.Y_optimization)
 
-        if hasattr(self, 'pipelines') and self.pipelines is not None:
-            if self.pipelines[0] is not None and len(self.pipelines) > 0:
-                if 'pipelines' not in self.disabled_file_outputs:
+        if getattr(self, 'pipelines', None) is not None:
+            if self.pipelines[0] is not None and len(self.pipelines) > 0:  # type: ignore[index, arg-type]
+                if 'pipelines' not in self.disable_file_output:
                     if self.task_type in CLASSIFICATION_TASKS:
                         pipelines = VotingClassifier(estimators=None, voting='soft', )
                     else:
@@ -860,8 +941,8 @@ def file_output(
         else:
             pipelines = None
 
-        if hasattr(self, 'pipeline') and self.pipeline is not None:
-            if 'pipeline' not in self.disabled_file_outputs:
+        if getattr(self, 'pipeline', None) is not None:
+            if 'pipeline' not in self.disable_file_output:
                 pipeline = self.pipeline
             else:
                 pipeline = None
@@ -877,15 +958,15 @@ def file_output(
             cv_model=pipelines,
             ensemble_predictions=(
                 Y_optimization_pred if 'y_optimization' not in
-                                       self.disabled_file_outputs else None
+                                       self.disable_file_output else None
             ),
             valid_predictions=(
                 Y_valid_pred if 'y_valid' not in
-                                self.disabled_file_outputs else None
+                                self.disable_file_output else None
             ),
             test_predictions=(
                 Y_test_pred if 'y_test' not in
-                               self.disabled_file_outputs else None
+                               self.disable_file_output else None
             ),
         )
 
@@ -976,21 +1057,20 @@ def _ensure_prediction_array_sizes(self, prediction: np.ndarray,
             (np.ndarray):
                 The formatted prediction
         """
-        assert self.datamanager.num_classes is not None, "Called function on wrong task"
-        num_classes: int = self.datamanager.num_classes
+        assert self.num_classes is not None, "Called function on wrong task"
 
         if self.output_type == MULTICLASS and \
-                prediction.shape[1] < num_classes:
+                prediction.shape[1] < self.num_classes:
             if Y_train is None:
                 raise ValueError('Y_train must not be None!')
             classes = list(np.unique(Y_train))
 
             mapping = dict()
-            for class_number in range(num_classes):
+            for class_number in range(self.num_classes):
                 if class_number in classes:
                     index = classes.index(class_number)
                     mapping[index] = class_number
-            new_predictions = np.zeros((prediction.shape[0], num_classes),
+            new_predictions = np.zeros((prediction.shape[0], self.num_classes),
                                        dtype=np.float32)
 
             for index in mapping:
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index d99251d3d..b1650113c 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -22,9 +22,31 @@
 from smac.tae import StatusType, TAEAbortException
 from smac.tae.execute_func import AbstractTAFunc
 
-import autoPyTorch.evaluation.train_evaluator
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.evaluation.utils import empty_queue, extract_learning_curve, read_queue
+from autoPyTorch.constants import (
+    FORECASTING_BUDGET_TYPE,
+    ForecastingDependenciesNotInstalledMSG,
+    STRING_TO_TASK_TYPES,
+    TIMESERIES_FORECASTING,
+)
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+    NoResamplingStrategyTypes
+)
+from autoPyTorch.evaluation.test_evaluator import eval_test_function
+try:
+    from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import forecasting_eval_train_function
+    forecasting_dependencies_installed = True
+except ModuleNotFoundError:
+    forecasting_dependencies_installed = False
+from autoPyTorch.evaluation.train_evaluator import eval_train_function
+from autoPyTorch.evaluation.utils import (
+    DisableFileOutputParameters,
+    empty_queue,
+    extract_learning_curve,
+    read_queue
+)
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.common import dict_repr, replace_string_bool_to_bool
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -100,6 +122,7 @@ def __init__(
         cost_for_crash: float,
         abort_on_first_run_crash: bool,
         pynisher_context: str,
+        multi_objectives: List[str],
         pipeline_config: Optional[Dict[str, Any]] = None,
         initial_num_run: int = 1,
         stats: Optional[Stats] = None,
@@ -109,16 +132,45 @@ def __init__(
         include: Optional[Dict[str, Any]] = None,
         exclude: Optional[Dict[str, Any]] = None,
         memory_limit: Optional[int] = None,
-        disable_file_output: bool = False,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         init_params: Dict[str, Any] = None,
         budget_type: str = None,
         ta: Optional[Callable] = None,
         logger_port: int = None,
         all_supported_metrics: bool = True,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     ):
 
-        eval_function = autoPyTorch.evaluation.train_evaluator.eval_function
+        self.backend = backend
+
+        dm = self.backend.load_datamanager()
+        if dm.val_tensors is not None:
+            self._get_validation_loss = True
+        else:
+            self._get_validation_loss = False
+        if dm.test_tensors is not None:
+            self._get_test_loss = True
+        else:
+            self._get_test_loss = False
+
+        self.resampling_strategy = dm.resampling_strategy
+        self.resampling_strategy_args = dm.resampling_strategy_args
+
+        if STRING_TO_TASK_TYPES.get(dm.task_type, -1) == TIMESERIES_FORECASTING:
+            if not forecasting_dependencies_installed:
+                raise ModuleNotFoundError(ForecastingDependenciesNotInstalledMSG)
+            eval_function: Callable = forecasting_eval_train_function
+            if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
+                self.output_y_hat_optimization = output_y_hat_optimization
+            elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
+                self.output_y_hat_optimization = False
+        else:
+            if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
+                eval_function = eval_train_function
+                self.output_y_hat_optimization = output_y_hat_optimization
+            elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
+                eval_function = eval_test_function
+                self.output_y_hat_optimization = False
 
         self.worst_possible_result = cost_for_crash
 
@@ -137,12 +189,10 @@ def __init__(
             abort_on_first_run_crash=abort_on_first_run_crash,
         )
 
-        self.backend = backend
         self.pynisher_context = pynisher_context
         self.seed = seed
         self.initial_num_run = initial_num_run
         self.metric = metric
-        self.output_y_hat_optimization = output_y_hat_optimization
         self.include = include
         self.exclude = exclude
         self.disable_file_output = disable_file_output
@@ -170,20 +220,28 @@ def __init__(
             memory_limit = int(math.ceil(memory_limit))
         self.memory_limit = memory_limit
 
-        dm = self.backend.load_datamanager()
-        if dm.val_tensors is not None:
-            self._get_validation_loss = True
-        else:
-            self._get_validation_loss = False
-        if dm.test_tensors is not None:
-            self._get_test_loss = True
-        else:
-            self._get_test_loss = False
+        self.search_space_updates = search_space_updates
 
-        self.resampling_strategy = dm.resampling_strategy
-        self.resampling_strategy_args = dm.resampling_strategy_args
+    def _check_and_get_default_budget(self) -> float:
+        budget_type_choices_tabular = ('epochs', 'runtime')
+        budget_choices = {
+            budget_type: float(self.pipeline_config.get(budget_type, np.inf))
+            for budget_type in budget_type_choices_tabular
+        }
 
-        self.search_space_updates = search_space_updates
+        budget_choices_forecasting = {budget_type: 1.0 for budget_type in FORECASTING_BUDGET_TYPE}
+        budget_choices.update(budget_choices_forecasting)
+        budget_type_choices = budget_type_choices_tabular + FORECASTING_BUDGET_TYPE
+
+        # budget is defined by epochs by default
+        budget_type = str(self.pipeline_config.get('budget_type', 'epochs'))
+        if self.budget_type is not None:
+            budget_type = self.budget_type
+
+        if budget_type not in budget_type_choices:
+            raise ValueError(f"budget type must be in {budget_type_choices}, but got {budget_type}")
+        else:
+            return budget_choices[budget_type]
 
     def run_wrapper(
         self,
@@ -202,26 +260,19 @@ def run_wrapper(
             RunValue:
                 Contains information about the status/performance of config
         """
-        if self.budget_type is None:
-            if run_info.budget != 0:
-                raise ValueError(
-                    'If budget_type is None, budget must be.0, but is %f' % run_info.budget
-                )
-        else:
-            if run_info.budget == 0:
-                # SMAC can return budget zero for intensifiers that don't have a concept
-                # of budget, for example a simple bayesian optimization intensifier.
-                # Budget determines how our pipeline trains, which can be via runtime or epochs
-                epochs_budget = self.pipeline_config.get('epochs', np.inf)
-                runtime_budget = self.pipeline_config.get('runtime', np.inf)
-                run_info = run_info._replace(budget=min(epochs_budget, runtime_budget))
-            elif run_info.budget <= 0:
-                raise ValueError('Illegal value for budget, must be greater than zero but is %f' %
-                                 run_info.budget)
-            if self.budget_type not in ('epochs', 'runtime'):
-                raise ValueError("Illegal value for budget type, must be one of "
-                                 "('epochs', 'runtime'), but is : %s" %
-                                 self.budget_type)
+        # SMAC returns non-zero budget for intensification
+        # In other words, SMAC returns budget=0 for a simple intensifier (i.e. no intensification)
+        is_intensified = (run_info.budget != 0)
+        default_budget = self._check_and_get_default_budget()
+
+        if self.budget_type is None and is_intensified:
+            raise ValueError(f'budget must be 0 (=no intensification) for budget_type=None, but got {run_info.budget}')
+        if self.budget_type is not None and run_info.budget < 0:
+            raise ValueError(f'budget must be greater than zero but got {run_info.budget}')
+
+        if self.budget_type is not None and not is_intensified:
+            # The budget will be provided in train evaluator when budget_type is None
+            run_info = run_info._replace(budget=default_budget)
 
         remaining_time = self.stats.get_remaing_time_budget()
 
@@ -245,6 +296,10 @@ def run_wrapper(
 
         self.logger.info("Starting to evaluate configuration %s" % run_info.config.config_id)
         run_info, run_value = super().run_wrapper(run_info=run_info)
+
+        if not is_intensified:  # It is required for the SMAC compatibility
+            run_info = run_info._replace(budget=0.0)
+
         return run_info, run_value
 
     def run(
diff --git a/autoPyTorch/evaluation/test_evaluator.py b/autoPyTorch/evaluation/test_evaluator.py
new file mode 100644
index 000000000..4d5b0ae91
--- /dev/null
+++ b/autoPyTorch/evaluation/test_evaluator.py
@@ -0,0 +1,236 @@
+from multiprocessing.queues import Queue
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+from smac.tae import StatusType
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes
+from autoPyTorch.evaluation.abstract_evaluator import (
+    AbstractEvaluator,
+    fit_and_suppress_warnings
+)
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+__all__ = [
+    'eval_test_function',
+    'TestEvaluator'
+]
+
+
+class TestEvaluator(AbstractEvaluator):
+    """
+    This class builds a pipeline using the provided configuration.
+    A pipeline implementing the provided configuration is fitted
+    using the datamanager object retrieved from disc, via the backend.
+    After the pipeline is fitted, it is save to disc and the performance estimate
+    is communicated to the main process via a Queue. It is only compatible
+    with `NoResamplingStrategyTypes`, i.e, when the training data
+    is not split and the test set is used for SMBO optimisation. It can not
+    be used for building ensembles which is ensured by having
+    `output_y_hat_optimisation`=False
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        configuration (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed. A dummy estimator is created for
+            integer configurations, a traditional machine learning pipeline is created
+            for string based configuration, and NAS is performed when a configuration
+            object is passed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+            Used as a list to pass more fine-grained
+            information on what to save. Must be a member of `DisableFileOutputParameters`.
+            Allowed elements in the list are:
+
+            + `y_optimization`:
+                do not save the predictions for the optimization set,
+                which would later on be used to build an ensemble. Note that SMAC
+                optimizes a metric evaluated on the optimization set.
+            + `pipeline`:
+                do not save any individual pipeline files
+            + `pipelines`:
+                In case of cross validation, disables saving the joint model of the
+                pipelines fit on each fold.
+            + `y_test`:
+                do not save the predictions for the test set.
+            + `all`:
+                do not save any of the above.
+            For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        all_supported_metrics  (bool):
+            Whether all supported metric should be calculated for every configuration.
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            An object used to fine tune the hyperparameter search space of the pipeline
+    """
+    def __init__(
+        self,
+        backend: Backend, queue: Queue,
+        metric: autoPyTorchMetric,
+        budget: float,
+        configuration: Union[int, str, Configuration],
+        budget_type: str = None,
+        pipeline_config: Optional[Dict[str, Any]] = None,
+        seed: int = 1,
+        output_y_hat_optimization: bool = False,
+        num_run: Optional[int] = None,
+        include: Optional[Dict[str, Any]] = None,
+        exclude: Optional[Dict[str, Any]] = None,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        init_params: Optional[Dict[str, Any]] = None,
+        logger_port: Optional[int] = None,
+        all_supported_metrics: bool = True,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ) -> None:
+        super().__init__(
+            backend=backend,
+            queue=queue,
+            configuration=configuration,
+            metric=metric,
+            seed=seed,
+            output_y_hat_optimization=output_y_hat_optimization,
+            num_run=num_run,
+            include=include,
+            exclude=exclude,
+            disable_file_output=disable_file_output,
+            init_params=init_params,
+            budget=budget,
+            budget_type=budget_type,
+            logger_port=logger_port,
+            all_supported_metrics=all_supported_metrics,
+            pipeline_config=pipeline_config,
+            search_space_updates=search_space_updates
+        )
+
+        if not isinstance(self.resampling_strategy, (NoResamplingStrategyTypes)):
+            raise ValueError(
+                f'resampling_strategy for TestEvaluator must be in '
+                f'NoResamplingStrategyTypes, but got {self.resampling_strategy}'
+            )
+
+    def fit_predict_and_loss(self) -> None:
+
+        split_id = 0
+        train_indices, test_indices = self.splits[split_id]
+
+        self.pipeline = self._get_pipeline()
+        X = {'train_indices': train_indices,
+             'val_indices': test_indices,
+             'split_id': split_id,
+             'num_run': self.num_run,
+             **self.fit_dictionary}  # fit dictionary
+        y = None
+        fit_and_suppress_warnings(self.logger, self.pipeline, X, y)
+        train_loss, _ = self.predict_and_loss(train=True)
+        test_loss, test_pred = self.predict_and_loss()
+        self.Y_optimization = self.y_test
+        self.finish_up(
+            loss=test_loss,
+            train_loss=train_loss,
+            opt_pred=test_pred,
+            valid_pred=None,
+            test_pred=test_pred,
+            file_output=True,
+            additional_run_info=None,
+            status=StatusType.SUCCESS,
+        )
+
+    def predict_and_loss(
+        self, train: bool = False
+    ) -> Tuple[Dict[str, float], np.ndarray]:
+        labels = self.y_train if train else self.y_test
+        feats = self.X_train if train else self.X_test
+        preds = self.predict_function(
+            X=feats,
+            pipeline=self.pipeline,
+            Y_train=self.y_train  # Need this as we need to know all the classes in train splits
+        )
+        loss_dict = self._loss(labels, preds)
+
+        return loss_dict, preds
+
+
+# create closure for evaluating an algorithm
+def eval_test_function(
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    output_y_hat_optimization: bool,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    instance: str = None,
+) -> None:
+    evaluator = TestEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates)
+
+    evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
new file mode 100644
index 000000000..0940d1e9a
--- /dev/null
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -0,0 +1,558 @@
+import copy
+import warnings
+from multiprocessing.queues import Queue
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+from smac.tae import StatusType
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import SEASONALITY_MAP
+from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+from autoPyTorch.evaluation.utils_extra import DummyTimeSeriesForecastingPipeline
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
+    """
+    This class is  similar to the TrainEvaluator. Except that given the specific
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        configuration (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed. A dummy estimator is created for
+            integer configurations, a traditional machine learning pipeline is created
+            for string based configuration, and NAS is performed when a configuration
+            object is passed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+            Used as a list to pass more fine-grained
+            information on what to save. Must be a member of `DisableFileOutputParameters`.
+            Allowed elements in the list are:
+
+            + `y_optimization`:
+                do not save the predictions for the optimization set,
+                which would later on be used to build an ensemble. Note that SMAC
+                optimizes a metric evaluated on the optimization set.
+            + `pipeline`:
+                do not save any individual pipeline files
+            + `pipelines`:
+                In case of cross validation, disables saving the joint model of the
+                pipelines fit on each fold.
+            + `y_test`:
+                do not save the predictions for the test set.
+            + `all`:
+                do not save any of the above.
+            For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        all_supported_metrics  (bool):
+            Whether all supported metric should be calculated for every configuration.
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            An object used to fine tune the hyperparameter search space of the pipeline
+        max_budget (float):
+            maximal budget value available for the optimizer. This is applied to compute the size of the proxy
+            validation sets
+        min_num_test_instances (Optional[int]):
+            minimal number of instances to be validated. We do so to ensure that there are enough instances in
+            the validation set
+
+    """
+    def __init__(self, backend: Backend, queue: Queue,
+                 metric: autoPyTorchMetric,
+                 budget: float,
+                 budget_type: str = None,
+                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 configuration: Optional[Configuration] = None,
+                 seed: int = 1,
+                 output_y_hat_optimization: bool = True,
+                 num_run: Optional[int] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 logger_port: Optional[int] = None,
+                 keep_models: Optional[bool] = None,
+                 all_supported_metrics: bool = True,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 max_budget: float = 1.0,
+                 min_num_test_instances: Optional[int] = None) -> None:
+        super(TimeSeriesForecastingTrainEvaluator, self).__init__(
+            backend=backend,
+            queue=queue,
+            configuration=configuration,
+            metric=metric,
+            seed=seed,
+            output_y_hat_optimization=output_y_hat_optimization,
+            num_run=num_run,
+            include=include,
+            exclude=exclude,
+            disable_file_output=disable_file_output,
+            init_params=init_params,
+            budget=budget,
+            budget_type=budget_type,
+            logger_port=logger_port,
+            keep_models=keep_models,
+            all_supported_metrics=all_supported_metrics,
+            pipeline_config=pipeline_config,
+            search_space_updates=search_space_updates
+        )
+        self.datamanager = backend.load_datamanager()
+        self.n_prediction_steps = self.datamanager.n_prediction_steps
+        self.num_sequences = self.datamanager.num_sequences
+        self.num_targets = self.datamanager.num_targets
+        self.seq_length_min = np.min(self.num_sequences)
+        seasonality = SEASONALITY_MAP.get(self.datamanager.freq, 1)
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        self.seasonality = int(seasonality)  # type: ignore[call-overload]
+
+        self.max_budget = max_budget
+        self.min_num_test_instances = min_num_test_instances
+        self.eval_test_tensors = True
+
+    def fit_predict_and_loss(self) -> None:
+        """Fit, predict and compute the loss for cross-validation and
+        holdout"""
+        assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
+            .format(self.__class__.__name__)
+        additional_run_info: Optional[Dict] = None
+
+        if self.num_folds == 1:
+            split_id = 0
+            self.logger.info("Starting fit {}".format(split_id))
+
+            pipeline = self._get_pipeline()
+
+            train_split, test_split = self.splits[split_id]
+
+            self.Y_optimization = self.datamanager.get_test_target(test_split)
+
+            # self.Y_actual_train = self.y_train[train_split]
+            y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
+                                                                                        train_indices=train_split,
+                                                                                        test_indices=test_split,
+                                                                                        add_pipeline_to_self=True)
+
+            mase_coefficient_val = self.generate_mase_coefficient_for_validation(test_split)  # type: ignore[arg-type]
+
+            forecasting_kwargs = {'sp': self.seasonality,
+                                  'n_prediction_steps': self.n_prediction_steps,
+                                  }
+            forecasting_kwargs_val = copy.copy(forecasting_kwargs)
+            forecasting_kwargs_val['mase_coefficient'] = mase_coefficient_val
+            if self.y_test is not None:
+                mase_coefficient_test = self.generate_mase_coefficient_for_test_set()
+                forecasting_kwargs['mase_coefficient'] = mase_coefficient_test
+
+            train_loss = None
+
+            loss = self._loss(self.Y_optimization, y_opt_pred, **forecasting_kwargs_val)  # type: ignore[arg-type]
+
+            additional_run_info = pipeline.get_additional_run_info() if hasattr(
+                pipeline, 'get_additional_run_info') else {}
+
+            status = StatusType.SUCCESS
+            # self.Y_optimization and y_opt_pred need to be applied to construct ensembles. We simply scale them here
+            self.Y_optimization *= mase_coefficient_val
+
+            self.finish_up(
+                loss=loss,
+                train_loss=train_loss,  # type: ignore[arg-type]
+                opt_pred=y_opt_pred * mase_coefficient_val,
+                valid_pred=y_valid_pred,
+                test_pred=y_test_pred,
+                additional_run_info=additional_run_info,
+                file_output=True,
+                status=status,
+                **forecasting_kwargs
+            )
+
+        else:
+            Y_optimization_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            Y_valid_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            Y_test_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            train_splits: List[Optional[Union[np.ndarray, List]]] = [None] * self.num_folds
+
+            self.pipelines = [self._get_pipeline() for _ in range(self.num_folds)]
+
+            # Train losses is not applied here as it might become too expensive
+
+            # used as weights when averaging train losses.
+            train_fold_weights = [np.NaN] * self.num_folds
+            # stores opt (validation) loss of each fold.
+            opt_losses = [np.NaN] * self.num_folds
+            # weights for opt_losses.
+            opt_fold_weights = [np.NaN] * self.num_folds
+
+            mase_coefficient_val_all = []
+            for train_split, test_split in self.splits:
+                mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)  # type: ignore[arg-type]
+                mase_coefficient_val_all.append(mase_coefficient)
+
+            forecasting_kwargs = {'sp': self.seasonality,
+                                  'n_prediction_steps': self.n_prediction_steps}
+
+            if self.y_test is not None:
+                mase_coefficient_test = self.generate_mase_coefficient_for_test_set()
+                forecasting_kwargs['mase_coefficient'] = mase_coefficient_test
+
+            for i, (train_split, test_split) in enumerate(self.splits):
+                if i > 0:
+                    self.eval_test_tensors = False
+                pipeline = self.pipelines[i]
+
+                train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
+                                                                                    train_indices=train_split,
+                                                                                    test_indices=test_split,
+                                                                                    add_pipeline_to_self=False)
+                # Y_train_pred[i] = train_pred
+                Y_optimization_pred[i] = opt_pred
+                Y_valid_pred[i] = valid_pred
+                Y_test_pred[i] = test_pred
+                train_splits[i] = train_split
+
+                self.Y_targets[i] = self.datamanager.get_test_target(test_split)
+                # Compute train loss of this fold and store it. train_loss could
+                # either be a scalar or a dict of scalars with metrics as keys.
+
+                # number of training data points for this fold. Used for weighting
+                # the average.
+                train_fold_weights[i] = len(train_split)
+
+                forecasting_kwargs_val = copy.copy(forecasting_kwargs)
+                forecasting_kwargs_val['mase_coefficient'] = mase_coefficient_val_all[i]
+
+                # Compute validation loss of this fold and store it.
+                optimization_loss = self._loss(
+                    self.Y_targets[i],  # type: ignore[arg-type]
+                    opt_pred,
+                    **forecasting_kwargs_val
+                )
+                opt_losses[i] = optimization_loss
+                # number of optimization data points for this fold.
+                # Used for weighting the average.
+                opt_fold_weights[i] = len(train_split)
+
+            # Compute weights of each fold based on the number of samples in each
+            # fold.
+
+            opt_fold_weights = [w / sum(opt_fold_weights)
+                                for w in opt_fold_weights]
+
+            train_loss = None
+
+            opt_loss = {}
+            # self.logger.debug("OPT LOSSES: {}".format(opt_losses if opt_losses is not None else None))
+            for metric in opt_losses[0].keys():
+                opt_loss[metric] = np.average(
+                    [
+                        opt_losses[i][metric]
+                        for i in range(self.num_folds)
+                    ],
+                    weights=opt_fold_weights,
+                )
+            Y_targets = self.Y_targets
+            Y_train_targets = self.Y_train_targets
+
+            Y_optimization_preds = np.concatenate(
+                [Y_optimization_pred[i] * mase_coefficient_val_all[i] for i in range(self.num_folds)
+                 if Y_optimization_pred[i] is not None])
+            Y_targets = np.concatenate([
+                Y_targets[i] * mase_coefficient_val_all[i] for i in range(self.num_folds)
+                if Y_targets[i] is not None
+            ])
+
+            if self.y_valid is not None:
+                warnings.warn('valid_pred is currently unsupported for fore casting tasks!')
+            Y_valid_preds = None
+
+            if self.y_test is not None:
+                Y_test_preds = np.array([Y_test_pred[i] * mase_coefficient_val_all[0]
+                                         for i in range(self.num_folds)
+                                         if Y_test_pred[i] is not None])
+                # Average the predictions of several pipelines
+                if len(Y_test_preds.shape) == 3:
+                    Y_test_preds = np.nanmean(Y_test_preds, axis=0)
+            else:
+                Y_test_preds = None
+
+            self.Y_optimization = Y_targets
+            self.Y_actual_train = Y_train_targets
+
+            self.pipeline = self._get_pipeline()
+
+            status = StatusType.SUCCESS
+            self.logger.debug("In train evaluator fit_predict_and_loss, loss:{}".format(opt_loss))
+            self.finish_up(
+                loss=opt_loss,
+                train_loss=train_loss,  # type: ignore[arg-type]
+                opt_pred=Y_optimization_preds.flatten(),
+                valid_pred=Y_valid_preds,
+                test_pred=Y_test_preds,
+                additional_run_info=additional_run_info,
+                file_output=True,
+                status=status,
+                **forecasting_kwargs,
+            )
+
+    def generate_mase_coefficient_for_validation(self, test_split: Sequence[int]) -> np.ndarray:
+        """
+        Compute the denominator for Mean Absolute Scaled Losses,
+        For detail, please check sktime.performance_metrics.forecasting._functions.mean_absolute_scaled_error
+
+        Parameters:
+        ----------
+        test_split (Sequence):
+            test splits, consistent of int
+        Return:
+        ----------
+        mase_coefficient (np.ndarray(self.num_sequence * self.n_prediction_steps)):
+            inverse of the mase_denominator
+        """
+        mase_coefficient = np.ones([len(test_split), self.num_targets])
+        if self.additional_metrics is not None:
+            if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
+                for seq_idx, test_idx in enumerate(test_split):
+                    mase_coefficient[seq_idx] = self.datamanager.get_time_series_seq(test_idx).mase_coefficient
+
+        mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
+        return mase_coefficient
+
+    def generate_mase_coefficient_for_test_set(self) -> np.ndarray:
+        """
+        Compute the denominator for Mean Absolute Scaled Losses,
+        For detail, please check sktime.performance_metrics.forecasting._functions.mean_absolute_scaled_error
+
+        Parameters:
+        ----------
+        test_split: Sequence
+            test splits, consistent of int
+        Return:
+        ----------
+        mase_coefficient: np.ndarray(self.num_sequence * self.n_prediction_steps)
+            inverse of the mase_denominator
+        """
+        mase_coefficient = np.ones([len(self.datamanager.datasets), self.num_targets])
+        if self.additional_metrics is not None:
+            if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
+                for seq_idx, test_idx in enumerate(self.datamanager.datasets):
+                    mase_coefficient[seq_idx] = self.datamanager.datasets[seq_idx].mase_coefficient
+        mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
+        return mase_coefficient
+
+    def create_validation_sub_set(self, test_indices: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+        if self.min_num_test_instances is not None:
+            num_test_instances = len(test_indices)
+
+            if num_test_instances < self.min_num_test_instances or self.budget >= self.max_budget:
+                # if the length of test indices is smaller than the
+                return test_indices, None
+            num_val_instance = min(num_test_instances,
+                                   max(self.min_num_test_instances,
+                                       int(num_test_instances * self.budget / self.max_budget)
+                                       ))
+            test_subset_indices = np.linspace(0, num_test_instances, num_val_instance, endpoint=False, dtype=np.int)
+            return test_indices[test_subset_indices], test_subset_indices
+        else:
+            return test_indices, None
+
+    def _predict(self, pipeline: BaseEstimator,
+                 test_indices: Union[np.ndarray, List],
+                 train_indices: Union[np.ndarray, List],
+                 ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+        test_indices_subset, test_split_subset_idx = self.create_validation_sub_set(test_indices)
+
+        val_sets = []
+
+        for test_idx in test_indices_subset:
+            val_sets.append(self.datamanager.get_validation_set(test_idx))
+        opt_pred = self.predict_function(val_sets, pipeline)
+        opt_pred = opt_pred.reshape(-1, self.num_targets)
+
+        if test_split_subset_idx is not None:
+            dummy_pipeline = DummyTimeSeriesForecastingPipeline(0, n_prediction_steps=self.n_prediction_steps)
+            remaining_indices = np.setdiff1d(np.arange(len(test_indices)), test_indices_subset)
+            val_set_remain = []
+            for remaining_idx in remaining_indices:
+                val_set_remain.append(self.datamanager.get_validation_set(test_indices[remaining_idx]))
+            y_opt_full = np.zeros([len(test_indices), self.n_prediction_steps, self.num_targets])
+            y_opt_full[remaining_indices] = dummy_pipeline.predict(val_set_remain).reshape([-1,
+                                                                                            self.n_prediction_steps,
+                                                                                            self.num_targets])
+            y_opt_full[test_split_subset_idx] = opt_pred.reshape([-1, self.n_prediction_steps, self.num_targets])
+
+            opt_pred = y_opt_full
+
+        opt_pred = opt_pred.reshape(-1, self.num_targets)
+
+        if self.y_valid is not None:
+            warnings.warn('valid_pred is current unsupported for forecasting tasks!')
+        valid_pred = None
+
+        if self.y_test is not None and self.eval_test_tensors:
+            test_seq = self.datamanager.generate_test_seqs()
+            test_pred = self.predict_function(test_seq, pipeline).reshape(-1, self.num_targets)
+        else:
+            test_pred = None
+
+        return np.empty(1), opt_pred, valid_pred, test_pred
+
+
+# create closure for evaluating an algorithm
+def forecasting_eval_train_function(
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    output_y_hat_optimization: bool,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    instance: str = None,
+    max_budget: float = 1.0,
+    min_num_test_instances: Optional[int] = None
+) -> None:
+    """
+    This closure allows the communication between the ExecuteTaFuncWithQueue and the
+    pipeline trainer (TrainEvaluator).
+
+    Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
+    builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
+    to disc via the backend, and puts the performance result of the run in the queue.
+
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        config (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        instance (str):
+            An instance on which to evaluate the current pipeline. By default we work
+            with a single instance, being the provided X_train, y_train of a single dataset.
+            This instance is a compatibility argument for SMAC, that is capable of working
+            with multiple datasets at the same time.
+        max_budget (float):
+            maximal budget value available for the optimizer. This is applied to compute the size of the proxy
+            validation sets
+        min_num_test_instances (Optional[int]):
+            minimal number of instances to be validated. We do so to ensure that there are enough instances in
+            the validation set
+    """
+    evaluator = TimeSeriesForecastingTrainEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates,
+        max_budget=max_budget,
+        min_num_test_instances=min_num_test_instances,
+    )
+    evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 010948b55..142af6bcc 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -12,17 +12,19 @@
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
-    MULTICLASSMULTIOUTPUT,
+    MULTICLASSMULTIOUTPUT
 )
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.evaluation.abstract_evaluator import (
     AbstractEvaluator,
     fit_and_suppress_warnings
 )
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.common import dict_repr, subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
-__all__ = ['TrainEvaluator', 'eval_function']
+__all__ = ['TrainEvaluator', 'eval_train_function']
 
 
 def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray:
@@ -39,7 +41,9 @@ class TrainEvaluator(AbstractEvaluator):
     A pipeline implementing the provided configuration is fitted
     using the datamanager object retrieved from disc, via the backend.
     After the pipeline is fitted, it is save to disc and the performance estimate
-    is communicated to the main process via a Queue.
+    is communicated to the main process via a Queue. It is only compatible
+    with `CrossValTypes`, `HoldoutValTypes`, i.e, when the training data
+    is split and the validation set is used for SMBO optimisation.
 
     Attributes:
         backend (Backend):
@@ -79,10 +83,25 @@ class TrainEvaluator(AbstractEvaluator):
             An optional dictionary to include components of the pipeline steps.
         exclude (Optional[Dict[str, Any]]):
             An optional dictionary to exclude components of the pipeline steps.
-        disable_file_output (Union[bool, List[str]]):
-            By default, the model, it's predictions and other metadata is stored on disk
-            for each finished configuration. This argument allows the user to skip
-            saving certain file type, for example the model, from being written to disk.
+        disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+            Used as a list to pass more fine-grained
+            information on what to save. Must be a member of `DisableFileOutputParameters`.
+            Allowed elements in the list are:
+
+            + `y_optimization`:
+                do not save the predictions for the optimization set,
+                which would later on be used to build an ensemble. Note that SMAC
+                optimizes a metric evaluated on the optimization set.
+            + `pipeline`:
+                do not save any individual pipeline files
+            + `pipelines`:
+                In case of cross validation, disables saving the joint model of the
+                pipelines fit on each fold.
+            + `y_test`:
+                do not save the predictions for the test set.
+            + `all`:
+                do not save any of the above.
+            For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
         init_params (Optional[Dict[str, Any]]):
             Optional argument that is passed to each pipeline step. It is the equivalent of
             kwargs for the pipeline steps.
@@ -96,6 +115,7 @@ class TrainEvaluator(AbstractEvaluator):
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
             An object used to fine tune the hyperparameter search space of the pipeline
     """
+
     def __init__(self, backend: Backend, queue: Queue,
                  metric: autoPyTorchMetric,
                  budget: float,
@@ -107,7 +127,7 @@ def __init__(self, backend: Backend, queue: Queue,
                  num_run: Optional[int] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
-                 disable_file_output: Union[bool, List] = False,
+                 disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  keep_models: Optional[bool] = None,
@@ -133,9 +153,12 @@ def __init__(self, backend: Backend, queue: Queue,
             search_space_updates=search_space_updates
         )
 
-        self.splits = self.datamanager.splits
-        if self.splits is None:
-            raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
+        if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
+            raise ValueError(
+                f'resampling_strategy for TrainEvaluator must be in '
+                f'(CrossValTypes, HoldoutValTypes), but got {self.resampling_strategy}'
+            )
+
         self.num_folds: int = len(self.splits)
         self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
         self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
@@ -209,7 +232,6 @@ def fit_predict_and_loss(self) -> None:
             additional_run_info = {}
 
             for i, (train_split, test_split) in enumerate(self.splits):
-
                 pipeline = self.pipelines[i]
                 train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
                                                                                     train_indices=train_split,
@@ -254,10 +276,15 @@ def fit_predict_and_loss(self) -> None:
 
             # train_losses is a list of dicts. It is
             # computed using the target metric (self.metric).
-            train_loss = np.average([train_losses[i][str(self.metric)]
-                                     for i in range(self.num_folds)],
-                                    weights=train_fold_weights,
-                                    )
+            train_loss = {}
+            for metric in train_losses[0].keys():
+                train_loss[metric] = np.average(
+                    [
+                        train_losses[i][metric]
+                        for i in range(self.num_folds)
+                    ],
+                    weights=train_fold_weights
+                )
 
             opt_loss = {}
             # self.logger.debug("OPT LOSSES: {}".format(opt_losses if opt_losses is not None else None))
@@ -381,25 +408,25 @@ def _predict(self, pipeline: BaseEstimator,
 
 
 # create closure for evaluating an algorithm
-def eval_function(
-        backend: Backend,
-        queue: Queue,
-        metric: autoPyTorchMetric,
-        budget: float,
-        config: Optional[Configuration],
-        seed: int,
-        output_y_hat_optimization: bool,
-        num_run: int,
-        include: Optional[Dict[str, Any]],
-        exclude: Optional[Dict[str, Any]],
-        disable_file_output: Union[bool, List],
-        pipeline_config: Optional[Dict[str, Any]] = None,
-        budget_type: str = None,
-        init_params: Optional[Dict[str, Any]] = None,
-        logger_port: Optional[int] = None,
-        all_supported_metrics: bool = True,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        instance: str = None,
+def eval_train_function(
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    output_y_hat_optimization: bool,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    instance: str = None,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
@@ -480,6 +507,6 @@ def eval_function(
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
-        search_space_updates=search_space_updates
+        search_space_updates=search_space_updates,
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py
index 1bf93fa84..37e5fa36d 100644
--- a/autoPyTorch/evaluation/utils.py
+++ b/autoPyTorch/evaluation/utils.py
@@ -8,6 +8,9 @@
 
 from smac.runhistory.runhistory import RunValue
 
+from autoPyTorch.utils.common import autoPyTorchEnum
+
+
 __all__ = [
     'read_queue',
     'convert_multioutput_multiclass_to_multilabel',
@@ -102,3 +105,40 @@ def _predict(self, X: np.ndarray) -> np.ndarray:
             predictions.append(pred.ravel())
 
         return np.asarray(predictions).T
+
+
+class DisableFileOutputParameters(autoPyTorchEnum):
+    """
+    Contains literals that can be passed in to `disable_file_output` list.
+    These include:
+
+    + `y_optimization`:
+        do not save the predictions for the optimization set,
+        which would later on be used to build an ensemble. Note that SMAC
+        optimizes a metric evaluated on the optimization set.
+    + `pipeline`:
+        do not save any individual pipeline files
+    + `pipelines`:
+        In case of cross validation, disables saving the joint model of the
+        pipelines fit on each fold.
+    + `y_test`:
+        do not save the predictions for the test set.
+    + `all`:
+        do not save any of the above.
+    """
+    pipeline = 'pipeline'
+    pipelines = 'pipelines'
+    y_optimization = 'y_optimization'
+    y_test = 'y_test'
+    all = 'all'
+
+    @classmethod
+    def check_compatibility(
+        cls,
+        disable_file_output: List[Union[str, 'DisableFileOutputParameters']]
+    ) -> None:
+        for item in disable_file_output:
+            if item not in cls.__members__ and not isinstance(item, cls):
+                raise ValueError(f"Expected {item} to be in the members ("
+                                 f"{list(cls.__members__.keys())}) of {cls.__name__}"
+                                 f" or as string value of a member.")
diff --git a/autoPyTorch/evaluation/utils_extra.py b/autoPyTorch/evaluation/utils_extra.py
new file mode 100644
index 000000000..0201bacee
--- /dev/null
+++ b/autoPyTorch/evaluation/utils_extra.py
@@ -0,0 +1,72 @@
+# The functions and classes implemented in this module all require extra requirements.
+# We put them here to make it easier to be wrapped by try-except process
+from typing import Any, Dict, List, Optional, Union
+
+from ConfigSpace import Configuration
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.dummy import DummyClassifier
+
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+from autoPyTorch.utils.common import subsampler
+
+
+class DummyTimeSeriesForecastingPipeline(DummyClassifier):
+    """
+    A wrapper class that holds a pipeline for dummy forecasting. For each series, it simply repeats the last element
+    in the training series
+
+
+    Attributes:
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+        n_prediction_steps (int):
+            forecasting horizon
+    """
+    def __init__(self, config: Configuration,
+                 random_state: Optional[Union[int, np.random.RandomState]] = None,
+                 init_params: Optional[Dict] = None,
+                 n_prediction_steps: int = 1,
+                 ) -> None:
+        self.config = config
+        self.init_params = init_params
+        self.random_state = random_state
+        super(DummyTimeSeriesForecastingPipeline, self).__init__(strategy="uniform")
+        self.n_prediction_steps = n_prediction_steps
+
+    def fit(self, X: Dict[str, Any], y: Any,
+            sample_weight: Optional[np.ndarray] = None) -> object:
+        self.n_prediction_steps = X['dataset_properties']['n_prediction_steps']
+        y_train = subsampler(X['y_train'], X['train_indices'])
+        return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train, sample_weight)
+
+    def _generate_dummy_forecasting(self, X: List[Union[TimeSeriesSequence, np.ndarray]]) -> List:
+        if isinstance(X[0], TimeSeriesSequence):
+            X_tail = [x.get_target_values(-1) for x in X]
+        else:
+            X_tail = [x[-1] for x in X]
+        return X_tail
+
+    def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
+                      batch_size: int = 1000) -> np.ndarray:
+        X_tail = self._generate_dummy_forecasting(X)
+        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).flatten()
+
+    def predict(self, X: Union[np.ndarray, pd.DataFrame],
+                batch_size: int = 1000) -> np.ndarray:
+        X_tail = np.asarray(self._generate_dummy_forecasting(X))
+        if X_tail.ndim == 1:
+            X_tail = np.expand_dims(X_tail, -1)
+        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).flatten()
+
+    @staticmethod
+    def get_default_pipeline_options() -> Dict[str, Any]:
+        return {'budget_type': 'epochs',
+                'epochs': 1,
+                'runtime': 1}
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index aa444c782..53eae4696 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -10,6 +10,7 @@
 
 from smac.facade.smac_ac_facade import SMAC4AC
 from smac.intensification.hyperband import Hyperband
+from smac.intensification.intensification import Intensifier
 from smac.runhistory.runhistory import RunHistory
 from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
 from smac.scenario.scenario import Scenario
@@ -18,15 +19,21 @@
 from smac.utils.io.traj_logging import TrajEntry
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import (
+    FORECASTING_BUDGET_TYPE,
+    STRING_TO_TASK_TYPES,
+    TIMESERIES_FORECASTING
+)
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     DEFAULT_RESAMPLING_PARAMETERS,
     HoldoutValTypes,
+    NoResamplingStrategyTypes
 )
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
-from autoPyTorch.optimizer.utils import read_return_initial_configurations
+from autoPyTorch.optimizer.utils import read_forecasting_init_configurations, read_return_initial_configurations
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import get_named_client_logger
@@ -39,8 +46,8 @@ def get_smac_object(
     ta: Callable,
     ta_kwargs: Dict[str, Any],
     n_jobs: int,
-    initial_budget: int,
-    max_budget: int,
+    initial_budget: Union[int, float],
+    max_budget: Union[int, float],
     dask_client: Optional[dask.distributed.Client],
     initial_configurations: Optional[List[Configuration]] = None,
 ) -> SMAC4AC:
@@ -55,17 +62,27 @@ def get_smac_object(
         ta (Callable): the function to be intensifier by smac
         ta_kwargs (Dict[str, Any]): Arguments to the above ta
         n_jobs (int): Amount of cores to use for this task
+        initial_budget (int): the minimal budget to be allocated to the target algorithm
+        max_budget (int): the max budget to be allocated to the target algorithm
         dask_client (dask.distributed.Client): User provided scheduler
         initial_configurations (List[Configuration]): List of initial
             configurations which smac will run before starting the search process
 
     Returns:
-        (SMAC4AC): sequential model algorithm configuration object
+        (SMAC4HPO): sequential model algorithm configuration object
 
     """
-    intensifier = Hyperband
-
+    if initial_budget == max_budget:
+        # This allows vanilla BO optimization
+        intensifier = Intensifier
+        intensifier_kwargs: Dict[str, Any] = {'deterministic': True, }
+
+    else:
+        intensifier = Hyperband
+        intensifier_kwargs = {'initial_budget': initial_budget, 'max_budget': max_budget,
+                              'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'}
     rh2EPM = RunHistory2EPM4LogCost
+
     return SMAC4AC(
         scenario=Scenario(scenario_dict),
         rng=seed,
@@ -73,17 +90,16 @@ def get_smac_object(
         tae_runner=ta,
         tae_runner_kwargs=ta_kwargs,
         initial_configurations=initial_configurations,
+        initial_design=None,
         run_id=seed,
         intensifier=intensifier,
-        intensifier_kwargs={'initial_budget': initial_budget, 'max_budget': max_budget,
-                            'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'},
+        intensifier_kwargs=intensifier_kwargs,
         dask_client=dask_client,
         n_jobs=n_jobs,
     )
 
 
 class AutoMLSMBO(object):
-
     def __init__(self,
                  config_space: ConfigSpace.ConfigurationSpace,
                  dataset_name: str,
@@ -98,7 +114,9 @@ def __init__(self,
                  pipeline_config: Dict[str, Any],
                  start_num_run: int = 1,
                  seed: int = 1,
-                 resampling_strategy: Union[HoldoutValTypes, CrossValTypes] = HoldoutValTypes.holdout_validation,
+                 resampling_strategy: Union[HoldoutValTypes,
+                                            CrossValTypes,
+                                            NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
@@ -111,8 +129,10 @@ def __init__(self,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  portfolio_selection: Optional[str] = None,
                  pynisher_context: str = 'spawn',
-                 min_budget: int = 5,
-                 max_budget: int = 50,
+                 min_budget: Union[int, float] = 5,
+                 max_budget: Union[int, float] = 50,
+                 task_type: str = "",
+                 **kwargs: Dict[str, Any]
                  ):
         """
         Interface to SMAC. This method calls the SMAC optimize method, and allows
@@ -187,13 +207,23 @@ def __init__(self,
                 max_budget states the maximum resource allocation a pipeline is going to
                 be ran. For example, if the budget_type is epochs, and max_budget=50,
                 then the pipeline training will be terminated after 50 epochs.
+            task_type (str):
+                task type. Forecasting tasks require special process
+            kwargs (Any):
+                additional arguments that are customed by some specific task.
+                For instance, forecasting tasks require:
+                    min_num_test_instances (int):  minimal number of instances used to initialize a proxy validation set
+                    suggested_init_models (List[str]):  A set of initial models suggested by the users. Their
+                        hyperparameters are determined by the default configurations
+                    custom_init_setting_path (str): The path to the initial hyperparameter configurations set by
+                    the users
+
         """
         super(AutoMLSMBO, self).__init__()
         # data related
         self.dataset_name = dataset_name
-        self.datamanager: Optional[BaseDataset] = None
         self.metric = metric
-        self.task: Optional[str] = None
+
         self.backend = backend
         self.all_supported_metrics = all_supported_metrics
 
@@ -232,6 +262,8 @@ def __init__(self,
 
         self.search_space_updates = search_space_updates
 
+        self.task_type = task_type
+
         if logger_port is None:
             self.logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
         else:
@@ -241,26 +273,25 @@ def __init__(self,
                                               port=self.logger_port)
         self.logger.info("initialised {}".format(self.__class__.__name__))
 
-        self.initial_configurations: Optional[List[Configuration]] = None
-        if portfolio_selection is not None:
-            self.initial_configurations = read_return_initial_configurations(config_space=config_space,
-                                                                             portfolio_selection=portfolio_selection)
+        initial_configurations = []
 
-    def reset_data_manager(self) -> None:
-        if self.datamanager is not None:
-            del self.datamanager
-        self.datamanager = self.backend.load_datamanager()
+        if STRING_TO_TASK_TYPES.get(self.task_type, -1) == TIMESERIES_FORECASTING:
+            initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
+            # proxy-validation sets
+            self.min_num_test_instances: Optional[int] = kwargs.get('min_num_test_instances',  # type:ignore[assignment]
+                                                                    None)
+        else:
+            if portfolio_selection is not None:
+                initial_configurations = read_return_initial_configurations(config_space=config_space,
+                                                                            portfolio_selection=portfolio_selection)
 
-        if self.datamanager is not None and self.datamanager.task_type is not None:
-            self.task = self.datamanager.task_type
+        self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else None
 
     def run_smbo(self, func: Optional[Callable] = None
                  ) -> Tuple[RunHistory, List[TrajEntry], str]:
 
         self.watcher.start_task('SMBO')
         self.logger.info("Started run of SMBO")
-        # == first things first: load the datamanager
-        self.reset_data_manager()
 
         # == Initialize non-SMBO stuff
         # first create a scenario
@@ -345,6 +376,15 @@ def run_smbo(self, func: Optional[Callable] = None
                     )
             scenario_dict.update(self.smac_scenario_args)
 
+        budget_type = self.pipeline_config['budget_type']
+        if budget_type in FORECASTING_BUDGET_TYPE:
+            if STRING_TO_TASK_TYPES.get(self.task_type, -1) != TIMESERIES_FORECASTING:
+                raise ValueError('Forecasting Budget type is only available for forecasting task!')
+            if self.min_budget > 1. or self.max_budget > 1.:
+                self.min_budget = float(self.min_budget) / float(self.max_budget)
+                self.max_budget = 1.0
+            ta_kwargs['min_num_test_instances'] = self.min_num_test_instances
+
         if self.get_smac_object_callback is not None:
             smac = self.get_smac_object_callback(scenario_dict=scenario_dict,
                                                  seed=seed,
@@ -385,3 +425,23 @@ def run_smbo(self, func: Optional[Callable] = None
             raise NotImplementedError(type(smac.solver.tae_runner))
 
         return self.runhistory, self.trajectory, self._budget_type
+
+    def get_init_configs_for_forecasting(self, config_space: ConfigSpace, kwargs: Dict) -> List[Configuration]:
+        """get initial configurations for forecasting tasks"""
+        suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models',  # type:ignore[assignment]
+                                                                None)
+        custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path',  # type:ignore[assignment]
+                                                             None)
+        # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
+        # do not provide any initial configurations
+        if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
+            datamanager: BaseDataset = self.backend.load_datamanager()
+            dataset_properties = datamanager.get_dataset_properties([])
+            initial_configurations = read_forecasting_init_configurations(
+                config_space=config_space,
+                suggested_init_models=suggested_init_models,
+                custom_init_setting_path=custom_init_setting_path,
+                dataset_properties=dataset_properties
+            )
+            return initial_configurations
+        return []
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 6fb9d5024..5f12b0dd1 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -1,7 +1,7 @@
 import json
 import os
 import warnings
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
@@ -10,7 +10,6 @@ def read_return_initial_configurations(
     config_space: ConfigurationSpace,
     portfolio_selection: str
 ) -> List[Configuration]:
-
     # read and validate initial configurations
     portfolio_path = portfolio_selection if portfolio_selection != "greedy" else \
         os.path.join(os.path.dirname(__file__), '../configs/greedy_portfolio.json')
@@ -31,3 +30,72 @@ def read_return_initial_configurations(
                           f"Therefore, it can't be used as an initial "
                           f"configuration as it does not match the current config space. ")
     return initial_configurations
+
+
+def read_forecasting_init_configurations(config_space: ConfigurationSpace,
+                                         suggested_init_models: Optional[List[str]] = None,
+                                         custom_init_setting_path: Optional[str] = None,
+                                         dataset_properties: Dict = {}
+                                         ) -> List[Configuration]:
+    forecasting_init_path = os.path.join(os.path.dirname(__file__), '../configs/forecasting_init_cfgs.json')
+    initial_configurations_dict: List[Dict] = list()
+    initial_configurations = []
+    uni_variant = dataset_properties.get('uni_variant', True)
+    targets_have_missing_values = dataset_properties.get('targets_have_missing_values', False)
+    features_have_missing_values = dataset_properties.get('features_have_missing_values', False)
+
+    if suggested_init_models or suggested_init_models is None:
+        with open(forecasting_init_path, 'r') as f:
+            forecasting_init_dict: Dict[str, Any] = json.load(f)
+        cfg_trainer: Dict = forecasting_init_dict['trainer']
+        models_name_to_cfgs: Dict = forecasting_init_dict['models']
+
+        window_size = config_space.get_default_configuration()["data_loader:window_size"]
+        if suggested_init_models is None:
+            suggested_init_models = list(models_name_to_cfgs.keys())
+
+        for model_name in suggested_init_models:
+            cfg_tmp = cfg_trainer.copy()
+
+            model_cfg = models_name_to_cfgs.get(model_name, None)
+            if model_cfg is None:
+                warnings.warn(f'Cannot to find the corresponding information of model {model_name} from,'
+                              f' forecasting_init_cfgs, currently only {list(models_name_to_cfgs.keys())} are '
+                              f'supported')
+                continue
+            if not model_cfg.get('data_loader:backcast', False):
+                cfg_tmp['data_loader:window_size'] = window_size
+
+            cfg_tmp.update(model_cfg)
+            if not uni_variant:
+                cfg_tmp.update(forecasting_init_dict['feature_preprocessing'])
+                if features_have_missing_values:
+                    cfg_tmp.update(forecasting_init_dict['feature_imputer'])
+            if targets_have_missing_values:
+                cfg_tmp.update(forecasting_init_dict['target_imputer'])
+
+            initial_configurations_dict.append(cfg_tmp)
+
+    if custom_init_setting_path is not None:
+        try:
+            with open(custom_init_setting_path, 'r') as f:
+                initial_configurations_custom_dict: Union[List[Dict[str, Any]], Dict] = json.load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError("The path: {} provided for 'custome_setting_path' for "
+                                    "the file containing the custom initial configurations "
+                                    "does not exist. Please provide a valid path".format(custom_init_setting_path))
+        if isinstance(initial_configurations_custom_dict, list):
+            initial_configurations_dict.extend(initial_configurations_custom_dict)
+        else:
+            initial_configurations_dict.append(initial_configurations_custom_dict)
+
+    for configuration_dict in initial_configurations_dict:
+        try:
+            configuration = Configuration(config_space, configuration_dict)
+            initial_configurations.append(configuration)
+        except Exception as e:
+            warnings.warn(f"Failed to convert {configuration_dict} into"
+                          f" a Configuration with error {e}. "
+                          f"Therefore, it can't be used as an initial "
+                          f"configuration as it does not match the current config space. ")
+    return initial_configurations
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index 90c0f6362..5c580dbd6 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -412,12 +412,26 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 # check if component is not present in include
                 if include is not None and update.node_name in include.keys():
                     if split_hyperparameter[0] not in include[update.node_name]:
-                        raise ValueError("Not found {} in include".format(split_hyperparameter[0]))
+                        hp_in_component = False
+                        # If the node contains subcomponent that is also an instance of autoPyTorchChoice,
+                        # We need to ensure that include is properly passed to it subcomponent
+                        for include_component in include[update.node_name]:
+                            if include_component.startswith(split_hyperparameter[0]):
+                                hp_in_component = True
+                                break
+                        if not hp_in_component:
+                            raise ValueError("Not found {} in include".format(split_hyperparameter[0]))
 
                 # check if component is present in exclude
                 if exclude is not None and update.node_name in exclude.keys():
                     if split_hyperparameter[0] in exclude[update.node_name]:
-                        raise ValueError("Found {} in exclude".format(split_hyperparameter[0]))
+                        hp_in_component = False
+                        for exclude_component in exclude[update.node_name]:
+                            if exclude_component.startswith(split_hyperparameter[0]):
+                                hp_in_component = True
+                                break
+                        if not hp_in_component:
+                            raise ValueError("Found {} in exclude".format(split_hyperparameter[0]))
 
                 components = node.get_components()
                 # if hyperparameter is __choice__, check if
@@ -440,10 +454,23 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 # needs to be updated is in components of the
                 # choice module
                 elif split_hyperparameter[0] not in components.keys():
-                    raise ValueError("Unknown hyperparameter for choice {}. "
-                                     "Expected update hyperparameter "
-                                     "to be in {} got {}".format(node.__class__.__name__,
-                                                                 components.keys(), split_hyperparameter[0]))
+                    hp_in_component = False
+                    if hasattr(node, 'additional_components') and node.additional_components:
+                        # This is designed for forecasting network encoder:
+                        # forecasting network backbone is composed of two parts: encoder and decoder whereas the type
+                        # of the decoder is determined by the encoder. However, the type of decoder cannot be any part
+                        # of encoder's choice. To allow the user to update the hyperparameter search space for decoder
+                        # network, we consider decoder as "additional_components" and check if the update can be applied
+                        # to node.additional_components
+                        for component_func in node.additional_components:
+                            if split_hyperparameter[0] in component_func().keys():
+                                hp_in_component = True
+                                break
+                    if not hp_in_component:
+                        raise ValueError("Unknown hyperparameter for choice {}. "
+                                         "Expected update hyperparameter "
+                                         "to be in {} got {}".format(node.__class__.__name__,
+                                                                     components.keys(), split_hyperparameter[0]))
                 else:
                     # check if hyperparameter is in the search space of the component
                     component = components[split_hyperparameter[0]]
diff --git a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
index cfc1a890b..9072e8542 100644
--- a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import spmatrix
 
 import torch
 
@@ -24,7 +24,64 @@ def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
             FitRequirement('X_train',
-                           (np.ndarray, pd.DataFrame, csr_matrix),
+                           (np.ndarray, pd.DataFrame, spmatrix),
+                           user_defined=True, dataset_property=False),
+            FitRequirement('backend',
+                           (Backend, ),
+                           user_defined=True, dataset_property=False)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the fitted early_preprocessor into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        raise NotImplementedError()
+
+    def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Makes the autoPyTorchPreprocessingComponent Callable. Calling the component
+        calls the transform function of the underlying early_preprocessor and
+        returns the transformed array.
+        Args:
+            X (Union[np.ndarray, torch.Tensor]): input data tensor
+
+        Returns:
+            Union[np.ndarray, torch.Tensor]: Transformed data tensor
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> ConfigurationSpace:
+        """Return the configuration space of this classification algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]): Describes the dataset
+               to work on
+
+        Returns:
+            ConfigurationSpace: The configuration space of this algorithm.
+        """
+        return ConfigurationSpace()
+
+
+class autoPyTorchTargetPreprocessingComponent(autoPyTorchComponent):
+    """
+     Provides abstract interface for target preprocessing algorithms in AutoPyTorch. Most methods defined in this class
+     are the same as autoPyTorch.pipeline.components.preprocessing.base_preprocessing.autoPyTorchPreprocessingComponent
+     However, they are defined as two different classes such that its subclasses will not be identified as feature
+     preprocessor
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('y_train',
+                           (pd.DataFrame, ),
                            user_defined=True, dataset_property=False),
             FitRequirement('backend',
                            (Backend, ),
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
index ea47e33b9..02a3085b0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+from sklearn.base import BaseEstimator
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import make_pipeline
 
@@ -48,18 +49,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
             "TabularColumnTransformer": an instance of self
         """
         self.check_requirements(X, y)
-        numerical_pipeline = 'drop'
-        categorical_pipeline = 'drop'
 
         preprocessors = get_tabular_preprocessers(X)
-        if len(X['dataset_properties']['numerical_columns']):
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+        if len(preprocessors['numerical']) > 0:
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        if len(X['dataset_properties']['categorical_columns']):
+            column_transformers.append(
+                ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
+            )
+        if len(preprocessors['categorical']) > 0:
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
-
-        self.preprocessor = ColumnTransformer([
-            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
-            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
+            column_transformers.append(
+                ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
+            )
+
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
             remainder='passthrough'
         )
 
@@ -70,7 +78,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
 
-        self.preprocessor.fit(X_train)
+        if 'y_train' in X:
+            y_train = subsampler(X['y_train'], X['train_indices'])
+        else:
+            y_train = X['backend'].load_datamanager().train_tensors[1]
+
+        self.preprocessor.fit(X_train, y=y_train)
+
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
new file mode 100644
index 000000000..69edfcbb6
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import UniformFloatHyperparameter
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.implementations import MinorityCoalesceTransformer
+
+
+class MinorityCoalescer(BaseCoalescer):
+    """Group together categories whose occurence is less than a specified min_frac """
+    def __init__(self, min_frac: float, random_state: np.random.RandomState):
+        super().__init__()
+        self.min_frac = min_frac
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
+        self.check_requirements(X, y)
+        self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac)
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, Any]] = None,
+        min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac',
+                                                                        value_range=(1e-4, 0.5),
+                                                                        default_value=1e-2,
+                                                                        ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, min_frac, UniformFloatHyperparameter)
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'MinorityCoalescer',
+            'name': 'MinorityCoalescer',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
new file mode 100644
index 000000000..fdc13dec6
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
@@ -0,0 +1,37 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+
+
+class NoCoalescer(BaseCoalescer):
+    def __init__(self, random_state: np.random.RandomState):
+        super().__init__()
+        self.random_state = random_state
+        self._processing = False
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
+        """
+        As no coalescing happens, only check the requirements.
+
+        Args:
+            X (Dict[str, Any]):
+                fit dictionary
+            y (Optional[Any]):
+                Parameter to comply with scikit-learn API. Not used.
+
+        Returns:
+            instance of self
+        """
+        self.check_requirements(X, y)
+
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NoCoalescer',
+            'name': 'NoCoalescer',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
new file mode 100644
index 000000000..1139106ce
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -0,0 +1,254 @@
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+from autoPyTorch.utils.common import HyperparameterSearchSpace, HyperparameterValueType
+
+
+coalescer_directory = os.path.split(__file__)[0]
+_coalescer = find_components(__package__,
+                             coalescer_directory,
+                             BaseCoalescer)
+_addons = ThirdPartyComponents(BaseCoalescer)
+
+
+def add_coalescer(coalescer: BaseCoalescer) -> None:
+    _addons.add_component(coalescer)
+
+
+class CoalescerChoice(autoPyTorchChoice):
+    """
+    Allows for dynamically choosing coalescer component at runtime
+    """
+    proc_name = "coalescer"
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available coalescer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
+                as choices for coalescer the categorical columns
+        """
+        # TODO: Create `@property def components(): ...`.
+        components = OrderedDict()
+        components.update(_coalescer)
+        components.update(_addons.components)
+        return components
+
+    @staticmethod
+    def _get_default_choice(
+        avail_components: Dict[str, autoPyTorchComponent],
+        include: List[str],
+        exclude: List[str],
+        defaults: List[str] = ['NoCoalescer', 'MinorityCoalescer'],
+    ) -> str:
+        # TODO: Make it a base method
+        for choice in defaults:
+            if choice in avail_components and choice in include and choice not in exclude:
+                return choice
+        else:
+            raise RuntimeError(
+                f"Available components is either not included in `include` {include} or "
+                f"included in `exclude` {exclude}"
+            )
+
+    def _update_config_space(
+        self,
+        component: CSH.Hyperparameter,
+        avail_components: Dict[str, autoPyTorchComponent],
+        dataset_properties: Dict[str, BaseDatasetPropertiesType]
+    ) -> None:
+        # TODO: Make it a base method
+        cs = ConfigurationSpace()
+        cs.add_hyperparameter(component)
+
+        # add only child hyperparameters of early_preprocessor choices
+        for name in component.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            func4cs = avail_components[name].get_hyperparameter_search_space
+
+            # search space provides different args, so ignore it
+            component_config_space = func4cs(dataset_properties, **updates)  # type:ignore[call-arg]
+            parent_hyperparameter = {'parent': component, 'value': name}
+            cs.add_configuration_space(
+                name,
+                component_config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        self.configuration_space = cs
+
+    def _check_choices_in_update(
+        self,
+        choices_in_update: Sequence[HyperparameterValueType],
+        avail_components: Dict[str, autoPyTorchComponent]
+    ) -> None:
+        # TODO: Make it a base method
+        if not set(choices_in_update).issubset(avail_components):
+            raise ValueError(
+                f"The update for {self.__class__.__name__} is expected to be "
+                f"a subset of {avail_components}, but got {choices_in_update}"
+            )
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        # TODO: Make it a base method
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        avail_cmps = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include,
+            exclude=exclude
+        )
+
+        if len(avail_cmps) == 0:
+            raise ValueError(f"No {self.proc_name} found, please add {self.proc_name} to `include` argument")
+
+        include = include if include is not None else list(avail_cmps.keys())
+        exclude = exclude if exclude is not None else []
+        if default is None:
+            default = self._get_default_choice(avail_cmps, include, exclude)
+
+        updates = self._get_search_space_updates()
+        if "__choice__" in updates:
+            component = self._get_component_with_updates(
+                updates=updates,
+                avail_components=avail_cmps,
+                dataset_properties=dataset_properties
+            )
+        else:
+            component = self._get_component_without_updates(
+                default=default,
+                include=include,
+                avail_components=avail_cmps,
+                dataset_properties=dataset_properties
+            )
+
+        self.dataset_properties = dataset_properties
+        self._update_config_space(
+            component=component,
+            avail_components=avail_cmps,
+            dataset_properties=dataset_properties
+        )
+        return self.configuration_space
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the dataset_properties
+        It recursively makes sure that the children and parent level requirements
+        are honored.
+
+        Args:
+            dataset_properties:
+        """
+        # TODO: Make it a base method
+        super()._check_dataset_properties(dataset_properties)
+        if any(key not in dataset_properties for key in ['categorical_columns', 'numerical_columns']):
+            raise ValueError("Dataset properties must contain information about the type of columns")
+
+    def _get_component_with_updates(
+        self,
+        updates: Dict[str, HyperparameterSearchSpace],
+        avail_components: Dict[str, autoPyTorchComponent],
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+    ) -> CSH.Hyperparameter:
+        # TODO: Make it a base method
+        choice_key = '__choice__'
+        choices_in_update = updates[choice_key].value_range
+        default_in_update = updates[choice_key].default_value
+        self._check_choices_in_update(
+            choices_in_update=choices_in_update,
+            avail_components=avail_components
+        )
+        self._check_update_compatiblity(choices_in_update, dataset_properties)
+        return CSH.CategoricalHyperparameter(choice_key, choices_in_update, default_in_update)
+
+    def _get_component_without_updates(
+        self,
+        avail_components: Dict[str, autoPyTorchComponent],
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        default: str,
+        include: List[str]
+    ) -> CSH.Hyperparameter:
+        """
+        A method to get a hyperparameter information for the component.
+        This method is run when we do not get updates from _get_search_space_updates.
+
+        Args:
+            avail_components (Dict[str, autoPyTorchComponent]):
+                Available components for this processing.
+            dataset_properties (Dict[str, BaseDatasetPropertiesType]):
+                The properties of the dataset.
+            default (str):
+                The default component for this processing.
+            include (List[str]):
+                The components to include for the auto-pytorch searching.
+
+        Returns:
+            (CSH.Hyperparameter):
+                The hyperparameter information for this processing.
+        """
+        # TODO: Make an abstract method with NotImplementedError
+        choice_key = '__choice__'
+        no_proc_key = 'NoCoalescer'
+        choices = list(avail_components.keys())
+
+        assert isinstance(dataset_properties['categorical_columns'], list)  # mypy check
+        if len(dataset_properties['categorical_columns']) == 0:
+            # only no coalescer is compatible if the dataset has only numericals
+            default, choices = no_proc_key, [no_proc_key]
+            if no_proc_key not in include:
+                raise ValueError("Only no coalescer is compatible for a dataset with no categorical column")
+
+        return CSH.CategoricalHyperparameter(choice_key, choices, default_value=default)
+
+    def _check_update_compatiblity(
+        self,
+        choices_in_update: Sequence[HyperparameterValueType],
+        dataset_properties: Dict[str, BaseDatasetPropertiesType]
+    ) -> None:
+        """
+        Check the compatibility of the updates for the components
+        in this processing given dataset properties.
+        For example, some processing is not compatible with datasets
+        with no numerical columns.
+        We would like to check such compatibility in this method.
+
+        Args:
+            choices_in_update (Sequence[HyperparameterValueType]):
+                The choices of components in updates
+            dataset_properties (Dict[str, BaseDatasetPropertiesType]):
+                The properties of the dataset.
+        """
+        # TODO: Make an abstract method with NotImplementedError
+        assert isinstance(dataset_properties['categorical_columns'], list)  # mypy check
+        if len(dataset_properties['categorical_columns']) > 0:
+            # no restriction for update if dataset has categorical columns
+            return
+
+        if 'NoCoalescer' not in choices_in_update or len(choices_in_update) != 1:
+            raise ValueError(
+                "Only no coalescer is compatible for a dataset with no categorical column, "
+                f"but got {choices_in_update}"
+            )
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
new file mode 100644
index 000000000..b572f8343
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
@@ -0,0 +1,33 @@
+from typing import Any, Dict, List
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
+    autoPyTorchTabularPreprocessingComponent
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
+    def __init__(self) -> None:
+        super().__init__()
+        self._processing = True
+        self.add_fit_requirements([
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categories', (List,), user_defined=True, dataset_property=True)
+        ])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Add the preprocessor to the provided fit dictionary `X`.
+
+        Args:
+            X (Dict[str, Any]): fit dictionary in sklearn
+
+        Returns:
+            X (Dict[str, Any]): the updated fit dictionary
+        """
+        if self._processing and self.preprocessor['categorical'] is None:
+            # If we apply minority coalescer, we must have categorical preprocessor!
+            raise RuntimeError(f"fit() must be called before transform() on {self.__class__.__name__}")
+
+        X.update({'coalescer': self.preprocessor})
+        return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py
new file mode 100644
index 000000000..274cdc49a
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py
@@ -0,0 +1,172 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.feature_selection import SelectFromModel
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import NoneType_
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none
+
+
+CRITERION_CHOICES = ("gini", "entropy")
+
+
+class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Select features based on importance weights calculated using extra trees
+    """
+    def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
+                 criterion: str = "gini", max_features: float = 0.5,
+                 max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2,
+                 min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0,
+                 max_leaf_nodes: Union[int, NoneType_] = "none",
+                 min_impurity_decrease: float = 0, oob_score: bool = False,
+                 verbose: int = 0,
+                 random_state: Optional[np.random.RandomState] = None):
+        self.bootstrap = bootstrap
+        self.n_estimators = n_estimators
+        if criterion not in CRITERION_CHOICES:
+            raise ValueError(f"`criterion` of {self.__class__.__name__} "
+                             f"must be in {CRITERION_CHOICES}, but got: {criterion}")
+        self.criterion = criterion
+        self.max_features = max_features
+        self.min_impurity_decrease = min_impurity_decrease
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.oob_score = oob_score
+        self.verbose = verbose
+
+        super().__init__(random_state=random_state)
+
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            bootstrap=self.bootstrap,
+            n_estimators=self.n_estimators,
+            criterion=self.criterion,
+            max_features=self.max_features,
+            min_impurity_decrease=self.min_impurity_decrease,
+            max_depth=self.max_depth,
+            min_samples_split=self.min_samples_split,
+            min_samples_leaf=self.min_samples_leaf,
+            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+            max_leaf_nodes=self.max_leaf_nodes,
+            oob_score=self.oob_score,
+            verbose=self.verbose,
+            random_state=self.random_state,
+        )
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        if check_none(self.max_leaf_nodes):
+            self.max_leaf_nodes = None
+        elif isinstance(self.max_leaf_nodes, int):
+            self.max_leaf_nodes = int(self.max_leaf_nodes)
+        else:
+            raise ValueError(f"Expected `max_leaf_nodes` to be either "
+                             f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}")
+
+        if check_none(self.max_depth):
+            self.max_depth = None
+        elif isinstance(self.max_depth, int):
+            self.max_depth = int(self.max_depth)
+        else:
+            raise ValueError(f"Expected `max_depth` to be either "
+                             f"in ('None', 'none', None) or an integer, got {self.max_depth}")
+
+        # TODO: add class_weights
+        estimator = ExtraTreesClassifier(**self.get_components_kwargs())
+
+        self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
+                                                         threshold='mean',
+                                                         prefit=False)
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
+                                                                         value_range=(True, False),
+                                                                         default_value=True,
+                                                                         ),
+        n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
+                                                                            value_range=(10, 100),
+                                                                            default_value=10,
+                                                                            ),
+        max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
+                                                                         value_range=("none",),
+                                                                         default_value="none",
+                                                                         ),
+        max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features',
+                                                                            value_range=(0, 1),
+                                                                            default_value=0.5,
+                                                                            ),
+        min_impurity_decrease: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='min_impurity_decrease',
+            value_range=(0,),
+            default_value=0),
+        criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
+                                                                         value_range=CRITERION_CHOICES,
+                                                                         default_value="gini",
+                                                                         ),
+        min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
+                                                                                 value_range=(2, 20),
+                                                                                 default_value=2,
+                                                                                 ),
+        min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
+                                                                                value_range=(1, 20),
+                                                                                default_value=1,
+                                                                                ),
+        min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='min_weight_fraction_leaf',
+            value_range=(0,),
+            default_value=0),
+        max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
+                                                                              value_range=("none",),
+                                                                              default_value="none",
+                                                                              ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
+        add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, max_features, UniformFloatHyperparameter)
+        add_hyperparameter(cs, min_impurity_decrease, UniformFloatHyperparameter)
+        add_hyperparameter(cs, criterion, CategoricalHyperparameter)
+        add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
+        add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'ETC',
+                'name': 'Extra Trees Classifier Preprocessing',
+                'handles_sparse': True,
+                'handles_regression': False,
+                'handles_classification': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py
new file mode 100644
index 000000000..3c3db31cd
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py
@@ -0,0 +1,175 @@
+from typing import Any, Dict, List, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.ensemble import ExtraTreesRegressor
+from sklearn.feature_selection import SelectFromModel
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import NoneType_
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none
+
+
+CRITERION_CHOICES = ('mse', 'friedman_mse', 'mae')
+
+
+class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Selects features based on importance weights using extra trees
+    """
+    def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
+                 criterion: str = "mse", max_features: float = 1,
+                 max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2,
+                 min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0,
+                 max_leaf_nodes: Union[int, NoneType_] = "none",
+                 oob_score: bool = False, verbose: int = 0,
+                 random_state: Optional[np.random.RandomState] = None):
+        self.bootstrap = bootstrap
+        self.n_estimators = n_estimators
+        if criterion not in CRITERION_CHOICES:
+            raise ValueError(f"`criterion` of {self.__class__.__name__} "
+                             f"must be in {CRITERION_CHOICES}, but got: {criterion}")
+        self.criterion = criterion
+        self.max_features = max_features
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.oob_score = oob_score
+        self.verbose = verbose
+
+        super().__init__(random_state=random_state)
+
+        self.add_fit_requirements([
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
+
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            bootstrap=self.bootstrap,
+            n_estimators=self.n_estimators,
+            criterion=self.criterion,
+            max_features=self.max_features,
+            max_depth=self.max_depth,
+            min_samples_split=self.min_samples_split,
+            min_samples_leaf=self.min_samples_leaf,
+            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+            max_leaf_nodes=self.max_leaf_nodes,
+            oob_score=self.oob_score,
+            verbose=self.verbose,
+            random_state=self.random_state,
+        )
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        if check_none(self.max_leaf_nodes):
+            self.max_leaf_nodes = None
+        elif isinstance(self.max_leaf_nodes, int):
+            self.max_leaf_nodes = int(self.max_leaf_nodes)
+        else:
+            raise ValueError(f"Expected `max_leaf_nodes` to be either "
+                             f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}")
+
+        if check_none(self.max_depth):
+            self.max_depth = None
+        elif isinstance(self.max_depth, int):
+            self.max_depth = int(self.max_depth)
+        else:
+            raise ValueError(f"Expected `max_depth` to be either "
+                             f"in ('None', 'none', None) or an integer, got {self.max_depth}")
+
+        num_features = len(X['dataset_properties']['numerical_columns'])
+        max_features = int(
+            float(self.max_features) * (np.log(num_features) + 1))
+        # Use at most half of the features
+        max_features = max(1, min(int(num_features / 2), max_features))
+
+        # TODO: add class_weights
+        estimator = ExtraTreesRegressor(**self.get_components_kwargs())
+
+        self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
+                                                         threshold='mean',
+                                                         prefit=False)
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
+                                                                         value_range=(True, False),
+                                                                         default_value=True,
+                                                                         ),
+        n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
+                                                                            value_range=(100,),
+                                                                            default_value=100,
+                                                                            ),
+        max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
+                                                                         value_range=("none",),
+                                                                         default_value="none",
+                                                                         ),
+        max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features',
+                                                                            value_range=(0.1, 1),
+                                                                            default_value=1,
+                                                                            ),
+        criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
+                                                                         value_range=CRITERION_CHOICES,
+                                                                         default_value="mse",
+                                                                         ),
+        min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
+                                                                                 value_range=(2, 20),
+                                                                                 default_value=2,
+                                                                                 ),
+        min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
+                                                                                value_range=(1, 20),
+                                                                                default_value=1,
+                                                                                ),
+        min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='min_weight_fraction_leaf',
+            value_range=(0,),
+            default_value=0),
+        max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
+                                                                              value_range=("none",),
+                                                                              default_value="none",
+                                                                              ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
+        add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, max_features, UniformFloatHyperparameter)
+        add_hyperparameter(cs, criterion, CategoricalHyperparameter)
+        add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
+        add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'ETR',
+                'name': 'Extra Trees Regressor Preprocessing',
+                'handles_sparse': True,
+                'handles_regression': True,
+                'handles_classification': False
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py
new file mode 100644
index 000000000..bded9e093
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py
@@ -0,0 +1,118 @@
+from typing import Any, Dict, Optional
+
+from ConfigSpace.conditions import EqualsCondition
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.decomposition import FastICA as SklearnFastICA
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import percentage_value_range_to_integer_range
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class FastICA(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Reduce number of features by separating a multivariate signal into
+    additive subcomponents that are maximally independent.
+
+    Args:
+        n_components (int):
+            Number of components to use
+            Note:
+                This number needs to be less than the total number of
+                features. To keep the hyperparameter search space general
+                to different datasets, autoPyTorch defines its value
+                range as the percentage of the number of features (in float).
+                This is then used to construct the range of n_components using
+                n_components = percentage of features * number of features.
+                Defaults to 100.
+        algorithm (str):
+            Apply parallel or deflational algorithm for FastICA.
+            Defaults to 'parallel'.
+        whiten (bool):
+            If whiten is false, the data is already considered to be whitened,
+            and no whitening is performed. Defaults to False.
+        fun (str):
+            The functional form of the G function used in the approximation to neg-entropy.
+            Defaults to "logcosh".
+    """
+    def __init__(self, n_components: int = 100,
+                 algorithm: str = 'parallel', whiten: bool = False,
+                 fun: str = "logcosh",
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.n_components = n_components
+        self.algorithm = algorithm
+        self.whiten = whiten
+        self.fun = fun
+
+        super().__init__(random_state=random_state)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnFastICA(
+            n_components=self.n_components, algorithm=self.algorithm,
+            fun=self.fun, whiten=self.whiten, random_state=self.random_state)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        n_components: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_components',
+                                                                            value_range=(0.5, 0.9),
+                                                                            default_value=0.5,
+                                                                            ),
+        algorithm: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='algorithm',
+                                                                         value_range=('parallel', 'deflation'),
+                                                                         default_value='parallel',
+                                                                         ),
+        whiten: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='whiten',
+                                                                      value_range=(True, False),
+                                                                      default_value=False,
+                                                                      ),
+        fun: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='fun',
+                                                                   value_range=('logcosh', 'exp', 'cube'),
+                                                                   default_value='logcosh',
+                                                                   ),
+    ) -> ConfigurationSpace:
+        n_components = percentage_value_range_to_integer_range(
+            hyperparameter_search_space=n_components,
+            default_value_range=(10, 2000),
+            default_value=100,
+            dataset_properties=dataset_properties,
+        )
+        cs = ConfigurationSpace()
+
+        n_components_hp = get_hyperparameter(n_components, UniformIntegerHyperparameter)
+        whiten_hp = get_hyperparameter(whiten, CategoricalHyperparameter)
+        add_hyperparameter(cs, algorithm, CategoricalHyperparameter)
+        add_hyperparameter(cs, fun, CategoricalHyperparameter)
+        cs.add_hyperparameter(whiten_hp)
+
+        if True in whiten_hp.choices:
+            cs.add_hyperparameter(n_components_hp)
+            cs.add_condition(EqualsCondition(n_components_hp, whiten_hp, True))
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'FastICA',
+                'name': 'Fast Independent Component Analysis',
+                'handles_sparse': False,
+                'handles_classification': True,
+                'handles_regression': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FeatureAgglomeration.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FeatureAgglomeration.py
new file mode 100644
index 000000000..63519e301
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FeatureAgglomeration.py
@@ -0,0 +1,129 @@
+from typing import Any, Callable, Dict, Optional
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.forbidden import (
+    ForbiddenAndConjunction,
+    ForbiddenEqualsClause,
+    ForbiddenInClause
+)
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.cluster import FeatureAgglomeration as SklearnFeatureAgglomeration
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import percentage_value_range_to_integer_range
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class FeatureAgglomeration(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Recursively merge pair of clusters of features constructed
+    using agglomerative clustering.
+
+    Args:
+            n_clusters (int):
+                The number of clusters to find. Defaults to 25.
+                Note:
+                    This number needs to be less than the total number of
+                    features. To keep the hyperparameter search space general
+                    to different datasets, autoPyTorch defines its value
+                    range as the percentage of the number of features (in float).
+                    This is then used to construct the range of n_clusters using
+                    n_clusters = percentage of features * number of features.
+            affinity (str):
+                Metric used to compute the linkage. If linkage is “ward”, only
+                “euclidean” is accepted. Defaults to 'euclidean'.
+            linkage (str):
+                Which linkage criterion to use. The linkage criterion determines
+                which distance to use between sets of features. Defaults to 'ward'.
+            pooling_func (str):
+                Combines the values of agglomerated features into a single value,
+                autoPyTorch uses (max, min and median) functions from numpy. Defaults to "max".
+    """
+    def __init__(self, n_clusters: int = 25,
+                 affinity: str = 'euclidean', linkage: str = 'ward',
+                 pooling_func: str = "max",
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.n_clusters = n_clusters
+        self.affinity = affinity
+        self.linkage = linkage
+        self.pooling_func: Callable = getattr(np, pooling_func)
+
+        super().__init__(random_state=random_state)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnFeatureAgglomeration(
+            n_clusters=self.n_clusters, affinity=self.affinity,
+            linkage=self.linkage, pooling_func=self.pooling_func)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        n_clusters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_clusters',
+                                                                          value_range=(0.5, 0.9),
+                                                                          default_value=0.5,
+                                                                          ),
+        affinity: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='affinity',
+                                                                        value_range=("euclidean",
+                                                                                     "manhattan",
+                                                                                     "cosine"),
+                                                                        default_value="euclidean",
+                                                                        ),
+        linkage: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='linkage',
+                                                                       value_range=("ward", "complete", "average"),
+                                                                       default_value="ward",
+                                                                       ),
+        pooling_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='pooling_func',
+                                                                            value_range=("mean", "median", "max"),
+                                                                            default_value="max",
+                                                                            ),
+    ) -> ConfigurationSpace:
+        n_clusters = percentage_value_range_to_integer_range(
+            hyperparameter_search_space=n_clusters,
+            default_value_range=(2, 400),
+            default_value=25,
+            dataset_properties=dataset_properties,
+        )
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, n_clusters, UniformIntegerHyperparameter)
+        affinity_hp = get_hyperparameter(affinity, CategoricalHyperparameter)
+        linkage_hp = get_hyperparameter(linkage, CategoricalHyperparameter)
+        add_hyperparameter(cs, pooling_func, CategoricalHyperparameter)
+        cs.add_hyperparameters([affinity_hp, linkage_hp])
+
+        # If linkage is “ward”, only “euclidean” is accepted.
+        non_euclidian_affinity = [choice for choice in ["manhattan", "cosine"] if choice in affinity_hp.choices]
+
+        if "ward" in linkage_hp.choices and len(non_euclidian_affinity) > 0:
+            forbidden_condition = ForbiddenAndConjunction(
+                ForbiddenInClause(affinity_hp, non_euclidian_affinity),
+                ForbiddenEqualsClause(linkage_hp, "ward")
+            )
+            cs.add_forbidden_clause(forbidden_condition)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'FeatureAgglomeration',
+                'name': 'Feature Agglomeration',
+                'handles_sparse': False,
+                'handles_classification': True,
+                'handles_regression': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py
index afa0334cb..f6a8db28f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py
@@ -1,5 +1,4 @@
-from math import ceil, floor
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 from ConfigSpace.conditions import EqualsCondition, InCondition
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -17,10 +16,35 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
     base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import percentage_value_range_to_integer_range
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
 class KernelPCA(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Non-linear dimensionality reduction through the use of kernels
+
+    Args:
+        n_components (int):
+            Number of components.
+            Note:
+                This number needs to be less than the total number of
+                features. To keep the hyperparameter search space general
+                to different datasets, autoPyTorch defines its value
+                range as the percentage of the number of features (in float).
+                This is then used to construct the range of n_components using
+                n_components = percentage of features * number of features.
+                Defaults to 10.
+        kernel (str):
+            Kernel used for PCA. Defaults to 'rbf'.
+        degree (int):
+            Degree for poly kernels. Defaults to 3.
+        gamma (float):
+            Kernel coefficient for rbf, poly and sigmoid kernels. Defaults to 0.01.
+        coef0 (float):
+            Independent term in poly and sigmoid kernels. Defaults to 0.0.
+    """
     def __init__(self, n_components: int = 10,
                  kernel: str = 'rbf', degree: int = 3,
                  gamma: float = 0.01, coef0: float = 0.0,
@@ -38,6 +62,8 @@ def __init__(self, n_components: int = 10,
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
+        self.check_requirements(X, y)
+
         self.preprocessor['numerical'] = sklearn.decomposition.KernelPCA(
             n_components=self.n_components, kernel=self.kernel,
             degree=self.degree, gamma=self.gamma, coef0=self.coef0,
@@ -72,24 +98,12 @@ def get_hyperparameter_search_space(
 
         cs = ConfigurationSpace()
 
-        if dataset_properties is not None:
-            n_features = len(dataset_properties['numerical_columns']) if isinstance(
-                dataset_properties['numerical_columns'], List) else 0
-            if n_features == 1:
-                log = False
-            else:
-                log = n_components.log
-            n_components = HyperparameterSearchSpace(hyperparameter='n_components',
-                                                     value_range=(
-                                                         floor(float(n_components.value_range[0]) * n_features),
-                                                         ceil(float(n_components.value_range[1]) * n_features)),
-                                                     default_value=ceil(float(n_components.default_value) * n_features),
-                                                     log=log)
-        else:
-            n_components = HyperparameterSearchSpace(hyperparameter='n_components',
-                                                     value_range=(10, 2000),
-                                                     default_value=100,
-                                                     log=n_components.log)
+        n_components = percentage_value_range_to_integer_range(
+            hyperparameter_search_space=n_components,
+            default_value_range=(10, 2000),
+            default_value=100,
+            dataset_properties=dataset_properties,
+        )
 
         add_hyperparameter(cs, n_components, UniformIntegerHyperparameter)
         kernel_hp = get_hyperparameter(kernel, CategoricalHyperparameter)
@@ -121,5 +135,7 @@ def get_hyperparameter_search_space(
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {'shortname': 'KernelPCA',
                 'name': 'Kernel Principal Component Analysis',
-                'handles_sparse': True
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True
                 }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py
new file mode 100644
index 000000000..f9d0c996d
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py
@@ -0,0 +1,141 @@
+from typing import Any, Dict, Optional
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.forbidden import (
+    ForbiddenAndConjunction,
+    ForbiddenEqualsClause,
+)
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection import SelectFromModel
+from sklearn.svm import LinearSVC
+
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class LibLinearSVCPreprocessor(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Selects features based on importance weights using svm classifier
+    """
+    def __init__(self, dual: bool = False, penalty: str = "l1",
+                 loss: str = "squared_hinge", tol: float = 1e-4,
+                 C: float = 1, multi_class: str = "ovr",
+                 intercept_scaling: float = 1, fit_intercept: bool = True,
+                 random_state: Optional[np.random.RandomState] = None):
+
+        self.dual = dual
+        self.penalty = penalty
+        self.loss = loss
+        self.multi_class = multi_class
+        self.intercept_scaling = intercept_scaling
+        self.fit_intercept = fit_intercept
+        self.tol = tol
+        self.C = C
+
+        super().__init__(random_state=random_state)
+
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            dual=self.dual,
+            penalty=self.penalty,
+            loss=self.loss,
+            multi_class=self.multi_class,
+            intercept_scaling=self.intercept_scaling,
+            tol=self.tol,
+            fit_intercept=self.fit_intercept,
+            C=self.C,
+            random_state=self.random_state
+        )
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+        # TODO: add class_weights
+        estimator = LinearSVC(**self.get_components_kwargs())
+
+        self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
+                                                         threshold='mean',
+                                                         prefit=False)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'LinearSVC Preprocessor',
+                'name': 'linear Support Vector Classification Preprocessing',
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': False
+                }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        penalty: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='penalty',
+                                                                       value_range=("l1",),
+                                                                       default_value="l1",
+                                                                       ),
+        loss: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='loss',
+                                                                    value_range=("squared_hinge", "hinge"),
+                                                                    default_value="squared_hinge",
+                                                                    ),
+        dual: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dual',
+                                                                    value_range=(False,),
+                                                                    default_value=False,
+                                                                    ),
+        tol: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='tol',
+                                                                   value_range=(1e-5, 1e-1),
+                                                                   default_value=1e-4,
+                                                                   log=True
+                                                                   ),
+        C: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='C',
+                                                                 value_range=(0.03125, 32768),
+                                                                 default_value=1,
+                                                                 log=True
+                                                                 ),
+        multi_class: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='multi_class',
+                                                                           value_range=("ovr",),
+                                                                           default_value="ovr"),
+        fit_intercept: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='fit_intercept',
+                                                                             value_range=(True,),
+                                                                             default_value=True,
+                                                                             ),
+        intercept_scaling: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='intercept_scaling',
+                                                                                 value_range=(1,),
+                                                                                 default_value=1,
+                                                                                 ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, fit_intercept, CategoricalHyperparameter)
+        penalty_hp = get_hyperparameter(penalty, CategoricalHyperparameter)
+        add_hyperparameter(cs, multi_class, CategoricalHyperparameter)
+        loss_hp = get_hyperparameter(loss, CategoricalHyperparameter)
+        add_hyperparameter(cs, dual, CategoricalHyperparameter)
+        add_hyperparameter(cs, tol, UniformFloatHyperparameter)
+        add_hyperparameter(cs, C, UniformFloatHyperparameter)
+        add_hyperparameter(cs, intercept_scaling, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([loss_hp, penalty_hp])
+        if "l1" in penalty_hp.choices and "hinge" in loss_hp.choices:
+            penalty_and_loss = ForbiddenAndConjunction(
+                ForbiddenEqualsClause(penalty_hp, "l1"),
+                ForbiddenEqualsClause(loss_hp, "hinge")
+            )
+            cs.add_forbidden_clause(penalty_and_loss)
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py
index 9eb83a003..11e12e7df 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py
@@ -47,8 +47,9 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'NoFeaturePreprocessing',
-            'name': 'No Feature Preprocessing',
-            'handles_sparse': True
-        }
+        return {'shortname': 'NoFeaturePreprocessing',
+                'name': 'No Feature Preprocessing',
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py
index a0bd953cb..6fe2a617f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py
@@ -1,5 +1,5 @@
-from math import ceil, floor
-from typing import Any, Dict, List, Optional
+import warnings
+from typing import Any, Dict, Optional
 
 from ConfigSpace.conditions import EqualsCondition, InCondition
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -17,10 +17,34 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
     base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import percentage_value_range_to_integer_range
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
 class Nystroem(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Construct an approximate feature map for an arbitrary kernel using a subset of the data as basis.
+
+    Args:
+        n_components (int):
+            Note:
+                This number needs to be less than the total number of
+                features. To keep the hyperparameter search space general
+                to different datasets, autoPyTorch defines its value
+                range as the percentage of the number of features (in float).
+                This is then used to construct the range of n_components using
+                n_components = percentage of features * number of features. Defaults to 10.
+        kernel (str):
+            Kernel map to be approximated. Defaults to 'rbf'.
+        degree (int):
+            Degree of the polynomial kernel. Defaults to 3.
+        gamma (float):
+            Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and
+            sigmoid kernels. Defaults to 0.01.
+        coef0 (float):
+            Zero coefficient for polynomial and sigmoid kernels. Defaults to 0.0.
+    """
     def __init__(self, n_components: int = 10,
                  kernel: str = 'rbf', degree: int = 3,
                  gamma: float = 0.01, coef0: float = 0.0,
@@ -32,6 +56,8 @@ def __init__(self, n_components: int = 10,
         self.gamma = gamma
         self.coef0 = coef0
         super().__init__(random_state=random_state)
+        self.add_fit_requirements([
+            FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)])
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
@@ -50,7 +76,11 @@ def get_hyperparameter_search_space(
                                                                             default_value=0.5,
                                                                             ),
         kernel: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='kernel',
-                                                                      value_range=('poly', 'rbf', 'sigmoid', 'cosine'),
+                                                                      value_range=('poly',
+                                                                                   'rbf',
+                                                                                   'sigmoid',
+                                                                                   'cosine',
+                                                                                   'chi2'),
                                                                       default_value='rbf',
                                                                       ),
         gamma: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='gamma',
@@ -69,27 +99,42 @@ def get_hyperparameter_search_space(
 
         cs = ConfigurationSpace()
 
+        n_components = percentage_value_range_to_integer_range(
+            hyperparameter_search_space=n_components,
+            default_value_range=(10, 2000),
+            default_value=100,
+            dataset_properties=dataset_properties,
+        )
+
+        add_hyperparameter(cs, n_components, UniformIntegerHyperparameter)
+        value_range = list(kernel.value_range)
+
+        allow_chi = True
+
         if dataset_properties is not None:
-            n_features = len(dataset_properties['numerical_columns']) \
-                if isinstance(dataset_properties['numerical_columns'], List) else 0
-            # if numerical features are 1, set log to False
-            if n_features == 1:
-                log = False
+            if (
+                dataset_properties.get("issigned")
+                or dataset_properties.get("issparse")
+            ):
+                # chi kernel does not support negative numbers or
+                # a sparse matrix
+                allow_chi = False
             else:
-                log = n_components.log
-            n_components = HyperparameterSearchSpace(hyperparameter='n_components',
-                                                     value_range=(
-                                                         floor(float(n_components.value_range[0]) * n_features),
-                                                         ceil(float(n_components.value_range[1]) * n_features)),
-                                                     default_value=ceil(float(n_components.default_value) * n_features),
-                                                     log=log)
-        else:
-            n_components = HyperparameterSearchSpace(hyperparameter='n_components',
-                                                     value_range=(10, 2000),
-                                                     default_value=100,
-                                                     log=n_components.log)
+                allow_chi = True
+        if not allow_chi:
+            value_range = [value for value in value_range if value != "chi2"]
+            if len(value_range) == 0:
+                value_range = ["poly"]
+
+        if value_range != list(kernel.value_range):
+            warnings.warn(f"Given choices for `score_func` are not compatible with the dataset. "
+                          f"Updating choices to {value_range}")
+
+        kernel = HyperparameterSearchSpace(hyperparameter='kernel',
+                                           value_range=value_range,
+                                           default_value=value_range[-1],
+                                           )
 
-        add_hyperparameter(cs, n_components, UniformIntegerHyperparameter)
         kernel_hp = get_hyperparameter(kernel, CategoricalHyperparameter)
         gamma = get_hyperparameter(gamma, UniformFloatHyperparameter)
         coef0 = get_hyperparameter(coef0, UniformFloatHyperparameter)
@@ -119,5 +164,8 @@ def get_hyperparameter_search_space(
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {'shortname': 'Nystroem',
                 'name': 'Nystroem kernel approximation',
-                'handles_sparse': True
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True,
+                'handles_signed': True
                 }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PCA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PCA.py
new file mode 100644
index 000000000..1a64e1ed5
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PCA.py
@@ -0,0 +1,69 @@
+from typing import Any, Dict, Optional
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+import sklearn.decomposition
+from sklearn.base import BaseEstimator
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
+
+
+class PCA(autoPyTorchFeaturePreprocessingComponent):
+    def __init__(self, keep_variance: float = 0.9999,
+                 whiten: bool = False,
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.keep_variance = keep_variance
+        self.whiten = whiten
+        super().__init__(random_state=random_state)
+
+        self.add_fit_requirements([
+            FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.check_requirements(X, y)
+
+        n_components = float(self.keep_variance)
+        self.preprocessor['numerical'] = sklearn.decomposition.PCA(
+            n_components=n_components, whiten=self.whiten,
+            random_state=self.random_state)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        keep_variance: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='keep_variance',
+                                                                             value_range=(0.5, 0.9999),
+                                                                             default_value=0.9999,
+                                                                             log=True),
+        whiten: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='whiten',
+                                                                      value_range=(True, False),
+                                                                      default_value=False,
+                                                                      ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, keep_variance, UniformFloatHyperparameter)
+        add_hyperparameter(cs, whiten, CategoricalHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'PCA',
+                'name': 'Principal Component Analysis',
+                'handles_sparse': False,
+                'handles_classification': True,
+                'handles_regression': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py
index 38ca15b1c..dfc085d24 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py
@@ -37,7 +37,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {'shortname': 'PolynomialFeatures',
                 'name': 'PolynomialFeatures',
-                'handles_sparse': True}
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True
+                }
 
     @staticmethod
     def get_hyperparameter_search_space(
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py
deleted file mode 100644
index cb3eb2b54..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import Any, Dict, Optional
-
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-)
-
-import numpy as np
-
-import sklearn.preprocessing
-from sklearn.base import BaseEstimator
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
-    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
-
-
-class PowerTransformer(autoPyTorchFeaturePreprocessingComponent):
-    def __init__(self, standardize: bool = True,
-                 random_state: Optional[np.random.RandomState] = None):
-        self.standardize = standardize
-
-        super().__init__(random_state=random_state)
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        self.preprocessor['numerical'] = sklearn.preprocessing.PowerTransformer(method="yeo-johnson",
-                                                                                standardize=self.standardize,
-                                                                                copy=False)
-        return self
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
-        return {'shortname': 'PowerTransformer',
-                'name': 'Power Transformer',
-                'handles_sparse': True}
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        standardize: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='standardize',
-                                                                           value_range=(True, False),
-                                                                           default_value=True,
-                                                                           ),
-    ) -> ConfigurationSpace:
-        cs = ConfigurationSpace()
-        add_hyperparameter(cs, standardize, CategoricalHyperparameter)
-
-        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py
index 44cc169f4..cc0b1d628 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py
@@ -1,5 +1,4 @@
-from math import ceil, floor
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -15,10 +14,30 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
     base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import percentage_value_range_to_integer_range
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
 class RandomKitchenSinks(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Approximate a RBF kernel feature map using random Fourier features.
+
+    Args:
+        n_components (int):
+            Number of Monte Carlo samples per original feature.
+            Equals the dimensionality of the computed feature space.
+            Note:
+                This number needs to be less than the total number of
+                features. To keep the hyperparameter search space general
+                to different datasets, autoPyTorch defines its value
+                range as the percentage of the number of features (in float).
+                This is then used to construct the range of n_components using
+                n_components = percentage of features * number of features.
+                Defaults to 100.
+        gamma (float):
+            Parameter of RBF kernel: exp(-gamma * x^2). Defaults to 1.0.
+    """
     def __init__(self, n_components: int = 100,
                  gamma: float = 1.0,
                  random_state: Optional[np.random.RandomState] = None
@@ -47,24 +66,12 @@ def get_hyperparameter_search_space(
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
-        if dataset_properties is not None:
-            n_features = len(dataset_properties['numerical_columns']) \
-                if isinstance(dataset_properties['numerical_columns'], List) else 0
-            if n_features == 1:
-                log = False
-            else:
-                log = n_components.log
-            n_components = HyperparameterSearchSpace(hyperparameter='n_components',
-                                                     value_range=(
-                                                         floor(float(n_components.value_range[0]) * n_features),
-                                                         ceil(float(n_components.value_range[1]) * n_features)),
-                                                     default_value=ceil(float(n_components.default_value) * n_features),
-                                                     log=log)
-        else:
-            n_components = HyperparameterSearchSpace(hyperparameter='n_components',
-                                                     value_range=(10, 2000),
-                                                     default_value=100,
-                                                     log=n_components.log)
+        n_components = percentage_value_range_to_integer_range(
+            hyperparameter_search_space=n_components,
+            default_value_range=(10, 2000),
+            default_value=100,
+            dataset_properties=dataset_properties,
+        )
 
         add_hyperparameter(cs, n_components, UniformIntegerHyperparameter)
 
@@ -75,5 +82,7 @@ def get_hyperparameter_search_space(
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {'shortname': 'KitchenSink',
                 'name': 'Random Kitchen Sinks',
-                'handles_sparse': True
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True
                 }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py
new file mode 100644
index 000000000..10c92fdd1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py
@@ -0,0 +1,104 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformIntegerHyperparameter
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.ensemble import RandomTreesEmbedding as SklearnRandomTreesEmbedding
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import NoneType_
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none
+
+
+class RandomTreesEmbedding(autoPyTorchFeaturePreprocessingComponent):
+    def __init__(self, n_estimators: int = 10,
+                 max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2,
+                 min_samples_leaf: int = 1,
+                 max_leaf_nodes: Union[int, NoneType_] = "none",
+                 sparse_output: bool = False,
+                 random_state: Optional[np.random.RandomState] = None):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.sparse_output = sparse_output
+
+        super().__init__(random_state=random_state)
+
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            min_samples_split=self.min_samples_split,
+            min_samples_leaf=self.min_samples_leaf,
+            max_leaf_nodes=self.max_leaf_nodes,
+            sparse_output=self.sparse_output,
+            random_state=self.random_state
+        )
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        if check_none(self.max_leaf_nodes):
+            self.max_leaf_nodes = None
+        if check_none(self.max_depth):
+            self.max_depth = None
+
+        self.preprocessor['numerical'] = SklearnRandomTreesEmbedding(**self.get_components_kwargs())
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'RandomTreesEmbedding',
+                'name': 'Random Trees Embedding',
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True
+                }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
+                                                                            value_range=(10, 100),
+                                                                            default_value=10,
+                                                                            ),
+        max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
+                                                                         value_range=(2, 10),
+                                                                         default_value=5,
+                                                                         ),
+        min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
+                                                                                 value_range=(2, 20),
+                                                                                 default_value=2,
+                                                                                 ),
+        min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
+                                                                                value_range=(1, 20),
+                                                                                default_value=1,
+                                                                                ),
+        max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
+                                                                              value_range=("none",),
+                                                                              default_value="none",
+                                                                              ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py
new file mode 100644
index 000000000..1ba4d5307
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py
@@ -0,0 +1,88 @@
+from functools import partial
+from typing import Any, Dict, Optional
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection import SelectPercentile, chi2, f_classif, mutual_info_classif
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.utils \
+    import filter_score_func_choices
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
+
+
+SCORE_FUNC_CHOICES = ("chi2", "mutual_info_classif", "f_classif")
+
+
+class SelectPercentileClassification(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Select features according to a percentile of the highest scores.
+    Scores are calculated using one of SCORE_FUNC_CHOICES
+    """
+    def __init__(self, score_func: str = "chi2",
+                 percentile: int = 50,
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.percentile = percentile
+        if score_func == "chi2":
+            self.score_func = chi2
+        elif score_func == "f_classif":
+            self.score_func = f_classif
+        elif score_func == "mutual_info_classif":
+            self.score_func = partial(mutual_info_classif, random_state=random_state)
+        else:
+            raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, "
+                             "but is: {score_func}")
+
+        super().__init__(random_state=random_state)
+        self.add_fit_requirements([
+            FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SelectPercentile(
+            percentile=self.percentile, score_func=self.score_func)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        percentile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="percentile",
+                                                                          value_range=(1, 99),
+                                                                          default_value=50,
+                                                                          ),
+        score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func",
+                                                                          value_range=SCORE_FUNC_CHOICES,
+                                                                          default_value="chi2",
+                                                                          ),
+    ) -> ConfigurationSpace:
+        score_func = filter_score_func_choices(class_name="SelectPercentileClassification",
+                                               dataset_properties=dataset_properties,
+                                               score_func=score_func)
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, score_func, CategoricalHyperparameter)
+        add_hyperparameter(cs, percentile, UniformIntegerHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'SPC',
+                'name': 'Select Percentile Classification',
+                'handles_sparse': True,
+                'handles_regression': False,
+                'handles_classification': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileRegression.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileRegression.py
new file mode 100644
index 000000000..7a51b9f86
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileRegression.py
@@ -0,0 +1,81 @@
+from functools import partial
+from typing import Any, Dict, Optional
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection import SelectPercentile, f_regression, mutual_info_regression
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
+
+
+SCORE_FUNC_CHOICES = ('f_regression', 'mutual_info')
+
+
+class SelectPercentileRegression(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Select features according to a percentile of the highest scores.
+    Scores are calculated using one of SCORE_FUNC_CHOICES
+    """
+    def __init__(self, score_func: str = "f_regression",
+                 percentile: int = 50,
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.percentile = percentile
+        if score_func == "f_regression":
+            self.score_func = f_regression
+        elif score_func == "mutual_info":
+            self.score_func = partial(mutual_info_regression, random_state=random_state)
+        else:
+            raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, "
+                             "but is: {score_func}")
+
+        super().__init__(random_state=random_state)
+        self.add_fit_requirements([
+            FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SelectPercentile(
+            percentile=self.percentile, score_func=self.score_func)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        percentile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="percentile",
+                                                                          value_range=(1, 99),
+                                                                          default_value=50,
+                                                                          ),
+        score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func",
+                                                                          value_range=SCORE_FUNC_CHOICES,
+                                                                          default_value="f_regression",
+                                                                          ),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, score_func, CategoricalHyperparameter)
+        add_hyperparameter(cs, percentile, UniformIntegerHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'SPR',
+                'name': 'Select Percentile Regression',
+                'handles_sparse': True,
+                'handles_regression': True,
+                'handles_classification': False
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesClassification.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesClassification.py
new file mode 100644
index 000000000..d760e3f6b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesClassification.py
@@ -0,0 +1,109 @@
+from functools import partial
+from typing import Any, Dict, Optional
+
+from ConfigSpace.conditions import NotEqualsCondition
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection import GenericUnivariateSelect, chi2, f_classif, mutual_info_classif
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.utils \
+    import filter_score_func_choices
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+SCORE_FUNC_CHOICES = ("chi2", "mutual_info_classif", "f_classif")
+
+
+class SelectRatesClassification(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Univariate feature selector by selecting the best features based on
+    univariate statistical tests. Tests can be one of SCORE_FUNC_CHOICES
+    """
+    def __init__(self, alpha: float = 0.1,
+                 score_func: str = "chi2",
+                 mode: str = "fpr",
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.mode = mode
+        self.alpha = alpha
+        if score_func == "chi2":
+            self.score_func = chi2
+        elif score_func == "f_classif":
+            self.score_func = f_classif
+        elif score_func == "mutual_info_classif":
+            self.score_func = partial(mutual_info_classif,
+                                      random_state=random_state)
+            # mutual info classif constantly crashes without mode percentile
+            self.mode = "percentile"
+        else:
+            raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, "
+                             "but is: {score_func}")
+
+        super().__init__(random_state=random_state)
+        self.add_fit_requirements([
+            FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = GenericUnivariateSelect(
+            mode=self.mode, score_func=self.score_func, param=self.alpha)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        alpha: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="alpha",
+                                                                     value_range=(0.01, 0.5),
+                                                                     default_value=0.1,
+                                                                     ),
+        mode: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="mode",
+                                                                    value_range=('fpr', 'fdr', 'fwe', "percentile"),
+                                                                    default_value='fpr',
+                                                                    ),
+        score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func",
+                                                                          value_range=SCORE_FUNC_CHOICES,
+                                                                          default_value="chi2",
+                                                                          ),
+    ) -> ConfigurationSpace:
+
+        score_func = filter_score_func_choices(class_name="SelectPercentileClassification",
+                                               dataset_properties=dataset_properties,
+                                               score_func=score_func)
+
+        cs = ConfigurationSpace()
+
+        score_func_hp = get_hyperparameter(score_func, CategoricalHyperparameter)
+        add_hyperparameter(cs, alpha, UniformFloatHyperparameter)
+        mode_hp = get_hyperparameter(mode, CategoricalHyperparameter)
+
+        cs.add_hyperparameters([mode_hp, score_func_hp])
+        # mutual_info_classif constantly crashes if mode is not percentile
+        # as a WA, fix the mode for this score
+        if "mutual_info_classif" in score_func_hp.choices:
+            cond = NotEqualsCondition(mode_hp, score_func_hp, 'mutual_info_classif')
+            cs.add_condition(cond)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'SRC',
+                'name': 'Select Rates Classification',
+                'handles_sparse': True,
+                'handles_regression': False,
+                'handles_classification': True
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesRegression.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesRegression.py
new file mode 100644
index 000000000..f683e99c9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesRegression.py
@@ -0,0 +1,83 @@
+from typing import Any, Dict, Optional
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection import GenericUnivariateSelect, f_regression
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+SCORE_FUNC_CHOICES = ('f_regression',)
+
+
+class SelectRatesRegression(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Univariate feature selector by selecting the best features based on
+    univariate statistical tests. Tests can be one of SCORE_FUNC_CHOICES
+    """
+    def __init__(self, score_func: str = "f_regression",
+                 alpha: float = 0.1, mode: str = "fpr",
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        self.mode = mode
+        self.alpha = alpha
+        if score_func == "f_regression":
+            self.score_func = f_regression
+        else:
+            raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, "
+                             "but is: {score_func}")
+
+        super().__init__(random_state=random_state)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = GenericUnivariateSelect(
+            mode=self.mode, score_func=self.score_func, param=self.alpha)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        alpha: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="alpha",
+                                                                     value_range=(0.01, 0.5),
+                                                                     default_value=0.1,
+                                                                     ),
+        mode: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="mode",
+                                                                    value_range=('fpr', 'fdr', 'fwe'),
+                                                                    default_value='fpr',
+                                                                    ),
+        score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func",
+                                                                          value_range=SCORE_FUNC_CHOICES,
+                                                                          default_value="f_regression",
+                                                                          ),
+    ) -> ConfigurationSpace:
+
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, score_func, CategoricalHyperparameter)
+        add_hyperparameter(cs, alpha, UniformFloatHyperparameter)
+        add_hyperparameter(cs, mode, CategoricalHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {'shortname': 'SRR',
+                'name': 'Select Rates Regression',
+                'handles_sparse': True,
+                'handles_regression': True,
+                'handles_classification': False
+                }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py
index 55576a58f..2b830c8ae 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py
@@ -1,5 +1,4 @@
-from math import floor
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -14,10 +13,26 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing \
     .base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
+    utils import percentage_value_range_to_integer_range
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
 class TruncatedSVD(autoPyTorchFeaturePreprocessingComponent):
+    """
+    Linear dimensionality reduction by means of truncated singular value decomposition (SVD).
+
+    Args:
+        target_dim (int):
+            Desired dimensionality of output data.
+            Note:
+                This number needs to be less than the total number of
+                features. To keep the hyperparameter search space general
+                to different datasets, autoPyTorch defines its value
+                range as the percentage of the number of features (in float).
+                This is then used to construct the range of target_dim using
+                target_dim = percentage of features * number of features. Defaults to 128.
+    """
     def __init__(self, target_dim: int = 128,
                  random_state: Optional[np.random.RandomState] = None):
         self.target_dim = target_dim
@@ -35,7 +50,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {'shortname': 'TruncSVD',
                 'name': 'Truncated Singular Value Decomposition',
-                'handles_sparse': True}
+                'handles_sparse': True,
+                'handles_classification': True,
+                'handles_regression': True}
 
     @staticmethod
     def get_hyperparameter_search_space(
@@ -47,19 +64,12 @@ def get_hyperparameter_search_space(
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
-        if dataset_properties is not None:
-            n_features = len(dataset_properties['numerical_columns']) if isinstance(
-                dataset_properties['numerical_columns'], List) else 0
-            target_dim = HyperparameterSearchSpace(hyperparameter=target_dim.hyperparameter,
-                                                   value_range=(floor(float(target_dim.value_range[0]) * n_features),
-                                                                floor(float(target_dim.value_range[1]) * n_features)),
-                                                   default_value=floor(float(target_dim.default_value) * n_features),
-                                                   log=target_dim.log)
-        else:
-            target_dim = HyperparameterSearchSpace(hyperparameter=target_dim.hyperparameter,
-                                                   value_range=(10, 256),
-                                                   default_value=128,
-                                                   log=target_dim.log)
+        target_dim = percentage_value_range_to_integer_range(
+            hyperparameter_search_space=target_dim,
+            default_value_range=(10, 256),
+            default_value=128,
+            dataset_properties=dataset_properties,
+        )
 
         add_hyperparameter(cs, target_dim, UniformIntegerHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py
index a3937a626..0e964ab56 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py
@@ -5,6 +5,7 @@
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
+from autoPyTorch.constants import CLASSIFICATION_TASKS, REGRESSION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -46,6 +47,79 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components.update(_addons.components)
         return components
 
+    def get_available_components(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        include: List[str] = None,
+        exclude: List[str] = None,
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Characteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate backbones
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        available_comp = self.get_components()
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry is FeatureProprocessorChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if (
+                STRING_TO_TASK_TYPES[task_type] in CLASSIFICATION_TASKS
+                and not bool(properties['handles_classification'])
+            ):
+                continue
+            elif (
+                STRING_TO_TASK_TYPES[task_type] in REGRESSION_TASKS
+                and not bool(properties['handles_regression'])
+            ):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # backbones based on the dataset!
+            # TODO: Think if there is any case where a preprocessor
+            # is not compatible for a certain dataset
+
+            components_dict[name] = entry
+
+        return components_dict
+
     def get_hyperparameter_search_space(self,
                                         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                                         default: Optional[str] = None,
@@ -72,8 +146,16 @@ def get_hyperparameter_search_space(self,
                         'RandomKitchenSinks',
                         'Nystroem',
                         'PolynomialFeatures',
-                        'PowerTransformer',
                         'TruncatedSVD',
+                        'ExtraTreesPreprocessorClassification',
+                        'ExtraTreesPreprocessorRegression',
+                        'FeatureAgglomeration',
+                        'RandomTreesEmbedding',
+                        'SelectPercentileClassification',
+                        'SelectPercentileRegression',
+                        'SelectRatesClassification',
+                        'SelectRatesRegression',
+                        'LibLinearSVCPreprocessor'
                         ]
             for default_ in defaults:
                 if default_ in available_:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py
index d11f69b90..eb576d472 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py
@@ -10,7 +10,8 @@
 
 
 class autoPyTorchFeaturePreprocessingComponent(autoPyTorchTabularPreprocessingComponent):
-    _required_properties: List[str] = ['handles_sparse']
+    _required_properties: List[str] = [
+        'handles_sparse', 'handles_classification', 'handles_regression']
 
     def __init__(self, random_state: Optional[np.random.RandomState] = None):
         if random_state is None:
@@ -30,7 +31,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+        if self.preprocessor['numerical'] is None:
             raise AttributeError("{} can't tranform without fitting first"
                                  .format(self.__class__.__name__))
         X.update({'feature_preprocessor': self.preprocessor})
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py
new file mode 100644
index 000000000..5d91ac2b6
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py
@@ -0,0 +1,97 @@
+import warnings
+from math import ceil, floor
+from typing import Dict, List, Optional, Sequence
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.utils.common import HyperparameterSearchSpace, HyperparameterValueType
+
+
+NoneType_ = Optional[str]  # This typing is exclusively for Literal["none", "None", None]
+# TODO: when we drop support for 3.7 use the following line
+# NoneType_ = Optional[Literal["none", "None"]]
+
+
+def filter_score_func_choices(
+    class_name: str,
+    score_func: HyperparameterSearchSpace,
+    dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+) -> HyperparameterSearchSpace:
+    """
+    In the context of select rates classification or select percentile classification,
+    some score functions are not compatible with sparse or signed data.
+    This function filters out those score function from the search space of the component
+    depending on the dataset.
+
+    Args:
+        score_func (HyperparameterSearchSpace)
+        dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+            Information about the dataset. Defaults to None.
+
+    Raises:
+        ValueError:
+            if none of the score function choices are incompatible with the dataset
+
+    Returns:
+        HyperparameterSearchSpace:
+            updated score function search space
+    """
+    value_range = list(score_func.value_range)
+    if dataset_properties is not None:
+        if dataset_properties.get("issigned", False):
+            value_range = [value for value in value_range if value not in ("chi2", "mutual_info_classif")]
+        if dataset_properties.get("issparse", False):
+            value_range = [value for value in value_range if value != "f_classif"]
+
+    if sorted(value_range) != sorted(list(score_func.value_range)):
+        warnings.warn(f"Given choices for `score_func` are not compatible with the dataset. "
+                      f"Updating choices to {value_range}")
+
+    if len(value_range) == 0:
+        raise ValueError(f"`{class_name}` is not compatible with the"
+                         f" current dataset as it is both `signed` and `sparse`")
+    default_value = score_func.default_value if score_func.default_value in value_range else value_range[-1]
+    score_func = HyperparameterSearchSpace(hyperparameter="score_func",
+                                           value_range=value_range,
+                                           default_value=default_value,
+                                           )
+    return score_func
+
+
+def percentage_value_range_to_integer_range(
+    hyperparameter_search_space: HyperparameterSearchSpace,
+    default_value_range: Sequence[HyperparameterValueType],
+    default_value: int,
+    dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+) -> HyperparameterSearchSpace:
+    """
+    For some feature preprocessors, the value of an integer hyperparameter
+    needs to be lower than the number of features. To facilitate this,
+    autoPyTorch uses a value range based on the percentage of the number
+    of features. This function converts that hyperparameter search space
+    to an integer value range as is required by the underlying sklearn
+    preprocessors.
+    """
+    hyperparameter_name = hyperparameter_search_space.hyperparameter
+    if dataset_properties is not None:
+        n_features = len(dataset_properties['numerical_columns']) if isinstance(
+            dataset_properties['numerical_columns'], List) else 0
+        if n_features == 1:
+            # log=True is not supported in ConfigSpace when the value range consists of 0
+            # raising ValueError: Negative lower bound (0) for log-scale hyperparameter is forbidden.
+            log = False
+        else:
+            log = hyperparameter_search_space.log
+        hyperparameter_search_space = HyperparameterSearchSpace(
+            hyperparameter=hyperparameter_name,
+            value_range=(
+                floor(float(hyperparameter_search_space.value_range[0]) * n_features),
+                floor(float(hyperparameter_search_space.value_range[1]) * n_features)),
+            default_value=ceil(float(hyperparameter_search_space.default_value) * n_features),
+            log=log)
+    else:
+        hyperparameter_search_space = HyperparameterSearchSpace(hyperparameter=hyperparameter_name,
+                                                                value_range=default_value_range,
+                                                                default_value=default_value,
+                                                                log=hyperparameter_search_space.log)
+
+    return hyperparameter_search_space
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index ea09798ce..608ee8ec5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -1,9 +1,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter
-)
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 import numpy as np
 
@@ -16,91 +14,101 @@
 
 class SimpleImputer(BaseImputer):
     """
-    Impute missing values for categorical columns with '!missing!'
-    (In case of numpy data, the constant value is set to -1, under
-    the assumption that categorical data is fit with an Ordinal Scaler)
+    An imputer for numerical columns
+
+    Attributes:
+        random_state (Optional[np.random.RandomState]):
+            The random state to use for the imputer.
+        numerical_strategy (str: default='mean'):
+            The strategy to use for imputing numerical columns.
+            Can be one of ['most_frequent', 'constant_!missing!']
     """
 
-    def __init__(self,
-                 random_state: Optional[Union[np.random.RandomState, int]] = None,
-                 numerical_strategy: str = 'mean',
-                 categorical_strategy: str = 'most_frequent'):
+    def __init__(
+        self,
+        random_state: Optional[np.random.RandomState] = None,
+        numerical_strategy: str = 'mean',
+    ):
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
-        self.categorical_strategy = categorical_strategy
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
         """
-        The fit function calls the fit function of the underlying model
-        and returns the transformed array.
+        Builds the preprocessor based on the given fit dictionary 'X'.
+
         Args:
-            X (np.ndarray): input features
-            y (Optional[np.ndarray]): input labels
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
 
         Returns:
-            instance of self
+            self:
+                returns an instance of self.
         """
         self.check_requirements(X, y)
-        categorical_columns = X['dataset_properties']['categorical_columns'] \
-            if isinstance(X['dataset_properties']['categorical_columns'], List) else []
-        if len(categorical_columns) != 0:
-            if self.categorical_strategy == 'constant_!missing!':
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
-                                                                        # Train data is numpy
-                                                                        # as of this point, where
-                                                                        # Ordinal Encoding is using
-                                                                        # for categorical. Only
-                                                                        # Numbers are allowed
-                                                                        # fill_value='!missing!',
-                                                                        fill_value=-1,
-                                                                        copy=False)
-            else:
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
-                                                                        copy=False)
-        numerical_columns = X['dataset_properties']['numerical_columns'] \
-            if isinstance(X['dataset_properties']['numerical_columns'], List) else []
-        if len(numerical_columns) != 0:
+
+        # Choose an imputer for any numerical columns
+        numerical_columns = X['dataset_properties']['numerical_columns']
+
+        if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
             if self.numerical_strategy == 'constant_zero':
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
-                                                                      fill_value=0,
-                                                                      copy=False)
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
+                self.preprocessor['numerical'] = imputer
             else:
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                self.preprocessor['numerical'] = imputer
 
         return self
 
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
-                                                                                  value_range=("mean", "median",
-                                                                                               "most_frequent",
-                                                                                               "constant_zero"),
-                                                                                  default_value="mean",
-                                                                                  ),
-        categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='categorical_strategy',
-            value_range=("most_frequent",
-                         "constant_!missing!"),
-            default_value="most_frequent")
+        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='numerical_strategy',
+            value_range=("mean", "median", "most_frequent", "constant_zero"),
+            default_value="mean",
+        ),
     ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
         cs = ConfigurationSpace()
-        assert dataset_properties is not None, "To create hyperparameter search space" \
-                                               ", dataset_properties should not be None"
-        if len(dataset_properties['numerical_columns']) \
-                if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:
-            add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if len(dataset_properties['categorical_columns']) \
-                if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
-            add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
+        if dataset_properties is None:
+            raise ValueError("SimpleImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        if (
+            isinstance(dataset_properties['numerical_columns'], List)
+            and len(dataset_properties['numerical_columns']) != 0
+        ):
+            add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
         return cs
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        """Get the properties of the SimpleImputer class and what it can handle
+
+        Returns:
+            Dict[str, Union[str, bool]]:
+                A dict from property names to values
+        """
         return {
             'shortname': 'SimpleImputer',
             'name': 'Simple Imputer',
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
index b65f3c229..1f33a765a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
@@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+        if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
         X.update({'imputer': self.preprocessor})
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
new file mode 100644
index 000000000..7dd2502f9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+
+
+class PowerTransformer(BaseScaler):
+    """
+    Map data to as close to a Gaussian distribution as possible
+    in order to reduce variance and minimize skewness.
+
+    Uses `yeo-johnson` power transform method. Also, data is normalised
+    to zero mean and unit variance.
+    """
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'PowerTransformer',
+            'name': 'PowerTransformer',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
new file mode 100644
index 000000000..cc0b4fa7a
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
@@ -0,0 +1,73 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import numpy as np
+
+from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+class QuantileTransformer(BaseScaler):
+    """
+    Transform the features to follow a uniform or a normal distribution
+    using quantiles information.
+
+    For more details of each attribute, see:
+    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
+    """
+    def __init__(
+        self,
+        n_quantiles: int = 1000,
+        output_distribution: str = "normal",  # Literal["normal", "uniform"]
+        random_state: Optional[np.random.RandomState] = None
+    ):
+        super().__init__()
+        self.random_state = random_state
+        self.n_quantiles = n_quantiles
+        self.output_distribution = output_distribution
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
+                                                                    output_distribution=self.output_distribution,
+                                                                    copy=False)
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
+                                                                           value_range=(10, 2000),
+                                                                           default_value=1000,
+                                                                           ),
+        output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
+                                                                                   value_range=("uniform", "normal"),
+                                                                                   default_value="normal",
+                                                                                   )
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        # TODO parametrize like the Random Forest as n_quantiles = n_features^param
+        add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'QuantileTransformer',
+            'name': 'QuantileTransformer',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
new file mode 100644
index 000000000..2c59d77c2
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
@@ -0,0 +1,73 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from sklearn.preprocessing import RobustScaler as SklearnRobustScaler
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
+
+
+class RobustScaler(BaseScaler):
+    """
+    Remove the median and scale features according to the quantile_range to make
+    the features robust to outliers.
+
+    For more details of the preprocessor, see:
+    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
+    """
+    def __init__(
+        self,
+        q_min: float = 0.25,
+        q_max: float = 0.75,
+        random_state: Optional[np.random.RandomState] = None
+    ):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
+        self.random_state = random_state
+        self.q_min = q_min
+        self.q_max = q_max
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+        with_centering = bool(not X['dataset_properties']['issparse'])
+
+        self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
+                                                             with_centering=with_centering,
+                                                             copy=False)
+
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min",
+                                                                     value_range=(0.001, 0.3),
+                                                                     default_value=0.25),
+        q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max",
+                                                                     value_range=(0.7, 0.999),
+                                                                     default_value=0.75)
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, q_min, UniformFloatHyperparameter)
+        add_hyperparameter(cs, q_max, UniformFloatHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'RobustScaler',
+            'name': 'RobustScaler',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py
index 082b17cb9..d4d3ffeb5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py
@@ -66,9 +66,21 @@ def get_hyperparameter_search_space(self,
             raise ValueError("no scalers found, please add a scaler")
 
         if default is None:
-            defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
+            defaults = [
+                'StandardScaler',
+                'Normalizer',
+                'MinMaxScaler',
+                'PowerTransformer',
+                'QuantileTransformer',
+                'RobustScaler',
+                'NoScaler'
+            ]
             for default_ in defaults:
                 if default_ in available_scalers:
+                    if include is not None and default_ not in include:
+                        continue
+                    if exclude is not None and default_ in exclude:
+                        continue
                     default = default_
                     break
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
new file mode 100644
index 000000000..e5e71ea1e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
+    autoPyTorchTabularPreprocessingComponent
+
+
+class VarianceThreshold(autoPyTorchTabularPreprocessingComponent):
+    """
+    Removes features that have the same value in the training data.
+    """
+    def __init__(self, random_state: Optional[np.random.RandomState] = None):
+        super().__init__()
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold':
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnVarianceThreshold(
+            threshold=0.0
+        )
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        if self.preprocessor['numerical'] is None:
+            raise ValueError("cannot call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'variance_threshold': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+
+        return {
+            'shortname': 'Variance Threshold',
+            'name': 'Variance Threshold (constant feature removal)',
+            'handles_sparse': True,
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
new file mode 100644
index 000000000..ecca60570
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -0,0 +1,182 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import make_pipeline
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent,
+    autoPyTorchTimeSeriesTargetPreprocessingComponent)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.utils import (
+    get_time_series_preprocessers, get_time_series_target_preprocessers)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class TimeSeriesFeatureTransformer(autoPyTorchTimeSeriesPreprocessingComponent):
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.random_state = random_state
+        self.preprocessor: Optional[ColumnTransformer] = None
+        self.add_fit_requirements([
+            FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Creates a column transformer for the chosen tabular
+        preprocessors
+
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            "TimeSeriesFeatureTransformer": an instance of self
+        """
+        self.check_requirements(X, y)
+
+        preprocessors = get_time_series_preprocessers(X)
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+        if len(preprocessors['numerical']) > 0:
+            numerical_pipeline = make_pipeline(*preprocessors['numerical'])
+            column_transformers.append(
+                ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
+            )
+        if len(preprocessors['categorical']) > 0:
+            categorical_pipeline = make_pipeline(*preprocessors['categorical'])
+            column_transformers.append(
+                ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
+            )
+
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
+            remainder='passthrough'
+        )
+
+        # Where to get the data -- Prioritize X_train if any else
+        # get from backend
+        if 'X_train' in X:
+            X_train = X['X_train']
+        else:
+            X_train = X['backend'].load_datamanager().train_tensors[0]
+
+        self.preprocessor.fit(X_train)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the time series transformer to fit dictionary
+
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            X (Dict[str, Any]): updated fit dictionary
+        """
+        X.update({'time_series_feature_transformer': self})
+        return X
+
+    def __call__(self, X: pd.DataFrame) -> pd.DataFrame:
+        if self.preprocessor is None:
+            raise ValueError("cant call {} without fitting the column transformer first."
+                             .format(self.__class__.__name__))
+
+        return self.preprocessor.transform(X)
+
+    def get_column_transformer(self) -> ColumnTransformer:
+        """
+        Get fitted column transformer that is wrapped around
+        the sklearn early_preprocessor. Can only be called if fit()
+        has been called on the object.
+        Returns:
+            BaseEstimator: Fitted sklearn column transformer
+        """
+        if self.preprocessor is None:
+            raise AttributeError("{} can't return column transformer before transform is called"
+                                 .format(self.__class__.__name__))
+        return self.preprocessor
+
+
+class TimeSeriesTargetTransformer(autoPyTorchTimeSeriesTargetPreprocessingComponent):
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.random_state = random_state
+        self.preprocessor: Optional[ColumnTransformer] = None
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Creates a column transformer for the chosen tabular
+        preprocessors
+
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            "TimeSeriesTargetTransformer": an instance of self
+        """
+        self.check_requirements(X, y)
+
+        preprocessors = get_time_series_target_preprocessers(X)
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+        if len(preprocessors['target_numerical']) > 0:
+            numerical_pipeline = make_pipeline(*preprocessors['target_numerical'])
+            # TODO the last item needs to be adapted accordingly!
+            column_transformers.append(
+                ('target_numerical_pipeline', numerical_pipeline, list(range(len(preprocessors['target_numerical']))))
+            )
+
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
+            remainder='passthrough'
+        )
+
+        # Where to get the data -- Prioritize X_train if any else
+        # get from backend
+        if 'y_train' in X:
+            y_train = X['y_train']
+        else:
+            y_train = X['backend'].load_datamanager().train_tensors[1]
+
+        self.preprocessor.fit(y_train)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the time series transformer to fit dictionary
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            X (Dict[str, Any]): updated fit dictionary
+        """
+        X.update({'time_series_target_transformer': self})
+        return X
+
+    def __call__(self, y: pd.DataFrame) -> pd.DataFrame:
+        if self.preprocessor is None:
+            raise ValueError("cant call {} without fitting the column transformer first."
+                             .format(self.__class__.__name__))
+
+        return self.preprocessor.transform(y)
+
+    def get_target_transformer(self) -> ColumnTransformer:
+        """
+        Get fitted column transformer that is wrapped around
+        the sklearn early_preprocessor. Can only be called if fit()
+        has been called on the object.
+        Returns:
+            BaseEstimator: Fitted sklearn column transformer
+        """
+        if self.preprocessor is None:
+            raise AttributeError("{} can't return column transformer before transform is called"
+                                 .format(self.__class__.__name__))
+        return self.preprocessor
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
new file mode 100644
index 000000000..e924d360d
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -0,0 +1,40 @@
+from typing import Dict, Optional, Union
+
+from sklearn.base import BaseEstimator
+
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import (
+    autoPyTorchPreprocessingComponent, autoPyTorchTargetPreprocessingComponent)
+
+
+class autoPyTorchTimeSeriesPreprocessingComponent(autoPyTorchPreprocessingComponent):
+    """
+     Provides abstract interface for time series preprocessing algorithms in AutoPyTorch.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
+            numerical=None, categorical=None)
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
+
+
+class autoPyTorchTimeSeriesTargetPreprocessingComponent(autoPyTorchTargetPreprocessingComponent):
+    """
+     Provides abstract interface for time series target preprocessing algorithms in AutoPyTorch.
+     Currently only numerical target preprocessing is supported.
+     # TODO add support for categorical targets!
+     # TODO define inverse transformation for each inversible numerical transformation (log, deseasonalization, etc. )
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
+            numerical=None, categorical=None)
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
new file mode 100644
index 000000000..b08300724
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import \
+    NoEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
+    TimeSeriesBaseEncoder
+
+
+class TimeSeriesNoEncoder(TimeSeriesBaseEncoder):
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 ):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesBaseEncoder":
+        NoEncoder.fit(self, X, y)
+        self.feature_shapes = X['dataset_properties']['feature_shapes']
+        return self
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesNoEncoder',
+            'name': 'Time Series No Encoder',
+            'handles_sparse': True
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the self into the 'X' dictionary and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        return NoEncoder.transform(self, X)
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
new file mode 100644
index 000000000..5ac5e2550
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -0,0 +1,41 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import \
+    OneHotEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
+    TimeSeriesBaseEncoder
+
+
+class TimeSeriesOneHotEncoder(TimeSeriesBaseEncoder):
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 ):
+        super(TimeSeriesOneHotEncoder, self).__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
+        OneHotEncoder.fit(self, X, y)
+        categorical_columns = X['dataset_properties']['categorical_columns']
+        n_features_cat = X['dataset_properties']['categories']
+        feature_names = X['dataset_properties']['feature_names']
+        feature_shapes = X['dataset_properties']['feature_shapes']
+
+        if len(n_features_cat) == 0:
+            n_features_cat = self.preprocessor['categorical'].categories  # type: ignore
+        for i, cat_column in enumerate(categorical_columns):
+            feature_shapes[feature_names[cat_column]] = len(n_features_cat[i])
+        self.feature_shapes = feature_shapes
+        return self
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesOneHotEncoder',
+            'name': 'Time Series One Hot Encoder',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
new file mode 100644
index 000000000..4170fff8e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
@@ -0,0 +1,41 @@
+import os
+from collections import OrderedDict
+from typing import Dict
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import \
+    EncoderChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
+    TimeSeriesBaseEncoder
+
+encoding_directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            encoding_directory,
+                            TimeSeriesBaseEncoder)
+_addons = ThirdPartyComponents(TimeSeriesBaseEncoder)
+
+
+def add_encoder(encoder: TimeSeriesBaseEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class TimeSeriesEncoderChoice(EncoderChoice):
+    """
+    Allows for dynamically choosing encoding component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available encoder components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseEncoder components available
+                as choices for encoding the categorical columns
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
new file mode 100644
index 000000000..a3d64ee92
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -0,0 +1,35 @@
+from typing import Any, Dict, List, Union
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import \
+    BaseEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
+    autoPyTorchTimeSeriesPreprocessingComponent
+from autoPyTorch.utils.common import FitRequirement
+
+
+class TimeSeriesBaseEncoder(autoPyTorchTimeSeriesPreprocessingComponent):
+    """
+    Base class for encoder
+    """
+    def __init__(self) -> None:
+        super(TimeSeriesBaseEncoder, self).__init__()
+        self.add_fit_requirements([
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categories', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('feature_shapes', (Dict, ), user_defined=True, dataset_property=True),
+        ])
+        self.feature_shapes: Union[Dict[str, int]] = {}
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the self into the 'X' dictionary and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X['dataset_properties'].update({'feature_shapes': self.feature_shapes})
+        return BaseEncoder.transform(self, X)
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
new file mode 100644
index 000000000..22cb0062c
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -0,0 +1,186 @@
+from typing import Any, Dict, List, Optional
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+from sktime.transformations.series.impute import Imputer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent,
+    autoPyTorchTimeSeriesTargetPreprocessingComponent)
+from autoPyTorch.utils.common import (FitRequirement,
+                                      HyperparameterSearchSpace,
+                                      add_hyperparameter)
+
+
+class TimeSeriesFeatureImputer(autoPyTorchTimeSeriesPreprocessingComponent):
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 imputation_strategy: str = 'mean'):
+        super().__init__()
+        self.random_state = random_state
+        self.imputation_strategy = imputation_strategy
+        self.add_fit_requirements([
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseEstimator:
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
+
+        Args:
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
+
+        Returns:
+            self:
+                returns an instance of self.
+        """
+        # Choose an imputer for any numerical columns
+        numerical_columns = X['dataset_properties']['numerical_columns']
+
+        if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
+            if self.imputation_strategy == 'constant_zero':
+                imputer = Imputer(method='constant', random_state=self.random_state, value=0)
+            else:
+                imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state)
+            self.preprocessor['numerical'] = imputer
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds self into the 'X' dictionary and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
+            raise ValueError("cant call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'imputer': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='imputation_strategy',
+                value_range=("drift", "linear", "nearest", "constant_zero", "bfill", "ffill"),
+                default_value="drift",
+            ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the Time Series Imputator
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+            imputation_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for imputation, its hyperparameters are defined by sktime
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a Time Series Imputor with the given
+                `dataset_properties`
+        """
+        if dataset_properties is None:
+            raise ValueError("TimeSeriesFeatureImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        cs = ConfigurationSpace()
+        if (
+                dataset_properties.get('features_have_missing_values', True)
+                and isinstance(dataset_properties['numerical_columns'], List)
+                and len(dataset_properties['numerical_columns']) != 0
+        ):
+            add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
+        return cs
+
+
+class TimeSeriesTargetImputer(autoPyTorchTimeSeriesTargetPreprocessingComponent):
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 imputation_strategy: str = 'mean', ):
+        super().__init__()
+        self.random_state = random_state
+        self.imputation_strategy = imputation_strategy
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseEstimator:
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
+
+        Args:
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
+
+        Returns:
+            self:
+                returns an instance of self.
+        """
+        # Forecasting tasks always have numerical outputs (TODO add support for categorical HPs)
+        if self.imputation_strategy == 'constant_zero':
+            imputer = Imputer(method='constant', random_state=self.random_state, value=0)
+        else:
+            imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state)
+        self.preprocessor['target_numerical'] = imputer
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds self into the 'X' dictionary and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['target_numerical'] is None:
+            raise ValueError("cant call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'target_imputer': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='imputation_strategy',
+                value_range=("linear", "nearest", "constant_zero", "bfill", "ffill"),
+                default_value="linear",
+            ),
+    ) -> ConfigurationSpace:
+        """
+        Time series imputor, for the sake of speed, we only allow local imputation here (i.e., the filled value only
+        depends on its neighbours)
+        # TODO: Transformer for mean and median: df.fillna(df.groupby(df.index).agg('mean'))...
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): dataset properties
+            imputation_strategy: which strategy to use, its content is defined by
+             sktime.transformations.series.impute.Imputer
+
+
+        Returns:
+
+        """
+        if dataset_properties is None:
+            raise ValueError("TimeSeriesTargetImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        cs = ConfigurationSpace()
+        if dataset_properties.get('targets_have_missing_values', True):
+            add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
new file mode 100644
index 000000000..ff030da39
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -0,0 +1,82 @@
+from typing import Any, Dict, List, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
+    autoPyTorchTimeSeriesPreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import \
+    TimeSeriesScaler
+from autoPyTorch.utils.common import (FitRequirement,
+                                      HyperparameterSearchSpace,
+                                      add_hyperparameter)
+
+
+class BaseScaler(autoPyTorchTimeSeriesPreprocessingComponent):
+    """
+    Provides abstract class interface for time series scalers in AutoPytorch
+    """
+
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None,
+                 scaling_mode: str = 'standard'):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True)
+        ])
+        self.random_state = random_state
+        self.scaling_mode = scaling_mode
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> 'BaseScaler':
+        self.check_requirements(X, y)
+        dataset_is_small_preprocess = X["dataset_properties"]["is_small_preprocess"]
+        static_features = X['dataset_properties'].get('static_features', ())
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode=self.scaling_mode,
+                                                          dataset_is_small_preprocess=dataset_is_small_preprocess,
+                                                          static_features=static_features)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the fitted scalar into the 'X' dictionary and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+            raise ValueError(f"can not call transform on {self.__class__.__name__} without fitting first.")
+        X.update({'scaler': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            scaling_mode: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='scaling_mode',
+                value_range=("standard", "min_max", "max_abs", "mean_abs", "none"),
+                default_value="standard",
+            ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the Time Series Imputator
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+            scaling_mode (HyperparameterSearchSpace: default = ...)
+                The strategy to use for scaling, its hyperparameters are defined by sktime
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a Time Series Imputor with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, scaling_mode, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
new file mode 100644
index 000000000..abd246072
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -0,0 +1,159 @@
+from typing import Any, Tuple, Union
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+
+
+# Similar to / inspired by
+# https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
+class TimeSeriesScaler(BaseEstimator):
+    def __init__(self, mode: str,
+                 dataset_is_small_preprocess: bool = True,
+                 static_features: Union[Tuple[Union[str, int], ...], Tuple[()]] = ()):
+        self.mode = mode
+        self.dataset_is_small_preprocess = dataset_is_small_preprocess
+        self.static_features = static_features
+
+    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Any = None) -> "TimeSeriesScaler":
+        """
+        The transformer is transformed on the fly (for each batch)
+        """
+        if self.dataset_is_small_preprocess:
+            if not isinstance(X, pd.DataFrame):
+                raise ValueError(f'Scaling that works on small_preprocess dataset must work with pd.DataFrame.'
+                                 f'However, it gets {type(X)}')
+
+            static_features = [static_fea for static_fea in self.static_features if static_fea in X.columns]
+            self.static_features = static_features  # type: ignore[assignment]
+
+            if self.mode == "standard":
+                X_grouped = X.groupby(X.index)
+
+                self.loc = X_grouped.agg("mean")
+                self.scale = X_grouped.agg("std").fillna(0.0)
+
+                # for static features, if we do normalization w.r.t. each group, then they will become the same values,
+                # thus we treat them differently: normalize with the entire dataset
+                self.scale[self.static_features] = X[self.static_features].std().fillna(0.0)
+                self.loc[self.static_features] = X[self.static_features].mean()
+
+                # ensure that if all the values are the same in a group, we could still normalize them correctly
+                self.scale[self.scale == 0] = 1.
+
+            elif self.mode == "min_max":
+                X_grouped = X.groupby(X.index)
+                min_ = X_grouped.agg("min")
+                max_ = X_grouped.agg("max")
+
+                min_[self.static_features] = min_[self.static_features].min()
+                max_[self.static_features] = max_[self.static_features].max()
+
+                diff_ = max_ - min_
+                self.loc = min_
+                self.scale = diff_
+                self.scale.mask(self.scale == 0.0, self.loc)
+                self.scale[self.scale == 0.0] = 1.0
+
+            elif self.mode == "max_abs":
+                X_abs = X.transform("abs")
+                max_abs_ = X_abs.groupby(X_abs.index).agg("max")
+                max_abs_[self.static_features] = max_abs_[self.static_features].max()
+
+                max_abs_[max_abs_ == 0.0] = 1.0
+                self.loc = None
+                self.scale = max_abs_
+
+            elif self.mode == 'mean_abs':
+                X_abs = X.transform("abs")
+                X_abs = X_abs.groupby(X_abs.index)
+                mean_abs_ = X_abs.agg("mean")
+                mean_abs_[self.static_features] = mean_abs_[self.static_features].mean()
+                self.scale = mean_abs_.mask(mean_abs_ == 0.0, X_abs.agg("max"))
+
+                self.scale[self.scale == 0] = 1
+                self.loc = None
+
+            elif self.mode == "none":
+                self.loc = None
+                self.scale = None
+
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for time series scaler")
+        else:
+            static_features = [static_fea for static_fea in self.static_features if static_fea < X.shape[1]]
+            self.static_features = static_features  # type: ignore[assignment]
+
+        return self
+
+    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]:
+        """
+        X = sklearn.utils.check_array(
+            X,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        ) # type: np.ndarray
+        """
+        if not self.dataset_is_small_preprocess:
+            if not isinstance(X, np.ndarray):
+                raise ValueError(f'Scaling that works on none-small_preprocess dataset must work with np.ndarray.'
+                                 f'However, it gets {type(X)}')
+            if self.mode == 'standard':
+                # in this case X is a np array
+                loc = X.mean(axis=0, keepdims=True)
+                scale = np.nan_to_num(X.std(axis=0, ddof=1, keepdims=True))
+                scale = np.where(scale == 0, loc, scale)
+                scale[scale == 0] = 1.
+                return (X - loc) / scale
+
+            elif self.mode == 'min_max':
+                min_ = X.min(axis=0, keepdims=True)
+                max_ = X.max(axis=0, keepdims=True)
+
+                diff_ = max_ - min_
+                loc = min_
+                scale = diff_
+                scale = np.where(scale == 0., loc, scale)
+                scale[scale == 0.0] = 1.0
+                return (X - loc) / scale
+
+            elif self.mode == "max_abs":
+                X_abs = np.abs(X)
+                max_abs_ = X_abs.max(0, keepdims=True)
+                max_abs_[max_abs_ == 0.0] = 1.0
+                scale = max_abs_
+                return X / scale
+
+            elif self.mode == 'mean_abs':
+                X_abs = np.abs(X)
+                mean_abs_ = X_abs.mean(0, keepdims=True)
+                scale = np.where(mean_abs_ == 0.0, np.max(X_abs), mean_abs_)
+                scale[scale == 0] = 1
+                return X / scale
+
+            elif self.mode == "none":
+                return X
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for time series scaler")
+
+        if self.mode == "standard":
+            return (X - self.loc) / self.scale
+
+        elif self.mode == "min_max":
+            return (X - self.loc) / self.scale
+
+        elif self.mode == "max_abs":
+            return X / self.scale
+
+        elif self.mode == 'mean_abs':
+            return X / self.scale
+
+        elif self.mode == "none":
+            return X
+        else:
+            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
new file mode 100644
index 000000000..22252f0dd
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
@@ -0,0 +1,57 @@
+from typing import Any, Dict, List
+
+from sklearn.base import BaseEstimator
+
+
+def get_time_series_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator]]:
+    """
+    Expects fit_dictionary(X) to have numerical/categorical preprocessors
+    (fitted numerical/categorical preprocessing nodes) that will build a pipeline in the TimeSeriesTransformer.
+    This function parses X and extracts such components.
+    Creates a dictionary with two keys,
+    numerical- containing list of numerical preprocessors
+    categorical- containing list of categorical preprocessors
+
+    Args:
+        X: fit dictionary
+
+    Returns:
+        (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
+    """
+    preprocessor = dict(numerical=list(), categorical=list())  # type: Dict[str, List[BaseEstimator]]
+    for key, value in X.items():
+        if isinstance(value, dict):
+            # as each preprocessor is child of BaseEstimator
+            if 'numerical' in value and isinstance(value['numerical'], BaseEstimator):
+                preprocessor['numerical'].append(value['numerical'])
+            if 'categorical' in value and isinstance(value['categorical'], BaseEstimator):
+                preprocessor['categorical'].append(value['categorical'])
+
+    return preprocessor
+
+
+def get_time_series_target_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator]]:
+    """
+    Expects fit_dictionary(X) to have target preprocessors
+    I leave here interface to target categorical
+    (fitted numerical/categorical preprocessing nodes) that will build a pipeline in the TimeSeriesTransformer.
+    This function parses X and extracts such components.
+    Creates a dictionary with two keys,
+    numerical- containing list of numerical preprocessors
+    categorical- containing list of categorical preprocessors
+
+    Args:
+        X: fit dictionary
+
+    Returns:
+        (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
+    """
+    preprocessor = dict(target_numerical=list(), target_categorical=list())  # type: Dict[str, List[BaseEstimator]]
+    for key, value in X.items():
+        if isinstance(value, dict):
+            # as each preprocessor is child of BaseEstimator
+            if 'target_numerical' in value and isinstance(value['target_numerical'], BaseEstimator):
+                preprocessor['target_numerical'].append(value['target_numerical'])
+            if 'target_categorical' in value and isinstance(value['target_categorical'], BaseEstimator):
+                preprocessor['target_categorical'].append(value['target_categorical'])
+    return preprocessor
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
index 7fbf33f99..aa2b4c25f 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import spmatrix
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent
@@ -21,7 +21,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
         self.random_state = random_state
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+            FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
                            dataset_property=False)])
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing":
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
new file mode 100644
index 000000000..59035869e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -0,0 +1,112 @@
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+import pandas as pd
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import \
+    autoPyTorchTargetPreprocessingComponent
+from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import \
+    EarlyPreprocessing
+from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import (
+    get_preprocess_transforms, time_series_preprocess)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class TimeSeriesEarlyPreprocessing(EarlyPreprocessing):
+    def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
+        super(EarlyPreprocessing, self).__init__()
+        self.random_state = random_state
+        self.add_fit_requirements([
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('X_train', (pd.DataFrame, ), user_defined=True,
+                           dataset_property=False),
+            FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+        ])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        if dataset is small process, we transform the entire dataset here.
+        Before transformation, the order of the dataset is:
+        [(unknown_columns), categorical_columns, numerical_columns]
+        While after transformation, the order of the dataset is:
+        [numerical_columns, categorical_columns, unknown_columns]
+        we need to change feature_names and feature_shapes accordingly
+
+        Args:
+            X(Dict): fit dictionary
+
+        Returns:
+            X_transformed(Dict): transformed fit dictionary
+        """
+
+        transforms = get_preprocess_transforms(X)
+        if X['dataset_properties']['is_small_preprocess']:
+            if 'X_train' in X:
+                X_train = X['X_train']
+            else:
+                # Incorporate the transform to the dataset
+                X_train = X['backend'].load_datamanager().train_tensors[0]
+
+            X['X_train'] = time_series_preprocess(dataset=X_train, transforms=transforms)
+
+        feature_names = X['dataset_properties']['feature_names']
+        numerical_columns = X['dataset_properties']['numerical_columns']
+        categorical_columns = X['dataset_properties']['categorical_columns']
+
+        # resort feature_names
+        new_feature_names = [feature_names[num_col] for num_col in numerical_columns]
+        new_feature_names += [feature_names[cat_col] for cat_col in categorical_columns]
+        if set(feature_names) != set(new_feature_names):
+            new_feature_names += list(set(feature_names) - set(new_feature_names))
+        X['dataset_properties']['feature_names'] = tuple(new_feature_names)
+
+        # We need to also save the preprocess transforms for inference
+        X.update({'preprocess_transforms': transforms})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesEarlyPreprocessing',
+            'name': 'TIme Series Early Preprocessing Node',
+        }
+
+
+class TimeSeriesTargetEarlyPreprocessing(EarlyPreprocessing):
+
+    def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
+        super(EarlyPreprocessing, self).__init__()
+        self.random_state = random_state
+        self.add_fit_requirements([
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('y_train', (pd.DataFrame,), user_defined=True,
+                           dataset_property=False)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        # TODO consider inverse transformation
+        transforms = get_preprocess_transforms(X, preprocess_type=autoPyTorchTargetPreprocessingComponent)
+        if X['dataset_properties']['is_small_preprocess']:
+            if 'y_train' in X:
+                y_train = X['y_train']
+            else:
+                # Incorporate the transform to the dataset
+                y_train = X['backend'].load_datamanager().train_tensors[1]
+
+            X['y_train'] = time_series_preprocess(dataset=y_train, transforms=transforms)
+
+        # We need to also save the preprocess transforms for inference
+        X.update({'preprocess_target_transforms': transforms})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesTargetEarlyPreprocessing',
+            'name': 'TIme Series Target Early Preprocessing Node',
+        }
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index d74faffa6..830beced9 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -1,19 +1,26 @@
 import copy
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Type, Union
 
 import numpy as np
 
+import pandas as pd
+
 from sklearn.utils import check_array
 
 import torchvision.transforms
 
-from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchPreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import (
+    autoPyTorchPreprocessingComponent as aPTPre,
+    autoPyTorchTargetPreprocessingComponent as aPTTPre
+)
 
 
-def get_preprocess_transforms(X: Dict[str, Any]) -> torchvision.transforms.Compose:
-    candidate_transforms: List[autoPyTorchPreprocessingComponent] = list()
+def get_preprocess_transforms(X: Dict[str, Any],
+                              preprocess_type: Union[Type[aPTPre], Type[aPTTPre]] = aPTPre) \
+        -> List[Union[Type[aPTPre], Type[aPTTPre]]]:
+    candidate_transforms = []
     for key, value in X.items():
-        if isinstance(value, autoPyTorchPreprocessingComponent):
+        if isinstance(value, preprocess_type):
             candidate_transforms.append(copy.deepcopy(value))
 
     return candidate_transforms
@@ -37,3 +44,30 @@ def preprocess(dataset: np.ndarray, transforms: torchvision.transforms.Compose,
         ensure_2d=False,
         allow_nd=True,
     )
+
+
+def time_series_preprocess(dataset: pd.DataFrame, transforms: torchvision.transforms.Compose,
+                           indices: Optional[List[int]] = None) -> pd.DataFrame:
+    """
+    preprocess time series data (both features and targets). Dataset should be pandas DataFrame whose index identifies
+    which series the data belongs to.
+
+    Args:
+        dataset (pd.DataFrame): a dataset contains multiple series, its index identifies the series number
+        transforms (torchvision.transforms.Compose): transformation applied to dataset
+        indices (Optional[List[int]]): the indices that the transformer needs to work with
+
+    Returns:
+
+    """
+    # TODO consider Numpy implementation
+    composite_transforms = torchvision.transforms.Compose(transforms)
+    if indices is None:
+        index = dataset.index
+        dataset = composite_transforms(dataset)
+        dataset = pd.DataFrame(dataset, index=index)
+    else:
+        sub_dataset = dataset.iloc[:, indices]
+        sub_dataset = composite_transforms(sub_dataset)
+        dataset.iloc[:, indices] = sub_dataset
+    return dataset
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
new file mode 100644
index 000000000..a8c31081b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -0,0 +1,86 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace import CategoricalHyperparameter, ConfigurationSpace
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.pipeline import Pipeline
+
+import torch
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import TargetScaler
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+class BaseTargetScaler(autoPyTorchComponent):
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None,
+                 scaling_mode: str = 'none'):
+        super().__init__()
+        self.random_state = random_state
+        self.scaling_mode = scaling_mode
+        self.preprocessor: Optional[Pipeline] = None
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Creates a column transformer for the chosen tabular
+        preprocessors
+        Args:
+            X (Dict[str, Any]):
+                fit dictionary
+
+        Returns:
+            "BaseEstimator":
+                an instance of self
+        """
+        self.check_requirements(X, y)
+        self.scaler = TargetScaler(mode=self.scaling_mode)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the time series transformer to fit dictionary
+        Args:
+            X (Dict[str, Any]):
+                fit dictionary
+
+        Returns:
+            X (Dict[str, Any]):
+                updated fit dictionary
+        """
+        X.update({'target_scaler': self})
+        return X
+
+    def __call__(self,
+                 past_target: Union[np.ndarray, torch.tensor],
+                 past_observed_values: Optional[torch.BoolTensor] = None,
+                 future_targets: Optional[Union[np.ndarray, torch.Tensor]] = None,
+                 ) -> Union[np.ndarray, torch.tensor]:
+
+        if self.scaler is None:
+            raise ValueError("cant call {} without fitting the column transformer first."
+                             .format(self.__class__.__name__))
+
+        if len(past_target.shape) == 2:
+            # expand batch dimension when called on a single record
+            past_target = past_target[np.newaxis, ...]
+        past_target, future_targets, loc, scale = self.scaler.transform(past_target,
+                                                                        past_observed_values,
+                                                                        future_targets)
+        return past_target, future_targets, loc, scale
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            scaling_mode: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='scaling_mode',
+                value_range=("standard", "min_max", "max_abs", "mean_abs", "none"),
+                default_value="standard",
+            ),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, scaling_mode, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
new file mode 100644
index 000000000..7b4782206
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
@@ -0,0 +1,137 @@
+from typing import Any, Dict, Optional, Tuple
+
+from sklearn.base import BaseEstimator
+
+import torch
+
+
+# Similar to / inspired by
+# https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
+class TargetScaler(BaseEstimator):
+    """
+    To accelerate training, this scaler is only applied under trainer (after the data is loaded by dataloader)
+    """
+
+    def __init__(self, mode: str):
+        self.mode = mode
+
+    def fit(self, X: Dict, y: Any = None) -> "TargetScaler":
+        return self
+
+    def transform(self,
+                  past_targets: torch.Tensor,
+                  past_observed_values: torch.BoolTensor,
+                  future_targets: Optional[torch.Tensor] = None) -> \
+            Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if past_observed_values is None or torch.all(past_observed_values):
+            if self.mode == "standard":
+                loc = torch.mean(past_targets, dim=1, keepdim=True)
+                scale = torch.std(past_targets, dim=1, keepdim=True)
+
+                offset_targets = past_targets - loc
+                scale = torch.where(torch.logical_or(scale == 0.0, scale == torch.nan), offset_targets[:, [-1]], scale)
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+                return (past_targets - loc) / scale, future_targets, loc, scale
+
+            elif self.mode == "min_max":
+                min_ = torch.min(past_targets, dim=1, keepdim=True)[0]
+                max_ = torch.max(past_targets, dim=1, keepdim=True)[0]
+
+                diff_ = max_ - min_
+                loc = min_
+                scale = torch.where(diff_ == 0, past_targets[:, [-1]], diff_)
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+                return (past_targets - loc) / scale, future_targets, loc, scale
+
+            elif self.mode == "max_abs":
+                max_abs_ = torch.max(torch.abs(past_targets), dim=1, keepdim=True)[0]
+                max_abs_[max_abs_ == 0.0] = 1.0
+                scale = max_abs_
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+                return past_targets / scale, future_targets, None, scale
+
+            elif self.mode == 'mean_abs':
+                mean_abs = torch.mean(torch.abs(past_targets), dim=1, keepdim=True)
+                scale = torch.where(mean_abs == 0.0, past_targets[:, [-1]], mean_abs)
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+                return past_targets / scale, future_targets, None, scale
+
+            elif self.mode == "none":
+                return past_targets, future_targets, None, None
+
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
+        else:
+            valid_past_targets = past_observed_values * past_targets
+            valid_past_obs = torch.sum(past_observed_values, dim=1, keepdim=True)
+            if self.mode == "standard":
+                dfredom = 1
+                loc = torch.sum(valid_past_targets, dim=1, keepdim=True) / valid_past_obs
+                scale = torch.sum(torch.square((valid_past_targets - loc * past_observed_values)), dim=1, keepdim=True)
+
+                scale /= valid_past_obs - dfredom
+                scale = torch.sqrt(scale)
+
+                offset_targets = past_targets - loc
+                # ensure that all the targets are scaled properly
+                scale = torch.where(torch.logical_or(scale == 0.0, scale == torch.nan), offset_targets[:, [-1]], scale)
+                scale[scale == 0.0] = 1.0
+
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+
+                scaled_past_targets = torch.where(past_observed_values, offset_targets / scale, past_targets)
+                return scaled_past_targets, future_targets, loc, scale
+
+            elif self.mode == "min_max":
+                obs_mask = ~past_observed_values
+                min_masked_past_targets = past_targets.masked_fill(obs_mask, value=torch.inf)
+                max_masked_past_targets = past_targets.masked_fill(obs_mask, value=-torch.inf)
+                min_ = torch.min(min_masked_past_targets, dim=1, keepdim=True)[0]
+                max_ = torch.max(max_masked_past_targets, dim=1, keepdim=True)[0]
+
+                diff_ = max_ - min_
+                loc = min_
+                scale = torch.where(diff_ == 0, past_targets[:, [-1]], diff_)
+                scale[scale == 0.0] = 1.0
+
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+                scaled_past_targets = torch.where(past_observed_values, (past_targets - loc) / scale, past_targets)
+
+                return scaled_past_targets, future_targets, loc, scale
+
+            elif self.mode == "max_abs":
+                max_abs_ = torch.max(torch.abs(valid_past_targets), dim=1, keepdim=True)[0]
+                max_abs_[max_abs_ == 0.0] = 1.0
+                scale = max_abs_
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+
+                scaled_past_targets = torch.where(past_observed_values, past_targets / scale, past_targets)
+
+                return scaled_past_targets, future_targets, None, scale
+
+            elif self.mode == 'mean_abs':
+                mean_abs = torch.sum(torch.abs(valid_past_targets), dim=1, keepdim=True) / valid_past_obs
+                scale = torch.where(mean_abs == 0.0, valid_past_targets[:, [-1]], mean_abs)
+                # in case that all values in the tensor is 0
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+
+                scaled_past_targets = torch.where(past_observed_values, past_targets / scale, past_targets)
+                return scaled_past_targets, future_targets, None, scale
+
+            elif self.mode == "none":
+                return past_targets, future_targets, None, None
+
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
new file mode 100644
index 000000000..5039a09d5
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -0,0 +1,89 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.conditions import EqualsCondition
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+
+import numpy as np
+
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import (
+    ForecastingLossComponents
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.pipeline.components.training.losses import LogProbLoss
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class DistributionLoss(ForecastingLossComponents):
+    loss = LogProbLoss
+    net_output_type = 'distribution'  # type: ignore[assignment]
+
+    def __init__(self,
+                 dist_cls: str,
+                 random_state: Optional[np.random.RandomState] = None,
+                 forecast_strategy: str = "sample",
+                 num_samples: int = 100,
+                 aggregation: str = "mean",
+                 ):
+        super(DistributionLoss, self).__init__()
+        self.dist_cls = dist_cls
+        self.random_state = random_state
+        self.forecasting_strategy = DisForecastingStrategy(dist_cls=dist_cls,
+                                                           forecast_strategy=forecast_strategy,
+                                                           num_samples=num_samples,
+                                                           aggregation=aggregation)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'DistributionLoss',
+            'name': 'DistributionLoss',
+            "handles_tabular": False,
+            "handles_image": False,
+            "handles_time_series": True,
+            'handles_regression': True,
+            'handles_classification': False
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        required_padding_value = ALL_DISTRIBUTIONS[self.dist_cls].value_in_support
+        X.update({"required_padding_value": required_padding_value,
+                  "dist_forecasting_strategy": self.forecasting_strategy})
+        return super().transform(X)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="dist_cls",
+                value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                default_value=list(ALL_DISTRIBUTIONS.keys())[0]),
+            forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
+                                                                                     value_range=('sample', 'mean'),
+                                                                                     default_value='sample'),
+            num_samples: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_samples',
+                                                                               value_range=(50, 200),
+                                                                               default_value=100),
+            aggregation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='aggregation',
+                                                                               value_range=('mean', 'median'),
+                                                                               default_value='mean')
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
+
+        forecast_strategy = get_hyperparameter(forecast_strategy, CategoricalHyperparameter)
+        num_samples = get_hyperparameter(num_samples, UniformIntegerHyperparameter)
+        aggregation = get_hyperparameter(aggregation, CategoricalHyperparameter)
+
+        cs.add_hyperparameters([forecast_strategy, num_samples, aggregation])
+
+        cond_n_samples = EqualsCondition(num_samples, forecast_strategy, 'sample')
+        cond_agg = EqualsCondition(aggregation, forecast_strategy, 'sample')
+        cs.add_conditions([cond_n_samples, cond_agg])
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
new file mode 100644
index 000000000..581fc8828
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -0,0 +1,66 @@
+from functools import partial
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import UniformFloatHyperparameter
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
+    ForecastingLossComponents
+from autoPyTorch.pipeline.components.training.losses import QuantileLoss
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter
+)
+
+
+class NetworkQuantileLoss(ForecastingLossComponents):
+    loss = QuantileLoss
+    net_output_type = 'quantile'
+
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 lower_quantile: float = 0.1,
+                 upper_quantile: float = 0.9,
+                 ):
+        super().__init__()
+        self.random_state = random_state
+        self.quantiles = [0.5, lower_quantile, upper_quantile]
+        # To make it compatible with
+        # autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer
+        self.loss = partial(QuantileLoss, quantiles=self.quantiles)  # type: ignore[assignment]
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({"quantile_values": self.quantiles})
+        return super().transform(X)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'QuantileLoss',
+            'name': 'QuantileLoss',
+            "handles_tabular": False,
+            "handles_image": False,
+            "handles_time_series": True,
+            'handles_regression': True,
+            'handles_classification': False
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            lower_quantile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='lower_quantile',
+                                                                                  value_range=(0.0, 0.4),
+                                                                                  default_value=0.1),
+            upper_quantile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='upper_quantile',
+                                                                                  value_range=(0.6, 1.0),
+                                                                                  default_value=0.9)
+
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, lower_quantile, UniformFloatHyperparameter)
+        add_hyperparameter(cs, upper_quantile, UniformFloatHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
new file mode 100644
index 000000000..a77cd9cb9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
@@ -0,0 +1,61 @@
+from typing import Dict, Optional, Union
+
+from ConfigSpace import CategoricalHyperparameter, ConfigurationSpace
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
+    ForecastingLossComponents
+from autoPyTorch.pipeline.components.training.losses import (
+    L1Loss,
+    MAPELoss,
+    MASELoss,
+    MSELoss
+)
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+class RegressionLoss(ForecastingLossComponents):
+    net_output_type = 'regression'
+
+    def __init__(self,
+                 loss_name: str,
+                 random_state: Optional[np.random.RandomState] = None,
+                 ):
+        super(RegressionLoss, self).__init__()
+        if loss_name == "l1":
+            self.loss = L1Loss
+        elif loss_name == 'mse':
+            self.loss = MSELoss
+        elif loss_name == 'mase':
+            self.loss = MASELoss
+        elif loss_name == 'mape':
+            self.loss = MAPELoss
+        else:
+            raise ValueError(f"Unsupported loss type {loss_name}!")
+        self.random_state = random_state
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'RegressionLoss',
+            'name': 'RegressionLoss',
+            "handles_tabular": True,
+            "handles_image": True,
+            "handles_time_series": True,
+            'handles_regression': True,
+            'handles_classification': False
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            loss_name: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="loss_name",
+                                                                             value_range=('l1', 'mse', 'mase', 'mape'),
+                                                                             default_value='mse'),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, loss_name, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
new file mode 100644
index 000000000..f9e2b0789
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -0,0 +1,196 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.constants import (CLASSIFICATION_TASKS, FORECASTING_TASKS,
+                                   REGRESSION_TASKS, STRING_TO_TASK_TYPES)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
+    ForecastingLossComponents
+
+directory = os.path.split(__file__)[0]
+_optimizers = find_components(__package__,
+                              directory,
+                              ForecastingLossComponents)
+_addons = ThirdPartyComponents(ForecastingLossComponents)
+
+
+class ForecastingLossChoices(autoPyTorchChoice):
+    """This class select the training loss
+    training loss can be one of the following choice: distriubtion (log_prob), regression and quantile (TODO)
+    each losses corresponds to a network output head:
+    DistributionHead (log_prob)
+    RegressionHead
+
+    """
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available optimizer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseOptimizerComponents  available
+                as choices
+        """
+        components = OrderedDict()
+        components.update(_optimizers)
+        components.update(_addons.components)
+        return components
+
+    def get_available_components(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        include: List[str] = None,
+        exclude: List[str] = None,
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+            include (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to remove from the configuration space
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                Characteristics of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate heads
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        available_comp = self.get_components()
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == ForecastingLossChoices or hasattr(entry, 'get_components'):
+                continue
+
+            task_type_name = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type_name and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type_name and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type_name and not bool(properties['handles_time_series']):
+                continue
+
+            task_type = STRING_TO_TASK_TYPES[task_type_name]
+
+            if task_type in CLASSIFICATION_TASKS and not bool(properties['handles_classification']):
+                continue
+            elif task_type in [*REGRESSION_TASKS, *FORECASTING_TASKS] and not bool(properties['handles_regression']):
+                continue
+
+            components_dict[name] = entry
+        return components_dict
+
+    def get_hyperparameter_search_space(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        default: Optional[str] = None,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            default (Optional[str]):
+                Default component to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip
+
+        Returns:
+            ConfigurationSpace:
+                the configuration space of the hyper-parameters of the chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_losses = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        if len(available_losses) == 0:
+            raise ValueError("No Loss found")
+
+        if default is None:
+            defaults = [
+                'DistributionLoss',
+                'RegressionLoss',
+            ]
+            for default_ in defaults:
+                if default_ in available_losses:
+                    default = default_
+                    break
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_losses):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_losses,
+                                                               choice_hyperparameter.value_range))
+            optimizer = CSH.CategoricalHyperparameter('__choice__',
+                                                      choice_hyperparameter.value_range,
+                                                      default_value=choice_hyperparameter.default_value)
+        else:
+            optimizer = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_losses.keys()),
+                default_value=default
+            )
+        cs.add_hyperparameter(optimizer)
+        for name in optimizer.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_losses[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                  **updates)
+            parent_hyperparameter = {'parent': optimizer, 'value': name}
+            cs.add_configuration_space(
+                name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        assert self.choice is not None, "Cannot call transform before the object is initialized"
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
new file mode 100644
index 000000000..d5d4b36a2
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
@@ -0,0 +1,27 @@
+from typing import Any, Callable, Dict, Optional
+
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.utils.common import FitRequirement
+
+
+class ForecastingLossComponents(autoPyTorchComponent):
+    _required_properties = ["name", "handles_tabular", "handles_image", "handles_time_series",
+                            'handles_regression', 'handles_classification']
+    loss: Optional[Callable] = None
+    net_output_type: Optional[str] = None
+
+    def __init__(self,
+                 **kwargs: Any):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
+        ])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
+        self.check_requirements(X, y)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({"loss": self.loss,
+                  'net_output_type': self.net_output_type})
+        return X
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
new file mode 100644
index 000000000..fc7ac3ae1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -0,0 +1,1292 @@
+import warnings
+from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.distributions import AffineTransform, TransformedDistribution
+
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    StackedDecoder,
+    StackedEncoder,
+    TemporalFusionLayer,
+    VariableSelector
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
+    EncoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import \
+    _NoEmbedding
+
+ALL_NET_OUTPUT = Union[torch.Tensor, List[torch.Tensor], torch.distributions.Distribution]
+
+
+class TransformedDistribution_(TransformedDistribution):
+    """
+    We implement the mean function such that we do not need to enquire base mean every time
+    """
+
+    @property
+    def mean(self) -> torch.Tensor:
+        mean = self.base_dist.mean
+        for transform in self.transforms:
+            mean = transform(mean)
+        return mean
+
+
+def get_lagged_subsequences(
+        sequence: torch.Tensor,
+        subsequences_length: int,
+        lags_seq: Optional[List[int]] = None,
+        mask: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Returns lagged subsequences of a given sequence, this allows the model to receive the input from the past targets
+    outside the sliding windows. This implementation is similar to gluonTS's implementation
+     the only difference is that we pad the sequence that is not long enough
+
+    Args:
+        sequence (torch.Tensor):
+            the sequence from which lagged subsequences should be extracted, Shape: (N, T, C).
+        subsequences_length (int):
+            length of the subsequences to be extracted.
+        lags_seq (Optional[List[int]]):
+            lags of the sequence, indicating the sequence that needs to be extracted
+        mask (Optional[torch.Tensor]):
+            a mask tensor indicating, it is a cached mask tensor that allows the model to quickly extract the desired
+            lagged values
+
+    Returns:
+        lagged (Tensor)
+            A tensor of shape (N, S, I * C), where S = subsequences_length and I = len(indices),
+             containing lagged subsequences.
+        mask (torch.Tensor):
+            cached mask
+    """
+    batch_size = sequence.shape[0]
+    num_features = sequence.shape[2]
+    if mask is None:
+        if lags_seq is None:
+            warnings.warn('Neither lag_mask or lags_seq is given, we simply return the input value')
+            return sequence, None
+        # generate mask
+        num_lags = len(lags_seq)
+
+        # build a mask
+        mask_length = max(lags_seq) + subsequences_length
+        mask = torch.zeros((num_lags, mask_length), dtype=torch.bool)
+        for i, lag_index in enumerate(lags_seq):
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            mask[i, begin_index: end_index] = True
+    else:
+        num_lags = mask.shape[0]
+        mask_length = mask.shape[1]
+
+    mask_extend = mask.clone()
+
+    if mask_length > sequence.shape[1]:
+        sequence = torch.cat([sequence.new_zeros([batch_size, mask_length - sequence.shape[1], num_features]),
+                              sequence], dim=1)
+    elif mask_length < sequence.shape[1]:
+        mask_extend = torch.cat([mask.new_zeros([num_lags, sequence.shape[1] - mask_length]), mask_extend], dim=1)
+    #  (N, 1, T, C)
+    sequence = sequence.unsqueeze(1)
+
+    # (I, T, 1)
+    mask_extend = mask_extend.unsqueeze(-1)
+
+    # (N, I, S, C)
+    lagged_seq = torch.masked_select(sequence, mask_extend).reshape(batch_size, num_lags, subsequences_length, -1)
+
+    lagged_seq = torch.transpose(lagged_seq, 1, 2).reshape(batch_size, subsequences_length, -1)
+
+    return lagged_seq, mask
+
+
+def get_lagged_subsequences_inference(
+        sequence: torch.Tensor,
+        subsequences_length: int,
+        lags_seq: List[int]) -> torch.Tensor:
+    """
+    this function works exactly the same as get_lagged_subsequences. However, this implementation is faster when no
+    cached value is available, thus it is applied during inference times.
+
+    Args:
+        sequence (torch.Tensor):
+            the sequence from which lagged subsequences should be extracted, Shape: (N, T, C).
+        subsequences_length (int):
+            length of the subsequences to be extracted.
+        lags_seq (Optional[List[int]]):
+            lags of the sequence, indicating the sequence that needs to be extracted
+
+    Returns:
+        lagged (Tensor)
+            A tensor of shape (N, S, I * C), where S = subsequences_length and I = len(indices),
+             containing lagged subsequences.
+    """
+    sequence_length = sequence.shape[1]
+    batch_size = sequence.shape[0]
+    lagged_values = []
+    for lag_index in lags_seq:
+        begin_index = -lag_index - subsequences_length
+        end_index = -lag_index if lag_index > 0 else None
+        if end_index is not None and end_index < -sequence_length:
+            lagged_values.append(torch.zeros([batch_size, subsequences_length, *sequence.shape[2:]]))
+            continue
+        if begin_index < -sequence_length:
+            if end_index is not None:
+                pad_shape = [batch_size, subsequences_length - sequence_length - end_index, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence[:, :end_index, ...]], dim=1))
+            else:
+                pad_shape = [batch_size, subsequences_length - sequence_length, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence], dim=1))
+            continue
+        else:
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+
+    lagged_seq = torch.stack(lagged_values, -1).transpose(-1, -2).reshape(batch_size, subsequences_length, -1)
+    return lagged_seq
+
+
+class AbstractForecastingNet(nn.Module):
+    """
+    This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
+    MLP decoder and the final head).
+
+    This structure is active when the decoder is a MLP with auto_regressive set as false
+
+    Attributes:
+        network_structure (NetworkStructure):
+            network structure information
+        network_embedding (nn.Module):
+            network embedding
+        network_encoder (Dict[str, EncoderBlockInfo]):
+            Encoder network, could be selected to return a sequence or a 2D Matrix
+        network_decoder (Dict[str, DecoderBlockInfo]):
+            network decoder
+        temporal_fusion Optional[TemporalFusionLayer]:
+            Temporal Fusion Layer
+        network_head (nn.Module):
+            network head, maps the output of decoder to the final output
+        dataset_properties (Dict):
+            dataset properties
+        auto_regressive (bool):
+            if the model is auto-regressive model
+        output_type (str):
+            the form that the network outputs. It could be regression, distribution or quantile
+        forecast_strategy (str):
+            only valid if output_type is distribution or quantile, how the network transforms
+            its output to predicted values, could be mean or sample
+        num_samples (int):
+            only valid if output_type is not regression and forecast_strategy is sample. This indicates the
+            number of the points to sample when doing prediction
+        aggregation (str):
+            only valid if output_type is not regression and forecast_strategy is sample. The way that the samples
+            are aggregated. We could take their mean or median values.
+    """
+    future_target_required = False
+    dtype = torch.float
+
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 network_embedding: nn.Module,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 network_decoder: Dict[str, DecoderBlockInfo],
+                 temporal_fusion: Optional[TemporalFusionLayer],
+                 network_head: nn.Module,
+                 window_size: int,
+                 target_scaler: BaseTargetScaler,
+                 dataset_properties: Dict,
+                 auto_regressive: bool,
+                 feature_names: Union[Tuple[str], Tuple[()]] = (),
+                 known_future_features: Union[Tuple[str], Tuple[()]] = (),
+                 feature_shapes: Dict[str, int] = {},
+                 static_features: Union[Tuple[str], Tuple[()]] = (),
+                 time_feature_names: Union[Tuple[str], Tuple[()]] = (),
+                 output_type: str = 'regression',
+                 forecast_strategy: Optional[str] = 'mean',
+                 num_samples: int = 50,
+                 aggregation: str = 'mean'
+                 ):
+        super().__init__()
+        self.network_structure = network_structure
+        self.embedding = network_embedding
+        if len(known_future_features) > 0:
+            known_future_features_idx = [feature_names.index(kff) for kff in known_future_features]
+            self.decoder_embedding = self.embedding.get_partial_models(known_future_features_idx)
+        else:
+            self.decoder_embedding = _NoEmbedding()
+        # modules that generate tensors while doing forward pass
+        self.lazy_modules = []
+        if network_structure.variable_selection:
+            self.variable_selector = VariableSelector(network_structure=network_structure,
+                                                      dataset_properties=dataset_properties,
+                                                      network_encoder=network_encoder,
+                                                      auto_regressive=auto_regressive,
+                                                      feature_names=feature_names,
+                                                      known_future_features=known_future_features,
+                                                      feature_shapes=feature_shapes,
+                                                      static_features=static_features,
+                                                      time_feature_names=time_feature_names,
+                                                      )
+            self.lazy_modules.append(self.variable_selector)
+        has_temporal_fusion = network_structure.use_temporal_fusion
+        self.encoder = StackedEncoder(network_structure=network_structure,
+                                      has_temporal_fusion=has_temporal_fusion,
+                                      encoder_info=network_encoder,
+                                      decoder_info=network_decoder)
+        self.decoder = StackedDecoder(network_structure=network_structure,
+                                      encoder=self.encoder.encoder,
+                                      encoder_info=network_encoder,
+                                      decoder_info=network_decoder)
+        if has_temporal_fusion:
+            if temporal_fusion is None:
+                raise ValueError("When the network structure uses temporal fusion layer, "
+                                 "temporal_fusion must be given!")
+            self.temporal_fusion = temporal_fusion  # type: TemporalFusionLayer
+            self.lazy_modules.append(self.temporal_fusion)
+        self.has_temporal_fusion = has_temporal_fusion
+        self.head = network_head
+
+        first_decoder = 'block_0'
+        for i in range(1, network_structure.num_blocks + 1):
+            block_number = f'block_{i}'
+            if block_number in network_decoder:
+                if first_decoder == 'block_0':
+                    first_decoder = block_number
+
+        if first_decoder == 0:
+            raise ValueError("At least one decoder must be specified!")
+
+        self.target_scaler = target_scaler
+
+        self.n_prediction_steps = dataset_properties['n_prediction_steps']  # type: int
+        self.window_size = window_size
+
+        self.output_type = output_type
+        self.forecast_strategy = forecast_strategy
+        self.num_samples = num_samples
+        self.aggregation = aggregation
+
+        self._device = torch.device('cpu')
+
+        if not network_structure.variable_selection:
+            self.encoder_lagged_input = network_encoder['block_1'].encoder_properties.lagged_input
+            self.decoder_lagged_input = network_decoder[first_decoder].decoder_properties.lagged_input
+        else:
+            self.encoder_lagged_input = False
+            self.decoder_lagged_input = False
+
+        if self.encoder_lagged_input:
+            self.cached_lag_mask_encoder = None
+            self.encoder_lagged_value = network_encoder['block_1'].encoder.lagged_value
+        if self.decoder_lagged_input:
+            self.cached_lag_mask_decoder = None
+            self.decoder_lagged_value = network_decoder[first_decoder].decoder.lagged_value
+
+    @property
+    def device(self) -> torch.device:
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device) -> None:
+        self.to(device)
+        self._device = device
+        for model in self.lazy_modules:
+            model.device = device
+
+    def rescale_output(self,
+                       outputs: ALL_NET_OUTPUT,
+                       loc: Optional[torch.Tensor],
+                       scale: Optional[torch.Tensor],
+                       device: torch.device = torch.device('cpu')) -> ALL_NET_OUTPUT:
+        """
+        rescale the network output to its raw scale
+
+        Args:
+            outputs (ALL_NET_OUTPUT):
+                network head output
+            loc (Optional[torch.Tensor]):
+                scaling location value
+            scale (Optional[torch.Tensor]):
+                scaling scale value
+            device (torch.device):
+                which device the output is stored
+
+        Return:
+            ALL_NET_OUTPUT:
+                rescaleed network output
+        """
+        if isinstance(outputs, List):
+            return [self.rescale_output(output, loc, scale, device) for output in outputs]
+        if loc is not None or scale is not None:
+            if isinstance(outputs, torch.distributions.Distribution):
+                transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
+                                            scale=1.0 if scale is None else scale.to(device),
+                                            )
+                outputs = TransformedDistribution_(outputs, [transform])
+            else:
+                if loc is None:
+                    outputs = outputs * scale.to(device)  # type: ignore[union-attr]
+                elif scale is None:
+                    outputs = outputs + loc.to(device)
+                else:
+                    outputs = outputs * scale.to(device) + loc.to(device)
+        return outputs
+
+    def scale_value(self,
+                    raw_value: torch.Tensor,
+                    loc: Optional[torch.Tensor],
+                    scale: Optional[torch.Tensor],
+                    device: torch.device = torch.device('cpu')) -> torch.Tensor:
+        """
+        scale the outputs
+
+        Args:
+            raw_value (torch.Tensor):
+                network head output
+            loc (Optional[torch.Tensor]):
+                scaling location value
+            scale (Optional[torch.Tensor]):
+                scaling scale value
+            device (torch.device):
+                which device the output is stored
+
+        Return:
+            torch.Tensor:
+                scaled input value
+        """
+        if loc is not None or scale is not None:
+            if loc is None:
+                outputs = raw_value / scale.to(device)  # type: ignore[union-attr]
+            elif scale is None:
+                outputs = raw_value - loc.to(device)
+            else:
+                outputs = (raw_value - loc.to(device)) / scale.to(device)
+        return outputs
+
+    @abstractmethod
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None,
+                ) -> ALL_NET_OUTPUT:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
+        """
+        This function is applied to transform the network head output to torch tensor to create the point prediction
+
+        Args:
+            net_output (ALL_NET_OUTPUT):
+                network head output
+
+        Return:
+            torch.Tensor:
+                point prediction
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.Tensor] = None,
+                ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def repeat_intermediate_values(self,
+                                   intermediate_values: List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]],
+                                   is_hidden_states: List[bool],
+                                   repeats: int) -> List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]]:
+        """
+        This function is often applied for auto-regressive model where we sample multiple points to form several
+        trajectories and we need to repeat the intermediate values to ensure that the batch sizes match
+
+        Args:
+             intermediate_values (List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]])
+                a list of intermediate values to be repeated
+             is_hidden_states  (List[bool]):
+                if the intermediate_value is hidden states in RNN-form network, we need to consider the
+                hidden states differently
+            repeats (int):
+                number of repeats
+
+        Return:
+            List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]]:
+                repeated values
+        """
+        for i, (is_hx, inter_value) in enumerate(zip(is_hidden_states, intermediate_values)):
+            if isinstance(inter_value, torch.Tensor):
+                repeated_value = inter_value.repeat_interleave(repeats=repeats, dim=1 if is_hx else 0)
+                intermediate_values[i] = repeated_value
+            elif isinstance(inter_value, tuple):
+                dim = 1 if is_hx else 0
+                repeated_value = tuple(hx.repeat_interleave(repeats=repeats, dim=dim) for hx in inter_value)
+                intermediate_values[i] = repeated_value
+        return intermediate_values
+
+    def pad_tensor(self, tensor_to_be_padded: torch.Tensor, target_length: int) -> torch.Tensor:
+        """
+        pad tensor to meet the required length
+
+        Args:
+             tensor_to_be_padded (torch.Tensor)
+                tensor to be padded
+             target_length  (int):
+                target length
+
+        Return:
+            torch.Tensor:
+                padded tensors
+        """
+        tensor_shape = tensor_to_be_padded.shape
+        padding_size = [tensor_shape[0], target_length - tensor_shape[1], tensor_shape[-1]]
+        tensor_to_be_padded = torch.cat([tensor_to_be_padded.new_zeros(padding_size), tensor_to_be_padded], dim=1)
+        return tensor_to_be_padded
+
+
+class ForecastingNet(AbstractForecastingNet):
+    def pre_processing(self,
+                       past_targets: torch.Tensor,
+                       past_observed_targets: torch.BoolTensor,
+                       past_features: Optional[torch.Tensor] = None,
+                       future_features: Optional[torch.Tensor] = None,
+                       length_past: int = 0,
+                       length_future: int = 0,
+                       variable_selector_kwargs: Dict = {},
+                       ) -> Tuple[torch.Tensor, ...]:
+        if self.encoder_lagged_input:
+            if self.window_size < past_targets.shape[1]:
+                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    past_targets[:, -self.window_size:],
+                    past_observed_targets[:, -self.window_size:]
+                )
+                past_targets[:, :-self.window_size] = torch.where(
+                    past_observed_targets[:, :-self.window_size],
+                    self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                    past_targets[:, :-self.window_size])
+            else:
+                past_targets, _, loc, scale = self.target_scaler(
+                    past_targets,
+                    past_observed_targets
+                )
+            truncated_past_targets, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
+                                                                                           self.window_size,
+                                                                                           self.encoder_lagged_value,
+                                                                                           self.cached_lag_mask_encoder)
+        else:
+            if self.window_size < past_targets.shape[1]:
+                past_targets = past_targets[:, -self.window_size:]
+                past_observed_targets = past_observed_targets[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
+            truncated_past_targets = past_targets
+        if past_features is not None:
+            if self.window_size <= past_features.shape[1]:
+                past_features = past_features[:, -self.window_size:]
+            elif self.encoder_lagged_input:
+                past_features = self.pad_tensor(past_features, self.window_size)
+
+        if self.network_structure.variable_selection:
+            batch_size = truncated_past_targets.shape[0]
+            feat_dict_static = {}
+            if length_past > 0:
+                if past_features is not None:
+                    past_features = self.embedding(past_features.to(self.device))
+                feat_dict_past = {'past_targets': truncated_past_targets.to(device=self.device)}
+
+                if past_features is not None:
+                    for feature_name in self.variable_selector.feature_names:
+                        tensor_idx = self.variable_selector.feature_names2tensor_idx[feature_name]
+                        if feature_name not in self.variable_selector.static_features:
+                            feat_dict_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                        else:
+                            static_feature = past_features[:, 0, tensor_idx[0]: tensor_idx[1]]
+                            feat_dict_static[feature_name] = static_feature
+
+                if hasattr(self.variable_selector, 'placeholder_features'):
+                    for placehold in self.variable_selector.placeholder_features:
+                        feat_dict_past[placehold] = torch.zeros((batch_size, length_past, 1),
+                                                                dtype=past_targets.dtype,
+                                                                device=self.device)
+            else:
+                feat_dict_past = None  # type: ignore[assignment]
+            if length_future > 0:
+                if future_features is not None:
+                    future_features = self.decoder_embedding(future_features.to(self.device))
+                feat_dict_future = {}
+                if hasattr(self.variable_selector, 'placeholder_features'):
+                    for placehold in self.variable_selector.placeholder_features:
+                        feat_dict_future[placehold] = torch.zeros((batch_size,
+                                                                   length_future, 1),
+                                                                  dtype=past_targets.dtype,
+                                                                  device=self.device)
+                if future_features is not None:
+                    for feature_name in self.variable_selector.known_future_features:
+                        tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
+                        if feature_name not in self.variable_selector.static_features:
+                            feat_dict_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                        else:
+                            if length_past == 0:
+                                # Otherwise static_feature is acquired when processing with encoder network
+                                static_feature = future_features[:, 0, tensor_idx[0]: tensor_idx[1]]
+                                feat_dict_static[feature_name] = static_feature
+
+            else:
+                feat_dict_future = None  # type: ignore[assignment]
+
+            x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
+                x_past=feat_dict_past,
+                x_future=feat_dict_future,
+                x_static=feat_dict_static,
+                batch_size=batch_size,
+                length_past=length_past,
+                length_future=length_future,
+                **variable_selector_kwargs
+            )
+
+            return x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets
+        else:
+            if past_features is not None:
+                x_past = torch.cat([truncated_past_targets, past_features], dim=-1).to(device=self.device)
+                x_past = self.embedding(x_past.to(device=self.device))
+            else:
+                x_past = self.embedding(truncated_past_targets.to(device=self.device))
+            if future_features is not None and length_future > 0:
+                future_features = self.decoder_embedding(future_features.to(self.device))
+            return x_past, future_features, None, loc, scale, None, past_targets
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None,
+                ) -> ALL_NET_OUTPUT:
+        x_past, x_future, x_static, loc, scale, static_context_initial_hidden, _ = self.pre_processing(
+            past_targets=past_targets,
+            past_observed_targets=past_observed_targets,
+            past_features=past_features,
+            future_features=future_features,
+            length_past=min(self.window_size, past_targets.shape[1]),
+            length_future=self.n_prediction_steps
+        )
+
+        encoder_additional = [static_context_initial_hidden]
+        encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
+        encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
+
+        decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
+                                      pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
+
+        if self.has_temporal_fusion:
+            decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                  decoder_output=decoder_output,
+                                                  past_observed_targets=past_observed_targets,
+                                                  decoder_length=self.n_prediction_steps,
+                                                  static_embedding=x_static
+                                                  )
+
+        output = self.head(decoder_output)
+
+        return self.rescale_output(output, loc, scale, self.device)
+
+    def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
+        if self.output_type == 'regression':
+            return net_output
+        elif self.output_type == 'quantile':
+            return net_output[0]
+        elif self.output_type == 'distribution':
+            if self.forecast_strategy == 'mean':
+                if isinstance(net_output, list):
+                    return torch.cat([dist.mean for dist in net_output], dim=-2)
+                else:
+                    return net_output.mean
+            elif self.forecast_strategy == 'sample':
+                if isinstance(net_output, list):
+                    samples = torch.cat([dist.sample((self.num_samples,)) for dist in net_output], dim=-2)
+                else:
+                    samples = net_output.sample((self.num_samples,))
+                if self.aggregation == 'mean':
+                    return torch.mean(samples, dim=0)
+                elif self.aggregation == 'median':
+                    return torch.median(samples, 0)[0]
+                else:
+                    raise NotImplementedError(f'Unknown aggregation: {self.aggregation}')
+            else:
+                raise NotImplementedError(f'Unknown forecast_strategy: {self.forecast_strategy}')
+        else:
+            raise NotImplementedError(f'Unknown output_type: {self.output_type}')
+
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                ) -> torch.Tensor:
+        net_output = self(past_targets=past_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+        return self.pred_from_net_output(net_output)
+
+
+class ForecastingSeq2SeqNet(ForecastingNet):
+    future_target_required = True
+    """
+    Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while
+
+    This structure is activate when the decoder is recurrent (RNN or transformer).
+    We train the network with teacher forcing, thus
+    future_targets is required for the network. To train the network, past targets and past features are fed to the
+    encoder to obtain the hidden states whereas future targets and future features.
+    When the output type is distribution and forecast_strategy is sampling,
+    this model is equivalent to a deepAR model during inference.
+    """
+
+    def decoder_select_variable(self, future_targets: torch.tensor,
+                                future_features: Optional[torch.Tensor]) -> torch.Tensor:
+        batch_size = future_targets.shape[0]
+        length_future = future_targets.shape[1]
+        future_targets = future_targets.to(self.device)
+        if future_features is not None:
+            future_features = self.decoder_embedding(future_features.to(self.device))
+        feat_dict_future = {}
+        if hasattr(self.variable_selector, 'placeholder_features'):
+            for placeholder in self.variable_selector.placeholder_features:
+                feat_dict_future[placeholder] = torch.zeros((batch_size,
+                                                             length_future, 1),
+                                                            dtype=future_targets.dtype,
+                                                            device=self.device)
+
+        for feature_name in self.variable_selector.known_future_features:
+            tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
+            if feature_name not in self.variable_selector.static_features:
+                feat_dict_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+
+        feat_dict_future['future_prediction'] = future_targets
+        _, x_future, _, _ = self.variable_selector(x_past=None,
+                                                   x_future=feat_dict_future,
+                                                   x_static=None,
+                                                   length_past=0,
+                                                   length_future=length_future,
+                                                   batch_size=batch_size,
+                                                   use_cached_static_contex=True
+                                                   )
+        return x_future
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:
+        x_past, _, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
+            past_targets=past_targets,
+            past_observed_targets=past_observed_targets,
+            past_features=past_features,
+            future_features=future_features,
+            length_past=min(self.window_size, past_targets.shape[1]),
+            length_future=0,
+            variable_selector_kwargs={'cache_static_contex': True}
+        )
+        encoder_additional = [static_context_initial_hidden]
+        encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
+        if self.training:
+            future_targets = self.scale_value(future_targets, loc, scale)
+            # we do one step ahead forecasting
+            if self.decoder_lagged_input:
+                future_targets = torch.cat([past_targets, future_targets[:, :-1, :]], dim=1)
+                future_targets, self.cached_lag_mask_decoder = get_lagged_subsequences(future_targets,
+                                                                                       self.n_prediction_steps,
+                                                                                       self.decoder_lagged_value,
+                                                                                       self.cached_lag_mask_decoder)
+            else:
+                future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
+
+            if self.network_structure.variable_selection:
+                decoder_input = self.decoder_select_variable(future_targets, future_features)
+            else:
+                decoder_input = future_targets if future_features is None else torch.cat([future_features,
+                                                                                          future_targets], dim=-1)
+                decoder_input = decoder_input.to(self.device)
+                decoder_input = self.decoder_embedding(decoder_input)
+
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
+                                                           additional_input=encoder_additional)
+
+            decoder_output = self.decoder(x_future=decoder_input, encoder_output=encoder2decoder,
+                                          pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
+
+            if self.has_temporal_fusion:
+                decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                      decoder_output=decoder_output,
+                                                      past_observed_targets=past_observed_targets,
+                                                      decoder_length=self.n_prediction_steps,
+                                                      static_embedding=x_static
+                                                      )
+            net_output = self.head(decoder_output)
+
+            return self.rescale_output(net_output, loc, scale, self.device)
+        else:
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
+
+            if self.has_temporal_fusion:
+                decoder_output_all: Optional[torch.Tensor] = None
+
+            if self.forecast_strategy != 'sample':
+                all_predictions = []
+                predicted_target = past_targets[:, [-1]]
+                past_targets = past_targets[:, :-1]
+                for idx_pred in range(self.n_prediction_steps):
+                    predicted_target = predicted_target.cpu()
+                    if self.decoder_lagged_input:
+                        past_targets = torch.cat([past_targets, predicted_target], dim=1)
+                        ar_future_target = get_lagged_subsequences_inference(past_targets, 1,
+                                                                             self.decoder_lagged_value)
+                    else:
+                        ar_future_target = predicted_target[:, [-1]]
+
+                    if self.network_structure.variable_selection:
+                        decoder_input = self.decoder_select_variable(
+                            future_targets=predicted_target[:, -1:].to(self.device),
+                            future_features=future_features[:, [idx_pred]] if future_features is not None else None
+                        )
+                    else:
+                        decoder_input = ar_future_target if future_features is None else torch.cat(
+                            [future_features[:, [idx_pred]],
+                             ar_future_target,
+                             ],
+                            dim=-1)
+                        decoder_input = decoder_input.to(self.device)
+                        decoder_input = self.decoder_embedding(decoder_input)
+
+                    decoder_output = self.decoder(decoder_input,
+                                                  encoder_output=encoder2decoder,
+                                                  pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
+                                                  cache_intermediate_state=True,
+                                                  incremental_update=idx_pred > 0)
+
+                    if self.has_temporal_fusion:
+                        if decoder_output_all is not None:
+                            decoder_output_all = torch.cat([decoder_output_all, decoder_output], dim=1)
+                        else:
+                            decoder_output_all = decoder_output
+                        decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                              decoder_output=decoder_output_all,
+                                                              past_observed_targets=past_observed_targets,
+                                                              decoder_length=idx_pred + 1,
+                                                              static_embedding=x_static
+                                                              )[:, -1:]
+
+                    net_output = self.head(decoder_output)
+                    predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()],
+                                                 dim=1)
+
+                    all_predictions.append(net_output)
+
+                if self.output_type == 'regression':
+                    all_predictions = torch.cat(all_predictions, dim=1)
+                elif self.output_type == 'quantile':
+                    all_predictions = torch.cat([self.pred_from_net_output(pred) for pred in all_predictions], dim=1)
+                else:
+                    all_predictions = self.pred_from_net_output(all_predictions)
+
+                return self.rescale_output(all_predictions, loc, scale, self.device)
+
+            else:
+                # we follow the DeepAR implementation:
+                batch_size = past_targets.shape[0]
+
+                encoder2decoder = self.repeat_intermediate_values(
+                    encoder2decoder,
+                    is_hidden_states=self.encoder.encoder_has_hidden_states,
+                    repeats=self.num_samples)
+
+                if self.has_temporal_fusion:
+                    intermediate_values = self.repeat_intermediate_values([encoder_output, past_observed_targets],
+                                                                          is_hidden_states=[False, False],
+                                                                          repeats=self.num_samples)
+
+                    encoder_output = intermediate_values[0]
+                    past_observed_targets = intermediate_values[1]
+
+                if self.decoder_lagged_input:
+                    max_lag_seq_length = max(self.decoder_lagged_value) + 1
+                else:
+                    max_lag_seq_length = 1 + self.window_size
+                repeated_past_target = past_targets[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
+                                                                                               dim=0).squeeze(1)
+                repeated_predicted_target = repeated_past_target[:, [-1]]
+                repeated_past_target = repeated_past_target[:, :-1, ]
+
+                repeated_x_static = x_static.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if x_static is not None else None
+
+                repeated_future_features = future_features.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if future_features is not None else None
+
+                if self.network_structure.variable_selection:
+                    self.variable_selector.cached_static_contex = self.repeat_intermediate_values(
+                        [self.variable_selector.cached_static_contex],
+                        is_hidden_states=[False],
+                        repeats=self.num_samples
+                    )[0]
+
+                for idx_pred in range(self.n_prediction_steps):
+                    if self.decoder_lagged_input:
+                        ar_future_target = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
+                        ar_future_target = get_lagged_subsequences_inference(ar_future_target, 1,
+                                                                             self.decoder_lagged_value)
+                    else:
+                        ar_future_target = repeated_predicted_target[:, [-1]]
+
+                    if self.network_structure.variable_selection:
+                        decoder_input = self.decoder_select_variable(
+                            future_targets=ar_future_target,
+                            future_features=None if repeated_future_features is None else
+                            repeated_future_features[:, [idx_pred]])
+                    else:
+                        decoder_input = ar_future_target if repeated_future_features is None else torch.cat(
+                            [repeated_future_features[:, [idx_pred], :], ar_future_target], dim=-1)
+
+                        decoder_input = decoder_input.to(self.device)
+                        decoder_input = self.decoder_embedding(decoder_input)
+
+                    decoder_output = self.decoder(decoder_input,
+                                                  encoder_output=encoder2decoder,
+                                                  pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
+                                                  cache_intermediate_state=True,
+                                                  incremental_update=idx_pred > 0)
+
+                    if self.has_temporal_fusion:
+                        if decoder_output_all is not None:
+                            decoder_output_all = torch.cat([decoder_output_all, decoder_output], dim=1)
+                        else:
+                            decoder_output_all = decoder_output
+                        decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                              decoder_output=decoder_output_all,
+                                                              past_observed_targets=past_observed_targets,
+                                                              decoder_length=idx_pred + 1,
+                                                              static_embedding=repeated_x_static,
+                                                              )[:, -1:, ]
+
+                    net_output = self.head(decoder_output)
+                    samples = net_output.sample().cpu()
+
+                    repeated_predicted_target = torch.cat([repeated_predicted_target,
+                                                           samples],
+                                                          dim=1)
+
+                all_predictions = repeated_predicted_target[:, 1:].unflatten(0, (batch_size, self.num_samples))
+
+                if self.aggregation == 'mean':
+                    return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
+                elif self.aggregation == 'median':
+                    return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
+                else:
+                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
+
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                ) -> torch.Tensor:
+        net_output = self(past_targets=past_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+        if self.output_type == 'regression':
+            return self.pred_from_net_output(net_output)
+        else:
+            return net_output
+
+
+class ForecastingDeepARNet(ForecastingSeq2SeqNet):
+    future_target_required = True
+
+    def __init__(self,
+                 **kwargs: Any):
+        """
+        Forecasting network with DeepAR structure.
+
+        This structure is activate when the decoder is not recurrent (MLP) and its hyperparameter "auto_regressive" is
+        set  as True. We train the network to let it do a one-step prediction. This structure is compatible with any
+         sorts of encoder (except MLP).
+        """
+        super(ForecastingDeepARNet, self).__init__(**kwargs)
+        # this determines the training targets
+        self.encoder_bijective_seq_output = kwargs['network_encoder']['block_1'].encoder_properties.bijective_seq_output
+
+        self.cached_lag_mask_encoder_test = None
+        self.only_generate_future_dist = False
+
+    def train(self, mode: bool = True) -> nn.Module:
+        self.only_generate_future_dist = False
+        return super().train(mode=mode)
+
+    def encoder_select_variable(self, past_targets: torch.tensor, past_features: Optional[torch.Tensor],
+                                length_past: int,
+                                **variable_selector_kwargs: Any) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
+        batch_size = past_targets.shape[0]
+        past_targets = past_targets.to(self.device)
+        if past_features is not None:
+            past_features = past_features.to(self.device)
+            past_features = self.embedding(past_features)
+        feat_dict_past = {'past_targets': past_targets.to(device=self.device)}
+        feat_dict_static = {}
+        if hasattr(self.variable_selector, 'placeholder_features'):
+            for placehold in self.variable_selector.placeholder_features:
+                feat_dict_past[placehold] = torch.zeros((batch_size, length_past, 1),
+                                                        dtype=past_targets.dtype,
+                                                        device=self.device)
+
+        for feature_name in self.variable_selector.feature_names:
+            tensor_idx = self.variable_selector.feature_names2tensor_idx[feature_name]
+            if feature_name not in self.variable_selector.static_features:
+                feat_dict_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+            else:
+                static_feature = past_features[:, 0, tensor_idx[0]: tensor_idx[1]]
+                feat_dict_static[feature_name] = static_feature
+
+        x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=feat_dict_past,
+                                                                             x_future=None,
+                                                                             x_static=feat_dict_static,
+                                                                             length_past=length_past,
+                                                                             length_future=0,
+                                                                             batch_size=batch_size,
+                                                                             **variable_selector_kwargs,
+                                                                             )
+        return x_past, static_context_initial_hidden
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:
+        encode_length = min(self.window_size, past_targets.shape[1])
+
+        if past_observed_targets is None:
+            past_observed_targets = torch.ones_like(past_targets, dtype=torch.bool)
+
+        if self.training:
+            if self.encoder_lagged_input:
+                if self.window_size < past_targets.shape[1]:
+                    past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                        past_targets[:, -self.window_size:],
+                        past_observed_targets[:, -self.window_size:]
+                    )
+
+                    past_targets[:, :-self.window_size] = torch.where(
+                        past_observed_targets[:, :-self.window_size],
+                        self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                        past_targets[:, :-self.window_size])
+                else:
+                    past_targets, _, loc, scale = self.target_scaler(
+                        past_targets,
+                        past_observed_targets
+                    )
+
+                future_targets = self.scale_value(future_targets, loc, scale)
+
+                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
+                seq_length = self.window_size + self.n_prediction_steps
+                targets_all, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_all,
+                                                                                    seq_length - 1,
+                                                                                    self.encoder_lagged_value,
+                                                                                    self.cached_lag_mask_encoder)
+                targets_all = targets_all[:, -(encode_length + self.n_prediction_steps - 1):]
+            else:
+                if self.window_size < past_targets.shape[1]:
+                    past_targets = past_targets[:, -self.window_size:]
+                    past_observed_targets = past_observed_targets[:, -self.window_size:]
+                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
+                future_targets = self.scale_value(future_targets, loc, scale)
+                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
+
+            if self.network_structure.variable_selection:
+                if past_features is not None:
+                    assert future_features is not None
+                    past_features = past_features[:, -self.window_size:]
+                    features_all = torch.cat([past_features, future_features[:, :-1]], dim=1)
+                else:
+                    features_all = None
+                length_past = min(self.window_size, past_targets.shape[1]) + self.n_prediction_steps - 1
+                encoder_input, static_context_initial_hidden = self.encoder_select_variable(targets_all,
+                                                                                            past_features=features_all,
+                                                                                            length_past=length_past)
+            else:
+                if past_features is not None:
+                    assert future_features is not None
+                    if self.window_size <= past_features.shape[1]:
+                        past_features = past_features[:, -self.window_size:]
+
+                    features_all = torch.cat([past_features, future_features[:, :-1]], dim=1)
+                    encoder_input = torch.cat([features_all, targets_all], dim=-1)
+                else:
+                    encoder_input = targets_all
+
+                encoder_input = encoder_input.to(self.device)
+
+                encoder_input = self.embedding(encoder_input)
+                static_context_initial_hidden = None  # type: ignore[assignment]
+
+            encoder_additional: List[Optional[torch.Tensor]] = [static_context_initial_hidden]
+            encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
+            encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
+                                                           additional_input=encoder_additional,
+                                                           output_seq=True)
+
+            if self.only_generate_future_dist:
+                # DeepAR only receives the output of the last encoder
+                encoder2decoder = [encoder2decoder[-1][:, -self.n_prediction_steps:]]
+            net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
+            # DeepAR does not allow tf layers
+            return self.rescale_output(net_output, loc, scale, self.device)
+        else:
+            if self.encoder_lagged_input:
+                if self.window_size < past_targets.shape[1]:
+                    past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                        past_targets[:, -self.window_size:],
+                        past_observed_targets[:, -self.window_size:],
+                    )
+
+                    past_targets[:, :-self.window_size] = torch.where(
+                        past_observed_targets[:, :-self.window_size],
+                        self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                        past_targets[:, :-self.window_size])
+                else:
+                    past_targets, _, loc, scale = self.target_scaler(
+                        past_targets,
+                        past_observed_targets,
+                    )
+
+                truncated_past_targets, self.cached_lag_mask_encoder_test = get_lagged_subsequences(
+                    past_targets,
+                    self.window_size,
+                    self.encoder_lagged_value,
+                    self.cached_lag_mask_encoder_test
+                )
+                truncated_past_targets = truncated_past_targets[:, -encode_length:]
+            else:
+                if self.window_size < past_targets.shape[1]:
+                    past_targets = past_targets[:, -self.window_size:]
+                    past_observed_targets = past_observed_targets[:, -self.window_size]
+                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
+                truncated_past_targets = past_targets
+
+            if self.network_structure.variable_selection:
+                if past_features is not None:
+                    features_all = past_features[:, -self.window_size:]
+                else:
+                    features_all = None
+                variable_selector_kwargs = dict(cache_static_contex=True,
+                                                use_cached_static_contex=False)
+
+                encoder_input, static_context_initial_hidden = self.encoder_select_variable(truncated_past_targets,
+                                                                                            past_features=features_all,
+                                                                                            length_past=encode_length,
+                                                                                            **variable_selector_kwargs)
+
+            else:
+                if past_features is not None:
+                    assert future_features is not None
+                    features_all = torch.cat([past_features[:, -encode_length:], future_features[:, :-1]], dim=1)
+                else:
+                    features_all = None
+
+                encoder_input = truncated_past_targets if features_all is None else torch.cat(
+                    [features_all[:, :encode_length], truncated_past_targets], dim=-1
+                )
+
+                encoder_input = encoder_input.to(self.device)
+                encoder_input = self.embedding(encoder_input)
+                static_context_initial_hidden = None  # type: ignore[assignment]
+
+            all_samples = []
+            batch_size: int = past_targets.shape[0]
+
+            encoder_additional: List[Optional[torch.Tensor]] = [static_context_initial_hidden]  # type: ignore[no-redef]
+            encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
+            encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
+                                                           additional_input=encoder_additional,
+                                                           cache_intermediate_state=True,
+                                                           )
+
+            self.encoder.cached_intermediate_state = self.repeat_intermediate_values(
+                self.encoder.cached_intermediate_state,
+                is_hidden_states=self.encoder.encoder_has_hidden_states,
+                repeats=self.num_samples)
+
+            if self.network_structure.variable_selection:
+                self.variable_selector.cached_static_contex = self.repeat_intermediate_values(
+                    [self.variable_selector.cached_static_contex],
+                    is_hidden_states=[False],
+                    repeats=self.num_samples)[0]
+
+            if self.encoder_lagged_input:
+                max_lag_seq_length = max(max(self.encoder_lagged_value), encode_length)
+            else:
+                max_lag_seq_length = encode_length
+
+            net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
+
+            next_sample = net_output.sample(sample_shape=(self.num_samples,))
+
+            next_sample = next_sample.transpose(0, 1).reshape(
+                (next_sample.shape[0] * next_sample.shape[1], 1, -1)
+            ).cpu()
+
+            all_samples.append(next_sample)
+
+            # TODO considering padding targets here instead of inside get_lagged function
+            if self.n_prediction_steps > 1:
+                repeated_past_target = past_targets[:, -max_lag_seq_length:, ].repeat_interleave(
+                    repeats=self.num_samples,
+                    dim=0).squeeze(1)
+
+                if future_features is not None:
+                    future_features = future_features[:, 1:]
+                else:
+                    future_features = None
+
+                repeated_future_features = future_features.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if future_features is not None else None
+
+            for k in range(1, self.n_prediction_steps):
+                if self.encoder_lagged_input:
+                    repeated_past_target = torch.cat([repeated_past_target, all_samples[-1]], dim=1)
+                    ar_future_target = get_lagged_subsequences_inference(repeated_past_target, 1,
+                                                                         self.encoder_lagged_value)
+                else:
+                    ar_future_target = next_sample
+
+                if self.network_structure.variable_selection:
+                    length_past = 1
+                    variable_selector_kwargs = dict(use_cached_static_contex=True)
+                    if repeated_future_features is not None:
+                        feature_next = repeated_future_features[:, [k - 1]]
+                    else:
+                        feature_next = None
+                    encoder_input, _ = self.encoder_select_variable(ar_future_target, past_features=feature_next,
+                                                                    length_past=1,
+                                                                    **variable_selector_kwargs)
+
+                else:
+                    if repeated_future_features is not None:
+                        encoder_input = torch.cat([repeated_future_features[:, [k - 1]], ar_future_target], dim=-1)
+                    else:
+                        encoder_input = ar_future_target
+                    encoder_input = encoder_input.to(self.device)
+                    encoder_input = self.embedding(encoder_input)
+
+                encoder2decoder, _ = self.encoder(encoder_input=encoder_input,
+                                                  additional_input=[None] * self.network_structure.num_blocks,
+                                                  output_seq=False, cache_intermediate_state=True,
+                                                  incremental_update=True)
+
+                net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
+
+                next_sample = net_output.sample().cpu()
+                all_samples.append(next_sample)
+
+            all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
+
+            if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
+                raise ValueError(
+                    f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
+                    f"but this network has {self.output_type} and {self.forecast_strategy}")
+            if self.aggregation == 'mean':
+                return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
+            elif self.aggregation == 'median':
+                return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
+            else:
+                raise ValueError(f'Unknown aggregation: {self.aggregation}')
+
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                ) -> torch.Tensor:
+        net_output = self(past_targets=past_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+        return net_output
+
+
+class NBEATSNet(ForecastingNet):
+    future_target_required = False
+
+    def forward(self,  # type: ignore[override]
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor,
+                                                                                   Tuple[torch.Tensor, torch.Tensor]]:
+
+        # Unlike other networks, NBEATS network is required to predict both past and future targets.
+        # Thereby, we return two tensors for backcast and forecast
+        if past_observed_targets is None:
+            past_observed_targets = torch.ones_like(past_targets, dtype=torch.bool)
+
+        if self.window_size <= past_targets.shape[1]:
+            past_targets = past_targets[:, -self.window_size:]
+            past_observed_targets = past_observed_targets[:, -self.window_size:]
+        else:
+            past_targets = self.pad_tensor(past_targets, self.window_size)
+
+        past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
+
+        past_targets = past_targets.to(self.device)
+
+        batch_size = past_targets.shape[0]
+        output_shape = past_targets.shape[2:]
+        forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
+
+        forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
+        backcast, _ = self.encoder(past_targets, [None])
+        backcast = backcast[0]
+        # nbeats network only has one decoder block (flat decoder)
+        for block in self.decoder.decoder['block_1']:
+            backcast_block, forecast_block = block([None], backcast)
+
+            backcast = backcast - backcast_block
+            forecast = forecast + forecast_block
+        backcast = backcast.reshape(past_targets.shape)
+        forecast = forecast.reshape(forcast_shape)
+
+        forecast = self.rescale_output(forecast, loc, scale, self.device)
+        if self.training:
+            backcast = self.rescale_output(backcast, loc, scale, self.device)
+            return backcast, forecast
+        else:
+            return forecast
+
+    def pred_from_net_output(self, net_output: torch.Tensor) -> torch.Tensor:
+        return net_output
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
new file mode 100644
index 000000000..2750348a5
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -0,0 +1,159 @@
+from typing import Any, Dict, Iterable, Optional
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
+    ForecastingDeepARNet,
+    ForecastingNet,
+    ForecastingSeq2SeqNet,
+    NBEATSNet
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
+    DisForecastingStrategy
+from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    get_device_from_fit_dictionary
+)
+
+
+class ForecastingNetworkComponent(NetworkComponent):
+    def __init__(
+            self,
+            network: Optional[torch.nn.Module] = None,
+            random_state: Optional[np.random.RandomState] = None,
+    ) -> None:
+        super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
+        self._fit_requirements.clear()
+        self.add_fit_requirements([
+            FitRequirement('dataset_properties', (Dict,), user_defined=False, dataset_property=True),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_encoder", (Dict,), user_defined=False,
+                           dataset_property=False),
+            FitRequirement("network_decoder", (Dict,), user_defined=False,
+                           dataset_property=False),
+            FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
+            FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
+            FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
+            FitRequirement("net_output_type", (str,), user_defined=False, dataset_property=False),
+            FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('static_features', (tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
+        ])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
+        # Make sure that input dictionary X has the required
+        # information to fit this stage
+        self.check_requirements(X, y)
+
+        network_structure = X['network_structure']
+        network_encoder = X['network_encoder']
+        network_decoder = X['network_decoder']
+
+        net_output_type = X['net_output_type']
+
+        feature_names = X['dataset_properties']['feature_names']
+        feature_shapes = X['dataset_properties']['feature_shapes']
+        transform_time_features = X['transform_time_features']
+        known_future_features = X['dataset_properties']['known_future_features']
+        if transform_time_features:
+            time_feature_names = X['dataset_properties']['time_feature_names']
+        else:
+            time_feature_names = ()
+
+        network_init_kwargs = dict(network_structure=network_structure,
+                                   network_embedding=X['network_embedding'],
+                                   network_encoder=network_encoder,
+                                   network_decoder=network_decoder,
+                                   temporal_fusion=X.get("temporal_fusion", None),
+                                   network_head=X['network_head'],
+                                   auto_regressive=X['auto_regressive'],
+                                   window_size=X['window_size'],
+                                   dataset_properties=X['dataset_properties'],
+                                   target_scaler=X['target_scaler'],
+                                   output_type=net_output_type,
+                                   feature_names=feature_names,
+                                   feature_shapes=feature_shapes,
+                                   known_future_features=known_future_features,
+                                   time_feature_names=time_feature_names,
+                                   static_features=X['dataset_properties']['static_features']
+                                   )
+        if net_output_type == 'distribution':
+            dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
+
+            network_init_kwargs.update(dict(forecast_strategy=dist_forecasting_strategy.forecast_strategy,
+                                            num_samples=dist_forecasting_strategy.num_samples,
+                                            aggregation=dist_forecasting_strategy.aggregation, ))
+
+        if X['auto_regressive']:
+            first_decoder = next(iter(network_decoder.items()))[1]
+            if first_decoder.decoder_properties.recurrent:
+                self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
+            else:
+                self.network = ForecastingDeepARNet(**network_init_kwargs)
+        else:
+            first_decoder = next(iter(network_decoder.items()))[1]
+            if first_decoder.decoder_properties.multi_blocks:
+                self.network = NBEATSNet(**network_init_kwargs)
+            else:
+                self.network = ForecastingNet(**network_init_kwargs)
+
+        # Properly set the network training device
+        if self.device is None:
+            self.device = get_device_from_fit_dictionary(X)
+
+        self.to(self.device)
+
+        if STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in CLASSIFICATION_TASKS:
+            self.final_activation = nn.Softmax(dim=1)
+
+        self.is_fitted_ = True
+
+        return self
+
+    def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
+        """
+        Performs batched prediction given a loader object
+        """
+        assert self.network is not None
+        self.network.eval()
+
+        # Batch prediction
+        Y_batch_preds = list()
+
+        for i, (X_batch, Y_batch) in enumerate(loader):
+            # Predict on batch
+            past_targets = X_batch['past_targets']
+            past_features = X_batch['past_features']
+            future_features = X_batch["future_features"]
+            past_observed_targets = X_batch['past_observed_targets']
+
+            if past_targets.ndim == 2:
+                past_targets = past_targets.unsqueeze(-1)
+
+            pred_kwargs = {"past_targets": past_targets,
+                           "past_features": past_features,
+                           "future_features": future_features}
+
+            for key in pred_kwargs.keys():
+                if pred_kwargs[key] is not None:
+                    pred_kwargs[key] = pred_kwargs[key].float()
+
+            pred_kwargs.update({'past_observed_targets': past_observed_targets})
+
+            with torch.no_grad():
+                Y_batch_pred = self.network.predict(**pred_kwargs)
+
+            Y_batch_preds.append(Y_batch_pred.cpu())
+
+        return torch.cat(Y_batch_preds, 0).cpu().numpy()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
index f3fb4d7a2..f2ed459c3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
@@ -28,9 +28,11 @@ class MLPBackbone(NetworkBackboneComponent):
     """
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        layers: List[nn.Module] = list()
         in_features = input_shape[0]
+        return self._build_backbone(in_features)
 
+    def _build_backbone(self, in_features: int, ) -> nn.Module:
+        layers: List[nn.Module] = list()
         self._add_layer(layers, in_features, self.config['num_units_1'], 1)
 
         for i in range(2, self.config['num_groups'] + 1):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
deleted file mode 100644
index 6ea5a179e..000000000
--- a/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple
-
-import ConfigSpace as CS
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter
-)
-
-import torch
-from torch import nn
-from torch.nn.utils import weight_norm
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-
-
-# _Chomp1d, _TemporalBlock and _TemporalConvNet copied from
-# https://github.com/locuslab/TCN/blob/master/TCN/tcn.py, Carnegie Mellon University Locus Labs
-# Paper: https://arxiv.org/pdf/1803.01271.pdf
-class _Chomp1d(nn.Module):
-    def __init__(self, chomp_size: int):
-        super(_Chomp1d, self).__init__()
-        self.chomp_size = chomp_size
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return x[:, :, :-self.chomp_size].contiguous()
-
-
-class _TemporalBlock(nn.Module):
-    def __init__(self,
-                 n_inputs: int,
-                 n_outputs: int,
-                 kernel_size: int,
-                 stride: int,
-                 dilation: int,
-                 padding: int,
-                 dropout: float = 0.2):
-        super(_TemporalBlock, self).__init__()
-        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
-                                           stride=stride, padding=padding, dilation=dilation))
-        self.chomp1 = _Chomp1d(padding)
-        self.relu1 = nn.ReLU()
-        self.dropout1 = nn.Dropout(dropout)
-
-        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
-                                           stride=stride, padding=padding, dilation=dilation))
-        self.chomp2 = _Chomp1d(padding)
-        self.relu2 = nn.ReLU()
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
-                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
-        self.downsample = nn.Conv1d(
-            n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
-        self.relu = nn.ReLU()
-        # self.init_weights()
-
-    def init_weights(self) -> None:
-        self.conv1.weight.data.normal_(0, 0.01)
-        self.conv2.weight.data.normal_(0, 0.01)
-        if self.downsample is not None:
-            self.downsample.weight.data.normal_(0, 0.01)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = self.net(x)
-        res = x if self.downsample is None else self.downsample(x)
-        return self.relu(out + res)
-
-
-class _TemporalConvNet(nn.Module):
-    def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int = 2, dropout: float = 0.2):
-        super(_TemporalConvNet, self).__init__()
-        layers: List[Any] = []
-        num_levels = len(num_channels)
-        for i in range(num_levels):
-            dilation_size = 2 ** i
-            in_channels = num_inputs if i == 0 else num_channels[i - 1]
-            out_channels = num_channels[i]
-            layers += [_TemporalBlock(in_channels,
-                                      out_channels,
-                                      kernel_size,
-                                      stride=1,
-                                      dilation=dilation_size,
-                                      padding=(kernel_size - 1) * dilation_size,
-                                      dropout=dropout)]
-        self.network = nn.Sequential(*layers)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # swap sequence and feature dimensions for use with convolutional nets
-        x = x.transpose(1, 2).contiguous()
-        x = self.network(x)
-        x = x.transpose(1, 2).contiguous()
-        return x
-
-
-class TCNBackbone(NetworkBackboneComponent):
-    """
-    Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
-    """
-
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        num_channels = [self.config["num_filters_0"]]
-        for i in range(1, self.config["num_blocks"]):
-            num_channels.append(self.config[f"num_filters_{i}"])
-        backbone = _TemporalConvNet(input_shape[-1],
-                                    num_channels,
-                                    kernel_size=self.config["kernel_size"],
-                                    dropout=self.config["dropout"] if self.config["use_dropout"] else 0.0
-                                    )
-        self.backbone = backbone
-        return backbone
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Any]:
-        return {
-            "shortname": "TCNBackbone",
-            "name": "TCNBackbone",
-            'handles_tabular': False,
-            'handles_image': False,
-            'handles_time_series': True,
-        }
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                          value_range=(1, 10),
-                                                                          default_value=5),
-        num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
-                                                                           value_range=(4, 64),
-                                                                           default_value=32),
-        kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
-                                                                           value_range=(4, 64),
-                                                                           default_value=32),
-        use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
-                                                                           value_range=(True, False),
-                                                                           default_value=False),
-        dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
-                                                                       value_range=(0, 0.5),
-                                                                       default_value=0.1),
-    ) -> ConfigurationSpace:
-        cs = ConfigurationSpace()
-
-        min_num_blocks, max_num_blocks = num_blocks.value_range
-        num_blocks_hp = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
-        cs.add_hyperparameter(num_blocks_hp)
-
-        add_hyperparameter(cs, kernel_size, UniformIntegerHyperparameter)
-
-        use_dropout_hp = get_hyperparameter(use_dropout, CategoricalHyperparameter)
-        cs.add_hyperparameter(use_dropout_hp)
-
-        dropout_hp = get_hyperparameter(dropout, UniformFloatHyperparameter)
-        cs.add_hyperparameter(dropout_hp)
-        cs.add_condition(CS.EqualsCondition(dropout_hp, use_dropout_hp, True))
-
-        for i in range(0, int(max_num_blocks)):
-            num_filter_search_space = HyperparameterSearchSpace(f"num_filters_{i}",
-                                                                value_range=num_filters.value_range,
-                                                                default_value=num_filters.default_value,
-                                                                log=num_filters.log)
-            num_filters_hp = get_hyperparameter(num_filter_search_space, UniformIntegerHyperparameter)
-            cs.add_hyperparameter(num_filters_hp)
-            if i >= int(min_num_blocks):
-                cs.add_condition(CS.GreaterThanCondition(
-                    num_filters_hp, num_blocks_hp, i))
-
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index 8d5339389..67e877960 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -149,12 +147,7 @@ def get_hyperparameter_search_space(
             raise ValueError("No backbone found")
 
         if default is None:
-            defaults = [
-                'ShapedMLPBackbone',
-                'MLPBackbone',
-                'ConvNetImageBackbone',
-                'InceptionTimeBackbone',
-            ]
+            defaults = self._defaults_network
             for default_ in defaults:
                 if default_ in available_backbones:
                     default = default_
@@ -192,6 +185,14 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    @property
+    def _defaults_network(self) -> List[str]:
+        return [
+            'ShapedMLPBackbone',
+            'MLPBackbone',
+            'ConvNetImageBackbone',
+        ]
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index 1a04d6645..7ff914a98 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import spmatrix
 
 import torch
 from torch import nn
@@ -29,7 +29,7 @@ def __init__(self,
         super().__init__()
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+            FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
                            dataset_property=False),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
new file mode 100644
index 000000000..e0417f587
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -0,0 +1,314 @@
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+import numpy as np
+
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    flat_encoder import FlatForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    seq_encoder import SeqForecastingEncoderChoice
+from autoPyTorch.utils.hyperparameter_search_space_update import \
+    HyperparameterSearchSpaceUpdate
+
+
+class ForecastingNetworkChoice(autoPyTorchChoice):
+    """
+    A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
+    the choice of encoder. Therefore, here "choice" indicates the choice of encoder, then decoder will be determined by
+    the encoder.
+    """
+
+    def __init__(self,
+                 dataset_properties: Dict[str, BaseDatasetPropertiesType],
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        super().__init__(dataset_properties, random_state)
+        self.include_components: Dict[str, List[str]] = {}
+        self.exclude_components: Dict[str, List[str]] = {}
+
+        self.default_components = OrderedDict(
+            {"flat_encoder": FlatForecastingEncoderChoice(dataset_properties=self.dataset_properties,
+                                                          random_state=self.random_state),
+             "seq_encoder": SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties,
+                                                        random_state=self.random_state)})
+
+    def get_components(self) -> Dict[str, AbstractForecastingEncoderChoice]:  # type: ignore[override]
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        return self.default_components
+
+    def get_available_components(  # type: ignore[override]
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            include: List[str] = None,
+            exclude: List[str] = None,
+            components: Optional[Dict[str, AbstractForecastingEncoderChoice]] = None
+    ) -> Dict[str, AbstractForecastingEncoderChoice]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+            include (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to honor when creating the configuration space. It can also include
+                nested components, for instance, flat_encoder:MLPEncoder
+            exclude (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to remove from the configuration space. It can also include
+                nested components, for instance, flat_encoder:MLPEncoder
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                Characteristics of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]:
+                A filtered dict of learning rate backbones
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        if components is None:
+            available_comp = self.get_components()
+        else:
+            available_comp = components  # type: ignore[assignment]
+
+        if include is not None:
+            include_top = set()
+            for incl in include:
+                if incl not in available_comp:
+                    for comp in available_comp.keys():
+                        self.include_components[comp] = []
+                        if incl.startswith(comp):
+                            incl_sub = ":".join(incl.split(":")[1:])
+                            if comp in self.include_components:
+                                self.include_components[comp].append(incl_sub)
+                            else:
+                                self.include_components[comp] = [incl_sub]
+                            include_top.add(comp)
+                else:
+                    include_top.add(incl)
+            if not include_top:
+                raise ValueError(f"Trying to include unknown component: {include}")
+            include = list(include_top)
+        elif exclude is not None:
+            for excl in exclude:
+                for comp in available_comp.keys():
+                    if excl.startswith(comp):
+                        excl_sub = ":".join(excl.split(":")[1:])
+                        if comp in self.exclude_components:
+                            self.exclude_components[comp].append(excl_sub)
+                        else:
+                            self.exclude_components[comp] = [excl_sub]
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == ForecastingNetworkChoice:
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # backbones based on the dataset!
+            # TODO: Think if there is any case where a backbone is not recommended for a certain dataset
+
+            components_dict[name] = entry
+
+        return components_dict
+
+    def get_hyperparameter_search_space(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            default (Optional[str]):
+                Default backbone to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components.
+                It can also include nested components, for instance, flat_encoder:MLPEncoder
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip. It can also include nested components, for instance, flat_encoder:MLPEncoder
+
+        Returns:
+            ConfigurationSpace:
+                the configuration space of the hyper-parameters of the chosen component
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        cs = ConfigurationSpace()
+        # Compile a list of legal preprocessors for this problem
+        available_encoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        if len(available_encoders) == 0:
+            raise ValueError("No Encoder found")
+
+        if default is None:
+            defaults = self._defaults_network
+            for default_ in defaults:
+                if default_ in available_encoders:
+                    default = default_
+                    break
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_encoders):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_encoders,
+                                                               choice_hyperparameter.value_range))
+            hp_encoder = CSH.CategoricalHyperparameter('__choice__',
+                                                       choice_hyperparameter.value_range,
+                                                       default_value=choice_hyperparameter.default_value)
+        else:
+            hp_encoder = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_encoders.keys()),
+                default_value=default
+            )
+        cs.add_hyperparameter(hp_encoder)
+
+        for name in hp_encoder.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            include_encoder = None
+            exclude_encoder = None
+            if include is not None:
+                if name in self.include_components:
+                    include_encoder = self.include_components[name]
+            if exclude is not None:
+                if name in self.exclude_components:
+                    exclude_encoder = self.exclude_components[name]
+
+            config_space = available_encoders[name].get_hyperparameter_search_space(
+                dataset_properties=dataset_properties,
+                include=include_encoder,
+                exclude=exclude_encoder,
+                **updates  # type: ignore[call-arg, arg-type]
+            )
+            parent_hyperparameter = {'parent': hp_encoder, 'value': name}
+            cs.add_configuration_space(
+                name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+
+        return cs
+
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        new_params = {}
+
+        params = configuration.get_dictionary()
+        choice = params['__choice__']
+        del params['__choice__']
+
+        for param, value in params.items():
+            param = param.replace(choice + ':', '')
+            new_params[param] = value
+
+        if init_params is not None:
+            for param, value in init_params.items():
+                param = param.replace(choice + ':', '')
+                new_params[param] = value
+
+        choice_component = self.get_components()[choice]
+
+        updates = self._get_search_space_updates(prefix=choice)
+
+        self.new_params = new_params
+        sub_configuration_space = choice_component.get_hyperparameter_search_space(
+            self.dataset_properties,
+            **updates  # type: ignore[call-arg, arg-type]
+        )
+
+        sub_configuration = Configuration(sub_configuration_space,
+                                          values=new_params)
+        self.choice = choice_component.set_hyperparameters(sub_configuration)  # type: ignore[assignment]
+
+        return self
+
+    def _apply_search_space_update(self, hyperparameter_search_space_update: HyperparameterSearchSpaceUpdate) -> None:
+        sub_module_name_component = hyperparameter_search_space_update.hyperparameter.split(':')
+        if len(sub_module_name_component) <= 2:
+            super()._apply_search_space_update(hyperparameter_search_space_update)
+        else:
+            sub_module_name = sub_module_name_component[0]
+            # TODO create a new update and consider special HPs for seq encoder!!!
+            update_sub_module = HyperparameterSearchSpaceUpdate(
+                hyperparameter_search_space_update.node_name,
+                hyperparameter=hyperparameter_search_space_update.hyperparameter.replace(f'{sub_module_name}:', ''),
+                value_range=hyperparameter_search_space_update.value_range,
+                default_value=hyperparameter_search_space_update.default_value,
+                log=hyperparameter_search_space_update.log
+            )
+            self.get_components()[sub_module_name]._apply_search_space_update(update_sub_module)
+
+    @property
+    def _defaults_network(self) -> List[str]:
+        return ['flat_network',
+                'seq_network']
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
+        """Handy method to check if a component is fitted
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+        """
+        # Allows to use check_is_fitted on the choice object
+        self.fitted_ = True
+        assert self.choice is not None, "Cannot call fit without initializing the component"
+        return self.choice.fit(X, y)
+
+    def transform(self, X: Dict) -> Dict:
+        assert self.choice is not None, "Cannot call transform before the object is initialized"
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
new file mode 100644
index 000000000..6da9d42bb
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -0,0 +1,715 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
+    GateAddNorm,
+    GatedResidualNetwork,
+    InterpretableMultiHeadAttention,
+    VariableSelectionNetwork
+)
+
+import torch
+from torch import nn
+
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    AddLayer, NetworkStructure)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderBlockInfo, EncoderOutputForm)
+
+
+class TemporalFusionLayer(nn.Module):
+    """
+    (Lim et al.
+    Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
+    https://arxiv.org/abs/1912.09363)
+    we follow the implementation from pytorch forecasting:
+    https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
+    """
+
+    def __init__(self,
+                 window_size: int,
+                 network_structure: NetworkStructure,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 n_decoder_output_features: int,
+                 d_model: int,
+                 n_head: int,
+                 dropout: Optional[float] = None):
+        super().__init__()
+        num_blocks = network_structure.num_blocks
+        last_block = f'block_{num_blocks}'
+        n_encoder_output = network_encoder[last_block].encoder_output_shape[-1]
+        self.window_size = window_size
+
+        if n_decoder_output_features != n_encoder_output:
+            self.decoder_proj_layer = nn.Linear(n_decoder_output_features, n_encoder_output, bias=False)
+        else:
+            self.decoder_proj_layer = None
+        if network_structure.variable_selection:
+            if network_structure.skip_connection:
+                # static feature selector needs to generate the same number of features as the output of the encoder
+                n_encoder_output_first = network_encoder['block_1'].encoder_output_shape[-1]
+                self.static_context_enrichment = GatedResidualNetwork(
+                    n_encoder_output_first, n_encoder_output_first, n_encoder_output_first, dropout
+                )
+                self.enrichment = GatedResidualNetwork(
+                    input_size=n_encoder_output,
+                    hidden_size=n_encoder_output,
+                    output_size=d_model,
+                    dropout=dropout,
+                    context_size=n_encoder_output_first,
+                    residual=False,
+                )
+                self.enrich_with_static = True
+        if not hasattr(self, 'enrichment'):
+            self.enrichment = GatedResidualNetwork(
+                input_size=n_encoder_output,
+                hidden_size=n_encoder_output,
+                output_size=d_model,
+                dropout=dropout,
+                residual=False,
+            )
+            self.enrich_with_static = False
+
+        self.attention_fusion = InterpretableMultiHeadAttention(
+            d_model=d_model,
+            n_head=n_head,
+            dropout=dropout or 0.0
+        )
+        self.post_attn_gate_norm = GateAddNorm(d_model, dropout=dropout, trainable_add=False)
+        self.pos_wise_ff = GatedResidualNetwork(input_size=d_model, hidden_size=d_model,
+                                                output_size=d_model, dropout=dropout)
+
+        self.network_structure = network_structure
+        if network_structure.skip_connection:
+            if network_structure.skip_connection_type == 'add':
+                self.residual_connection = AddLayer(d_model, n_encoder_output)
+            elif network_structure.skip_connection_type == 'gate_add_norm':
+                self.residual_connection = GateAddNorm(d_model, skip_size=n_encoder_output,
+                                                       dropout=None, trainable_add=False)
+        self._device = 'cpu'
+
+    def forward(self,
+                encoder_output: torch.Tensor,
+                decoder_output: torch.Tensor,
+                past_observed_targets: torch.BoolTensor,
+                decoder_length: int,
+                static_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Args:
+            encoder_output (torch.Tensor):
+                the output of the last layer of encoder network
+            decoder_output (torch.Tensor):
+                the output of the last layer of decoder network
+            past_observed_targets (torch.BoolTensor):
+                observed values in the past
+            decoder_length (int):
+                length of decoder network
+            static_embedding Optional[torch.Tensor]:
+                embeddings of static features  (if available)
+        """
+
+        if self.decoder_proj_layer is not None:
+            decoder_output = self.decoder_proj_layer(decoder_output)
+
+        network_output = torch.cat([encoder_output, decoder_output], dim=1)
+
+        if self.enrich_with_static and static_embedding is not None:
+            static_context_enrichment = self.static_context_enrichment(static_embedding)
+            attn_input = self.enrichment(
+                network_output, static_context_enrichment[:, None].expand(-1, network_output.shape[1], -1)
+            )
+        else:
+            attn_input = self.enrichment(network_output)
+
+        # Attention
+        encoder_out_length = encoder_output.shape[1]
+        past_observed_targets = past_observed_targets[:, -encoder_out_length:]
+        past_observed_targets = past_observed_targets.to(self.device)
+
+        mask = self.get_attention_mask(past_observed_targets=past_observed_targets, decoder_length=decoder_length)
+        if mask.shape[-1] < attn_input.shape[1]:
+            # in case that none of the samples has length greater than window_size
+            mask = torch.cat([
+                mask.new_full((*mask.shape[:-1], attn_input.shape[1] - mask.shape[-1]), True),
+                mask
+            ], dim=-1)
+
+        attn_output, attn_output_weights = self.attention_fusion(
+            q=attn_input[:, -decoder_length:],  # query only for predictions
+            k=attn_input,
+            v=attn_input,
+            mask=mask)
+
+        # skip connection over attention
+        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, -decoder_length:])
+        output = self.pos_wise_ff(attn_output)
+
+        if self.network_structure.skip_connection:
+            return self.residual_connection(output, decoder_output)
+        else:
+            return output
+
+    @property
+    def device(self) -> torch.device:
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device) -> None:
+        self.to(device)
+        self._device = device
+
+    def get_attention_mask(self, past_observed_targets: torch.BoolTensor, decoder_length: int) -> torch.Tensor:
+        """
+        https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
+        temporal_fusion_transformer/__init__.py
+        """
+        # indices to which is attended
+        attend_step = torch.arange(decoder_length, device=self.device)
+        # indices for which is predicted
+        predict_step = torch.arange(0, decoder_length, device=self.device)[:, None]
+        # do not attend to steps to self or after prediction
+        # todo: there is potential value in attending to future forecasts if they are made with knowledge currently
+        #   available
+        #   one possibility is here to use a second attention layer for future attention (assuming different effects
+        #   matter in the future than the past)
+        #   or alternatively using the same layer but allowing forward attention - i.e. only masking out non-available
+        #   data and self
+        decoder_mask = attend_step >= predict_step
+        # do not attend to steps where data is padded
+        # this is the result of our padding strategy: we pad values at the start of the tensors
+        encoder_mask = ~past_observed_targets.squeeze(-1)
+
+        # combine masks along attended time - first encoder and then decoder
+        mask = torch.cat(
+            (
+                encoder_mask.unsqueeze(1).expand(-1, decoder_length, -1),
+                decoder_mask.unsqueeze(0).expand(encoder_mask.size(0), -1, -1),
+            ),
+            dim=2,
+        )
+        return mask
+
+
+class VariableSelector(nn.Module):
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 dataset_properties: Dict[str, Any],
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 auto_regressive: bool = False,
+                 feature_names: Union[Tuple[str], Tuple[()]] = (),
+                 known_future_features: Union[Tuple[str], Tuple[()]] = (),
+                 feature_shapes: Dict[str, int] = {},
+                 static_features: Union[Tuple[Union[str, int]], Tuple[()]] = (),
+                 time_feature_names: Union[Tuple[str], Tuple[()]] = (),
+                 ):
+        """
+        Variable Selector. This models follows the implementation from
+        pytorch_forecasting.models.temporal_fusion_transformer.sub_modules.VariableSelectionNetwork
+        However, we adjust the structure to fit the data extracted from our dataloader: we record the feature index from
+        each feature names and break the input features on the fly.
+
+        The order of the input variables is as follows:
+        [features (from the dataset), time_features (from time feature transformers), targets]
+        Args:
+            network_structure (NetworkStructure):
+                contains the information of the overall architecture information
+            dataset_properties (Dict):
+                dataset properties
+            network_encoder(Dict[str, EncoderBlockInfo]):
+                Network encoders
+            auto_regressive (bool):
+                if it belongs to an auto-regressive model
+            feature_names (Tuple[str]):
+                feature names, used to construct the selection network
+            known_future_features (Tuple[str]):
+                known future features
+            feature_shapes (Dict[str, int]):
+                shapes of each features
+            time_feature_names (Tuple[str]):
+                time feature names, used to complement feature_shapes
+        """
+        super().__init__()
+        first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
+        self.hidden_size = first_encoder_output_shape
+
+        assert set(feature_names) == set(feature_shapes.keys()), f"feature_names and feature_shapes must have " \
+                                                                 f"the same variable names but they are different" \
+                                                                 f"at {set(feature_names) ^ set(feature_shapes.keys())}"
+        pre_scalar = {'past_targets': nn.Linear(dataset_properties['output_shape'][-1], self.hidden_size)}
+        encoder_input_sizes = {'past_targets': self.hidden_size}
+        decoder_input_sizes = {}
+        future_feature_name2tensor_idx = {}
+        feature_names2tensor_idx = {}
+        idx_tracker = 0
+        idx_tracker_future = 0
+
+        static_features = set(static_features)  # type: ignore[assignment]
+        static_features_input_size = {}
+
+        # static_features should always be known beforehand
+        known_future_features = tuple(known_future_features)  # type: ignore[assignment]
+        feature_names = tuple(feature_names)  # type: ignore[assignment]
+        time_feature_names = tuple(time_feature_names)  # type: ignore[assignment]
+
+        if feature_names:
+            for name in feature_names:
+                feature_shape = feature_shapes[name]
+                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker + feature_shape]
+                idx_tracker += feature_shape
+                pre_scalar[name] = nn.Linear(feature_shape, self.hidden_size)
+                if name in static_features:
+                    static_features_input_size[name] = self.hidden_size
+                else:
+                    encoder_input_sizes[name] = self.hidden_size
+                    if name in known_future_features:
+                        decoder_input_sizes[name] = self.hidden_size
+
+        for future_name in known_future_features:
+            feature_shape = feature_shapes[future_name]
+            future_feature_name2tensor_idx[future_name] = [idx_tracker_future, idx_tracker_future + feature_shape]
+            idx_tracker_future += feature_shape
+
+        if time_feature_names:
+            for name in time_feature_names:
+                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker + 1]
+                future_feature_name2tensor_idx[name] = [idx_tracker_future, idx_tracker_future + 1]
+                idx_tracker += 1
+                idx_tracker_future += 1
+                pre_scalar[name] = nn.Linear(1, self.hidden_size)
+                encoder_input_sizes[name] = self.hidden_size
+                decoder_input_sizes[name] = self.hidden_size
+
+        if not feature_names or not known_future_features:
+            # Ensure that at least one feature is applied
+            placeholder_features = 'placeholder_features'
+            i = 0
+
+            self.placeholder_features: List[str] = []
+            while placeholder_features in feature_names or placeholder_features in self.placeholder_features:
+                i += 1
+                placeholder_features = f'placeholder_features_{i}'
+                if i == 5000:
+                    raise RuntimeError(
+                        "Cannot assign name to placeholder features, please considering rename your features")
+
+            name = placeholder_features
+            pre_scalar[name] = nn.Linear(1, self.hidden_size)
+            encoder_input_sizes[name] = self.hidden_size
+            decoder_input_sizes[name] = self.hidden_size
+            self.placeholder_features.append(placeholder_features)
+
+        feature_names = time_feature_names + feature_names  # type: ignore[assignment]
+        known_future_features = time_feature_names + known_future_features  # type: ignore[assignment]
+
+        self.feature_names = feature_names
+        self.feature_names2tensor_idx = feature_names2tensor_idx
+        self.future_feature_name2tensor_idx = future_feature_name2tensor_idx
+        self.known_future_features = known_future_features
+
+        if auto_regressive:
+            pre_scalar.update({'future_prediction': nn.Linear(dataset_properties['output_shape'][-1],
+                                                              self.hidden_size)})
+            decoder_input_sizes.update({'future_prediction': self.hidden_size})
+        self.pre_scalars = nn.ModuleDict(pre_scalar)
+
+        self._device = torch.device('cpu')
+
+        if not dataset_properties['uni_variant']:
+            self.static_variable_selection = VariableSelectionNetwork(
+                input_sizes=static_features_input_size,
+                hidden_size=self.hidden_size,
+                input_embedding_flags={},
+                dropout=network_structure.grn_dropout_rate,
+                prescalers=self.pre_scalars
+            )
+        self.static_input_sizes = static_features_input_size
+        self.static_features = static_features
+
+        self.auto_regressive = auto_regressive
+
+        # create single variable grns that are shared across decoder and encoder
+        if network_structure.share_single_variable_networks:
+            self.shared_single_variable_grns = nn.ModuleDict()
+            for name, input_size in encoder_input_sizes.items():
+                self.shared_single_variable_grns[name] = GatedResidualNetwork(
+                    input_size,
+                    min(input_size, self.hidden_size),
+                    self.hidden_size,
+                    network_structure.grn_dropout_rate,
+                )
+            for name, input_size in decoder_input_sizes.items():
+                if name not in self.shared_single_variable_grns:
+                    self.shared_single_variable_grns[name] = GatedResidualNetwork(
+                        input_size,
+                        min(input_size, self.hidden_size),
+                        self.hidden_size,
+                        network_structure.grn_dropout_rate,
+                    )
+
+        self.encoder_variable_selection = VariableSelectionNetwork(
+            input_sizes=encoder_input_sizes,
+            hidden_size=self.hidden_size,
+            input_embedding_flags={},
+            dropout=network_structure.grn_dropout_rate,
+            context_size=self.hidden_size,
+            single_variable_grns={}
+            if not network_structure.share_single_variable_networks
+            else self.shared_single_variable_grns,
+            prescalers=self.pre_scalars,
+        )
+
+        self.decoder_variable_selection = VariableSelectionNetwork(
+            input_sizes=decoder_input_sizes,
+            hidden_size=self.hidden_size,
+            input_embedding_flags={},
+            dropout=network_structure.grn_dropout_rate,
+            context_size=self.hidden_size,
+            single_variable_grns={}
+            if not network_structure.share_single_variable_networks
+            else self.shared_single_variable_grns,
+            prescalers=self.pre_scalars,
+        )
+
+        self.static_context_variable_selection = GatedResidualNetwork(
+            input_size=self.hidden_size,
+            hidden_size=self.hidden_size,
+            output_size=self.hidden_size,
+            dropout=network_structure.grn_dropout_rate,
+        )
+
+        n_hidden_states = 0
+        if network_encoder['block_1'].encoder_properties.has_hidden_states:
+            n_hidden_states = network_encoder['block_1'].n_hidden_states
+
+        static_context_initial_hidden = [GatedResidualNetwork(input_size=self.hidden_size,
+                                                              hidden_size=self.hidden_size,
+                                                              output_size=self.hidden_size,
+                                                              dropout=network_structure.grn_dropout_rate,
+                                                              ) for _ in range(n_hidden_states)]
+
+        self.static_context_initial_hidden = nn.ModuleList(static_context_initial_hidden)
+        self.cached_static_contex: Optional[torch.Tensor] = None
+        self.cached_static_embedding: Optional[torch.Tensor] = None
+
+    @property
+    def device(self) -> torch.device:
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device) -> None:
+        self.to(device)
+        self._device = device
+
+    def forward(self,
+                x_past: Optional[Dict[str, torch.Tensor]],
+                x_future: Optional[Dict[str, torch.Tensor]],
+                x_static: Optional[Dict[str, torch.Tensor]],
+                length_past: int = 0,
+                length_future: int = 0,
+                batch_size: int = 0,
+                cache_static_contex: bool = False,
+                use_cached_static_contex: bool = False,
+                ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
+        if x_past is None and x_future is None:
+            raise ValueError('Either past input or future inputs need to be given!')
+        if length_past == 0 and length_future == 0:
+            raise ValueError("Either length_past or length_future must be given!")
+        timesteps = length_past + length_future
+
+        if not use_cached_static_contex:
+            if len(self.static_input_sizes) > 0:
+                static_embedding, _ = self.static_variable_selection(x_static)
+            else:
+                if length_past > 0:
+                    assert x_past is not None, "x_past must be given when length_past is greater than 0!"
+                    model_dtype = next(iter(x_past.values())).dtype
+                else:
+                    assert x_future is not None, "x_future must be given when length_future is greater than 0!"
+                    model_dtype = next(iter(x_future.values())).dtype
+
+                static_embedding = torch.zeros(
+                    (batch_size, self.hidden_size), dtype=model_dtype, device=self.device
+                )
+
+            static_context_variable_selection = self.static_context_variable_selection(static_embedding)[:, None]
+            static_context_initial_hidden: Optional[Tuple[torch.Tensor, ...]] = tuple(
+                init_hidden(static_embedding) for init_hidden in self.static_context_initial_hidden
+            )
+            if cache_static_contex:
+                self.cached_static_contex = static_context_variable_selection
+                self.cached_static_embedding = static_embedding
+        else:
+            static_embedding = self.cached_static_embedding
+            static_context_initial_hidden = None
+            static_context_variable_selection = self.cached_static_contex
+        static_context_variable_selection = static_context_variable_selection.expand(-1, timesteps, -1)
+        if x_past is not None:
+            embeddings_varying_encoder, _ = self.encoder_variable_selection(
+                x_past,
+                static_context_variable_selection[:, :length_past],
+            )
+        else:
+            embeddings_varying_encoder = None
+        if x_future is not None:
+            embeddings_varying_decoder, _ = self.decoder_variable_selection(
+                x_future,
+                static_context_variable_selection[:, length_past:],
+            )
+        else:
+            embeddings_varying_decoder = None
+        return embeddings_varying_encoder, embeddings_varying_decoder, static_embedding, static_context_initial_hidden
+
+
+class StackedEncoder(nn.Module):
+    """
+    Encoder network that is stacked by several encoders. Skip-connections can be applied to each stack. Each stack
+    needs to generate a sequence of encoded features passed to the next stack and the
+    corresponding decoder (encoder2decoder) that is located at the same layer.Additionally, if temporal fusion
+    transformer is applied, the last encoder also needs to output the full encoded feature sequence
+    """
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 has_temporal_fusion: bool,
+                 encoder_info: Dict[str, EncoderBlockInfo],
+                 decoder_info: Dict[str, DecoderBlockInfo],
+                 ):
+        super().__init__()
+        self.num_blocks = network_structure.num_blocks
+        self.skip_connection = network_structure.skip_connection
+        self.has_temporal_fusion = has_temporal_fusion
+
+        self.encoder_output_type = [EncoderOutputForm.NoOutput] * self.num_blocks
+        self.encoder_has_hidden_states = [False] * self.num_blocks
+        len_cached_intermediate_states = self.num_blocks + 1 if self.has_temporal_fusion else self.num_blocks
+        self.cached_intermediate_state = [torch.empty(0) for _ in range(len_cached_intermediate_states)]
+
+        self.encoder_num_hidden_states = [0] * self.num_blocks
+        encoder = nn.ModuleDict()
+        for i, block_idx in enumerate(range(1, self.num_blocks + 1)):
+            block_id = f'block_{block_idx}'
+            encoder[block_id] = encoder_info[block_id].encoder
+            if self.skip_connection:
+                input_size = encoder_info[block_id].encoder_output_shape[-1]
+                skip_size = encoder_info[block_id].encoder_input_shape[-1]
+                if network_structure.skip_connection_type == 'add':
+                    encoder[f'skip_connection_{block_idx}'] = AddLayer(input_size, skip_size)
+                elif network_structure.skip_connection_type == 'gate_add_norm':
+                    encoder[f'skip_connection_{block_idx}'] = GateAddNorm(input_size,
+                                                                          hidden_size=input_size,
+                                                                          skip_size=skip_size,
+                                                                          dropout=network_structure.grn_dropout_rate)
+            if block_id in decoder_info:
+                if decoder_info[block_id].decoder_properties.recurrent:
+                    if decoder_info[block_id].decoder_properties.has_hidden_states:
+                        # RNN -> RNN
+                        self.encoder_output_type[i] = EncoderOutputForm.HiddenStates
+                    else:
+                        # Transformer -> Transformer
+                        self.encoder_output_type[i] = EncoderOutputForm.Sequence
+                else:
+                    # Deep AR, MLP as decoder
+                    self.encoder_output_type[i] = EncoderOutputForm.SequenceLast
+            if encoder_info[block_id].encoder_properties.has_hidden_states:
+                self.encoder_has_hidden_states[i] = True
+                self.encoder_num_hidden_states[i] = encoder_info[block_id].n_hidden_states
+            else:
+                self.encoder_has_hidden_states[i] = False
+        self.encoder = encoder
+
+    def forward(self,
+                encoder_input: torch.Tensor,
+                additional_input: List[Optional[torch.Tensor]],
+                output_seq: bool = False,
+                cache_intermediate_state: bool = False,
+                incremental_update: bool = False) -> Tuple[List[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        A forward pass through the encoder
+
+        Args:
+            encoder_input (torch.Tensor):
+                encoder input
+            additional_input (List[Optional[torch.Tensor]])
+                additional input to the encoder, e.g., initial hidden states
+            output_seq (bool)
+                if the encoder want to generate a sequence of multiple time steps or a single time step
+            cache_intermediate_state (bool):
+                if the intermediate values are cached
+            incremental_update (bool):
+                if an incremental update is applied, this is normally applied for
+                auto-regressive model, however, ony deepAR requires incremental update in encoder
+
+        Returns:
+            encoder2decoder ([List[torch.Tensor]]):
+                encoder output that will be passed to decoders
+            encoder_output (torch.Tensor):
+                full sequential encoded features from the last encoder layer. Applied to temporal transformer
+        """
+        encoder2decoder = []
+        x = encoder_input
+        for i, block_id in enumerate(range(1, self.num_blocks + 1)):
+            output_seq_i = (output_seq or self.has_temporal_fusion or block_id < self.num_blocks)
+            encoder_i = self.encoder[f'block_{block_id}']
+            if self.encoder_has_hidden_states[i]:
+                if incremental_update:
+                    hx = self.cached_intermediate_state[i]
+                    fx, hx = encoder_i(x, output_seq=False, hx=hx)
+                else:
+                    rnn_num_layers = encoder_i.config['num_layers']
+                    hx = additional_input[i]
+                    if hx is None:
+                        fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
+                    else:
+                        if self.encoder_num_hidden_states[i] == 1:
+                            fx, hx = encoder_i(x, output_seq=output_seq_i,
+                                               hx=hx[0].expand((rnn_num_layers, -1, -1)).contiguous())
+                        else:
+                            hx = tuple(hx_i.expand(rnn_num_layers, -1, -1).contiguous() for hx_i in hx)
+                            fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
+            else:
+                if incremental_update:
+                    x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
+                    fx = encoder_i(x_all, output_seq=False)
+                else:
+                    fx = encoder_i(x, output_seq=output_seq_i)
+            if self.skip_connection:
+                if output_seq_i:
+                    fx = self.encoder[f'skip_connection_{block_id}'](fx, x)
+                else:
+                    fx = self.encoder[f'skip_connection_{block_id}'](fx, x[:, -1:])
+
+            if self.encoder_output_type[i] == EncoderOutputForm.HiddenStates:
+                encoder2decoder.append(hx)
+            elif self.encoder_output_type[i] == EncoderOutputForm.Sequence:
+                encoder2decoder.append(fx)
+            elif self.encoder_output_type[i] == EncoderOutputForm.SequenceLast:
+                if output_seq_i and not output_seq:
+                    encoder2decoder.append(encoder_i.get_last_seq_value(fx).squeeze(1))
+                else:
+                    encoder2decoder.append(fx)
+            else:
+                raise NotImplementedError
+
+            if cache_intermediate_state:
+                if self.encoder_has_hidden_states[i]:
+                    self.cached_intermediate_state[i] = hx
+                else:
+                    if incremental_update:
+                        self.cached_intermediate_state[i] = x_all
+                    else:
+                        self.cached_intermediate_state[i] = x
+                    # otherwise the decoder does not exist for this layer
+            x = fx
+
+        if self.has_temporal_fusion:
+            if incremental_update:
+                self.cached_intermediate_state[i + 1] = torch.cat([self.cached_intermediate_state[i + 1], x], dim=1)
+            else:
+                self.cached_intermediate_state[i + 1] = x
+            return encoder2decoder, x
+        else:
+            return encoder2decoder, None
+
+
+class StackedDecoder(nn.Module):
+    """
+    Decoder network that is stacked by several decoders. Skip-connections can be applied to each stack. It decodes the
+    encoded features (encoder2decoder) from each corresponding stacks and known_future_features to generate the decoded
+    output features that will be further fed to the network decoder.
+    """
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 encoder: nn.ModuleDict,
+                 encoder_info: Dict[str, EncoderBlockInfo],
+                 decoder_info: Dict[str, DecoderBlockInfo],
+                 ):
+        super().__init__()
+        self.num_blocks = network_structure.num_blocks
+        self.first_block = -1
+        self.skip_connection = network_structure.skip_connection
+
+        self.decoder_has_hidden_states = []
+        decoder = nn.ModuleDict()
+        for i in range(1, self.num_blocks + 1):
+            block_id = f'block_{i}'
+            if block_id in decoder_info:
+                self.first_block = i if self.first_block == -1 else self.first_block
+                decoder[block_id] = decoder_info[block_id].decoder
+                if decoder_info[block_id].decoder_properties.has_hidden_states:
+                    self.decoder_has_hidden_states.append(True)
+                else:
+                    self.decoder_has_hidden_states.append(False)
+                if self.skip_connection:
+                    input_size_encoder = encoder_info[block_id].encoder_output_shape[-1]
+                    skip_size_encoder = encoder_info[block_id].encoder_input_shape[-1]
+
+                    input_size_decoder = decoder_info[block_id].decoder_output_shape[-1]
+                    skip_size_decoder = decoder_info[block_id].decoder_input_shape[-1]
+                    if skip_size_decoder > 0:
+                        if input_size_encoder == input_size_decoder and skip_size_encoder == skip_size_decoder:
+                            decoder[f'skip_connection_{i}'] = encoder[f'skip_connection_{i}']
+                        else:
+                            if network_structure.skip_connection_type == 'add':
+                                decoder[f'skip_connection_{i}'] = AddLayer(input_size_decoder, skip_size_decoder)
+                            elif network_structure.skip_connection_type == 'gate_add_norm':
+                                decoder[f'skip_connection_{i}'] = GateAddNorm(input_size_decoder,
+                                                                              hidden_size=input_size_decoder,
+                                                                              skip_size=skip_size_decoder,
+                                                                              dropout=network_structure.grn_dropout_rate
+                                                                              )
+        self.cached_intermediate_state = [torch.empty(0) for _ in range(self.num_blocks + 1 - self.first_block)]
+        self.decoder = decoder
+
+    def forward(self,
+                x_future: Optional[torch.Tensor],
+                encoder_output: List[torch.Tensor],
+                pos_idx: Optional[Tuple[int]] = None,
+                cache_intermediate_state: bool = False,
+                incremental_update: bool = False
+                ) -> torch.Tensor:
+        """
+        A forward pass through the decoder
+
+        Args:
+            x_future (Optional[torch.Tensor]):
+                known future features
+            encoder_output (List[torch.Tensor])
+                encoded features, stored as List, whereas each element in the list indicates encoded features from an
+                encoder stack
+            pos_idx (int)
+                position index of the current x_future. This is applied to transformer decoder
+            cache_intermediate_state (bool):
+                if the intermediate values are cached
+            incremental_update (bool):
+                if an incremental update is applied, this is normally applied for auto-regressive model
+
+        Returns:
+            decoder_output (torch.Tensor):
+                decoder output that will be passed to the network head
+        """
+        x = x_future
+        for i, block_id in enumerate(range(self.first_block, self.num_blocks + 1)):
+            decoder_i = self.decoder[f'block_{block_id}']
+            if self.decoder_has_hidden_states[i]:
+                if incremental_update:
+                    hx = self.cached_intermediate_state[i]
+                    fx, hx = decoder_i(x_future=x, encoder_output=hx, pos_idx=pos_idx)
+                else:
+                    fx, hx = decoder_i(x_future=x, encoder_output=encoder_output[i], pos_idx=pos_idx)
+            else:
+                if incremental_update:
+                    # in this case, we only have Transformer, thus x_all needs to be None value!
+                    # TODO make this argument clearer!
+                    fx = decoder_i(x, encoder_output=encoder_output[i], pos_idx=pos_idx)
+                else:
+                    fx = decoder_i(x, encoder_output=encoder_output[i], pos_idx=pos_idx)
+            skip_id = f'skip_connection_{block_id}'
+            if self.skip_connection and skip_id in self.decoder and x is not None:
+                fx = self.decoder[skip_id](fx, x)
+            if cache_intermediate_state:
+                if self.decoder_has_hidden_states[i]:
+                    self.cached_intermediate_state[i] = hx
+                    # TODO consider if there are other case that could make use of cached intermediate states
+            x = fx
+        return x
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
new file mode 100644
index 000000000..9fcbc14e0
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -0,0 +1,145 @@
+import math
+from typing import Any, Dict, NamedTuple, Optional, Tuple
+
+from sklearn.base import BaseEstimator
+
+import torch
+from torch import nn
+
+
+class NetworkStructure(NamedTuple):
+    num_blocks: int = 1
+    variable_selection: bool = False
+    share_single_variable_networks: bool = False
+    use_temporal_fusion: bool = False
+    skip_connection: bool = False
+    skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
+    grn_dropout_rate: float = 0.0
+
+
+class ForecastingNetworkStructure(BaseEstimator):
+    def __init__(self,
+                 num_blocks: int = 1,
+                 variable_selection: bool = False,
+                 share_single_variable_networks: bool = False,
+                 use_temporal_fusion: bool = False,
+                 skip_connection: bool = False,
+                 skip_connection_type: str = "add",
+                 grn_dropout_rate: float = 0.0,
+                 ) -> None:
+        super().__init__()
+        self.num_blocks = num_blocks
+        self.variable_selection = variable_selection
+        self.share_single_variable_networks = share_single_variable_networks
+        self.use_temporal_fusion = use_temporal_fusion
+        self.skip_connection = skip_connection
+        self.skip_connection_type = skip_connection_type
+        self.grn_dropout_rate = grn_dropout_rate
+        self.network_structure: Optional[NetworkStructure] = None
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.network_structure = NetworkStructure(num_blocks=self.num_blocks,
+                                                  variable_selection=self.variable_selection,
+                                                  share_single_variable_networks=self.share_single_variable_networks,
+                                                  use_temporal_fusion=self.use_temporal_fusion,
+                                                  skip_connection=self.skip_connection,
+                                                  skip_connection_type=self.skip_connection_type,
+                                                  grn_dropout_rate=self.grn_dropout_rate)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({
+            'network_structure': self.network_structure,
+        })
+        return X
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
+
+
+class AddLayer(nn.Module):
+    def __init__(self, input_size: int, skip_size: int):
+        super().__init__()
+        if input_size == skip_size:
+            self.fc = nn.Linear(skip_size, input_size)
+        self.norm = nn.LayerNorm(input_size)
+
+    def forward(self, input: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, 'fc'):
+            return self.norm(input + self.fc(skip))
+        else:
+            return self.norm(input)
+
+
+def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type: str = 'encoder') -> nn.Module:
+    nhead = 2 ** config['n_head_log']
+    dim_feedforward = 2 ** config['d_feed_forward_log']
+    dropout = config.get('dropout', 0.0)
+    activation = config['activation']
+    layer_norm_eps = config['layer_norm_eps']
+    norm_first = config['norm_first']
+    if layer_type == 'encoder':
+        return nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
+                                          dropout=dropout, activation=activation, norm_first=norm_first,
+                                          layer_norm_eps=layer_norm_eps, batch_first=True)
+    elif layer_type == 'decoder':
+        return nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
+                                          dropout=dropout, activation=activation, norm_first=norm_first,
+                                          layer_norm_eps=layer_norm_eps, batch_first=True)
+    else:
+        raise ValueError('layer_type must be encoder or decoder!')
+
+
+class PositionalEncoding(nn.Module):
+    r"""https://github.com/pytorch/examples/blob/master/word_language_model/model.py
+
+        NOTE: different from the raw implementation, this model is designed for the batch_first inputs!
+        Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model (int):
+            the embed dim (required).
+        dropout(float):
+            the dropout value (default=0.1).
+        max_len(int):
+            the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x: torch.Tensor, pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
+        r"""Inputs of forward function
+        Args:
+            x (torch.Tensor(B, L, N)):
+                the sequence fed to the positional encoder model (required).
+            pos_idx (Tuple[int]):
+                position idx indicating the start (first) and end (last) time index of x in a sequence
+
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        if pos_idx is None:
+            x = x + self.pe[:, :x.size(1), :]
+        else:
+            x = x + self.pe[:, pos_idx[0]: pos_idx[1], :]  # type: ignore[misc]
+        return self.dropout(x)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
new file mode 100644
index 000000000..b5d9eead8
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -0,0 +1,240 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace.conditions import EqualsCondition, GreaterThanCondition
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderNetwork
+from autoPyTorch.pipeline.components.setup.network_head.utils import \
+    _activations
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
+
+
+class MLPDecoderModule(DecoderNetwork):
+    def __init__(self,
+                 global_layers: nn.Module,
+                 local_layers: Optional[nn.Module],
+                 auto_regressive: bool = False
+                 ):
+        super().__init__()
+        self.global_layers = global_layers
+        self.local_layers = local_layers
+        self.auto_regressive = auto_regressive
+
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
+        if not self.auto_regressive:
+            if len(encoder_output.shape) == 3:
+                encoder_output = encoder_output.squeeze(1)
+
+        if x_future is None or self.auto_regressive:
+            # for auto-regressive model, x_future is fed to the encoders
+            x = self.global_layers(encoder_output)
+            if self.local_layers is None:
+                return x
+            else:
+                return self.local_layers(x)
+
+        if self.local_layers is None:
+            x = torch.concat([encoder_output, x_future.flatten(-2)], dim=-1)
+            return self.global_layers(x)
+
+        x = self.global_layers(encoder_output)
+        x = self.local_layers(x)
+
+        return torch.concat([x, x_future], dim=-1)
+
+
+class ForecastingMLPDecoder(BaseForecastingDecoder):
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        global_layers = []
+        in_features = encoder_output_shape[-1]
+        has_local_layer = 'units_local_layer' in self.config
+        if not has_local_layer and not self.auto_regressive:
+            in_features += int(np.prod(future_variable_input))
+        if 'num_layers' in self.config and self.config["num_layers"] > 0:
+            for i in range(1, self.config["num_layers"] + 1):
+                global_layers.append(nn.Linear(in_features=in_features,
+                                               out_features=self.config[f"units_layer_{i}"]))
+                global_layers.append(_activations[self.config["activation"]]())
+                in_features = self.config[f"units_layer_{i}"]
+        num_decoder_output_features = in_features
+        if has_local_layer:
+            local_layers = [nn.Linear(in_features=in_features,
+                                      out_features=self.config['units_local_layer'] * n_prediction_heads)]
+            if 'activation' in self.config:
+                local_layers.append(_activations[self.config["activation"]]())
+            local_layers.append(nn.Unflatten(-1, (n_prediction_heads, self.config['units_local_layer'])))
+            num_decoder_output_features = self.config['units_local_layer'] + future_variable_input[-1]
+
+        return MLPDecoderModule(global_layers=nn.Sequential(*global_layers),
+                                local_layers=nn.Sequential(*local_layers) if has_local_layer else None,
+                                auto_regressive=self.auto_regressive), num_decoder_output_features
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'MLPDecoder',
+            'name': 'MLPDecoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        if self.is_last_decoder:
+            X.update({'mlp_has_local_layer': self.config.get('has_local_layer', True)})
+        return super().transform(X)
+
+    @property
+    def fitted_encoder(self) -> List[str]:
+        return ['RNNEncoder', 'TCNEncoder', 'MLEncoder', 'NBEATSEncoder']
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            can_be_auto_regressive: bool = False,
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
+                                                                              value_range=(0, 3),
+                                                                              default_value=1),
+            units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
+                                                                               value_range=(16, 512),
+                                                                               default_value=32,
+                                                                               log=True),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
+                                                                              value_range=tuple(_activations.keys()),
+                                                                              default_value=list(_activations.keys())[
+                                                                                  0]),
+            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False,
+                                                                                   ),
+            has_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='has_local_layer',
+                                                                                   value_range=(True, False),
+                                                                                   default_value=True),
+            units_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_local_layer",
+                                                                                     value_range=(8, 128),
+                                                                                     default_value=16,
+                                                                                     log=True),
+    ) -> ConfigurationSpace:
+        """
+        Builds the mlp head layer. The decoder implementation follows the idea from:
+
+        Wen et al, A Multi-Horizon Quantile Recurrent Forecaster, NeurIPS 2017, Time Series Workshop
+        https://arxiv.org/abs/1711.11053
+
+        This model acts as the global MLP, local MLP is implemented under forecasting_head, that maps the output
+        features to the final output
+
+        Additionally, this model also determines if DeepAR is applied to do prediction
+
+        Salinas et al. DeepAR: Probabilistic Forecasting with Autoregressive Recurrent Networks
+        https://arxiv.org/abs/1704.04110
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                Dataset Properties
+            can_be_auto_regressive (bool):
+                if this decoder is allowed to be auto-regressive
+            is_top_layer (bool):
+                if this mlp decoder is at the top layer as seq decoders. Only top layer MLP allows deactivating local
+                layers. (Otherwise, the decoder cannot output a sequence)
+            num_layers (HyperparameterSearchSpace):
+                number of decoder layers (the last layer is not included, thus it starts from 0)
+            units_layer (HyperparameterSearchSpace):
+                number of units of each layer (except for the last layer)
+            activation (HyperparameterSearchSpace):
+                activation function
+            auto_regressive (HyperparameterSearchSpace):
+                if the model acts as a DeepAR model, the corresponding hyperparaemter is controlled by seq_encoder
+            has_local_layer (HyperparameterSearchSpace):
+                if local MLP layer is applied, if not, the output of the network will be directly attached
+                 with different heads
+            units_local_layer (HyperparameterSearchSpace):
+                number of units of local layer. The size of this layer is smaller as it needs to be
+                expanded to adapt to the number of predictions
+        Returns:
+            cs (ConfigurationSpace):
+                ConfigurationSpace
+        """
+        if dataset_properties is not None:
+            encoder_can_be_auto_regressive = dataset_properties.get('encoder_can_be_auto_regressive', False)
+            if not encoder_can_be_auto_regressive:
+                # deepAR model cannot be applied
+                auto_regressive = HyperparameterSearchSpace(hyperparameter=auto_regressive.hyperparameter,
+                                                            value_range=[False],
+                                                            default_value=False, )
+        cs = ConfigurationSpace()
+
+        min_num_layers: int = num_layers.value_range[0]  # type: ignore
+        max_num_layers: int = num_layers.value_range[-1]  # type: ignore
+        num_layers_is_constant = (min_num_layers == max_num_layers)
+
+        num_layers_hp = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        activation_hp = get_hyperparameter(activation, CategoricalHyperparameter)
+        cs.add_hyperparameter(num_layers_hp)
+
+        if not num_layers_is_constant:
+            cs.add_hyperparameter(activation_hp)
+            # HERE WE replace 1 with 0 to be compatible with our modification
+            cs.add_condition(GreaterThanCondition(activation_hp, num_layers_hp, 0))
+        elif max_num_layers > 1:
+            # only add activation if we have more than 1 layer
+            cs.add_hyperparameter(activation_hp)
+
+        for i in range(1, max_num_layers + 1):
+            num_units_search_space = HyperparameterSearchSpace(
+                hyperparameter=f"units_layer_{i}",
+                value_range=units_layer.value_range,
+                default_value=units_layer.default_value,
+                log=units_layer.log,
+            )
+            num_units_hp = get_hyperparameter(num_units_search_space, UniformIntegerHyperparameter)
+            cs.add_hyperparameter(num_units_hp)
+
+            if i >= min_num_layers and not num_layers_is_constant:
+                # In the case of a constant, the max and min number of layers are the same.
+                # So no condition is needed. If it is not a constant but a hyperparameter,
+                # then a condition has to be made so that it accounts for the value of the
+                # hyperparameter.
+                cs.add_condition(GreaterThanCondition(num_units_hp, num_layers_hp, i - 1))
+
+        # add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
+        has_local_layer = get_hyperparameter(has_local_layer, CategoricalHyperparameter)
+        units_local_layer = get_hyperparameter(units_local_layer, UniformIntegerHyperparameter)
+
+        cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
+
+        if can_be_auto_regressive:
+            auto_regressive_hp: CategoricalHyperparameter = get_hyperparameter(  # type:ignore[assignment]
+                auto_regressive, CategoricalHyperparameter
+            )
+            cs.add_hyperparameters([auto_regressive_hp])
+
+            if False in auto_regressive_hp.choices:
+                cs.add_hyperparameters([has_local_layer, units_local_layer])
+                cs.add_conditions([cond_units_local_layer])
+
+                cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive_hp, False)
+                cs.add_conditions([cond_use_local_layer])
+                return cs
+            else:
+                return cs
+
+        cs.add_hyperparameters([has_local_layer, units_local_layer])
+        cs.add_conditions([cond_units_local_layer])
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
new file mode 100644
index 000000000..419c9ef34
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -0,0 +1,570 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.conditions import (
+    AndConjunction,
+    EqualsCondition,
+    GreaterThanCondition
+)
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderNetwork
+from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
+)
+
+
+class NBEATSBlock(DecoderNetwork):
+    """
+    An N-BEATS block. An N-BEATS network is stacked by multiple Blocks.
+    For detail, we refer to
+    Oreshkin et al., N-BEATS: Neural basis expansion analysis for interpretable time series forecasting
+    https://arxiv.org/abs/1905.10437
+
+    The hyperaprameter definitions are quite similar to
+    https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
+
+    However, we only construct the forecast/ backcast head under
+    autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head. As we only get to know the
+    output shape and forecasting horizon there
+
+    Attributes:
+        n_in_features (int):
+            number of input features
+        stack_idx (int):
+            index of the current stack
+        stack_type (str):
+            type of this stack. Could be one of 'generic', 'seasonality', 'trend'
+        num_blocks (int):
+            number of blocks exist in this stack
+        num_layers (int):
+            number of network layer inside each block
+        width (int):
+            network width (number of features)
+        normalization (str):
+            normalization type, could be BN or LN
+        activation (str):
+            activation function type
+        weight_sharing (bool):
+            if weights are shared for this block
+        expansion_coefficient_length (int):
+            expansion_coefficient_length
+        use_dropout (bool):
+            if dropout is applied
+        dropout_rate (Optional[float]).
+            dropout rate
+    """
+
+    def __init__(self,
+                 n_in_features: int,
+                 stack_idx: int,
+                 stack_type: str,
+                 num_blocks: int,
+                 num_layers: int,
+                 width: int,
+                 normalization: str,
+                 activation: str,
+                 weight_sharing: bool,
+                 expansion_coefficient_length: int,
+                 use_dropout: bool,
+                 dropout_rate: Optional[float] = None,
+                 ):
+        super().__init__()
+        self.n_in_features = n_in_features
+        self.stack_idx = stack_idx
+        self.stack_type = stack_type
+
+        self.num_blocks = num_blocks
+        self.num_layers = num_layers
+        self.width = width
+        self.normalization = normalization
+        self.activation = activation
+        self.use_dropout = use_dropout
+        self.dropout_rate = dropout_rate
+
+        self.expansion_coefficient_length = expansion_coefficient_length
+
+        self.weight_sharing = weight_sharing
+
+        self.backbone = nn.Sequential(*self.build_backbone())
+
+        self.backcast_head: Optional[nn.Module] = None
+        self.forecast_head: Optional[nn.Module] = None
+
+    def build_backbone(self) -> List[nn.Module]:
+        layers: List[nn.Module] = list()
+        n_in_features = self.n_in_features
+        for _ in range(self.num_layers):
+            self._add_layer(layers, n_in_features)
+            n_in_features = self.width
+        return layers
+
+    def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
+        layers.append(nn.Linear(in_features, self.width))
+        if self.normalization == 'BN':
+            layers.append(nn.BatchNorm1d(self.width))
+        elif self.normalization == 'LN':
+            layers.append(nn.LayerNorm(self.width))
+        layers.append(_activations[self.activation]())
+        if self.use_dropout:
+            layers.append(nn.Dropout(self.dropout_rate))
+
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None) -> Union[nn.Module, Tuple[nn.Module, nn.Module]]:
+        if self.backcast_head is None or self.forecast_head is None:
+            # used to compute head dimensions
+            return self.backbone(encoder_output)
+        else:
+            x = self.backbone(encoder_output)
+            forecast = self.forecast_head(x)
+            backcast = self.backcast_head(x)
+            return backcast, forecast
+
+
+class NBEATSDecoder(BaseForecastingDecoder):
+    _fixed_seq_length = True
+    window_size = 1
+    fill_lower_resolution_seq = False
+
+    @staticmethod
+    def decoder_properties() -> DecoderProperties:
+        return DecoderProperties(multi_blocks=True)
+
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[List[List[NBEATSBlock]], int]:
+        in_features = encoder_output_shape[-1]
+        n_beats_type = self.config['n_beats_type']
+        if n_beats_type == 'G':
+            stacks: List[List[NBEATSBlock]] = [[] for _ in range(self.config['num_stacks_g'])]
+            for stack_idx in range(1, self.config['num_stacks_g'] + 1):
+                for block_idx in range(self.config['num_blocks_g']):
+                    if self.config['weight_sharing_g'] and block_idx > 0:
+                        # for weight sharing, we only create one instance
+                        break
+                    ecl = self.config['expansion_coefficient_length_g']
+                    stacks[stack_idx - 1].append(NBEATSBlock(in_features,
+                                                             stack_idx=stack_idx,
+                                                             stack_type='generic',
+                                                             num_blocks=self.config['num_blocks_g'],
+                                                             num_layers=self.config['num_layers_g'],
+                                                             width=self.config['width_g'],
+                                                             normalization=self.config['normalization'],
+                                                             activation=self.config['activation'],
+                                                             weight_sharing=self.config['weight_sharing_g'],
+                                                             expansion_coefficient_length=ecl,
+                                                             use_dropout=self.config['use_dropout_g'],
+                                                             dropout_rate=self.config.get('dropout_g', None),
+                                                             ))
+
+        elif n_beats_type == 'I':
+            stacks: List[List[NBEATSBlock]] = [[] for _ in range(self.config['num_stacks_i'])]  # type:ignore
+            for stack_idx in range(1, self.config['num_stacks_i'] + 1):
+                for block_idx in range(self.config['num_blocks_i_%d' % stack_idx]):
+                    if self.config['weight_sharing_i_%d' % stack_idx] and block_idx > 0:
+                        # for weight sharing, we only create one instance
+                        break
+                    stack_type = self.config['stack_type_i_%d' % stack_idx]
+                    if stack_type == 'generic':
+                        ecl = self.config['expansion_coefficient_length_i_generic_%d' % stack_idx]
+                    elif stack_type == 'trend':
+                        ecl = self.config['expansion_coefficient_length_i_trend_%d' % stack_idx]
+                    elif stack_type == 'seasonality':
+                        ecl = self.config['expansion_coefficient_length_i_seasonality_%d' % stack_idx]
+                    else:
+                        raise ValueError(f"Unsupported stack_type {stack_type}")
+
+                    stacks[stack_idx - 1].append(NBEATSBlock(
+                        in_features,
+                        stack_idx=stack_idx,
+                        stack_type=stack_type,
+                        num_blocks=self.config['num_blocks_i_%d' % stack_idx],
+                        num_layers=self.config['num_layers_i_%d' % stack_idx],
+                        width=self.config['width_i_%d' % stack_idx],
+                        normalization=self.config['normalization'],
+                        activation=self.config['activation'],
+                        weight_sharing=self.config['weight_sharing_i_%d' % stack_idx],
+                        expansion_coefficient_length=ecl,
+                        use_dropout=self.config['use_dropout_i'],
+                        dropout_rate=self.config.get('dropout_i_%d' % stack_idx, None),
+                    ))
+        else:
+            raise ValueError(f"Unsupported n_beats_type: {n_beats_type}")
+        return stacks, stacks[-1][-1].width
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NBEATSDecoder',
+            'name': 'NBEATSDecoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @property
+    def fitted_encoder(self) -> List[str]:
+        return ['NBEATSEncoder']
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'backcast_loss_ration': self.config['backcast_loss_ration']})
+        return super().transform(X)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            n_beats_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="n_beats_type",
+                value_range=('I', 'G'),
+                default_value='I'
+            ),
+            num_stacks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="num_stacks_g",
+                value_range=(2, 32),
+                default_value=30,
+                log=True,
+            ),
+            num_blocks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_blocks_g',
+                value_range=(1, 2),
+                default_value=1
+            ),
+            num_layers_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_layers_g',
+                value_range=(1, 4),
+                default_value=4
+            ),
+            width_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'width_g',
+                value_range=(16, 512),
+                default_value=256,
+                log=True
+            ),
+            num_stacks_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="num_stacks_i",
+                value_range=(1, 4),
+                default_value=2
+            ),
+            num_blocks_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_blocks_i',
+                value_range=(1, 5),
+                default_value=3
+            ),
+            num_layers_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_layers_i',
+                value_range=(1, 5),
+                default_value=3
+            ),
+            width_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'width_i',
+                value_range=(16, 2048),
+                default_value=512,
+                log=True
+            ),
+            weight_sharing: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'weight_sharing',
+                value_range=(True, False),
+                default_value=False,
+            ),
+            stack_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'stack_type',
+                value_range=('generic', 'seasonality', 'trend'),
+                default_value='generic'),
+            expansion_coefficient_length_generic: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_generic',
+                value_range=(16, 64),
+                default_value=32,
+                log=True
+            ),
+            expansion_coefficient_length_seasonality: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_seasonality',
+                value_range=(1, 8),
+                default_value=3,
+            ),
+            expansion_coefficient_length_trend: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_trend',
+                value_range=(1, 4),
+                default_value=3,
+            ),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="activation",
+                value_range=tuple(_activations.keys()),
+                default_value=list(_activations.keys())[0],
+            ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="use_dropout",
+                value_range=(True, False),
+                default_value=False,
+            ),
+            normalization: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="normalization",
+                value_range=('BN', 'LN', 'NoNorm'),
+                default_value='BN'
+            ),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="dropout",
+                value_range=(0, 0.8),
+                default_value=0.1,
+            ),
+            backcast_loss_ration: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="backcast_loss_ration",
+                value_range=(0., 1.),
+                default_value=1.,
+            )
+    ) -> ConfigurationSpace:
+        """
+        Configuration for N-BEATS. The network is composed of several stacks, each stack is composed of several block,
+        we follow the implementation from N-BEATS: blocks are only composed of fully-connected layers with the same
+        width
+        The design of the configuration space follows pytorch-forecasting:
+        https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
+        Give that N-BEATS-I and N-BEATS-G's default hyperparameter configuration that totally different, we consider
+        them as two separate configuration space: N-BEATS-G that only contains generic blocks and thus could be scaled
+        up to 32 stacks, while each stacks share the same number of blocks/ width/ dropout rate. While N-BEATS-I is
+        is restricted to be a network with a much smaller number of stacks. However, the block type of N-BEATS-G at each
+        stack can be freely selected
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                dataset properties
+            n_beats_type (str):
+                type of nbeats network, could be I (N-BEATS-I) or G (N-BEATS-G)
+            num_stacks_g (int):
+                number of stacks for N-BEATS G
+            num_blocks_g (int):
+                number of blocks per stack for n-BEATS G
+            num_layers_g (int):
+                number of fc layers per block for N-BEATS G, this value is the same across all the blocks
+                within one stack
+            width_g (int):
+                fc layer width for N-BEATS G, this value is the same across all the blocks within one stack
+            num_stacks_i (int):
+                number of stacks for N-BEATS I
+            num_blocks_i (int):
+                number of blocks per stack  for N-BEATS I
+            num_layers_i (int):
+                number of fc layers per block for N-BEATS I, this value is the same across all the
+                blocks within one stack
+            width_i (int):
+                fc layer width for N-BEATS I, this value is the same across all the blocks within one stack
+            weight_sharing (bool):
+                if weights are shared inside one block
+            stack_type (str):
+                stack type, used to define the final output
+            expansion_coefficient_length_generic (int):
+                expansion_coefficient_length for N-BEATS G, activate if stack_type is 'generic'
+            expansion_coefficient_length_seasonality (int):
+                expansion_coefficient_length for N-BEATS I, activate if stack_type is
+                'seasonality' (n_dim = expansion_coefficient_length_interpretable * n_prediciton_steps)
+            expansion_coefficient_length_trend (int):
+                expansion_coefficient_length for N-BEATS I, activate if stack_type is 'trend' (it
+                corresponds to the degree of the polynomial)
+            activation (str):
+                activation function across fc layers
+            use_dropout (bool):
+                if dropout is applied
+            normalization (str):
+                normalization type, could be BN, LN or no normalization
+            dropout (float):
+                dropout value
+            backcast_loss_ration (float):
+                weight of backcast in comparison to forecast when calculating the loss. A weight of 1.0 indicates that
+                forecast and backcast loss is weighted the same (regardless of backcast and forecast lengths).
+                Defaults to 0.0, i.e. no weight.
+        Returns:
+            ConfigurationSpace:
+                Configuration Space
+        """
+
+        cs = ConfigurationSpace()
+
+        n_beats_type = get_hyperparameter(n_beats_type, CategoricalHyperparameter)
+
+        # General Hyperparameters
+        add_hyperparameter(cs, activation, CategoricalHyperparameter)
+        add_hyperparameter(cs, normalization, CategoricalHyperparameter)
+        add_hyperparameter(cs, backcast_loss_ration, UniformFloatHyperparameter)
+
+        cs.add_hyperparameter(n_beats_type)
+        # N-BEATS-G
+
+        weight_sharing_g = HyperparameterSearchSpace(hyperparameter='weight_sharing_g',
+                                                     value_range=weight_sharing.value_range,
+                                                     default_value=weight_sharing.default_value,
+                                                     log=weight_sharing.log)
+        use_dropout_g = HyperparameterSearchSpace(hyperparameter='use_dropout_g',
+                                                  value_range=use_dropout.value_range,
+                                                  default_value=use_dropout.default_value,
+                                                  log=use_dropout.log)
+        dropout_g = HyperparameterSearchSpace(hyperparameter='dropout_g',
+                                              value_range=dropout.value_range,
+                                              default_value=dropout.default_value,
+                                              log=dropout.log)
+        ecl_g_search_space = HyperparameterSearchSpace(
+            hyperparameter='expansion_coefficient_length_g',
+            value_range=expansion_coefficient_length_generic.value_range,
+            default_value=expansion_coefficient_length_generic.default_value,
+            log=expansion_coefficient_length_generic.log
+        )
+
+        num_stacks_g = get_hyperparameter(num_stacks_g, UniformIntegerHyperparameter)
+        num_blocks_g = get_hyperparameter(num_blocks_g, UniformIntegerHyperparameter)
+        num_layers_g = get_hyperparameter(num_layers_g, UniformIntegerHyperparameter)
+        width_g = get_hyperparameter(width_g, UniformIntegerHyperparameter)
+        weight_sharing_g = get_hyperparameter(weight_sharing_g, CategoricalHyperparameter)
+        ecl_g = get_hyperparameter(ecl_g_search_space, UniformIntegerHyperparameter)
+        use_dropout_g = get_hyperparameter(use_dropout_g, CategoricalHyperparameter)
+
+        dropout_g = get_hyperparameter(dropout_g, UniformFloatHyperparameter)
+
+        n_beats_g_hps = [num_stacks_g, num_blocks_g, num_layers_g, width_g, weight_sharing_g, ecl_g, use_dropout_g]
+        n_beats_g_conds = [EqualsCondition(hp_nbeats_g, n_beats_type, 'G') for hp_nbeats_g in n_beats_g_hps]
+        cs.add_hyperparameters(n_beats_g_hps)
+        cs.add_hyperparameter(dropout_g)
+        cs.add_conditions(n_beats_g_conds)
+        cs.add_condition(AndConjunction(EqualsCondition(dropout_g, n_beats_type, 'G'),
+                                        EqualsCondition(dropout_g, use_dropout_g, True)))
+
+        min_num_stacks_i, max_num_stacks_i = num_stacks_i.value_range
+
+        use_dropout_i = HyperparameterSearchSpace(hyperparameter='use_dropout_i',
+                                                  value_range=use_dropout.value_range,
+                                                  default_value=use_dropout.default_value,
+                                                  log=use_dropout.log)
+
+        num_stacks_i = get_hyperparameter(num_stacks_i, UniformIntegerHyperparameter)
+        use_dropout_i = get_hyperparameter(use_dropout_i, CategoricalHyperparameter)
+
+        cs.add_hyperparameters([num_stacks_i, use_dropout_i])
+        cs.add_conditions([EqualsCondition(num_stacks_i, n_beats_type, 'I'),
+                           EqualsCondition(use_dropout_i, n_beats_type, 'I')
+                           ])
+
+        for stack_idx in range(1, int(max_num_stacks_i) + 1):
+            num_blocks_i_search_space = HyperparameterSearchSpace(hyperparameter='num_blocks_i_%d' % stack_idx,
+                                                                  value_range=num_blocks_i.value_range,
+                                                                  default_value=num_blocks_i.default_value,
+                                                                  log=num_blocks_i.log)
+            num_layers_i_search_space = HyperparameterSearchSpace(hyperparameter='num_layers_i_%d' % stack_idx,
+                                                                  value_range=num_layers_i.value_range,
+                                                                  default_value=num_layers_i.default_value,
+                                                                  log=num_layers_i.log)
+            width_i_search_space = HyperparameterSearchSpace(hyperparameter='width_i_%d' % stack_idx,
+                                                             value_range=width_i.value_range,
+                                                             default_value=width_i.default_value,
+                                                             log=width_i.log)
+            weight_sharing_i_search_space = HyperparameterSearchSpace(hyperparameter='weight_sharing_i_%d' % stack_idx,
+                                                                      value_range=weight_sharing.value_range,
+                                                                      default_value=weight_sharing.default_value,
+                                                                      log=weight_sharing.log)
+            stack_type_i_search_space = HyperparameterSearchSpace(hyperparameter='stack_type_i_%d' % stack_idx,
+                                                                  value_range=stack_type.value_range,
+                                                                  default_value=stack_type.default_value,
+                                                                  log=stack_type.log)
+            expansion_coefficient_length_generic_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_i_generic_%d' % stack_idx,
+                value_range=expansion_coefficient_length_generic.value_range,
+                default_value=expansion_coefficient_length_generic.default_value,
+                log=expansion_coefficient_length_generic.log
+            )
+            expansion_coefficient_length_seasonality_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_i_seasonality_%d' % stack_idx,
+                value_range=expansion_coefficient_length_seasonality.value_range,
+                default_value=expansion_coefficient_length_seasonality.default_value,
+                log=expansion_coefficient_length_seasonality.log
+            )
+            expansion_coefficient_length_trend_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_i_trend_%d' % stack_idx,
+                value_range=expansion_coefficient_length_trend.value_range,
+                default_value=expansion_coefficient_length_trend.default_value,
+                log=expansion_coefficient_length_trend.log
+            )
+
+            num_blocks_i_hp = get_hyperparameter(num_blocks_i_search_space, UniformIntegerHyperparameter)
+            num_layers_i_hp = get_hyperparameter(num_layers_i_search_space, UniformIntegerHyperparameter)
+            width_i_hp = get_hyperparameter(width_i_search_space, UniformIntegerHyperparameter)
+            weight_sharing_i_hp = get_hyperparameter(weight_sharing_i_search_space, CategoricalHyperparameter)
+            stack_type_i_hp = get_hyperparameter(stack_type_i_search_space, CategoricalHyperparameter)
+
+            expansion_coefficient_length_generic_hp = get_hyperparameter(
+                expansion_coefficient_length_generic_search_space,
+                UniformIntegerHyperparameter
+            )
+            expansion_coefficient_length_seasonality_hp = get_hyperparameter(
+                expansion_coefficient_length_seasonality_search_space,
+                UniformIntegerHyperparameter
+            )
+            expansion_coefficient_length_trend_hp = get_hyperparameter(
+                expansion_coefficient_length_trend_search_space,
+                UniformIntegerHyperparameter
+            )
+
+            hps = [num_blocks_i_hp, num_layers_i_hp, width_i_hp, stack_type_i_hp, weight_sharing_i_hp]
+            cs.add_hyperparameters([*hps,
+                                    expansion_coefficient_length_generic_hp,
+                                    expansion_coefficient_length_seasonality_hp,
+                                    expansion_coefficient_length_trend_hp])
+
+            cond_ecls = [
+                EqualsCondition(expansion_coefficient_length_generic_hp, stack_type_i_hp, 'generic'),
+                EqualsCondition(expansion_coefficient_length_seasonality_hp, stack_type_i_hp, 'seasonality'),
+                EqualsCondition(expansion_coefficient_length_trend_hp, stack_type_i_hp, 'trend'),
+            ]
+
+            if stack_idx > int(min_num_stacks_i):
+                # The units of layer i should only exist
+                # if there are at least i layers
+                for hp in hps:
+                    cs.add_condition(
+                        AndConjunction(GreaterThanCondition(hp, num_stacks_i, stack_idx - 1),
+                                       EqualsCondition(hp, n_beats_type, 'I'))
+                    )
+                for cond_ecl in cond_ecls:
+                    cs.add_condition(
+                        AndConjunction(cond_ecl,
+                                       GreaterThanCondition(cond_ecl.child, num_stacks_i, stack_idx - 1),
+                                       EqualsCondition(cond_ecl.child, n_beats_type, 'I'))
+                    )
+            else:
+                cs.add_conditions([EqualsCondition(hp, n_beats_type, 'I') for hp in hps])
+                cs.add_conditions([
+                    AndConjunction(cond_ecl,
+                                   EqualsCondition(cond_ecl.child, n_beats_type, 'I')) for cond_ecl in cond_ecls
+                ]
+                )
+
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_i_%d' % stack_idx,
+                                                             value_range=dropout.value_range,
+                                                             default_value=dropout.default_value,
+                                                             log=dropout.log)
+
+            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
+            cs.add_hyperparameter(dropout_hp)
+
+            dropout_condition_1 = EqualsCondition(dropout_hp, use_dropout_i, True)
+            dropout_condition_2 = EqualsCondition(dropout_hp, n_beats_type, 'I')
+
+            if stack_idx > int(min_num_stacks_i):
+                dropout_condition_3 = GreaterThanCondition(dropout_hp, num_stacks_i, stack_idx - 1)
+                cs.add_condition(AndConjunction(dropout_condition_1, dropout_condition_2, dropout_condition_3))
+            else:
+                cs.add_condition(AndConjunction(dropout_condition_1, dropout_condition_2))
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
new file mode 100644
index 000000000..848a2a4cd
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -0,0 +1,118 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import Constant
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderNetwork
+from autoPyTorch.utils.common import FitRequirement
+
+
+class RNN_Module(DecoderNetwork):
+    def __init__(self,
+                 in_features: int,
+                 hidden_size: int,
+                 num_layers: int,
+                 cell_type: str,
+                 dropout: float,
+                 lagged_value: Optional[Union[List, np.ndarray]] = None):
+        super().__init__()
+        if cell_type == 'lstm':
+            cell = nn.LSTM
+        else:
+            cell = nn.GRU
+        self.lagged_value = lagged_value
+        in_features = in_features
+        self.lstm = cell(input_size=in_features,
+                         hidden_size=hidden_size,
+                         num_layers=num_layers,
+                         dropout=dropout,
+                         bidirectional=False,
+                         batch_first=True)
+
+    def forward(self,
+                x_future: torch.Tensor,
+                encoder_output: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                pos_idx: Optional[Tuple[int]] = None) -> Tuple[torch.Tensor, ...]:
+        if x_future.ndim == 2:
+            x_future = x_future.unsqueeze(1)
+        outputs, hidden_state, = self.lstm(x_future, encoder_output)
+        return outputs, hidden_state
+
+
+class ForecastingRNNDecoder(BaseForecastingDecoder):
+    """
+    Standard searchable RNN decoder for time series data, only works when the encoder is an RNN encoder
+    """
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
+        self.rnn_kwargs: Optional[Dict] = None
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
+        self.add_fit_requirements([FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False)])
+
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        assert self.rnn_kwargs is not None
+        # RNN decoder only allows RNN encoder, these parameters need to exists.
+        hidden_size = self.rnn_kwargs['hidden_size']
+        num_layers = self.rnn_kwargs['num_layers']
+        cell_type = self.rnn_kwargs['cell_type']
+        dropout = self.rnn_kwargs['dropout']
+        decoder = RNN_Module(in_features=future_variable_input[-1],
+                             hidden_size=hidden_size,
+                             num_layers=num_layers,
+                             cell_type=cell_type,
+                             dropout=dropout,
+                             lagged_value=self.lagged_value
+                             )
+        return decoder, hidden_size
+
+    @property
+    def fitted_encoder(self) -> List[str]:
+        return ['RNNEncoder']
+
+    @staticmethod
+    def decoder_properties() -> DecoderProperties:
+        decoder_properties = DecoderProperties(has_hidden_states=True,
+                                               recurrent=True,
+                                               lagged_input=True)
+        return decoder_properties
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.rnn_kwargs = X['rnn_kwargs']
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        return super().fit(X, y)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'RNNDecoder',
+            'name': 'RNNDecoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+    ) -> ConfigurationSpace:
+        cs = CS.ConfigurationSpace()
+        cs.add_hyperparameter(Constant('decoder_type', 'RNNDecoder'))  # this helps the encoder to recognize the decoder
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
new file mode 100644
index 000000000..021846e8a
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -0,0 +1,264 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    PositionalEncoding, build_transformer_layers)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderNetwork
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
+)
+
+
+class _TransformerDecoder(DecoderNetwork):
+    def __init__(self,
+                 in_features: int,
+                 d_model: int,
+                 num_layers: int,
+                 transformer_decoder_layers: nn.Module,
+                 use_positional_decoder: bool,
+                 use_layer_norm_output: bool,
+                 dropout_pd: float = 0.0,
+                 layer_norm_eps_output: Optional[float] = None,
+                 n_prediction_steps: int = 1,
+                 lagged_value: Optional[Union[List, np.ndarray]] = None):
+        super().__init__()
+        self.lagged_value = lagged_value
+        in_features = in_features
+
+        # self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        self.input_layer = nn.Linear(in_features, d_model, bias=False)
+
+        self.use_positional_decoder = use_positional_decoder
+        if use_positional_decoder:
+            self.pos_encoding = PositionalEncoding(d_model, dropout_pd)
+
+        self.use_layer_norm_output = use_layer_norm_output
+
+        if use_layer_norm_output:
+            norm = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+        else:
+            norm = None
+        self.transformer_decoder_layers = nn.TransformerDecoder(decoder_layer=transformer_decoder_layers,
+                                                                num_layers=num_layers,
+                                                                norm=norm)
+        self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(n_prediction_steps)
+
+    def forward(self,
+                x_future: torch.Tensor,
+                encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
+        output = self.input_layer(x_future)
+        if self.use_positional_decoder:
+            output = self.pos_encoding(output, pos_idx)
+        if self.training:
+            output = self.transformer_decoder_layers(output, encoder_output,
+                                                     tgt_mask=self.tgt_mask.to(encoder_output.device))
+        else:
+            output = self.transformer_decoder_layers(output, encoder_output)
+        return output
+
+
+class ForecastingTransformerDecoder(BaseForecastingDecoder):
+    """
+    Standard searchable Transformer decoder for time series data, only works when the encoder is a
+    Transformer Encoder
+    """
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
+        self.transformer_encoder_kwargs: Optional[dict] = None
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
+        self.add_fit_requirements([FitRequirement('transformer_encoder_kwargs', (Dict,), user_defined=False,
+                                                  dataset_property=False)])
+
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        assert self.transformer_encoder_kwargs is not None
+        d_model = 2 ** self.transformer_encoder_kwargs['d_model_log']
+        transformer_decoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='decoder')
+        n_prediction_steps = dataset_properties['n_prediction_steps']
+
+        decoder = _TransformerDecoder(in_features=future_variable_input[-1],
+                                      d_model=d_model,
+                                      num_layers=self.config['num_layers'],
+                                      transformer_decoder_layers=transformer_decoder_layers,
+                                      use_positional_decoder=self.config['use_positional_decoder'],
+                                      use_layer_norm_output=self.config['use_layer_norm_output'],
+                                      dropout_pd=self.config.get('dropout_positional_decoder', 0.0),
+                                      layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
+                                      n_prediction_steps=n_prediction_steps,
+                                      lagged_value=self.lagged_value)
+
+        return decoder, d_model
+
+    @staticmethod
+    def decoder_properties() -> DecoderProperties:
+        return DecoderProperties(recurrent=True,
+                                 lagged_input=True)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.transformer_encoder_kwargs = X['transformer_encoder_kwargs']
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        return super().fit(X, y)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TransformerDecoder',
+            'name': 'TransformerDecoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @property
+    def fitted_encoder(self) -> List[str]:
+        return ['TransformerEncoder']
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            num_layers: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='num_layers',
+                                      value_range=(1, 4),
+                                      default_value=1),
+            n_head_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='n_head_log',
+                                      value_range=(1, 4),
+                                      default_value=3),
+            d_feed_forward_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
+                                      value_range=(6, 12),
+                                      default_value=7),
+            norm_first: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="norm_first",
+                                      value_range=(True, False),
+                                      default_value=True),
+            layer_norm_eps: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='layer_norm_eps',
+                                      value_range=(1e-7, 1e-3),
+                                      default_value=1e-5,
+                                      log=True),
+            use_positional_decoder: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_positional_decoder',
+                                      value_range=(True, False),
+                                      default_value=True),
+            use_layer_norm_output: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_layer_norm_output',
+                                      value_range=(True, False),
+                                      default_value=True),
+            activation: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="activation",
+                                      value_range=('relu', 'gelu'),
+                                      default_value='relu',
+                                      ),
+            use_dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                      value_range=(True, False),
+                                      default_value=False,
+                                      ),
+            dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="dropout",
+                                      value_range=(0, 0.1),
+                                      default_value=0.1,
+                                      ),
+    ) -> ConfigurationSpace:
+        """
+        get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
+        consider their log value (with base 2) as the hyperparameters
+
+        Args:
+            num_layers (int):
+                number of transformer layers
+            n_head_log (int):
+                log value (base 2, this should work for all the following hyperparameters with logs) of number of head
+            d_feed_forward_log (int):
+                log values of feed forward network width
+            norm_first (bool):
+                if ``True``, layer norm is done prior to attention and feedforward operations, respectivaly.
+                Otherwise, it's done after. Default: ``False`` (after).
+            layer_norm_eps (float):
+                eps for layer norm
+            use_layer_norm_output (bool):
+                if layer norm output is applied
+            activation (str):
+                activation function type
+            use_dropout (bool):
+                if dropout is applied
+            dropout (float):
+                dropout rate
+
+        Returns:
+            ConfigurationSpace:
+                configuration space
+        """
+        cs = CS.ConfigurationSpace()
+
+        add_hyperparameter(cs, activation, CategoricalHyperparameter)
+        add_hyperparameter(cs, norm_first, CategoricalHyperparameter)
+
+        min_transformer_layers, max_transformer_layers = num_layers.value_range
+
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+
+        # We can have dropout in the network for
+        # better generalization
+        use_positional_decoder = get_hyperparameter(use_positional_decoder, CategoricalHyperparameter)
+
+        dropout_pd = HyperparameterSearchSpace(hyperparameter='dropout_positional_decoder',
+                                               value_range=dropout.value_range,
+                                               default_value=dropout.default_value,
+                                               log=dropout.log)
+        dropout_pd = get_hyperparameter(dropout_pd, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([num_layers, use_dropout, use_positional_decoder, dropout_pd])
+        cs.add_condition(CS.AndConjunction(
+            CS.EqualsCondition(dropout_pd, use_dropout, True),
+            CS.EqualsCondition(dropout_pd, use_positional_decoder, True)
+        ))
+
+        add_hyperparameter(cs, n_head_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, d_feed_forward_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, layer_norm_eps, UniformFloatHyperparameter)
+
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameter(dropout)
+        cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
+
+        use_layer_norm_output = get_hyperparameter(use_layer_norm_output, CategoricalHyperparameter)
+        layer_norm_eps_output = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_output',
+                                                          value_range=layer_norm_eps.value_range,
+                                                          default_value=layer_norm_eps.default_value,
+                                                          log=layer_norm_eps.log)
+        layer_norm_eps_output = get_hyperparameter(layer_norm_eps_output, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([use_layer_norm_output, layer_norm_eps_output])
+        cs.add_condition(CS.EqualsCondition(layer_norm_eps_output, use_layer_norm_output, True))
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
new file mode 100644
index 000000000..62fb78240
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -0,0 +1,17 @@
+import os
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents, find_components)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder
+
+directory = os.path.split(__file__)[0]
+decoders = find_components(__package__,
+                           directory,
+                           BaseForecastingDecoder)
+
+decoder_addons = ThirdPartyComponents(BaseForecastingDecoder)
+
+
+def add_decoder(decoder: BaseForecastingDecoder) -> None:
+    decoder_addons.add_component(decoder)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
new file mode 100644
index 000000000..8d816a413
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -0,0 +1,220 @@
+from abc import abstractmethod
+from collections import OrderedDict
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+from torch import nn
+
+from autoPyTorch.pipeline.components.base_component import (
+    BaseEstimator, autoPyTorchComponent)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo, DecoderProperties)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseForecastingDecoder(autoPyTorchComponent):
+    """
+    Base class for network decoder used for forecasting. Holds the decoder module and the config which was used to
+    create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 block_number: int = 1,
+                 auto_regressive: bool = False,
+                 **kwargs: Any):
+        super().__init__()
+        self.block_number = block_number
+        self.add_fit_requirements([
+            FitRequirement('known_future_features', (tuple,), user_defined=False, dataset_property=True),
+            FitRequirement('feature_shapes', (Dict,), user_defined=False, dataset_property=True),
+            FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_steps', (int,), user_defined=False, dataset_property=True),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
+        ])
+        self.auto_regressive = auto_regressive
+        self.config = kwargs
+        self.decoder: Optional[nn.Module] = None
+        self.n_decoder_output_features: Optional[int] = None
+        self.decoder_input_shape: Optional[Tuple[int, ...]] = None
+        self.n_prediction_heads = 1
+        self.is_last_decoder: Optional[bool] = False
+
+    @property
+    def fitted_encoder(self) -> List[str]:
+        return []
+
+    @staticmethod
+    def decoder_properties() -> DecoderProperties:
+        return DecoderProperties()
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Builds the head component and assigns it to self.decoder
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+        Returns:
+            Self
+        """
+        self.check_requirements(X, y)
+        output_shape = X['dataset_properties']['output_shape']
+
+        encoder_output_shape = X['network_encoder'][f'block_{self.block_number}'].encoder_output_shape
+
+        auto_regressive = self.auto_regressive
+
+        if auto_regressive:
+            self.n_prediction_heads = 1
+        else:
+            self.n_prediction_heads = X['dataset_properties']['n_prediction_steps']
+
+        network_structure = X['network_structure']
+        variable_selection = network_structure.variable_selection
+
+        if 'n_decoder_output_features' not in X:
+            future_features = X['dataset_properties']['known_future_features']
+            feature_shapes = X['dataset_properties']['feature_shapes']
+
+            future_in_features = sum([feature_shapes[fu_feat] for fu_feat in future_features])
+
+            if X['transform_time_features']:
+                n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
+            else:
+                n_time_feature_transform = 0
+
+            if variable_selection:
+                future_in_features = X['network_encoder']['block_1'].encoder_output_shape[-1]
+            else:
+                if auto_regressive:
+                    if self.decoder_properties().lagged_input and hasattr(self, 'lagged_value'):
+                        future_in_features += len(self.lagged_value) * output_shape[-1]
+                    elif self.decoder_properties().recurrent:
+                        future_in_features += output_shape[-1]
+                future_in_features += n_time_feature_transform
+            future_variable_input = (self.n_prediction_heads, future_in_features)
+        else:
+            future_variable_input = (self.n_prediction_heads, X['n_decoder_output_features'])
+
+        if self.block_number == network_structure.num_blocks:
+            self.is_last_decoder = True
+
+        # TODO consider decoder auto regressive and fill in decoder part
+
+        self.decoder, self.n_decoder_output_features = self.build_decoder(
+            encoder_output_shape=encoder_output_shape,
+            future_variable_input=future_variable_input,
+            n_prediction_heads=self.n_prediction_heads,
+            dataset_properties=X['dataset_properties']
+        )
+
+        self.decoder_input_shape = future_variable_input
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the network head into the fit dictionary 'X' and returns it.
+
+        Args:
+            X (Dict[str, Any]):
+                'X' dictionary
+        Returns:
+            (Dict[str, Any]):
+                the updated 'X' dictionary
+        """
+        # 'auto_regressive' needs to be the same across all the decoders,
+        # 'n_prediction_heads' and 'n_decoder_output_features' are only applied to the head such that they could be
+        # overwritten by the following decoders
+        network_decoder = X.get('network_decoder', OrderedDict())
+        assert self.decoder_input_shape is not None
+        assert self.n_decoder_output_features is not None
+        network_decoder[f'block_{self.block_number}'] = DecoderBlockInfo(
+            decoder=self.decoder,
+            decoder_properties=self.decoder_properties(),
+            decoder_input_shape=self.decoder_input_shape,
+            decoder_output_shape=(self.n_prediction_heads, self.n_decoder_output_features)
+        )
+        if self.is_last_decoder:
+            X.update({'network_decoder': network_decoder,
+                      'n_prediction_heads': self.n_prediction_heads,
+                      'n_decoder_output_features': self.n_decoder_output_features,
+                      'auto_regressive': self.auto_regressive})
+        else:
+            X.update({'network_decoder': network_decoder,
+                      'n_decoder_output_features': self.n_decoder_output_features,
+                      })
+
+        return X
+
+    def build_decoder(self,
+                      encoder_output_shape: Tuple[int, ...],
+                      future_variable_input: Tuple[int, ...],
+                      n_prediction_heads: int,
+                      dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        """
+        Builds the head module and returns it
+
+        Args:
+            encoder_output_shape (Tuple[int, ...]):
+                shape of the input to the decoder, this value is the encoder output
+            future_variable_input (Tuple[int, ...]):
+                shape of the known future input values
+            n_prediction_heads (int):
+                how many prediction heads the network has, used for final forecasting heads
+            dataset_properties (Dict):
+                dataset properties
+        Returns:
+            nn.Module:
+                head module
+        """
+        decoder, n_decoder_features = self._build_decoder(encoder_output_shape, future_variable_input,
+                                                          n_prediction_heads, dataset_properties)
+        return decoder, int(n_decoder_features)
+
+    @abstractmethod
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        """
+        Builds the head module and returns it
+
+        Args:
+            encoder_output_shape (Tuple[int, ...]):
+                shape of the input to the decoder, this value is the encoder output
+            future_variable_input (Tuple[int, ...]):
+                shape of the known future input values
+            n_prediction_heads (int):
+                how many prediction heads the network has, used for final forecasting heads
+            dataset_properties (Dict):
+                dataset properties
+
+        Returns:
+            decoder (nn.Module):
+                decoder module
+            n_decoder_features (int):
+                output of decoder features, used for initialize network head.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def get_name(cls) -> str:
+        """
+        Get the name of the decoder
+
+        Args:
+            None
+
+        Returns:
+            str:
+                Name of the decoder
+        """
+        return str(cls.get_properties()["shortname"])
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
new file mode 100644
index 000000000..5cb9d8ff2
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -0,0 +1,69 @@
+from typing import NamedTuple, Optional, Tuple
+
+import torch
+from torch import nn
+
+
+class DecoderProperties(NamedTuple):
+    """
+    Decoder properties
+
+    Args:
+        has_hidden_states (bool):
+            if the decoder has hidden states. A decoder with hidden states might have additional output and requires
+            additional inputs
+        has_local_layer (bool):
+            if the decoder has local layer, in which case the output is also a 3D sequential feature
+        recurrent (bool):
+            if the decoder is recurrent. This determines if decoders can be auto-regressive
+        lagged_input (bool):
+            if the decoder accepts past targets as additional features
+        multi_blocks (bool):
+            If the decoder is stacked by multiple blocks (only for N-BEATS)
+    """
+    has_hidden_states: bool = False
+    has_local_layer: bool = True
+    recurrent: bool = False
+    lagged_input: bool = False
+    multi_blocks: bool = False
+
+
+class DecoderBlockInfo(NamedTuple):
+    """
+    Decoder block infos
+
+    Args:
+        decoder (nn.Module):
+            decoder network
+        decoder_properties (EncoderProperties):
+            decoder properties
+        decoder_output_shape (Tuple[int, ...]):
+            output shape that the decoder ought to output
+
+        decoder_input_shape (Tuple[int, ...]):
+            requried input shape of the decoder
+
+    """
+    decoder: nn.Module
+    decoder_properties: DecoderProperties
+    decoder_output_shape: Tuple[int, ...]
+    decoder_input_shape: Tuple[int, ...]
+
+
+class DecoderNetwork(nn.Module):
+    def forward(self, x_future: torch.Tensor,
+                encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
+        """
+        Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
+
+
+        Args:
+            x_future: torch.Tensor(B, L_future, N_out), the future features
+            encoder_output: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
+            pos_idx: positional index, indicating the position of the forecasted tensor, used for transformer
+        Returns:
+            net_output: torch.Tensor with shape either (B, L_future, N)
+
+        """
+        raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
new file mode 100644
index 000000000..e4b905fee
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -0,0 +1,394 @@
+import os
+import warnings
+from abc import abstractmethod
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.conditions import EqualsCondition, OrConjunction
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+from sklearn.pipeline import Pipeline
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components
+)
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    ForecastingNetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import (
+    decoder_addons, decoders)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+class AbstractForecastingEncoderChoice(autoPyTorchChoice):
+    """
+    A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
+    the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
+    the encoder.
+    """
+
+    def __init__(self,
+                 **kwargs: Any,
+                 ):
+        super().__init__(**kwargs)
+        self.pipeline: Optional[Pipeline] = None
+        self.decoder_choice: Optional[List[BaseForecastingDecoder]] = None
+
+    @abstractmethod
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        raise NotImplementedError
+
+    def get_decoder_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
+        components = OrderedDict()
+        components.update(decoders)
+        components.update(decoder_addons.components)
+        return components
+
+    @property
+    def additional_components(self) -> List[Callable]:
+        # This function is deigned to add additional components rather than the components in __choice__
+        return [self.get_decoder_components]
+
+    def get_available_components(  # type: ignore[override]
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            include: List[str] = None,
+            exclude: List[str] = None,
+            components: Optional[Dict[str, Type[autoPyTorchComponent]]] = None
+    ) -> Dict[str, Type[autoPyTorchComponent]]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+         components (Optional[Dict[str, Type[autoPyTorchComponent]]]): components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate backbones
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        if components is None:
+            available_comp = self.get_components()
+        else:
+            available_comp = components
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    warnings.warn("Trying to include unknown component: ""%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # backbones based on the dataset!
+            # TODO: Think if there is any case where a backbone
+            # is not recommended for a certain dataset
+
+            components_dict[name] = entry
+
+        return components_dict
+
+    def get_hyperparameter_search_space(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            default (Optional[str]):
+                Default encoder to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components. It
+                allows nested encoder such as flat_encoder:MLPEncoder
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip. It allows nested encoder as such flat_encoder:MLPEncoder
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal components for this problem
+        available_encoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        available_decoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=None, exclude=None,
+            components=self.get_decoder_components())
+
+        if len(available_encoders) == 0:
+            raise ValueError("No Encoder found")
+        if len(available_decoders) == 0:
+            raise ValueError("No Decoder found")
+
+        if default is None:
+            defaults = self._defaults_network
+            for default_ in defaults:
+                if default_ in available_encoders:
+                    default = default_
+                    break
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_encoders):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_encoders,
+                                                               choice_hyperparameter.value_range))
+            hp_encoder = CSH.CategoricalHyperparameter('__choice__',
+                                                       choice_hyperparameter.value_range,
+                                                       default_value=choice_hyperparameter.default_value)
+        else:
+            hp_encoder = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_encoders.keys()),
+                default_value=default
+            )
+        cs.add_hyperparameter(hp_encoder)
+
+        decoder2encoder: Dict[str, List[str]] = {key: [] for key in available_decoders.keys()}
+        encoder2decoder: Dict[str, List[str]] = {}
+        for encoder_name in hp_encoder.choices:
+            updates = self._get_search_space_updates(prefix=encoder_name)
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type: ignore[call-arg]
+                dataset_properties,
+                **updates   # type: ignore[call-arg]
+            )
+            parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+            cs.add_configuration_space(
+                encoder_name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+            allowed_decoders = available_encoders[encoder_name].allowed_decoders()
+            if len(allowed_decoders) > 1:
+                if 'decoder_type' not in config_space:
+                    raise ValueError('When a specific encoder has more than one allowed decoder, its ConfigSpace'
+                                     'must contain the hyperparameter "decoder_type" ! Please check your encoder '
+                                     'setting!')
+                hp_decoder_choice = config_space.get_hyperparameter('decoder_type').choices
+                if not set(hp_decoder_choice).issubset(allowed_decoders):
+                    raise ValueError('The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
+                allowed_decoders = hp_decoder_choice
+            for decoder_name in allowed_decoders:
+                decoder2encoder[decoder_name].append(encoder_name)
+            encoder2decoder[encoder_name] = allowed_decoders
+
+        for decoder_name in available_decoders.keys():
+            if not decoder2encoder[decoder_name]:
+                continue
+            updates = self._get_search_space_updates(prefix=decoder_name)
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(  # type: ignore[call-arg]
+                dataset_properties,
+                **updates   # type: ignore[call-arg]
+            )
+            compatible_encoders = decoder2encoder[decoder_name]
+            encoders_with_multi_decoder = []
+            encoder_with_uni_decoder = []
+
+            for encoder in compatible_encoders:
+                if len(encoder2decoder[encoder]) > 1:
+                    encoders_with_multi_decoder.append(encoder)
+                else:
+                    encoder_with_uni_decoder.append(encoder)
+
+            cs.add_configuration_space(
+                decoder_name,
+                config_space,
+                # parent_hyperparameter=parent_hyperparameter
+            )
+            hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
+            conditions_to_add = []
+            for hp in hps:
+                # TODO consider if this will raise any unexpected behavior
+                if hp.name.startswith(decoder_name):
+                    # From the implementation of ConfigSpace
+                    # Only add a condition if the parameter is a top-level
+                    # parameter of the new configuration space (this will be some
+                    #  kind of tree structure).
+                    if cs.get_parents_of(hp):
+                        continue
+                    or_cond = []
+                    for encoder_uni in encoder_with_uni_decoder:
+                        or_cond.append(EqualsCondition(hp,
+                                                       hp_encoder,
+                                                       encoder_uni))
+                    for encode_multi in encoders_with_multi_decoder:
+                        hp_decoder_type = cs.get_hyperparameter(f'{encode_multi}:decoder_type')
+                        or_cond.append(EqualsCondition(hp,
+                                                       hp_decoder_type,
+                                                       decoder_name))
+                    if len(or_cond) == 0:
+                        continue
+                    elif len(or_cond) > 1:
+                        conditions_to_add.append(OrConjunction(*or_cond))
+                    else:
+                        conditions_to_add.append(or_cond[0])
+            cs.add_conditions(conditions_to_add)
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
+
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        """
+        Applies a configuration to the given component.
+        This method translate a hierarchical configuration key,
+        to an actual parameter of the autoPyTorch component.
+
+        Args:
+            configuration (Configuration):
+                Which configuration to apply to the chosen component
+            init_params (Optional[Dict[str, any]]):
+                Optional arguments to initialize the chosen component
+
+        Returns:
+            self: returns an instance of self
+        """
+        new_params = {}
+
+        params = configuration.get_dictionary()
+        choice = params.pop('__choice__')
+
+        for param, value in params.items():
+            param = param.replace(choice + ':', '')
+            new_params[param] = value
+
+        if init_params is not None:
+            for param, value in init_params.items():
+                param = param.replace(choice + ':', '')
+                new_params[param] = value
+
+        decoder_components = self.get_decoder_components()
+
+        decoder_type: Optional[str] = None
+
+        decoder_params = {}
+        decoder_params_names = []
+        for param, value in new_params.items():
+            if decoder_type is None:
+                for decoder_component in decoder_components.keys():
+                    if param.startswith(decoder_component):
+                        decoder_type = decoder_component
+                        decoder_params_names.append(param)
+                        param = param.replace(decoder_type + ':', '')
+                        decoder_params[param] = value
+            else:
+                if param.startswith(decoder_type):
+                    decoder_params_names.append(param)
+                    param = param.replace(decoder_type + ':', '')
+                    decoder_params[param] = value
+        assert decoder_type is not None, 'Decoder must be given to initialize a network backbone'
+
+        for param_name in decoder_params_names:
+            del new_params[param_name]
+
+        new_params['random_state'] = self.random_state
+        decoder_params['random_state'] = self.random_state
+
+        self.new_params = new_params
+        self.choice = self.get_components()[choice](**new_params)
+        self.decoder_choice = decoder_components[decoder_type](**decoder_params)
+
+        self.pipeline = Pipeline([('net_structure', ForecastingNetworkStructure()),
+                                  ('encoder', self.choice),
+                                  ('decoder', self.decoder_choice)])
+        return self
+
+    @property
+    def _defaults_network(self) -> List[str]:
+        return ['MLPEncoder', 'RNNEncoder', 'NBEATSEncoder']
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> Pipeline:  # type: ignore[override]
+        """Handy method to check if a component is fitted
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+        """
+        # Allows to use check_is_fitted on the choice object
+        self.fitted_ = True
+        assert self.pipeline is not None, "Cannot call fit without initializing the component"
+        return self.pipeline.fit(X, y)
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        assert self.pipeline is not None, "Cannot call transform before the object is initialized"
+        return self.pipeline.transform(X)   # type: ignore[no-any-return]
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
new file mode 100644
index 000000000..a82db4f95
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -0,0 +1,161 @@
+from abc import abstractmethod
+from collections import OrderedDict
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import numpy as np
+
+from torch import nn
+
+import torchvision
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderBlockInfo,
+    EncoderProperties
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseForecastingEncoder(autoPyTorchComponent):
+    """
+    Base class for network backbones. Holds the encoder module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 block_number: int = 1,
+                 **kwargs: Any):
+        autoPyTorchComponent.__init__(self)
+        self.add_fit_requirements(
+            self._required_fit_arguments
+        )
+        self.encoder: nn.Module = None
+        self.config = kwargs
+        self.input_shape: Optional[Tuple[int, ...]] = None
+        self.block_number = block_number
+        self.encoder_output_shape: Optional[Tuple[int, ...]] = None
+
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        return [
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
+            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement('network_embedding', (nn.Module, ), user_defined=False, dataset_property=False),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False)
+        ]
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.check_requirements(X, y)
+
+        input_shape = (*X["dataset_properties"]['input_shape'][:-1], 0)
+        output_shape = X["dataset_properties"]['output_shape']
+
+        if self.block_number == 1:
+            if not X["dataset_properties"]["uni_variant"]:
+                X_train = X.get('X_train', None)
+                if X_train is None:
+                    raise ValueError('Non uni_variant dataset must contain X_train!')
+
+                if X["dataset_properties"]["is_small_preprocess"]:
+                    input_shape = X_train.shape[1:]
+                else:
+                    # get input shape by transforming first two elements of the training set
+                    transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
+                    X_train = X_train.values[:1, np.newaxis, ...]
+                    X_train = transforms(X_train)
+                    input_shape = np.concatenate(X_train).shape[1:]
+
+            if X['transform_time_features']:
+                n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
+            else:
+                n_time_feature_transform = 0
+
+            input_shape = (*input_shape[:-1], input_shape[-1] + n_time_feature_transform)
+
+            if 'network_embedding' in X.keys():
+                input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
+
+            variable_selection = X['network_structure'].variable_selection
+            if variable_selection:
+                in_features = self.n_encoder_output_feature()
+            elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
+                in_features = len(self.lagged_value) * output_shape[-1] + input_shape[-1]
+            else:
+                in_features = output_shape[-1] + input_shape[-1]
+
+            input_shape = (X['window_size'], in_features)
+        else:
+            if 'network_encoder' not in X or f'block_{self.block_number -1}' not in X['network_encoder']:
+                raise ValueError('Lower block layers must be fitted and transformed first!')
+            network_block_info = X['network_encoder'][f'block_{self.block_number -1}']
+            input_shape = network_block_info.encoder_output_shape
+
+        self.encoder = self.build_encoder(
+            input_shape=input_shape,
+        )
+
+        self.input_shape = input_shape
+
+        has_hidden_states = self.encoder_properties().has_hidden_states
+        self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
+        if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:
+            raise ValueError(f'n_encoder_output_feature ({ self.n_encoder_output_feature()}) '
+                             f'must equal to the output dimension f({self.encoder_output_shape})')
+        return self
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def n_encoder_output_feature(self) -> int:
+        # We need this to compute the output of the variable selection network
+        raise NotImplementedError
+
+    def n_hidden_states(self) -> int:
+        return 0
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X['dataset_properties'].update({'input_shape': self.input_shape})
+        network_encoder = X.get('network_encoder', OrderedDict())
+        assert self.input_shape is not None
+        assert self.encoder_output_shape is not None
+        network_encoder[f'block_{self.block_number}'] = EncoderBlockInfo(encoder=self.encoder,
+                                                                         encoder_properties=self.encoder_properties(),
+                                                                         encoder_input_shape=self.input_shape,
+                                                                         encoder_output_shape=self.encoder_output_shape,
+                                                                         n_hidden_states=self.n_hidden_states())
+
+        X.update({'network_encoder': network_encoder})
+        return X
+
+    @abstractmethod
+    def build_encoder(self,
+                      input_shape: Tuple[int, ...]) -> nn.Module:
+        """
+        Builds the backbone module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]):
+                input feature shape
+
+        Returns:
+            nn.Module: backbone module
+        """
+        pass
+
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        """
+        Encoder properties, this determines how the data flows over the forecasting networks
+
+        """
+        encoder_properties = EncoderProperties()
+        return encoder_properties
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
new file mode 100644
index 000000000..f3286f827
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -0,0 +1,99 @@
+from enum import Enum
+from typing import NamedTuple, Tuple
+
+import torch
+from torch import nn
+
+
+class EncoderProperties(NamedTuple):
+    """
+    Encoder properties
+
+    Args:
+        has_hidden_states (bool):
+            if the encoder has hidden states. An encoder with hidden states might have additional output
+        bijective_seq_output (bool):
+            if the encoder's output sequence has the same length as its input sequence's length
+        fixed_input_seq_length (bool):
+            if the encoder requries a fixed length of input (for instance, MLP)
+        lagged_input (bool):
+            if the encoder accepts past targets as additional features
+        is_casual (bool):
+            If the output of the encoder only depends on the past targets
+    """
+    has_hidden_states: bool = False
+    bijective_seq_output: bool = True
+    fixed_input_seq_length: bool = False
+    lagged_input: bool = False
+    is_casual: bool = True
+
+
+class EncoderBlockInfo(NamedTuple):
+    """
+    Encoder block infos
+
+    Args:
+        encoder (nn.Module):
+            encoder network
+        encoder_properties (EncoderProperties):
+            encoder properties
+        encoder_input_shape (Tuple[int, ...]):
+            requried input shape of the encoder
+        encoder_output_shape (Tuple[int, ...]):
+            output shape that the encoder ought to output
+        n_hidden_states (int):
+            number of hidden states
+    """
+    encoder: nn.Module
+    encoder_properties: EncoderProperties
+    encoder_input_shape: Tuple[int, ...]
+    encoder_output_shape: Tuple[int, ...]
+    n_hidden_states: int
+
+
+class EncoderNetwork(nn.Module):
+    def forward(self,
+                x: torch.Tensor,
+                output_seq: bool = False) -> torch.Tensor:
+        """
+        Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
+        When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
+         needs to be set as True
+        When the decoder is a seq2seq model, the network needs to output a 2-d Tensor (B, N), in which case,
+        output_seq needs to be set as False
+
+        Args:
+            x: torch.Tensor(B, L_in, N)
+                input data
+            output_seq (bool): if the network outputs a sequence tensor. If it is set True,
+                output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
+                If this value is set as False, the network only returns the last item of the sequence.
+            hx (Optional[torch.Tensor]): addational input to the network, this could be a hidden states or a sequence
+                from previous inputs
+
+        Returns:
+            net_output: torch.Tensor with shape either (B, N) or (B, L_out, N)
+
+        """
+        raise NotImplementedError
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        get the last value of the sequential output
+        Args:
+            x (torch.Tensor(B, L, N)):
+                a sequential value output by the network, usually this value needs to be fed to the decoder
+                (or a 2D tensor for a flat encoder)
+        Returns:
+            output (torch.Tensor(B, 1, M)):
+                last element of the sequential value (or a 2D tensor for flat encoder)
+
+        """
+        raise NotImplementedError
+
+
+class EncoderOutputForm(Enum):
+    NoOutput = 0
+    HiddenStates = 1  # RNN -> RNN
+    Sequence = 2  # Transformer -> Transformer
+    SequenceLast = 3  # RNN/TCN/Transformer -> MLP
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
new file mode 100644
index 000000000..10e67ff8a
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -0,0 +1,173 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
+    EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    HyperparameterSearchSpace,
+    add_hyperparameter
+)
+
+
+class TimeSeriesMLP(EncoderNetwork):
+    def __init__(self,
+                 window_size: int,
+                 network: Optional[nn.Module] = None
+                 ):
+        """
+        Transform the input features (B, T, N) to fit the requirement of MLP
+        Args:
+            window_size (int): T
+            fill_lower_resolution_seq: if sequence with lower resolution needs to be filled with 0
+        (for multi-fidelity problems with resolution as fidelity)
+        """
+        super().__init__()
+        self.window_size = window_size
+        self.network = network
+
+    def forward(self, x: torch.Tensor, output_seq: bool = False) -> torch.Tensor:
+        """
+
+        Args:
+            x: torch.Tensor(B, L_in, N)
+            output_seq (bool), if the MLP outputs a squence, in which case, the input will be rolled to fit the size of
+            the network. For Instance if self.window_size = 3, and we obtain a squence with [1, 2, 3, 4, 5]
+            the input of this mlp is rolled as :
+            [[1, 2, 3]
+            [2, 3, 4]
+            [3, 4 ,5]]
+
+        Returns:
+
+        """
+        if output_seq:
+            x = x.unfold(1, self.window_size, 1).transpose(-1, -2)
+            # x.shape = [B, L_in - self.window + 1, self.window, N]
+        else:
+            if x.shape[1] > self.window_size:
+                # we need to ensure that the input size fits the network shape
+                x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
+        x = x.flatten(-2)
+        return x if self.network is None else self.network(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+
+class MLPEncoder(BaseForecastingEncoder, MLPBackbone):  # type: ignore[misc]
+    _fixed_seq_length = True
+    window_size = 1
+
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(bijective_seq_output=False, fixed_input_seq_length=True)
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder']
+
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        requirements_list = super()._required_fit_arguments
+        requirements_list.append(FitRequirement('window_size', (int,), user_defined=False, dataset_property=False))
+        return requirements_list
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.window_size = X["window_size"]
+        # when resolution is smaller
+        return super().fit(X, y)
+
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[-1]
+        network = nn.Sequential(*self._build_backbone(in_features * self.window_size))
+        return TimeSeriesMLP(window_size=self.window_size,
+                             network=network)
+
+    def n_encoder_output_feature(self) -> int:
+        # This function should never be called!!
+        num_out_features: int = self.config["num_units_%d" % (self.config['num_groups'])]
+        return num_out_features
+
+    def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
+                   layer_id: int) -> None:
+        """
+        Dynamically add a layer given the in->out specification
+
+        Args:
+            layers (List[nn.Module]): The list where all modules are added
+            in_features (int): input dimensionality of the new layer
+            out_features (int): output dimensionality of the new layer
+
+        """
+        layers.append(nn.Linear(in_features, out_features))
+        if self.config['normalization'] == 'BN':
+            layers.append(nn.BatchNorm1d(out_features))
+        elif self.config['normalization'] == 'LN':
+            layers.append(nn.LayerNorm(out_features))
+        layers.append(_activations[self.config["activation"]]())
+        if self.config['use_dropout']:
+            layers.append(nn.Dropout(self.config["dropout_%d" % layer_id]))
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TSMLPEncoder',
+            'name': 'TimeSeriesMLPEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(  # type: ignore[override]
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
+                                                                              value_range=(1, 5),
+                                                                              default_value=3,
+                                                                              ),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
+                                                                              value_range=tuple(_activations.keys()),
+                                                                              default_value=list(_activations.keys())[
+                                                                                  0],
+                                                                              ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                                                               value_range=(True, False),
+                                                                               default_value=False,
+                                                                               ),
+            num_units: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_units",
+                                                                             value_range=(16, 1024),
+                                                                             default_value=64,
+                                                                             log=True
+                                                                             ),
+            normalization: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='normalization',
+                                                                                 value_range=('BN', 'LN', 'NoNorm'),
+                                                                                 default_value='BN'),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
+                                                                           value_range=(0, 0.8),
+                                                                           default_value=0.1,
+                                                                           ),
+    ) -> ConfigurationSpace:
+        cs = MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,  # type: ignore
+                                                         num_groups=num_groups,
+                                                         activation=activation,
+                                                         use_dropout=use_dropout,
+                                                         num_units=num_units,
+                                                         dropout=dropout)
+        add_hyperparameter(cs, normalization, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
new file mode 100644
index 000000000..b6abe9ffe
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -0,0 +1,85 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace import ConfigurationSpace
+
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    MLPEncoder import TimeSeriesMLP
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
+
+
+class NBEATSEncoder(BaseForecastingEncoder):
+    """
+    Encoder for NBEATS-like network. It flatten the input sequence to fit the requirement of MLP, the main part is
+    implemented under decoder
+    """
+    _fixed_seq_length = True
+    window_size = 1
+
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(fixed_input_seq_length=True)
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['NBEATSDecoder']
+
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        requirements_list = super()._required_fit_arguments
+        requirements_list.append(FitRequirement('window_size', (int,), user_defined=False, dataset_property=False))
+        return requirements_list
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.check_requirements(X, y)
+        self.window_size = X["window_size"]
+
+        # n-BEATS only requires targets as its input
+        # TODO add support for multi-variant
+        output_shape = X["dataset_properties"]['output_shape']
+
+        self.encoder = self.build_encoder(
+            input_shape=output_shape,
+        )
+
+        input_shape = (self.window_size, output_shape[-1])
+        self.input_shape = input_shape
+
+        has_hidden_states = self.encoder_properties().has_hidden_states
+        self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
+        return self
+
+    def n_encoder_output_feature(self) -> int:
+        # This function should never be called!!!
+        raise NotImplementedError
+
+    def build_encoder(self,
+                      input_shape: Tuple[int, ...]) -> nn.Module:
+        preprocessor = TimeSeriesMLP(window_size=self.window_size)
+        return preprocessor
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NBEATSEncoder',
+            'name': 'NBEATSEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
new file mode 100644
index 000000000..79b1e5cc5
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -0,0 +1,52 @@
+import os
+from collections import OrderedDict
+from typing import Dict, Optional, Type, Union
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class FlatForecastingEncoderChoice(AbstractForecastingEncoderChoice):
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'FlatEncoder',
+            'name': 'FlatEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/InceptionTimeBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
similarity index 76%
rename from autoPyTorch/pipeline/components/setup/network_backbone/InceptionTimeBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 869f808ed..b3decba68 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/InceptionTimeBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -1,16 +1,16 @@
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    UniformIntegerHyperparameter
-)
+from ConfigSpace.hyperparameters import UniformIntegerHyperparameter
 
 import torch
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.utils.common import (HyperparameterSearchSpace,
+                                      add_hyperparameter)
 
 
 # Code inspired by https://github.com/hfawaz/InceptionTime
@@ -94,11 +94,14 @@ def __init__(self,
         bottleneck_size = self.config["bottleneck_size"]
         kernel_size = self.config["kernel_size"]
         n_res_inputs = in_features
+
+        receptive_field = 1
         for i in range(self.config["num_blocks"]):
             block = _InceptionBlock(n_inputs=n_inputs,
                                     n_filters=n_filters,
                                     bottleneck=bottleneck_size,
                                     kernel_size=kernel_size)
+            receptive_field += max(kernel_size, 3) - 1
             self.__setattr__(f"inception_block_{i}", block)
 
             # add a residual block after every 3 inception blocks
@@ -108,8 +111,9 @@ def __init__(self,
                                                                        n_outputs=n_res_outputs))
                 n_res_inputs = n_res_outputs
             n_inputs = block.get_n_outputs()
+        self.receptive_field = receptive_field
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, output_seq: bool = False) -> torch.Tensor:
         # swap sequence and feature dimensions for use with convolutional nets
         x = x.transpose(1, 2).contiguous()
         res = x
@@ -119,48 +123,75 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 x = self.__getattr__(f"residual_block_{i}")(x, res)
                 res = x
         x = x.transpose(1, 2).contiguous()
-        return x
+        if output_seq:
+            return x
+        else:
+            return self.get_last_seq_value(x)
 
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, -1:, :]
 
-class InceptionTimeBackbone(NetworkBackboneComponent):
+
+class InceptionTimeEncoder(BaseForecastingEncoder):
+    _receptive_field = 1
     """
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
 
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        backbone = _InceptionTime(in_features=input_shape[-1],
-                                  config=self.config)
-        self.backbone = backbone
-        return backbone
+    def build_encoder(self, input_shape: Tuple[int, ...] = (0,)) -> nn.Module:
+        in_features = input_shape[-1]
+        encoder = _InceptionTime(in_features=in_features,
+                                 config=self.config)
+        self._receptive_field = encoder.receptive_field
+        return encoder
+
+    def n_encoder_output_feature(self) -> int:
+        # see _InceptionBlock.forward()
+        num_filters_out: int = self.config['num_filters']
+        return num_filters_out * 4
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder']
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {
-            'shortname': 'InceptionTimeBackbone',
-            'name': 'InceptionTimeBackbone',
+            'shortname': 'InceptionTimeEncoder',
+            'name': 'InceptionTimeEncoder',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
         }
 
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'window_size': self._receptive_field})
+        return super().transform(X)
+
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
         num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                          value_range=(1, 10),
-                                                                          default_value=5,
+                                                                          value_range=(1, 5),
+                                                                          default_value=3,
                                                                           ),
         num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                            value_range=(4, 64),
                                                                            default_value=32,
+                                                                           log=True,
                                                                            ),
         kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
                                                                            value_range=(4, 64),
                                                                            default_value=32,
+                                                                           log=True,
                                                                            ),
         bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
                                                                                value_range=(16, 64),
                                                                                default_value=32,
+                                                                               log=True
                                                                                ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
@@ -169,5 +200,4 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, num_filters, UniformIntegerHyperparameter)
         add_hyperparameter(cs, kernel_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, bottleneck_size, UniformIntegerHyperparameter)
-
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
new file mode 100644
index 000000000..152936e1b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -0,0 +1,196 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformFloatHyperparameter,
+                                         UniformIntegerHyperparameter)
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderNetwork,
+    EncoderProperties
+)
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
+)
+
+
+class _RNN(EncoderNetwork):
+    # we only consder GRU and LSTM here
+    def __init__(self,
+                 in_features: int,
+                 config: Dict[str, Any],
+                 lagged_value: Optional[List[int]] = None):
+        super().__init__()
+        if lagged_value is None:
+            self.lagged_value = [0]
+        else:
+            self.lagged_value = lagged_value
+        self.config = config
+        if config['cell_type'] == 'lstm':
+            cell_type = nn.LSTM
+        else:
+            cell_type = nn.GRU
+        self.lstm = cell_type(input_size=in_features,
+                              hidden_size=config["hidden_size"],
+                              num_layers=config["num_layers"],
+                              dropout=config.get("dropout", 0.0),
+                              bidirectional=config["bidirectional"],
+                              batch_first=True)
+        self.cell_type = config['cell_type']
+
+    def forward(self,
+                x: torch.Tensor,
+                output_seq: bool = False,
+                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+        B, T, _ = x.shape
+
+        x, hidden_state = self.lstm(x, hx)
+
+        if output_seq:
+            return x, hidden_state
+        else:
+            return self.get_last_seq_value(x), hidden_state
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, _ = x.shape
+        if not self.config["bidirectional"]:
+            return x[:, -1:, ]
+        else:
+            x_by_direction = x.view(B,
+                                    T,
+                                    2,
+                                    self.config["hidden_size"])
+            x = torch.cat([
+                x_by_direction[:, -1, [0], :],
+                x_by_direction[:, 0, [1], :]
+            ], dim=-1)
+            return x
+
+
+class RNNEncoder(BaseForecastingEncoder):
+    """
+    Standard searchable LSTM backbone for time series data
+    """
+    _fixed_seq_length = False
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
+
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[-1]
+        encoder = _RNN(in_features=in_features,
+                       config=self.config,
+                       lagged_value=self.lagged_value,
+                       )
+        return encoder
+
+    def n_encoder_output_feature(self) -> int:
+        hidden_size: int = self.config['hidden_size']
+        return 2 * hidden_size if self.config['bidirectional'] else hidden_size
+
+    def n_hidden_states(self) -> int:
+        if self.config['cell_type'] == 'lstm':
+            return 2
+        elif self.config['cell_type'] == 'gru':
+            return 1
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder', 'RNNDecoder']
+
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(has_hidden_states=True, lagged_input=True)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        return super().fit(X, y)
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        rnn_kwargs = {'hidden_size': self.config['hidden_size'],
+                      'num_layers': self.config['num_layers'],
+                      'bidirectional': self.config['bidirectional'],
+                      'cell_type': self.config['cell_type'],
+                      'dropout': self.config.get('dropout', 0.0)}  # used for initialize
+        X.update({'rnn_kwargs': rnn_kwargs})
+        return super().transform(X)
+
+    @staticmethod
+    def get_properties(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'RNNEncoder',
+            'name': 'RNNEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            cell_type: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="cell_type",
+                                                                             value_range=['lstm', 'gru'],
+                                                                             default_value='lstm'),
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_layers',
+                                                                              value_range=(1, 3),
+                                                                              default_value=1),
+            hidden_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='hidden_size',
+                                                                               value_range=(32, 512),
+                                                                               default_value=64,
+                                                                               log=True),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
+                                                                               value_range=(True, False),
+                                                                               default_value=False),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
+                                                                           value_range=(0., 0.5),
+                                                                           default_value=0.1),
+            bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
+                                                                                 value_range=(False,),
+                                                                                 default_value=False),
+            decoder_type: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='decoder_type',
+                                      value_range=('MLPDecoder', 'RNNDecoder'),
+                                      default_value='MLPDecoder')
+    ) -> ConfigurationSpace:
+        """
+        get hyperparameter search space, bidirectional is not casual so I do not allow it to be set as True,
+        However, it might be further implemented to NLP tasks
+
+        """
+        cs = CS.ConfigurationSpace()
+
+        # TODO consider lstm layers with different hidden size
+        # TODO bidirectional needs to be set as false for DeepAR model
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameters([num_layers, use_dropout, dropout])
+
+        # Add plain hyperparameters
+        add_hyperparameter(cs, cell_type, CategoricalHyperparameter)
+        add_hyperparameter(cs, hidden_size, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, bidirectional, CategoricalHyperparameter)
+        add_hyperparameter(cs, decoder_type, CategoricalHyperparameter)
+
+        cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
+                                           CS.GreaterThanCondition(dropout, num_layers, 1)))
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
new file mode 100644
index 000000000..ee9293e8d
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -0,0 +1,226 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformFloatHyperparameter,
+                                         UniformIntegerHyperparameter)
+
+import torch
+from torch import nn
+from torch.nn.utils import weight_norm
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
+    EncoderNetwork
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    get_hyperparameter
+)
+
+
+# _Chomp1d, _TemporalBlock and _TemporalConvNet copied from
+# https://github.com/locuslab/TCN/blob/master/TCN/tcn.py, Carnegie Mellon University Locus Labs
+# Paper: https://arxiv.org/pdf/1803.01271.pdf
+class _Chomp1d(nn.Module):
+    def __init__(self, chomp_size: int):
+        super(_Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, :, :-self.chomp_size].contiguous()
+
+
+class _TemporalBlock(nn.Module):
+    def __init__(self,
+                 n_inputs: int,
+                 n_outputs: int,
+                 kernel_size: int,
+                 stride: int,
+                 dilation: int,
+                 padding: int,
+                 dropout: float = 0.2):
+        super(_TemporalBlock, self).__init__()
+        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp1 = _Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp2 = _Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
+                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
+        self.downsample = nn.Conv1d(
+            n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
+        self.relu = nn.ReLU()
+        # self.init_weights()
+
+    def init_weights(self) -> None:
+        self.conv1.weight.data.normal_(0, 0.01)
+        self.conv2.weight.data.normal_(0, 0.01)
+        if self.downsample is not None:
+            self.downsample.weight.data.normal_(0, 0.01)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = self.net(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class _TemporalConvNet(EncoderNetwork):
+    def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: List[int], dropout: float = 0.2):
+        super(_TemporalConvNet, self).__init__()
+        layers: List[Any] = []
+        num_levels = len(num_channels)
+        receptive_field = 1
+
+        # stride_values = []
+
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i - 1]
+            out_channels = num_channels[i]
+            stride = 1
+            # stride_values.extend([stride, stride])
+            layers += [_TemporalBlock(in_channels,
+                                      out_channels,
+                                      kernel_size[i],
+                                      stride=stride,
+                                      dilation=dilation_size,
+                                      padding=(kernel_size[i] - 1) * dilation_size,
+                                      dropout=dropout)]
+            # receptive_field_block = 1 + (kernel_size - 1) * dilation_size * \
+            #                        (int(np.prod(stride_values[:-2])) * (1 + stride_values[-2]))
+            # stride = 1, we ignore stride computation
+            receptive_field_block = 1 + 2 * (kernel_size[i] - 1) * dilation_size
+            receptive_field += receptive_field_block
+        self.receptive_field = receptive_field
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor, output_seq: bool = False) -> torch.Tensor:
+        # swap sequence and feature dimensions for use with convolutional nets
+        x = x.transpose(1, 2).contiguous()
+        x = self.network(x)
+        x = x.transpose(1, 2).contiguous()
+        if output_seq:
+            return x
+        else:
+            return self.get_last_seq_value(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, -1:]
+
+
+class TCNEncoder(BaseForecastingEncoder):
+    _receptive_field = 1
+    """
+    Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
+    """
+
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        num_channels = [self.config["num_filters_1"]]
+        kernel_size = [self.config["kernel_size_1"]]
+        dropout = self.config["dropout"] if self.config["use_dropout"] else 0.0
+        for i in range(2, self.config["num_blocks"] + 1):
+            num_channels.append(self.config[f"num_filters_{i}"])
+            kernel_size.append(self.config[f"kernel_size_{i}"])
+        in_features = input_shape[-1]
+        encoder = _TemporalConvNet(in_features,
+                                   num_channels,
+                                   kernel_size=kernel_size,
+                                   dropout=dropout
+                                   )
+        self._receptive_field = encoder.receptive_field
+        return encoder
+
+    def n_encoder_output_feature(self) -> int:
+        num_blocks = self.config["num_blocks"]
+        num_filter_out: int = self.config[f"num_filters_{num_blocks}"]
+        return num_filter_out
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder']
+
+    @staticmethod
+    def get_properties(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            "shortname": "TCNBackbone",
+            "name": "TCNBackbone",
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'window_size': self._receptive_field})
+        return super().transform(X)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                                              value_range=(1, 4),
+                                                                              default_value=3),
+            num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
+                                                                               value_range=(4, 64),
+                                                                               default_value=16,
+                                                                               log=True),
+            kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
+                                                                               value_range=(2, 64),
+                                                                               default_value=8,
+                                                                               log=True),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                                                               value_range=(True, False),
+                                                                               default_value=False),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
+                                                                           value_range=(0, 0.5),
+                                                                           default_value=0.1),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        min_num_blocks, max_num_blocks = num_blocks.value_range
+        num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
+        cs.add_hyperparameter(num_blocks)
+
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        cs.add_hyperparameter(use_dropout)
+
+        dropout_hp = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameter(dropout_hp)
+
+        dropout_condition = CS.EqualsCondition(dropout_hp, use_dropout, True)
+
+        cs.add_condition(dropout_condition)
+
+        for i in range(1, int(max_num_blocks) + 1):
+            num_filter_search_space = HyperparameterSearchSpace(f"num_filters_{i}",
+                                                                value_range=num_filters.value_range,
+                                                                default_value=num_filters.default_value,
+                                                                log=num_filters.log)
+            kernel_size_search_space = HyperparameterSearchSpace(f"kernel_size_{i}",
+                                                                 value_range=kernel_size.value_range,
+                                                                 default_value=kernel_size.default_value,
+                                                                 log=kernel_size.log)
+            num_filters_hp = get_hyperparameter(num_filter_search_space, UniformIntegerHyperparameter)
+            kernel_size_hp = get_hyperparameter(kernel_size_search_space, UniformIntegerHyperparameter)
+            cs.add_hyperparameter(num_filters_hp)
+            cs.add_hyperparameter(kernel_size_hp)
+            if i > int(min_num_blocks):
+                cs.add_conditions([
+                    CS.GreaterThanCondition(num_filters_hp, num_blocks, i - 1),
+                    CS.GreaterThanCondition(kernel_size_hp, num_blocks, i - 1)
+                ])
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
new file mode 100644
index 000000000..521efc7df
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -0,0 +1,278 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformFloatHyperparameter,
+                                         UniformIntegerHyperparameter)
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    PositionalEncoding,
+    build_transformer_layers
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
+    EncoderNetwork
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
+)
+
+
+class _TransformerEncoder(EncoderNetwork):
+    def __init__(self,
+                 in_features: int,
+                 d_model: int,
+                 num_layers: int,
+                 transformer_encoder_layers: nn.Module,
+                 use_positional_encoder: bool,
+                 use_layer_norm_output: bool,
+                 dropout_pe: float = 0.0,
+                 layer_norm_eps_output: Optional[float] = None,
+                 lagged_value: Optional[List[int]] = None):
+        super().__init__()
+        if lagged_value is None:
+            self.lagged_value = [0]
+        else:
+            self.lagged_value = lagged_value
+        if in_features != d_model:
+            input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        else:
+            input_layer = []
+        if use_positional_encoder:
+            input_layer.append(PositionalEncoding(d_model, dropout_pe))
+        self.input_layer = nn.Sequential(*input_layer)
+
+        self.use_layer_norm_output = use_layer_norm_output
+        if use_layer_norm_output:
+            norm = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+        else:
+            norm = None
+        self.transformer_encoder_layers = nn.TransformerEncoder(encoder_layer=transformer_encoder_layers,
+                                                                num_layers=num_layers,
+                                                                norm=norm)
+
+    def forward(self,
+                x: torch.Tensor,
+                output_seq: bool = False,
+                mask: Optional[torch.Tensor] = None,
+                src_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.input_layer(x)
+        x = self.transformer_encoder_layers(x)
+        if output_seq:
+            return x
+        else:
+            return self.get_last_seq_value(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, -1:]
+
+
+class TransformerEncoder(BaseForecastingEncoder):
+    """
+    Standard searchable Transformer Encoder for time series data
+    """
+    _fixed_seq_length = False
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
+
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[-1]
+
+        d_model = 2 ** self.config['d_model_log']
+        transformer_encoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='encoder')
+
+        encoder = _TransformerEncoder(in_features=in_features,
+                                      d_model=d_model,
+                                      num_layers=self.config['num_layers'],
+                                      transformer_encoder_layers=transformer_encoder_layers,
+                                      use_positional_encoder=self.config['use_positional_encoder'],
+                                      use_layer_norm_output=self.config['use_layer_norm_output'],
+                                      dropout_pe=self.config.get('dropout_positional_encoder', 0.0),
+                                      layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
+                                      lagged_value=self.lagged_value)
+        return encoder
+
+    def n_encoder_output_feature(self) -> int:
+        d_model: int = 2 ** self.config['d_model_log']
+        return d_model
+
+    @staticmethod
+    def allowed_decoders() -> List[str]:
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder', 'TransformerDecoder']
+
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(lagged_input=True,
+                                 is_casual=False)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        return super().fit(X, y)
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        transformer_encoder_kwargs = {'d_model_log': self.config['d_model_log']}  # used for initialize
+        X.update({'transformer_encoder_kwargs': transformer_encoder_kwargs})
+        return super().transform(X)
+
+    @staticmethod
+    def get_properties(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TransformerEncoder',
+            'name': 'TransformerEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            num_layers: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='num_layers',
+                                      value_range=(1, 4),
+                                      default_value=1),
+            n_head_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='n_head_log',
+                                      value_range=(1, 4),
+                                      default_value=3),
+            d_model_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='d_model_log',
+                                      value_range=(4, 9),
+                                      default_value=5),
+            d_feed_forward_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
+                                      value_range=(6, 12),
+                                      default_value=7),
+            norm_first: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="norm_first",
+                                      value_range=(True, False),
+                                      default_value=True),
+            layer_norm_eps: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='layer_norm_eps',
+                                      value_range=(1e-7, 1e-3),
+                                      default_value=1e-5,
+                                      log=True),
+            use_positional_encoder: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_positional_encoder',
+                                      value_range=(True, False),
+                                      default_value=True),
+            use_layer_norm_output: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_layer_norm_output',
+                                      value_range=(True, False),
+                                      default_value=True),
+            activation: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="activation",
+                                      value_range=('relu', 'gelu'),
+                                      default_value='relu',
+                                      ),
+            use_dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                      value_range=(True, False),
+                                      default_value=False,
+                                      ),
+            dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="dropout",
+                                      value_range=(0, 0.8),
+                                      default_value=0.1,
+                                      ),
+            decoder_type: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='decoder_type',
+                                      value_range=('MLPDecoder', 'TransformerDecoder'),
+                                      default_value='MLPDecoder')
+    ) -> ConfigurationSpace:
+        """
+        get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
+        consider their log value (with base 2) as the hyperparameters
+
+        Args:
+            num_layers (int):
+                number of transformer layers
+            n_head_log (int):
+                log value (base 2, this should work for all the following hyperparameters with logs) of number of head
+            d_model_log (int):
+                log values of input of dimensions passed to feed forward network
+            d_feed_forward_log (int):
+                log values of feed forward network width
+            norm_first (bool):
+                if ``True``, layer norm is done prior to attention and feedforward operations, respectivaly.
+                Otherwise, it's done after. Default: ``False`` (after).
+            layer_norm_eps (float):
+                eps for layer norm
+            use_positional_encoder (bool):
+                if positional encoder is applied
+            use_layer_norm_output (bool):
+                if layer norm output is applied
+            activation (str):
+                activation function type
+            use_dropout (bool):
+                if dropout is applied
+            dropout (float):
+                dropout rate
+            decoder_type (str):
+                type of decoder, could be MLPDecoder (DeepAR) or TransformerDecoder (seq2seq)
+
+        Returns:
+            ConfigurationSpace:
+                configuration space
+        """
+        cs = CS.ConfigurationSpace()
+
+        add_hyperparameter(cs, activation, CategoricalHyperparameter)
+        add_hyperparameter(cs, d_model_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, norm_first, CategoricalHyperparameter)
+
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+
+        # We can have dropout in the network for
+        # better generalization
+        use_positional_encoder = get_hyperparameter(use_positional_encoder, CategoricalHyperparameter)
+
+        dropout_pe = HyperparameterSearchSpace(hyperparameter='dropout_positional_encoder',
+                                               value_range=dropout.value_range,
+                                               default_value=dropout.default_value,
+                                               log=dropout.log)
+        dropout_pe = get_hyperparameter(dropout_pe, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([num_layers, use_dropout, use_positional_encoder, dropout_pe])
+        cs.add_condition(CS.AndConjunction(
+            CS.EqualsCondition(dropout_pe, use_dropout, True),
+            CS.EqualsCondition(dropout_pe, use_positional_encoder, True)
+        ))
+
+        add_hyperparameter(cs, n_head_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, d_feed_forward_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, layer_norm_eps, UniformFloatHyperparameter)
+
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameter(dropout)
+        cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
+
+        use_layer_norm_output = get_hyperparameter(use_layer_norm_output, CategoricalHyperparameter)
+        layer_norm_eps_output = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_output',
+                                                          value_range=layer_norm_eps.value_range,
+                                                          default_value=layer_norm_eps.default_value,
+                                                          log=layer_norm_eps.log)
+
+        layer_norm_eps_output = get_hyperparameter(layer_norm_eps_output, UniformFloatHyperparameter)
+        cs.add_hyperparameters([use_layer_norm_output, layer_norm_eps_output])
+        cs.add_condition(CS.EqualsCondition(layer_norm_eps_output, use_layer_norm_output, True))
+
+        add_hyperparameter(cs, decoder_type, CategoricalHyperparameter)
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
new file mode 100644
index 000000000..6a4b85a8b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -0,0 +1,747 @@
+import inspect
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import ConfigSpace as CS
+from ConfigSpace.conditions import (
+    EqualsCondition,
+    GreaterThanCondition,
+    OrConjunction
+)
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+from ConfigSpace.forbidden import (
+    ForbiddenAndConjunction,
+    ForbiddenEqualsClause,
+    ForbiddenInClause
+)
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    Constant,
+    Hyperparameter,
+    OrdinalHyperparameter,
+    UniformFloatHyperparameter
+)
+
+from sklearn.pipeline import Pipeline
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    ForecastingNetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components. \
+    TemporalFusion import TemporalFusion
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    get_hyperparameter
+)
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
+    deepAR_decoder_name = 'MLPDecoder'
+    deepAR_decoder_prefix = 'block_1'
+    tf_prefix = "temporal_fusion"
+
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(  # type: ignore[override]
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                                              value_range=(1, 1),
+                                                                              default_value=1),
+            variable_selection: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="variable_selection",
+                value_range=(True, False),
+                default_value=False
+            ),
+            variable_selection_use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="variable_selection_use_dropout",
+                value_range=(True, False),
+                default_value=False,
+            ),
+            variable_selection_dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="variable_selection_dropout_rate",
+                value_range=(0.0, 0.8),
+                default_value=0.1,
+            ),
+
+            share_single_variable_networks: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="share_single_variable_networks",
+                value_range=(True, False),
+                default_value=False,
+            ),
+            use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_temporal_fusion',
+                value_range=(True, False),
+                default_value=False,
+            ),
+            decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="decoder_auto_regressive",
+                value_range=(True, False),
+                default_value=False,
+            ),
+            skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False),
+            skip_connection_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="skip_connection_type",
+                value_range=("add", "gate_add_norm"),
+                default_value="gate_add_norm",
+            ),
+            grn_use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="grn_use_dropout",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=True),
+            grn_dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='grn_dropout_rate',
+                                                                                    value_range=(0.0, 0.8),
+                                                                                    default_value=0.1),
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            num_blocks (int):
+                number of encoder-decoder structure blocks
+            variable_selection (bool):
+                if variable selection is applied, if True, then the first block will be attached with a variable
+                 selection block while the following will be enriched with static features.
+            variable_selection_use_dropout (bool):
+                if variable selection network uses dropout
+            variable_selection_dropout_rate (float):
+                dropout rate of variable selection network
+            share_single_variable_networks (bool):
+                if single variable networks are shared between encoder and decoder
+            skip_connection (int):
+                if skip connection is applied
+            use_temporal_fusion (int):
+                if temporal fusion layer is applied
+            skip_connection_type (str):
+                skip connection type, it could be directly added or a GRN network
+                (Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
+                https://arxiv.org/abs/1912.09363) TODO consider hidden size of grn as a new HP
+            grn_use_dropout (bool):
+                if dropout layer is applied to GRN, since variable selection network also contains GRN,
+                this parameter also influence variable selection network
+            grn_dropout_rate (float):
+                dropout rate of GRN, same as above, this variable also influence variable selection network
+            decoder_auto_regressive (int):
+                if decoder is auto_regressive, e.g., if the decoder receives the output as its input,
+                 this only works for  auto_regressive decoder models
+            default (Optional[str]):
+                Default backbone to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        static_features_shape: int = dataset_properties.get("static_features_shape", 0)  # type: ignore[assignment]
+        future_feature_shapes: Tuple[int] = dataset_properties.get("future_feature_shapes",  # type: ignore[assignment]
+                                                                   (0,))
+
+        cs = ConfigurationSpace()
+
+        min_num_blocks: int = num_blocks.value_range[0]   # type: ignore[assignment]
+        max_num_blocks: int = num_blocks.value_range[1]   # type: ignore[assignment]
+
+        variable_selection_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+            variable_selection, CategoricalHyperparameter)
+        share_single_variable_networks = get_hyperparameter(share_single_variable_networks, CategoricalHyperparameter)
+
+        decoder_auto_regressive_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+            decoder_auto_regressive, CategoricalHyperparameter
+        )
+
+        if min_num_blocks == max_num_blocks:
+            num_blocks = Constant(num_blocks.hyperparameter, num_blocks.value_range[0])
+        else:
+            num_blocks = OrdinalHyperparameter(
+                num_blocks.hyperparameter,
+                sequence=list(range(min_num_blocks, max_num_blocks + 1))
+            )
+
+        skip_connection_hp: CategoricalHyperparameter = get_hyperparameter(skip_connection,  # type: ignore[assignment]
+                                                                           CategoricalHyperparameter)
+
+        hp_network_structures = [num_blocks, decoder_auto_regressive_hp, variable_selection_hp,
+                                 skip_connection_hp]
+        cond_skip_connections = []
+
+        if True in skip_connection_hp.choices:
+            skip_connection_type_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+                skip_connection_type, CategoricalHyperparameter
+            )
+            hp_network_structures.append(skip_connection_type_hp)
+            cond_skip_connections.append(EqualsCondition(skip_connection_type_hp, skip_connection_hp, True))
+            if 'gate_add_norm' in skip_connection_type_hp.choices:
+                grn_use_dropout_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+                    grn_use_dropout, CategoricalHyperparameter
+                )
+                hp_network_structures.append(grn_use_dropout_hp)
+                if True in variable_selection_hp.choices:
+                    cond_skip_connections.append(
+                        EqualsCondition(grn_use_dropout_hp, skip_connection_type_hp, "gate_add_norm")
+                    )
+                else:
+                    cond_skip_connections.append(
+                        EqualsCondition(grn_use_dropout_hp, skip_connection_type_hp, "gate_add_norm"))
+                if True in grn_use_dropout_hp.choices:
+                    grn_dropout_rate_hp = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
+                    hp_network_structures.append(grn_dropout_rate_hp)
+                    cond_skip_connections.append(EqualsCondition(grn_dropout_rate_hp, grn_use_dropout_hp, True))
+        cs.add_hyperparameters(hp_network_structures)
+        if cond_skip_connections:
+            cs.add_conditions(cond_skip_connections)
+
+        if True in variable_selection_hp.choices:
+            variable_selection_use_dropout_hp = get_hyperparameter(variable_selection_use_dropout,
+                                                                   CategoricalHyperparameter)
+            variable_selection_dropout_rate_hp = get_hyperparameter(variable_selection_dropout_rate,
+                                                                    UniformFloatHyperparameter)
+            cs.add_hyperparameters([variable_selection_use_dropout_hp, variable_selection_dropout_rate_hp])
+
+            cond_vs_dropout = EqualsCondition(variable_selection_use_dropout_hp, variable_selection_hp, True)
+            cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate_hp,
+                                                  variable_selection_use_dropout_hp,
+                                                  True)
+            cs.add_conditions([cond_vs_dropout, cond_vs_dropoutrate])
+
+        if True in variable_selection_hp.choices:
+            cs.add_hyperparameter(share_single_variable_networks)
+            cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection_hp, True))
+
+        # Compile a list of legal preprocessors for this problem
+        available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(  # type: ignore
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(  # type: ignore
+            dataset_properties=dataset_properties,
+            include=None, exclude=exclude,
+            components=self.get_decoder_components())
+
+        if len(available_encoders) == 0:
+            raise ValueError("No Encoder found")
+        if len(available_decoders) == 0:
+            raise ValueError("No Decoder found")
+
+        if default is None:
+            defaults = self._defaults_network
+            for default_ in defaults:
+                if default_ in available_encoders:
+                    default = default_
+                    break
+
+        updates_choice = self._get_search_space_updates()
+
+        forbiddens_decoder_auto_regressive = []
+
+        # TODO this is only a temporary solution, needs to be updated when ConfigSpace allows more complex conditions!
+        # General Idea to work with auto-regressive decoders:
+        # decoder cannot be auto-regressive if it is not recurrent
+        #   decoder_auto_regressive is conditioned on the HPs that allow recurrent decoders:
+        #     encoders that only have recurrent decoders -> EqCond(dar, encoder, en_name)
+        #     decoder_types of Encoders that contain recurrent decoders -> EqCond(dar, encoder:de_type, de_name)
+        #
+        # When no future data can be fed to the decoder (no future features), decoder must be auto-regressive:
+        #   disable the recurrent decoders without auto-regressive or variable selection
+        #   this is judged by add_forbidden_for_non_ar_recurrent_decoder
+
+        if True in decoder_auto_regressive_hp.choices:
+            forbidden_decoder_ar: Optional[ForbiddenEqualsClause] = ForbiddenEqualsClause(decoder_auto_regressive_hp,
+                                                                                          True)
+        else:
+            forbidden_decoder_ar = None
+
+        add_forbidden_for_non_ar_recurrent_decoder = False
+        if static_features_shape + future_feature_shapes[-1] == 0:
+            if False in decoder_auto_regressive_hp.choices and False in variable_selection_hp.choices:
+                add_forbidden_for_non_ar_recurrent_decoder = True
+
+        if len(decoder_auto_regressive_hp.choices) == 1 and True in decoder_auto_regressive_hp.choices:
+            conds_decoder_ar: Optional[List[CS.conditions.ConditionComponent]] = None
+        else:
+            conds_decoder_ar = []
+
+        for i in range(1, int(max_num_blocks) + 1):
+            block_prefix = f'block_{i}:'
+
+            if '__choice__' in updates_choice.keys():
+                choice_hyperparameter = updates_choice['__choice__']
+                if not set(choice_hyperparameter.value_range).issubset(available_encoders):
+                    raise ValueError("Expected given update for {} to have "
+                                     "choices in {} got {}".format(self.__class__.__name__,
+                                                                   available_encoders,
+                                                                   choice_hyperparameter.value_range))
+                hp_encoder = CategoricalHyperparameter(block_prefix + '__choice__',
+                                                       choice_hyperparameter.value_range,
+                                                       default_value=choice_hyperparameter.default_value)
+            else:
+                hp_encoder = CategoricalHyperparameter(
+                    block_prefix + '__choice__',
+                    list(available_encoders.keys()),
+                    default_value=default
+                )
+            if conds_decoder_ar is None:
+                # In this case we only allow encoders that has recurrent decoders
+                available_encoders_w_recurrent_decoder = []
+                for encoder_name in hp_encoder.choices:
+                    decoders = available_encoders[encoder_name].allowed_decoders()
+                    for decoder_name in decoders:
+                        if available_decoders[decoder_name].decoder_properties().recurrent:
+                            available_encoders_w_recurrent_decoder.append(encoder_name)
+                            break
+                if not available_encoders_w_recurrent_decoder:
+                    raise ValueError('If only auto-regressive decoder is allowed, at least one encoder must contain '
+                                     'recurrent decoder!')
+                hp_encoder = CategoricalHyperparameter(
+                    block_prefix + '__choice__',
+                    available_encoders_w_recurrent_decoder,
+                    default_value=available_encoders_w_recurrent_decoder[0])
+
+            cs.add_hyperparameter(hp_encoder)
+            if i > int(min_num_blocks):
+                cs.add_condition(
+                    GreaterThanCondition(hp_encoder, num_blocks, i - 1)
+                )
+
+            decoder2encoder: Dict[str, List[str]] = {key: [] for key in available_decoders.keys()}
+            encoder2decoder = {}
+            for encoder_name in hp_encoder.choices:
+                updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
+                config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type: ignore
+                    dataset_properties,
+                    **updates  # type: ignore[call-arg]
+                )
+                allowed_decoders = available_encoders[encoder_name].allowed_decoders()
+                if len(allowed_decoders) > 1:
+                    if 'decoder_type' not in config_space:
+                        raise ValueError('When a specific encoder has more than one allowed decoder, its ConfigSpace'
+                                         'must contain the hyperparameter "decoder_type" ! Please check your encoder '
+                                         'setting!')
+                    hp_decoder_choice = config_space.get_hyperparameter('decoder_type').choices
+                    if not set(hp_decoder_choice).issubset(allowed_decoders):
+                        raise ValueError(
+                            'The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
+                    recurrent_decoders = []
+                    for decoder_name in allowed_decoders:
+                        if available_decoders[decoder_name].decoder_properties().recurrent:
+                            recurrent_decoders.append(decoder_name)
+                    if conds_decoder_ar is None:
+                        if recurrent_decoders:
+                            updates['decoder_type'] = HyperparameterSearchSpace('decoder_type',
+                                                                                tuple(recurrent_decoders),
+                                                                                recurrent_decoders[0]
+                                                                                )
+                            ecd = available_encoders[encoder_name]
+                            config_space = ecd.get_hyperparameter_search_space(  # type:ignore
+                                dataset_properties,
+                                **updates  # type: ignore[call-arg]
+                            )
+                            hp_decoder_choice = recurrent_decoders
+                        else:
+                            cs.add_forbidden_clause(ForbiddenEqualsClause(hp_encoder, encoder_name))
+
+                    allowed_decoders = hp_decoder_choice
+                valid_decoders = []
+                for decoder_name in allowed_decoders:
+                    if decoder_name in decoder2encoder:
+                        valid_decoders.append(decoder_name)
+                        decoder2encoder[decoder_name].append(encoder_name)
+                encoder2decoder[encoder_name] = allowed_decoders
+                if len(allowed_decoders) > 1:
+
+                    if len(valid_decoders) < len(config_space.get_hyperparameter('decoder_type').choices):
+                        updates['decoder_type'] = HyperparameterSearchSpace(hyperparameter='decoder_type',
+                                                                            value_range=tuple(valid_decoders),
+                                                                            default_value=valid_decoders[0])
+                        config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type:ignore
+                            dataset_properties,
+                            **updates  # type: ignore[call-arg]
+                        )
+                parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+                cs.add_configuration_space(
+                    block_prefix + encoder_name,
+                    config_space,
+                    parent_hyperparameter=parent_hyperparameter
+                )
+
+            for decoder_name in available_decoders.keys():
+                if not decoder2encoder[decoder_name]:
+                    continue
+                updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
+                if i == 1 and decoder_name == self.deepAR_decoder_name:
+                    # TODO this is only a temporary solution, a fix on ConfigSpace needs to be implemented
+                    updates['can_be_auto_regressive'] = True  # type: ignore[assignment]
+
+                config_space = available_decoders[decoder_name].get_hyperparameter_search_space(  # type: ignore
+                    dataset_properties,
+                    **updates  # type: ignore[call-arg]
+                )
+                compatible_encoders = decoder2encoder[decoder_name]
+                encoders_with_multi_decoder_l = []
+                encoder_with_single_decoder_l = []
+                for encoder in compatible_encoders:
+                    if len(encoder2decoder[encoder]) > 1:
+                        encoders_with_multi_decoder_l.append(encoder)
+                    else:
+                        encoder_with_single_decoder_l.append(encoder)
+                encoders_with_multi_decoder = set(encoders_with_multi_decoder_l)
+                encoder_with_single_decoder = set(encoder_with_single_decoder_l)
+
+                cs.add_configuration_space(
+                    block_prefix + decoder_name,
+                    config_space,
+                    # parent_hyperparameter=parent_hyperparameter
+                )
+
+                hps = cs.get_hyperparameters()  # type: List[Hyperparameter]
+                conditions_to_add = []
+                for hp in hps:
+                    # TODO consider if this will raise any unexpected behavior
+                    if hp.name.startswith(block_prefix + decoder_name):
+                        # From the implementation of ConfigSpace
+                        # Only add a condition if the parameter is a top-level
+                        # parameter of the new configuration space (this will be some
+                        #  kind of tree structure).
+                        if cs.get_parents_of(hp):
+                            continue
+                        or_cond = []
+                        for encoder_single in encoder_with_single_decoder:
+                            or_cond.append(EqualsCondition(hp,
+                                                           hp_encoder,
+                                                           encoder_single))
+                        for encode_multi in encoders_with_multi_decoder:
+                            hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encode_multi}:decoder_type')
+                            or_cond.append(EqualsCondition(hp, hp_decoder_type, decoder_name))
+                        if len(or_cond) == 0:
+                            continue
+                        elif len(or_cond) > 1:
+                            conditions_to_add.append(OrConjunction(*or_cond))
+                        else:
+                            conditions_to_add.append(or_cond[0])
+
+                cs.add_conditions(conditions_to_add)
+
+            if conds_decoder_ar is not None or forbidden_decoder_ar is not None:
+                forbiddens_ar_non_recurrent: List[CS.forbidden.AbstractForbiddenClause] = []
+                for encoder in hp_encoder.choices:
+                    if len(encoder2decoder[encoder]) == 1:
+                        if available_decoders[encoder2decoder[encoder][0]].decoder_properties().recurrent:
+                            # conds_decoder_ar is not None: False can be in decoder_auto_regressive. In this case,
+                            # if hp_encoder selects encoder, then decoder_auto_regressive becomes inactiavte
+                            # (indicates a default decoder_auto_regressive=False, thus we need to add another
+                            # forbidden incase add_forbidden_for_non_ar_recurrent_decoder is required)
+                            # forbidden_decoder_ar is not None: only False in decoder_auto_regressive
+                            # add_forbidden_for_non_ar_recurrent_decoder is True:False in decoder_auto_regressive
+                            if conds_decoder_ar is not None:
+                                conds_decoder_ar.append(
+                                    EqualsCondition(decoder_auto_regressive_hp, hp_encoder, encoder)
+                                )
+                                if add_forbidden_for_non_ar_recurrent_decoder:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            ForbiddenEqualsClause(variable_selection_hp, False),
+                                            ForbiddenEqualsClause(hp_encoder, encoder)
+                                        )
+                                    )
+                            else:
+                                if add_forbidden_for_non_ar_recurrent_decoder:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            ForbiddenAndConjunction(
+                                                ForbiddenEqualsClause(variable_selection_hp, False),
+                                                ForbiddenEqualsClause(decoder_auto_regressive_hp, False)
+                                            ),
+                                            ForbiddenEqualsClause(hp_encoder, encoder)
+                                        )
+                                    )
+
+                    elif len(encoder2decoder[encoder]) > 1:
+                        hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encoder}:decoder_type')
+                        for decoder in hp_decoder_type.choices:
+                            if not available_decoders[decoder].decoder_properties().recurrent:
+                                # TODO this is a temporary solution: currently ConfigSpace is not able to correctly
+                                # activate/deactivate a complex nested configspace; Too many forbiddens might also rise
+                                # errors. Thus we only allow decoder_ar to be conditioned on the top layer hps and
+                                # put forbiddenclauses here
+                                if forbidden_decoder_ar is not None:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            forbidden_decoder_ar,
+                                            ForbiddenEqualsClause(hp_decoder_type, decoder)
+                                        )
+                                    )
+                            else:
+                                if add_forbidden_for_non_ar_recurrent_decoder:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            ForbiddenAndConjunction(
+                                                ForbiddenEqualsClause(variable_selection_hp, False),
+                                                ForbiddenEqualsClause(decoder_auto_regressive_hp, False)
+                                            ),
+                                            ForbiddenEqualsClause(hp_decoder_type, decoder)
+                                        )
+                                    )
+
+                    if forbiddens_ar_non_recurrent:
+                        cs.add_forbidden_clauses(forbiddens_ar_non_recurrent)
+        if conds_decoder_ar:
+            cs.add_condition(OrConjunction(*conds_decoder_ar))
+
+        use_temporal_fusion_hp = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
+        cs.add_hyperparameter(use_temporal_fusion_hp)
+        if True in use_temporal_fusion_hp.choices:
+            update = self._get_search_space_updates(prefix=self.tf_prefix)
+            cs_tf = TemporalFusion.get_hyperparameter_search_space(dataset_properties,
+                                                                   **update)
+            parent_hyperparameter = {'parent': use_temporal_fusion_hp, 'value': True}
+            cs.add_configuration_space(
+                self.tf_prefix,
+                cs_tf,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        for encoder_name, encoder in available_encoders.items():
+            encoder_is_casual = encoder.encoder_properties().is_casual
+            if not encoder_is_casual:
+                # we do not allow non-casual encoder to appear in the lower layer of the network. e.g, if we have an
+                # encoder with 3 blocks, then non_casual encoder is only allowed to appear in the third layer
+                for i in range(max(min_num_blocks, 2), max_num_blocks + 1):
+                    for j in range(1, i):
+                        choice_hp = cs.get_hyperparameter(f"block_{j}:__choice__")
+                        if encoder_name in choice_hp.choices:
+                            forbidden_encoder_uncasual = [ForbiddenEqualsClause(num_blocks, i),
+                                                          ForbiddenEqualsClause(choice_hp, encoder_name)]
+                            if forbidden_decoder_ar is not None:
+                                forbidden_encoder_uncasual.append(forbidden_decoder_ar)
+                            forbiddens_decoder_auto_regressive.append(
+                                ForbiddenAndConjunction(*forbidden_encoder_uncasual)
+                            )
+
+        cs.add_forbidden_clauses(forbiddens_decoder_auto_regressive)
+
+        if self.deepAR_decoder_name in available_decoders:
+            deep_ar_hp_name = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
+            if deep_ar_hp_name in cs:
+                deep_ar_hp = cs.get_hyperparameter(deep_ar_hp_name)
+                if True in deep_ar_hp.choices:
+                    forbidden_deep_ar = ForbiddenEqualsClause(deep_ar_hp, True)
+                    if min_num_blocks == 1:
+                        if max_num_blocks > 1:
+                            forbidden = ForbiddenAndConjunction(
+                                ForbiddenInClause(num_blocks, list(range(2, max_num_blocks + 1))),
+                                forbidden_deep_ar
+                            )
+                            cs.add_forbidden_clause(forbidden)
+                    else:
+                        cs.add_forbidden_clause(forbidden_deep_ar)
+
+                    forbidden_deep_ars = []
+
+                    hps_forbidden_deep_ar = [use_temporal_fusion_hp]
+                    for hp_forbidden_deep_ar in hps_forbidden_deep_ar:
+                        if True in hp_forbidden_deep_ar.choices:
+                            forbidden_deep_ars.append(ForbiddenAndConjunction(
+                                ForbiddenEqualsClause(hp_forbidden_deep_ar, True),
+                                forbidden_deep_ar
+                            ))
+                    if True in skip_connection_hp.choices:
+                        forbidden_deep_ars.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(skip_connection_hp, True),
+                            forbidden_deep_ar
+                        ))
+                    if forbidden_deep_ars:
+                        cs.add_forbidden_clauses(forbidden_deep_ars)
+
+        forbidden_mlp_local_layer = []
+        for i in range(1, max_num_blocks + 1):
+            hp_mlp_has_local_layer = f"block_{i}:MLPDecoder:has_local_layer"
+            if hp_mlp_has_local_layer in cs:
+                hp_mlp_has_local_layer = cs.get_hyperparameter(hp_mlp_has_local_layer)
+                if i < max_num_blocks:
+                    forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                        ForbiddenInClause(num_blocks, list(range(i + 1, max_num_blocks + 1))),
+                    ))
+                c1 = isinstance(skip_connection_hp, CategoricalHyperparameter) and True in skip_connection_hp.choices
+                c2 = isinstance(skip_connection_hp, Constant) and skip_connection_hp.value
+                if c1 or c2:
+                    if True in skip_connection_hp.choices:
+                        forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                            ForbiddenEqualsClause(skip_connection_hp, True),
+                        ))
+                c1 = isinstance(
+                    use_temporal_fusion_hp, CategoricalHyperparameter
+                ) and True in use_temporal_fusion_hp.choices
+                c2 = isinstance(use_temporal_fusion_hp, Constant) and skip_connection_hp.value
+                if c1 or c2:
+                    if True in use_temporal_fusion_hp.choices:
+                        forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                            ForbiddenEqualsClause(use_temporal_fusion_hp, True),
+                        ))
+
+        cs.add_forbidden_clauses(forbidden_mlp_local_layer)
+        return cs
+
+    @property
+    def _defaults_network(self) -> List[str]:
+        return ['RNNEncoder', 'NBEATSEncoder']
+
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        """
+        Applies a configuration to the given component.
+        This method translate a hierarchical configuration key,
+        to an actual parameter of the autoPyTorch component.
+
+        Args:
+            configuration (Configuration):
+                Which configuration to apply to the chosen component
+            init_params (Optional[Dict[str, any]]):
+                Optional arguments to initialize the chosen component
+
+        Returns:
+            self: returns an instance of self
+        """
+
+        params = configuration.get_dictionary()
+        decoder_auto_regressive = params.pop('decoder_auto_regressive', False)
+        net_structure_default_kwargs = inspect.signature(ForecastingNetworkStructure.__init__).parameters
+
+        forecasting_structure_kwargs = {
+            key: params.pop(key, value.default) for key, value in net_structure_default_kwargs.items()
+            if key != 'self'
+        }
+        if not params.pop('grn_use_dropout', False):
+            forecasting_structure_kwargs['grn_dropout_rate'] = 0.0
+
+        num_blocks = forecasting_structure_kwargs['num_blocks']
+        use_temporal_fusion = forecasting_structure_kwargs['use_temporal_fusion']
+
+        pipeline_steps = [('net_structure', ForecastingNetworkStructure(**forecasting_structure_kwargs))]
+        self.encoder_choice: Union[List[BaseForecastingEncoder], List[()]] = []
+        self.decoder_choice: Union[List[BaseForecastingDecoder], List[()]] = []
+
+        decoder_components = self.get_decoder_components()
+
+        for i in range(1, num_blocks + 1):
+            new_params = {}
+
+            block_prefix = f'block_{i}:'
+            choice = params.pop(block_prefix + '__choice__')
+
+            for param, value in params.items():
+                if param.startswith(block_prefix):
+                    param = param.replace(block_prefix + choice + ':', '')
+                    new_params[param] = value
+
+            if init_params is not None:
+                for param, value in init_params.items():
+                    if param.startswith(block_prefix):
+                        param = param.replace(block_prefix + choice + ':', '')
+                        new_params[param] = value
+
+            decoder_type: Optional[str] = None
+
+            decoder_params = {}
+            decoder_params_names = []
+            for param, value in new_params.items():
+                if decoder_type is None:
+                    for decoder_component in decoder_components.keys():
+                        if param.startswith(block_prefix + decoder_component):
+                            decoder_type: str = decoder_component  # type:ignore[no-redef]
+                            decoder_params_names.append(param)
+                            param = param.replace(block_prefix + decoder_type + ':', '')  # type:ignore[operator]
+                            decoder_params[param] = value
+                else:
+                    if param.startswith(block_prefix + decoder_type):
+                        decoder_params_names.append(param)
+                        param = param.replace(block_prefix + decoder_type + ':', '')
+                        decoder_params[param] = value
+            assert decoder_type is not None, 'Decoder must be given to initialize a forecasting backbone!'
+
+            for param_name in decoder_params_names:
+                del new_params[param_name]
+            new_params['random_state'] = self.random_state
+            new_params['block_number'] = i
+            decoder_params['random_state'] = self.random_state
+            decoder_params['block_number'] = i
+            # for mlp decoder, to avoid decoder's auto_regressive being overwritten by decoder_auto_regressive
+            if 'auto_regressive' not in decoder_params:
+                decoder_params['auto_regressive'] = decoder_auto_regressive
+            encoder = self.get_components()[choice](**new_params)
+            decoder = decoder_components[decoder_type](**decoder_params)
+            pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
+            self.encoder_choice.append(encoder)
+            self.decoder_choice.append(decoder)
+
+        new_params = {}
+        if use_temporal_fusion:
+            for param, value in params.items():
+                if param.startswith(self.tf_prefix):
+                    param = param.replace(self.tf_prefix + ':', '')
+                    new_params[param] = value
+            temporal_fusion = TemporalFusion(self.random_state,
+                                             **new_params)
+            pipeline_steps.extend([('temporal_fusion', temporal_fusion)])
+
+        self.pipeline = Pipeline(pipeline_steps)
+        self.choice = self.encoder_choice[0]
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'SeqEncoder',
+            'name': 'SeqEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
new file mode 100644
index 000000000..b428e1a16
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -0,0 +1,140 @@
+from typing import Any, Dict, Iterable, List, Optional
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.conditions import EqualsCondition
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import numpy as np
+
+import torch
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import \
+    TemporalFusionLayer
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    NetworkStructure
+from autoPyTorch.utils.common import (FitRequirement,
+                                      HyperparameterSearchSpace,
+                                      add_hyperparameter, get_hyperparameter)
+
+
+class TemporalFusion(autoPyTorchComponent):
+    """
+    Temporal Fusion layer. For details we refer to
+    Lim et al. Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting
+    https://arxiv.org/abs/1912.09363
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 attention_n_head_log: int = 2,
+                 attention_d_model_log: int = 4,
+                 use_dropout: bool = False,
+                 dropout_rate: Optional[float] = None, ):
+        autoPyTorchComponent.__init__(self, random_state=random_state)
+        self.add_fit_requirements(
+            self._required_fit_requirements
+        )
+        self.attention_n_head_log = attention_n_head_log
+        self.attention_d_model_log = attention_d_model_log
+        self.use_dropout = use_dropout
+        self.dropout_rate = dropout_rate
+
+        self.temporal_fusion: Optional[torch.nn.Module] = None
+        self.n_decoder_output_features = 0
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        return [
+            FitRequirement('window_size', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_encoder', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
+        ]
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
+        network_structure = X['network_structure']  # type: NetworkStructure
+
+        self.temporal_fusion = TemporalFusionLayer(window_size=X['window_size'],
+                                                   network_structure=network_structure,
+                                                   network_encoder=X['network_encoder'],
+                                                   n_decoder_output_features=X['n_decoder_output_features'],
+                                                   d_model=2 ** self.attention_d_model_log,
+                                                   n_head=2 ** self.attention_n_head_log,
+                                                   dropout=self.dropout_rate
+                                                   )
+        self.n_decoder_output_features = 2 ** self.attention_d_model_log
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({"n_decoder_output_features": self.n_decoder_output_features,
+                  "temporal_fusion": self.temporal_fusion})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'TemporalFusion',
+            'name': 'TemporalFusion',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            attention_n_head_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='attention_n_head_log',
+                value_range=(1, 3),
+                default_value=2,
+            ),
+            attention_d_model_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='attention_d_model_log',
+                value_range=(4, 8),
+                default_value=4,
+            ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_dropout',
+                value_range=(True, False),
+                default_value=True,
+            ),
+            dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='dropout_rate',
+                value_range=(0.0, 0.8),
+                default_value=0.1,
+            )
+    ) -> ConfigurationSpace:
+        """Return the configuration space of this classification algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, Union[str, int]]):
+                Describes the dataset to work on
+            attention_n_head_log (HyperparameterSearchSpace):
+                log value of number of heads for interpretable
+            attention_d_model_log (HyperparameterSearchSpace):
+                log value of input of attention model
+            use_dropout (HyperparameterSearchSpace):
+                if dropout is applied to temporal fusion layer
+            dropout_rate (HyperparameterSearchSpace):
+                dropout rate of the temporal fusion  layer
+        Returns:
+            ConfigurationSpace:
+                The configuration space of this algorithm.
+        """
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, attention_n_head_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, attention_d_model_log, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout_rate = get_hyperparameter(dropout_rate, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([use_dropout, dropout_rate])
+        cond_dropout = EqualsCondition(dropout_rate, use_dropout, True)
+        cs.add_condition(cond_dropout)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
index bb1a93ac1..0539df422 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -16,18 +16,23 @@
 }
 
 
-def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...]
+def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has_hidden_states: bool = False
                      ) -> Tuple[int, ...]:
     """
     Run a dummy forward pass to get the output shape of the backbone.
     Can and should be overridden by subclasses that know the output shape
     without running a dummy forward pass.
     :param input_shape: shape of the input
+    :param has_hidden_states: bool, if the network backbone contains a hidden_states. if yes,
+        the network will return a Tuple, we will then only consider the first item
     :return: output_shape
     """
     placeholder = torch.randn((2, *input_shape), dtype=torch.float)
     with torch.no_grad():
-        output = network(placeholder)
+        if has_hidden_states:
+            output = network(placeholder)[0]
+        else:
+            output = network(placeholder)
     return tuple(output.shape[1:])
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 49ecf40b7..fdcf051bd 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -34,7 +34,7 @@ def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_n
         # list of number of categories of categorical data
         # or 0 for numerical data
         self.num_input_features = num_input_features
-        categorical_features = self.num_input_features > 0
+        categorical_features: np.ndarray = self.num_input_features > 0
 
         self.num_categorical_features = self.num_input_features[categorical_features]
 
@@ -52,6 +52,36 @@ def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_n
 
         self.ee_layers = self._create_ee_layers()
 
+    def get_partial_models(self, subset_features: List[int]) -> "_LearnedEntityEmbedding":
+        """
+        extract a partial models that only works on a subset of the data that ought to be passed to the embedding
+        network, this function is implemented for time series forecasting tasks where the known future features is only
+        a subset of the past features
+        Args:
+            subset_features (List[int]):
+                a set of index identifying which features will pass through the partial model
+
+        Returns:
+            partial_model (_LearnedEntityEmbedding)
+                a new partial model
+        """
+        num_input_features = self.num_input_features[subset_features]
+        num_numerical_features = sum([sf < self.num_numerical for sf in subset_features])
+
+        num_output_dimensions = [self.num_output_dimensions[sf] for sf in subset_features]
+        embed_features = [self.embed_features[sf] for sf in subset_features]
+
+        ee_layers = []
+        ee_layer_tracker = 0
+        for sf in subset_features:
+            if self.embed_features[sf]:
+                ee_layers.append(self.ee_layers[ee_layer_tracker])
+                ee_layer_tracker += 1
+        ee_layers = nn.ModuleList(ee_layers)
+
+        return PartialLearnedEntityEmbedding(num_input_features, num_numerical_features, embed_features,
+                                             num_output_dimensions, ee_layers)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # pass the columns of each categorical feature through entity embedding layer
         # before passing it through the model
@@ -64,15 +94,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 x_pointer += 1
                 continue
             if x_pointer > last_concat:
-                concat_seq.append(x[:, last_concat: x_pointer])
-            categorical_feature_slice = x[:, x_pointer: x_pointer + num_in]
+                concat_seq.append(x[..., last_concat: x_pointer])
+            categorical_feature_slice = x[..., x_pointer: x_pointer + num_in]
             concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice))
             layer_pointer += 1
             x_pointer += num_in
             last_concat = x_pointer
 
-        concat_seq.append(x[:, last_concat:])
-        return torch.cat(concat_seq, dim=1)
+        concat_seq.append(x[..., last_concat:])
+        return torch.cat(concat_seq, dim=-1)
 
     def _create_ee_layers(self) -> nn.ModuleList:
         # entity embeding layers are Linear Layers
@@ -85,6 +115,35 @@ def _create_ee_layers(self) -> nn.ModuleList:
         return layers
 
 
+class PartialLearnedEntityEmbedding(_LearnedEntityEmbedding):
+    """
+    Construct a partial Embedding network that is derived from a learned embedding network and only applied to a subset
+    of the input features. This is applied to forecasting tasks where not all the features might be known beforehand
+    """
+    def __init__(self,
+                 num_input_features: np.ndarray,
+                 num_numerical_features: int,
+                 embed_features: List[bool],
+                 num_output_dimensions: List[int],
+                 ee_layers: nn.Module
+                 ):
+        super(_LearnedEntityEmbedding, self).__init__()
+        self.num_numerical = num_numerical_features
+        # list of number of categories of categorical data
+        # or 0 for numerical data
+        self.num_input_features = num_input_features
+        categorical_features: np.ndarray = self.num_input_features > 0
+
+        self.num_categorical_features = self.num_input_features[categorical_features]
+
+        self.embed_features = embed_features
+
+        self.num_output_dimensions = num_output_dimensions
+        self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
+
+        self.ee_layers = ee_layers
+
+
 class LearnedEntityEmbedding(NetworkEmbeddingComponent):
     """
     Class to learn an embedding for categorical hyperparameters.
@@ -94,10 +153,14 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg
         super().__init__(random_state=random_state)
         self.config = kwargs
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
-        return _LearnedEntityEmbedding(config=self.config,
-                                       num_input_features=num_input_features,
-                                       num_numerical_features=num_numerical_features)
+    def build_embedding(self,
+                        num_input_features: np.ndarray,
+                        num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
+
+        embedding = _LearnedEntityEmbedding(config=self.config,
+                                            num_input_features=num_input_features,
+                                            num_numerical_features=num_numerical_features)
+        return embedding, embedding.num_output_dimensions
 
     @staticmethod
     def get_hyperparameter_search_space(
@@ -107,9 +170,10 @@ def get_hyperparameter_search_space(
             value_range=(3, 7),
             default_value=5,
             log=True),
-        dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
-                                                                                   value_range=(0, 1),
-                                                                                   default_value=0.5),
+        dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="dimension_reduction",
+            value_range=(0, 1),
+            default_value=0.5),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter)
@@ -131,5 +195,5 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'name': 'LearnedEntityEmbedding',
             'handles_tabular': True,
             'handles_image': False,
-            'handles_time_series': False,
+            'handles_time_series': True,
         }
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index aded4f84d..52c56bc00 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
@@ -12,6 +12,9 @@
 
 
 class _NoEmbedding(nn.Module):
+    def get_partial_models(self, subset_features: List[int]) -> "_NoEmbedding":
+        return self
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
@@ -24,8 +27,10 @@ class NoEmbedding(NetworkEmbeddingComponent):
     def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
         super().__init__(random_state=random_state)
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
-        return _NoEmbedding()
+    def build_embedding(self,
+                        num_input_features: np.ndarray,
+                        num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]:
+        return _NoEmbedding(), None
 
     @staticmethod
     def get_hyperparameter_search_space(
@@ -42,5 +47,5 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'name': 'NoEmbedding',
             'handles_tabular': True,
             'handles_image': False,
-            'handles_time_series': False,
+            'handles_time_series': True,
         }
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
index 381e0735d..452e74cc1 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -205,6 +203,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 8652c347c..1ff5df13e 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -15,21 +15,38 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         super().__init__()
         self.embedding: Optional[nn.Module] = None
         self.random_state = random_state
+        self.feature_shapes: Dict[str, int] = {}
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         num_numerical_columns, num_input_features = self._get_args(X)
 
-        self.embedding = self.build_embedding(
+        self.embedding, num_output_features = self.build_embedding(
             num_input_features=num_input_features,
-            num_numerical_features=num_numerical_columns)
+            num_numerical_features=num_numerical_columns
+        )
+        if "feature_shapes" in X['dataset_properties']:
+            if num_output_features is not None:
+                feature_shapes = X['dataset_properties']['feature_shapes']
+                # forecasting tasks
+                feature_names = X['dataset_properties']['feature_names']
+                for idx_cat, n_output_cat in enumerate(num_output_features[num_numerical_columns:]):
+                    cat_feature_name = feature_names[idx_cat + num_numerical_columns]
+                    feature_shapes[cat_feature_name] = n_output_cat
+                self.feature_shapes = feature_shapes
+            else:
+                self.feature_shapes = X['dataset_properties']['feature_shapes']
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'network_embedding': self.embedding})
+        if "feature_shapes" in X['dataset_properties']:
+            X['dataset_properties'].update({"feature_shapes": self.feature_shapes})
         return X
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self,
+                        num_input_features: np.ndarray,
+                        num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]:
         raise NotImplementedError
 
     def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
@@ -39,12 +56,22 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
         else:
             X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
 
-            numerical_column_transformer = X['tabular_transformer'].preprocessor. \
-                named_transformers_['numerical_pipeline']
-            num_numerical_columns = numerical_column_transformer.transform(
-                X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
+            if 'tabular_transformer' in X:
+                numerical_column_transformer = X['tabular_transformer'].preprocessor. \
+                    named_transformers_['numerical_pipeline']
+            elif 'time_series_feature_transformer' in X:
+                numerical_column_transformer = X['time_series_feature_transformer'].preprocessor. \
+                    named_transformers_['numerical_pipeline']
+            else:
+                raise ValueError("Either a tabular or time_series transformer must be contained!")
+            if hasattr(X_train, 'iloc'):
+                num_numerical_columns = numerical_column_transformer.transform(
+                    X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1]
+            else:
+                num_numerical_columns = numerical_column_transformer.transform(
+                    X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
         num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
-                                      dtype=int)
+                                      dtype=np.int32)
         categories = X['dataset_properties']['categories']
 
         for i, category in enumerate(categories):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index 34163b986..ac52cf1c9 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -189,6 +187,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
new file mode 100644
index 000000000..8ca713882
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -0,0 +1,157 @@
+# THE MIT License
+
+# Copyright 2020 Jan Beitner
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# This part of implementation follows pytorch-forecasting:
+# https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/nbeats/sub_modules.py
+
+from typing import List, Tuple
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    NBEATSDecoder import NBEATSBlock
+
+
+class TransposeLinear(nn.Module):
+    def __init__(self, weights: torch.Tensor):
+        super().__init__()
+        self.register_buffer('weights', weights)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mm(self.weights)
+
+
+def linspace(backcast_length: int, forecast_length: int, centered: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    a function to generate a linear space to encode the positions of the components. For details. We refer to
+    Oreshkin et al. N-BEATS: Neural basis expansion analysis for interpretable time series forecasting
+    https://arxiv.org/abs/1905.10437
+    """
+    if centered:
+        norm = max(backcast_length, forecast_length)
+        start = -backcast_length
+        stop = forecast_length - 1
+    else:
+        norm = backcast_length + forecast_length
+        start = 0
+        stop = backcast_length + forecast_length - 1
+    lin_space = np.linspace(start / norm, stop / norm, backcast_length + forecast_length, dtype=np.float32)
+    b_ls = lin_space[:backcast_length]
+    f_ls = lin_space[backcast_length:]
+    return b_ls, f_ls
+
+
+def get_generic_heads(block_width: int, thetas_dim: int,
+                      forecast_length: int, backcast_length: int) -> Tuple[nn.Module, nn.Module]:
+    backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+                                  nn.Linear(thetas_dim, backcast_length, bias=False))
+    forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+                                  nn.Linear(thetas_dim, forecast_length, bias=False))
+    return backcast_head, forecast_head
+
+
+def get_trend_heads(block_width: int, thetas_dim: int,
+                    forecast_length: int, backcast_length: int) -> Tuple[nn.Module, nn.Module]:
+    base_layer = nn.Linear(block_width, thetas_dim, bias=False)
+
+    backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=True)
+    norm = np.sqrt(forecast_length / thetas_dim)  # ensure range of predictions is comparable to input
+
+    coefficients_backcast = torch.tensor([backcast_linspace ** i for i in range(thetas_dim)], dtype=torch.float32)
+
+    coefficients_forecast = torch.tensor([forecast_linspace ** i for i in range(thetas_dim)], dtype=torch.float32)
+
+    backcast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(coefficients_backcast * norm))
+    forecast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(coefficients_forecast * norm))
+
+    return backcast_head, forecast_head
+
+
+def get_seasonality_heads(block_width: int, thetas_dim: int,
+                          forecast_length: int, backcast_length: int) -> Tuple[nn.Module, nn.Module]:
+    base_layer = nn.Linear(block_width, forecast_length, bias=False)
+
+    backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=False)
+
+    def get_frequencies(n: int) -> np.ndarray:
+        return np.linspace(0, (backcast_length + forecast_length) / thetas_dim, n)
+
+    p1, p2 = (forecast_length // 2, forecast_length // 2) if forecast_length % 2 == 0 else \
+        (forecast_length // 2, forecast_length // 2 + 1)
+
+    s1_b = torch.tensor(
+        [np.cos(2 * np.pi * i * backcast_linspace) for i in get_frequencies(p1)], dtype=torch.float32)  # H/2-1
+    s2_b = torch.tensor(
+        [np.sin(2 * np.pi * i * backcast_linspace) for i in get_frequencies(p2)], dtype=torch.float32)
+
+    s1_f = torch.tensor(
+        [np.cos(2 * np.pi * i * forecast_linspace) for i in get_frequencies(p1)], dtype=torch.float32
+    )  # H/2-1
+    s2_f = torch.tensor(
+        [np.sin(2 * np.pi * i * forecast_linspace) for i in get_frequencies(p2)], dtype=torch.float32
+    )
+
+    backcast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(torch.cat([s1_b, s2_b])))
+    forecast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(torch.cat([s1_f, s2_f])))
+    return backcast_head, forecast_head
+
+
+def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBlock]],
+                         output_shape: Tuple[int]) -> nn.ModuleList:
+    nbeats_blocks = []
+    for stack_idx, stack in enumerate(nbeats_decoder):
+        for block_idx, block in enumerate(nbeats_decoder[stack_idx]):
+            stack_type = block.stack_type
+            if stack_type == 'generic':
+                backcast_head, forecast_head = get_generic_heads(block_width=block.width,
+                                                                 thetas_dim=block.expansion_coefficient_length,
+                                                                 forecast_length=np.product(output_shape).item(),
+                                                                 backcast_length=block.n_in_features)
+            elif stack_type == 'trend':
+                backcast_head, forecast_head = get_trend_heads(block_width=block.width,
+                                                               thetas_dim=block.expansion_coefficient_length,
+                                                               forecast_length=np.product(output_shape).item(),
+                                                               backcast_length=block.n_in_features)
+            elif stack_type == 'seasonality':
+                backcast_head, forecast_head = get_seasonality_heads(block_width=block.width,
+                                                                     thetas_dim=block.expansion_coefficient_length,
+                                                                     forecast_length=np.product(
+                                                                         output_shape).item(),
+                                                                     backcast_length=block.n_in_features)
+            else:
+                raise ValueError(f"Unsupported stack_type {stack_type}")
+            block.backcast_head = backcast_head
+            block.forecast_head = forecast_head
+
+            nbeats_blocks.append(block)
+        if nbeats_blocks[-1].weight_sharing:
+            block = nbeats_blocks[-1]
+            for _ in range(block.num_blocks - 1):
+                nbeats_blocks.append(nbeats_blocks[-1])
+    return nn.ModuleList(nbeats_blocks)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
new file mode 100644
index 000000000..2cc3178a9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -0,0 +1,218 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+# This part of codes mainly follow the implementation in gluonts:
+# https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
+# However, we don't simply follow their implementation mainly due to the different network backbone.
+# Additionally, scale information is not presented here to avoid
+
+from abc import abstractmethod
+from typing import Any, Dict, NamedTuple, Tuple, Type
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Beta, Distribution, Gamma, Normal, Poisson, StudentT
+
+
+class ProjectionLayer(nn.Module):
+    """
+    A projection layer that project features to a torch distribution
+    """
+
+    value_in_support = 0.0
+
+    # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
+
+    def __init__(
+        self,
+        num_in_features: int,
+        output_shape: Tuple[int, ...],
+        n_prediction_heads: int,
+        decoder_has_local_layer: bool,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+
+        # we consider all the prediction steps holistically. thus, the output of the poj layer is
+        # n_prediction_steps * dim *output_shape
+
+        def build_single_proj_layer(arg_dim: int) -> nn.Module:
+            """
+            build a single proj layer given the input dims, the output is unflattened to fit the required output_shape
+            and n_prediction_steps.
+            we note that output_shape's first dimensions is always n_prediction_steps
+            Args:
+                arg_dim (int):
+                    dimension of the target distribution
+
+            Returns:
+                proj_layer (nn.Module):
+                    projection layer that maps the decoder output to parameterize distributions
+            """
+            if decoder_has_local_layer:
+                return nn.Sequential(
+                    nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+                    nn.Unflatten(-1, (*output_shape, arg_dim)),
+                )
+            else:
+                return nn.Sequential(
+                    nn.Linear(
+                        num_in_features,
+                        n_prediction_heads * np.prod(output_shape).item() * arg_dim,
+                    ),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape, arg_dim)),
+                )
+
+        self.proj = nn.ModuleList(
+            [build_single_proj_layer(dim) for dim in self.arg_dims.values()]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.distributions:
+        """
+        get a target distribution
+        Args:
+            x: input tensor ([batch_size, in_features]):
+                input tensor, acquired by the base header, have the shape [batch_size, in_features]
+
+        Returns:
+            dist: torch.distributions ([batch_size, n_prediction_steps, output_shape]):
+                an output torch distribution with shape (batch_size, n_prediction_steps, output_shape)
+        """
+        params_unbounded = [proj(x) for proj in self.proj]
+        return self.dist_cls(*self.domain_map(*params_unbounded))
+
+    @property
+    @abstractmethod
+    def arg_dims(self) -> Dict[str, int]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def domain_map(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def dist_cls(self) -> Type[Distribution]:
+        raise NotImplementedError
+
+
+class NormalOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"loc": 1, "scale": 1}
+
+    def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:  # type: ignore
+        scale = F.softplus(scale) + 1e-10
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def dist_cls(self) -> Type[Distribution]:
+        return Normal  # type: ignore[no-any-return]
+
+
+class StudentTOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"df": 1, "loc": 1, "scale": 1}
+
+    def domain_map(  # type: ignore[override]
+        self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = F.softplus(scale) + 1e-10
+        df = 2.0 + F.softplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def dist_cls(self) -> Type[Distribution]:
+        return StudentT   # type: ignore[no-any-return]
+
+
+class BetaOutput(ProjectionLayer):
+    value_in_support = 0.5
+
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"concentration1": 1, "concentration0": 1}
+
+    def domain_map(  # type: ignore[override]
+        self, concentration1: torch.Tensor, concentration0: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO we need to adapt epsilon value given the datatype of this module
+        epsilon = 1e-10
+        concentration1 = F.softplus(concentration1) + epsilon
+        concentration0 = F.softplus(concentration0) + epsilon
+        return concentration1.squeeze(-1), concentration0.squeeze(-1)
+
+    @property
+    def dist_cls(self) -> Type[Distribution]:
+        # TODO consider constraints on Beta!!!
+        return Beta   # type: ignore[no-any-return]
+
+
+class GammaOutput(ProjectionLayer):
+    value_in_support = 0.5
+
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"concentration": 1, "rate": 1}
+
+    def domain_map(  # type: ignore[override]
+        self, concentration: torch.Tensor, rate: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO we need to adapt epsilon value given the datatype of this module
+        epsilon = 1e-10
+        concentration = F.softplus(concentration) + epsilon
+        rate = F.softplus(rate) + epsilon
+        return concentration.squeeze(-1), rate.squeeze(-1)
+
+    @property
+    def dist_cls(self) -> Type[Distribution]:
+        return Gamma  # type: ignore[no-any-return]
+
+
+class PoissonOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"rate": 1}
+
+    def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor]:  # type: ignore[override]
+        rate_pos = F.softplus(rate).clone()
+        return (rate_pos.squeeze(-1),)
+
+    @property
+    def dist_cls(self) -> Type[Distribution]:
+        return Poisson  # type: ignore[no-any-return]
+
+
+ALL_DISTRIBUTIONS = {
+    "studentT": StudentTOutput,
+    "normal": NormalOutput,
+    # 'beta': BetaOutput,
+    # 'gamma': GammaOutput,
+    # 'poisson': PoissonOutput
+}  # type: Dict[str, Type[ProjectionLayer]]
+
+
+class DisForecastingStrategy(NamedTuple):
+    dist_cls: str
+    forecast_strategy: str = "sample"
+    num_samples: int = 100
+    aggregation: str = "mean"
+
+
+# TODO find components that are compatible with beta, gamma and poisson distribution!
+
+# TODO consider how to implement NegativeBinomialOutput without scale information
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
new file mode 100644
index 000000000..7cfc0bbf9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -0,0 +1,245 @@
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+from ConfigSpace import ConfigurationSpace
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class QuantileHead(nn.Module):
+    def __init__(self, head_components: List[nn.Module]):
+        super().__init__()
+        self.net = nn.ModuleList(head_components)
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        return [net(x) for net in self.net]
+
+
+class ForecastingHead(NetworkHeadComponent):
+    """
+    Base class for network heads used for forecasting.
+     Holds the head module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 ):
+        super(NetworkHeadComponent, self).__init__(random_state=random_state)
+
+        self.add_fit_requirements(self._required_fit_requirements)
+        self.head: Optional[nn.Module] = None
+        self.output_shape: Optional[Tuple[int]] = None
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        return [
+            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('net_output_type', (str,), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_steps', (int,), user_defined=False, dataset_property=True)
+
+        ]
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Builds the head component and assigns it to self.head
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+        Returns:
+            Self
+        """
+        self.check_requirements(X, y)
+
+        output_shape = X['dataset_properties']['output_shape']
+
+        net_output_type = X['net_output_type']
+
+        if 'block_1' in X['network_decoder'] and X['network_decoder']['block_1'].decoder_properties.multi_blocks:
+            # if the decoder is a stacked block, we directly build head inside the decoder
+            if net_output_type != 'regression':
+                raise ValueError("decoder with multi block structure only allow regression loss!")
+            self.output_shape = (X['dataset_properties']['n_prediction_steps'], output_shape[-1])  # type: ignore
+            return self
+
+        num_quantiles = 0
+        dist_cls = None
+        if net_output_type == 'distribution':
+            if 'dist_forecasting_strategy' not in X:
+                raise ValueError('Distribution output type must contain dis_forecasting_strategy!')
+            dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
+            dist_cls = dist_forecasting_strategy.dist_cls
+        elif net_output_type == 'quantile':
+            if 'quantile_values' not in X:
+                raise ValueError("For Quantile losses, quantiles must be given in X!")
+            num_quantiles = len(X['quantile_values'])
+
+        head_n_in_features: int = X["n_decoder_output_features"]
+        n_prediction_heads = X["n_prediction_heads"]
+
+        decoder_has_local_layer = X.get('mlp_has_local_layer', True)
+
+        head_components = self.build_head(
+            head_n_in_features=head_n_in_features,
+            output_shape=output_shape,
+            decoder_has_local_layer=decoder_has_local_layer,
+            net_output_type=net_output_type,
+            dist_cls=dist_cls,
+            n_prediction_heads=n_prediction_heads,
+            num_quantiles=num_quantiles,
+        )
+        self.head = head_components
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the network head into the fit dictionary 'X' and returns it.
+
+        Args:
+            X (Dict[str, Any]):
+                'X' dictionary
+        Returns:
+            (Dict[str, Any]):
+                the updated 'X' dictionary
+        """
+        if self.head is not None:
+            X.update({'network_head': self.head})
+        else:
+            decoder = X['network_decoder']
+            # NBEATS is a flat encoder, it only has one decoder
+            first_decoder = decoder['block_1']
+            assert self.output_shape is not None
+            nbeats_decoder = build_NBEATS_network(first_decoder.decoder, self.output_shape)
+            decoder['block_1'] = DecoderBlockInfo(decoder=nbeats_decoder,
+                                                  decoder_properties=first_decoder.decoder_properties,
+                                                  decoder_output_shape=first_decoder.decoder_output_shape,
+                                                  decoder_input_shape=first_decoder.decoder_input_shape)
+            X.update({'network_head': self.head,
+                      'network_decoder': decoder})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        """Get the properties of the underlying algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, Union[str, int]]):
+                Describes the dataset to work on
+
+        Returns:
+            Dict[str, Any]:
+                Properties of the algorithm
+        """
+        return {
+            'shortname': 'ForecastingHead',
+            'name': 'ForecastingHead',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    def build_head(self,  # type: ignore[override]
+                   head_n_in_features: int,
+                   output_shape: Tuple[int, ...],
+                   decoder_has_local_layer: bool = True,
+                   net_output_type: str = "distribution",
+                   dist_cls: Optional[str] = None,
+                   n_prediction_heads: int = 1,
+                   num_quantiles: int = 3,
+                   ) -> nn.Module:
+        """
+        Builds the head module and returns it
+
+        Args:
+            head_n_in_features (int):
+                shape of the input to the head (usually the shape of the backbone output)
+            output_shape (Tuple[int, ...]):
+                shape of the output of the head
+            decoder_has_local_layer (bool):
+                if the decoder has local layer
+            net_output_type (str):
+                network output type
+            dist_cls (Optional[str]):
+                output distribution, only works if required_net_out_put_type is 'distribution'
+            n_prediction_heads (Dict):
+                additional paramter for initializing architectures. How many heads to predict
+            num_quantiles (int):
+                number of quantile losses
+
+        Returns:
+            nn.Module:
+                head module
+        """
+        if net_output_type == 'distribution':
+            assert dist_cls is not None
+            proj_layer_d = ALL_DISTRIBUTIONS[dist_cls](num_in_features=head_n_in_features,
+                                                       output_shape=output_shape[1:],
+                                                       n_prediction_heads=n_prediction_heads,
+                                                       decoder_has_local_layer=decoder_has_local_layer
+                                                       )
+            return proj_layer_d
+        elif net_output_type == 'regression':
+            if decoder_has_local_layer:
+                proj_layer_r = nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
+            else:
+                proj_layer_r = nn.Sequential(
+                    nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
+                )
+            return proj_layer_r
+        elif net_output_type == "quantile":
+            if decoder_has_local_layer:
+                proj_layer_quantiles = [
+                    nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
+                    for _ in range(num_quantiles)
+                ]
+            else:
+                proj_layer_quantiles = [
+                    nn.Sequential(
+                        nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
+                        nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
+                    ) for _ in range(num_quantiles)
+                ]
+            proj_layer_q = QuantileHead(proj_layer_quantiles)
+            return proj_layer_q
+        else:
+            raise NotImplementedError(f"Unsupported network type "
+                                      f"{net_output_type} (should be one of the following: "
+                                      f"regression, distribution or quantiles)")
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+    ) -> ConfigurationSpace:
+        """Return the configuration space of network head.
+
+        Returns:
+            ConfigurationSpace:
+                The configuration space of this algorithm.
+        """
+        cs = ConfigurationSpace()
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
index 1e6dbdbf3..b048293e9 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
@@ -11,7 +11,6 @@ class SparseInit(BaseNetworkInitializerComponent):
     """
     Fills the 2D input Tensor as a sparse matrix
     """
-
     def weights_init(self) -> Callable:
         """Returns the actual PyTorch model, that is dynamically created
         from a self.config object.
@@ -19,6 +18,7 @@ def weights_init(self) -> Callable:
         self.config is a dictionary created form a given config in the config space.
         It contains the necessary information to build a network.
         """
+
         def initialization(m: torch.nn.Module) -> None:
             if isinstance(m, (torch.nn.Conv1d,
                               torch.nn.Conv2d,
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
index fc878b669..bae589570 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -175,6 +173,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
index 010cbad81..f89b80849 100644
--- a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -177,6 +175,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/training/base_training.py b/autoPyTorch/pipeline/components/training/base_training.py
index ebf7ccbc4..48d0a3e06 100644
--- a/autoPyTorch/pipeline/components/training/base_training.py
+++ b/autoPyTorch/pipeline/components/training/base_training.py
@@ -12,15 +12,15 @@ class autoPyTorchTrainingComponent(autoPyTorchComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
         super(autoPyTorchTrainingComponent, self).__init__(random_state=random_state)
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict) -> Dict:
         """The transform function calls the transform function of the
         underlying model and returns the transformed array.
 
         Args:
-            X (np.ndarray): input features
+            X (Dict): input features
 
         Returns:
-            np.ndarray: Transformed features
+            Dict: Transformed features
         """
         raise NotImplementedError()
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index f39194477..483ac98d4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -59,7 +59,7 @@ def __init__(self, batch_size: int = 64,
             FitRequirement("Backend", (Backend,), user_defined=True, dataset_property=False),
             FitRequirement("is_small_preprocess", (bool,), user_defined=True, dataset_property=True)])
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict) -> Dict:
         """The transform function calls the transform function of the
         underlying model and returns the transformed array.
 
@@ -106,7 +106,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
-        train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
+
+        train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
@@ -118,15 +119,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             collate_fn=custom_collate_fn,
         )
 
-        self.val_data_loader = torch.utils.data.DataLoader(
-            val_dataset,
-            batch_size=min(self.batch_size, len(val_dataset)),
-            shuffle=False,
-            num_workers=X.get('num_workers', 0),
-            pin_memory=X.get('pin_memory', True),
-            drop_last=X.get('drop_last', False),
-            collate_fn=custom_collate_fn,
-        )
+        if X.get('val_indices', None) is not None:
+            val_dataset = datamanager.get_dataset(split_id=X['split_id'], train=False)
+            self.val_data_loader = torch.utils.data.DataLoader(
+                val_dataset,
+                batch_size=min(self.batch_size, len(val_dataset)),
+                shuffle=False,
+                num_workers=X.get('num_workers', 0),
+                pin_memory=X.get('pin_memory', True),
+                drop_last=X.get('drop_last', True),
+                collate_fn=custom_collate_fn,
+            )
 
         if X.get('X_test', None) is not None:
             self.test_data_loader = self.get_loader(X=X['X_test'],
@@ -135,7 +138,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         return self
 
-    def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.inf,
+    def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.iinfo(np.int32).max,
                    ) -> torch.utils.data.DataLoader:
         """
         Creates a data loader object from the provided data,
@@ -184,7 +187,6 @@ def get_val_data_loader(self) -> torch.utils.data.DataLoader:
         Returns:
             torch.utils.data.DataLoader: A validation data loader
         """
-        assert self.val_data_loader is not None, "No val data loader fitted"
         return self.val_data_loader
 
     def get_test_data_loader(self) -> torch.utils.data.DataLoader:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
new file mode 100644
index 000000000..3ddd66b2a
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -0,0 +1,618 @@
+import warnings
+from functools import partial
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+
+from ConfigSpace.conditions import EqualsCondition
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+from gluonts.time_feature import TimeFeature
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.compose import ColumnTransformer
+
+import torch
+
+import torchvision
+
+
+from autoPyTorch.datasets.time_series_dataset import (
+    TimeSeriesForecastingDataset,
+    TimeSeriesSequence,
+    extract_feature_index
+)
+from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
+from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
+    ExpandTransformTimeSeries,
+    PadSequenceCollector,
+    SequentialSubSetSampler,
+    TestSequenceDataset,
+    TimeSeriesSampler
+)
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    custom_collate_fn,
+    get_hyperparameter
+)
+
+
+class TimeSeriesForecastingDataLoader(FeatureDataLoader):
+    """This class is an interface to read time sequence data
+
+    It gives the possibility to read various types of mapped
+    datasets as described in:
+    https://pytorch.org/docs/stable/data.html
+    """
+
+    def __init__(self,
+                 batch_size: int = 64,
+                 backcast: bool = False,
+                 backcast_period: int = 2,
+                 window_size: int = 1,
+                 num_batches_per_epoch: Optional[int] = 50,
+                 n_prediction_steps: int = 1,
+                 sample_strategy: str = 'SeqUniform',
+                 transform_time_features: bool = False,
+                 random_state: Optional[np.random.RandomState] = None) -> None:
+        """
+        initialize a dataloader
+        Args:
+            batch_size (int):
+                batch size
+            backcast (bool):
+                if backcast is applied, where window_size is determined on the forecasting horizon
+            backcast_period (int):
+                backcast period, window_size is computed by horizon * backcast_period
+            window_size(int):
+                windows size, activate when backcast is false
+            num_batches_per_epoch (int):
+                number of batches per epoch
+            n_prediction_steps (int):
+                forecasting horizon
+            sample_strategy (str):
+                sample strategy, if all the sequences are expected to be sampled with the same size
+                or all the time steps are expected to be sampled with the same size
+            transform_time_features (bool):
+                if time features are transformed
+            random_state (Optional[np.random.RandomState]):
+                random states
+
+        """
+        super().__init__(batch_size=batch_size, random_state=random_state)
+        self.backcast = backcast
+        self.backcast_period = backcast_period
+
+        self.n_prediction_steps = n_prediction_steps
+        self.window_size = window_size
+
+        self.window_size = self.adjust_window_size(1)
+
+        self.sample_interval = 1
+        # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
+        # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
+        # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        self.sample_strategy = sample_strategy
+        self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
+        self.padding_collector: Optional[Callable] = None
+
+        self.known_future_features_index: Union[Tuple[int], Tuple[()]] = ()
+        self._is_uni_variant = False
+
+        self.transform_time_features = transform_time_features
+        self.freq = "1Y"
+        self.time_feature_transform: List[TimeFeature] = []
+        self.dataset_columns: Union[Tuple[Union[int, str]], Tuple[()]] = ()
+        self.sampler_train: Optional[Union[Iterator, torch.utils.data.sampler.Sampler]] = None
+
+        # Applied for get loader
+        self.feature_preprocessor: Optional[ColumnTransformer] = None
+
+        self.add_fit_requirements(
+            [FitRequirement("known_future_features", (tuple,), user_defined=True, dataset_property=True),
+             FitRequirement("feature_shapes", (Dict,), user_defined=True, dataset_property=True),
+             FitRequirement("feature_names", (tuple,), user_defined=True, dataset_property=True),
+             FitRequirement("sequence_lengths_train", (List,), user_defined=True, dataset_property=True),
+             FitRequirement("freq", (str,), user_defined=True, dataset_property=True),
+             FitRequirement("n_prediction_steps", (int,), user_defined=True, dataset_property=True)])
+
+    def adjust_window_size(self, sample_interval: int = 1) -> int:
+        """
+        Adjust the sliding window size with the given sample_interval and the
+        Args:
+            sample_interval (int): resolution of the window size
+
+        Returns:
+            window_size (int): window size
+
+        """
+        window_size = self.window_size
+        if self.backcast:
+            window_size = self.backcast_period * self.n_prediction_steps
+
+        if sample_interval > 1:
+            # for lower resolution, window_size should be smaller
+            window_size = (self.window_size - 1) // sample_interval + 1
+        return window_size
+
+    def compute_expected_num_instances_per_seq(self,
+                                               num_instances_dataset: int,
+                                               seq_train_length: np.ndarray,
+                                               min_start: int = 0,
+                                               fraction_seq: float = 1.0,
+                                               fraction_samples_per_seq: float = 1.0,
+                                               ) -> np.ndarray:
+        """
+        Compute the number of expected sample instances within each sequence.
+        Args:
+            num_instances_dataset (int):
+                number of all possible instances inside a dataset
+            seq_train_length (np.ndarray):
+                length of each sequence
+            min_start (int):
+                minimal number of start
+            fraction_seq (float):
+                fraction of the sequence that will be sampled during training.
+            fraction_samples_per_seq (float):
+                fraction of number of samples inside each series
+
+        Returns:
+            num_instances_per_seqs (np.ndarray): expected number of instances to be sampled inside each sequence
+        """
+        seq_train_length = np.asarray(seq_train_length)
+        num_instances_epoch = self.num_batches_per_epoch * self.batch_size
+        # create masks for masking
+        seq_idx_inactivate = np.random.choice(seq_train_length.size,
+                                              int(np.floor(seq_train_length.size * (1 - fraction_seq))),
+                                              replace=False)
+        if len(seq_idx_inactivate) == seq_train_length.size:
+            # we don't want to make all the sequence inactivate
+            seq_idx_inactivate = self.random_state.choice(seq_idx_inactivate, len(seq_idx_inactivate) - 1,
+                                                          replace=False)
+
+        if self.sample_strategy == 'LengthUniform':
+            available_seq_length: np.ndarray = seq_train_length - min_start
+            available_seq_length = np.where(available_seq_length <= 0, 0, available_seq_length)
+            num_instances_per_seqs = num_instances_epoch / np.sum(available_seq_length) * available_seq_length
+        elif self.sample_strategy == 'SeqUniform':
+            num_seq_train = len(seq_train_length)
+            num_instances_per_seqs = np.repeat(num_instances_epoch / num_seq_train, num_seq_train)
+        else:
+            raise NotImplementedError(f'Unsupported sample strategy: {self.sample_strategy}')
+
+        num_instances_per_seqs[seq_idx_inactivate] = 0
+        num_instances_per_seqs *= fraction_samples_per_seq
+        return num_instances_per_seqs
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
+        """
+        Fits a component by using an input dictionary with pre-requisites
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+
+        Returns:
+            A instance of self
+        """
+        self.check_requirements(X, y)
+
+        # Incorporate the transform to the dataset
+        datamanager: TimeSeriesForecastingDataset = X['backend'].load_datamanager()
+        dataset_properties = X['dataset_properties']
+
+        # this value corresponds to budget type resolution
+        sample_interval = X.get('sample_interval', 1)
+        padding_value = X.get('required_padding_value', 0.0)
+
+        self.n_prediction_steps = dataset_properties['n_prediction_steps']
+
+        self.window_size = self.adjust_window_size(sample_interval)
+
+        max_lagged_value = max(X['dataset_properties'].get('lagged_value', [np.inf]))
+        max_lagged_value += self.window_size + self.n_prediction_steps
+
+        # we want the feature names from the raw dataset
+        self.dataset_columns = datamanager.feature_names  # type: ignore[assignment]
+
+        known_future_features_index = extract_feature_index(
+            feature_shapes=X['dataset_properties']['feature_shapes'],
+            feature_names=X['dataset_properties']['feature_names'],
+            queried_features=X['dataset_properties']['known_future_features']
+        )
+        self.known_future_features_index = known_future_features_index
+
+        self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value,
+                                                      max_lagged_value)
+
+        # this value corresponds to budget type num_sequence
+        fraction_seq = X.get('fraction_seq', 1.0)
+        # this value corresponds to budget type num_sample_per_seq
+        fraction_samples_per_seq = X.get('fraction_samples_per_seq', 1.0)
+        self.sample_interval = sample_interval
+
+        # TODO, consider bucket setting
+        self.train_transform = self.build_transform(X, mode='train')
+        self.val_transform = self.build_transform(X, mode='val')
+        self.test_transform = self.build_transform(X, mode='test')
+        if 'time_series_feature_transformer' in X:
+            self.feature_preprocessor = X['time_series_feature_transformer'].preprocessor
+        datamanager.update_transform(
+            self.train_transform,
+            train=True,
+        )
+        datamanager.update_transform(
+            self.val_transform,
+            train=False,
+        )
+
+        if X['dataset_properties']["is_small_preprocess"]:
+            # This parameter indicates that the data has been pre-processed for speed
+            # Overwrite the datamanager with the pre-processes data
+            datamanager.replace_data(X['X_train'],
+                                     X['X_test'] if 'X_test' in X else None,
+                                     known_future_features_index=known_future_features_index)
+            self.dataset_small_preprocess = True
+        else:
+            self.dataset_small_preprocess = False
+
+        datamanager.transform_time_features = self.transform_time_features
+
+        self._is_uni_variant = X['dataset_properties']['uni_variant']
+
+        self.freq = X['dataset_properties']['freq']
+        self.time_feature_transform = X['dataset_properties']['time_feature_transform']
+
+        train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
+        val_dataset = datamanager.get_dataset(split_id=X['split_id'], train=False)
+
+        train_split, test_split = datamanager.splits[X['split_id']]
+
+        num_instances_dataset = np.size(train_split)
+
+        # get the length of each sequence of training data (after split), as we know that validation sets are always
+        # place on the tail of the series, the discontinuity only happens if a new series is concated.
+        # for instance, if we have a train indices is expressed as [0, 1, 2 ,3, 7 ,8 ].
+        #  A new sequence must start from the index 7. We could then split each unique values to represent the length
+        # of each split
+
+        # TODO consider min_start as a hp (multiple of self.n_prediction_steps?)
+        min_start = self.n_prediction_steps
+
+        dataset_seq_length_train_all = X['dataset_properties']['sequence_lengths_train']
+        if np.sum(dataset_seq_length_train_all) == len(train_split):
+            # this applies if we want to fit the entire datasets
+            seq_train_length = np.array(dataset_seq_length_train_all)
+        else:
+            _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
+
+        num_instances_per_seqs = self.compute_expected_num_instances_per_seq(num_instances_dataset,
+                                                                             seq_train_length,
+                                                                             min_start,
+                                                                             fraction_seq,
+                                                                             fraction_samples_per_seq,
+                                                                             )
+
+        # TODO consider the case where num_instances_train is greater than num_instances_dataset,
+        # In which case we simply iterate through all the datasets
+
+        sampler_indices_train = np.arange(num_instances_dataset)
+
+        self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
+                                               num_instances_per_seqs=num_instances_per_seqs,
+                                               min_start=min_start)
+
+        self.train_data_loader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=min(self.batch_size, len(sampler_indices_train)),
+            shuffle=False,
+            num_workers=X.get('num_workers', 0),
+            pin_memory=X.get('pin_memory', True),
+            drop_last=X.get('drop_last', True),
+            collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
+            sampler=self.sampler_train,
+        )
+
+        # validation set is not so important here, we make  the size of validation set to be 20% of training instances
+        num_samples_val = int(np.sum(num_instances_per_seqs)) // 5
+        if num_samples_val > len(val_dataset):
+            sampler_val = None
+        else:
+            sampler_val = SequentialSubSetSampler(data_source=val_dataset,
+                                                  num_samples=num_samples_val)
+
+        self.val_data_loader = torch.utils.data.DataLoader(
+            val_dataset,
+            batch_size=min(1000, len(val_dataset)),
+            shuffle=False,
+            num_workers=X.get('num_workers', 0),
+            pin_memory=X.get('pin_memory', True),
+            drop_last=X.get('drop_last', False),
+            collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
+            sampler=sampler_val
+        )
+        return self
+
+    def transform(self, X: Dict) -> Dict:
+        X.update({"window_size": self.window_size,
+                  'transform_time_features': self.transform_time_features})
+        return super().transform(X)
+
+    def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
+        """
+        Method to build a transformation that can pre-process input data
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            mode (str):
+                train/val/test
+
+        Returns:
+            A composition of transformations
+        """
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError("Unsupported mode provided {}. ".format(mode))
+
+        candidate_transformations = []  # type: List[Callable]
+
+        # if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
+        #    candidate_transformations.extend(X['preprocess_transforms'])
+
+        candidate_transformations.append(ExpandTransformTimeSeries())
+        if mode == 'test' or not X['dataset_properties']['is_small_preprocess']:
+            if "preprocess_transforms" in X:
+                candidate_transformations.extend(X['preprocess_transforms'])
+
+        # We transform to tensor under dataset
+        return torchvision.transforms.Compose(candidate_transformations)
+
+    def get_loader(self, X: Union[TimeSeriesSequence, List[TimeSeriesSequence]], y: Optional[np.ndarray] = None,
+                   batch_size: int = np.iinfo(np.int32).max,
+                   ) -> torch.utils.data.DataLoader:
+        """
+        Creates a data loader object from the provided data,
+        applying the transformations meant to validation objects
+        This is a lazy loaded test set, each time only one piece of series
+        """
+        if isinstance(X, TimeSeriesSequence):
+            X = [X]
+        if isinstance(X, List):
+            if self.dataset_small_preprocess and not self._is_uni_variant:
+
+                num_sequences = len(X)
+                sequence_lengths = [0] * num_sequences
+                for seq_idx, x_seq in enumerate(X):
+                    sequence_lengths[seq_idx] = len(x_seq.X)
+                series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+
+                if len(self.known_future_features_index) > 0:
+                    sequence_lengths_test = [0] * num_sequences
+                    for seq_idx, x_seq in enumerate(X):
+                        sequence_lengths_test[seq_idx] = len(x_seq.X_test)
+                    series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
+
+                if not X[0].is_pre_processed:  # type: ignore[union-attr]
+
+                    x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
+
+                    x_all.index = series_number
+
+                    if self.dataset_small_preprocess and self.feature_preprocessor is not None:
+                        self.feature_preprocessor = self.feature_preprocessor.fit(x_all)
+                        x_all = self.feature_preprocessor.transform(x_all.copy())
+
+                    x_all = pd.DataFrame(x_all)
+                    x_all.index = series_number
+
+                    if len(self.known_future_features_index) > 0:
+                        x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]),
+                                                  columns=self.dataset_columns)
+
+                        x_all_test.index = series_number_test
+
+                        if self.dataset_small_preprocess and self.feature_preprocessor is not None:
+                            x_all_test = self.feature_preprocessor.transform(x_all_test.copy())
+
+                        x_all_test = pd.DataFrame(x_all_test)
+                        x_all_test.index = series_number_test
+
+                else:
+                    x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]))
+                    x_all.index = series_number
+                    if len(self.known_future_features_index) > 0:
+                        x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]))
+                        x_all_test.index = series_number_test
+
+                x_all_grouped = x_all.groupby(x_all.index)
+                if len(self.known_future_features_index) > 0:
+                    x_all_test_grouped = x_all_test.groupby(x_all_test.index)
+
+            for i, x_seq in enumerate(X):
+                if not isinstance(x_seq, TimeSeriesSequence):
+                    raise NotImplementedError('Test Set must be a TimeSeriesSequence or a'
+                                              ' list of time series objects!')
+                x_seq.is_pre_processed = True
+                if x_seq.freq != self.freq:
+                    # WE need to recompute the cached time features (However, this should not happen)
+                    x_seq._cached_time_features = None
+
+                if self.dataset_small_preprocess and not self._is_uni_variant:
+                    x_seq.X = x_all_grouped.get_group(i).transform(np.array).values
+                    update_dict: Dict[str, Any] = {"known_future_features_index": self.known_future_features_index}
+                    if len(self.known_future_features_index) > 0:
+                        x_seq.X_test = x_all_test_grouped.get_group(i).transform(np.array).values
+
+                else:
+                    update_dict = {}
+                update_dict.update({'freq': self.freq,
+                                    'transform_time_features': self.transform_time_features,
+                                    'time_feature_transform': self.time_feature_transform, })
+
+                x_seq.update_attribute(**update_dict)
+                if self.transform_time_features:
+                    x_seq.cache_time_features()
+
+                x_seq.freq = self.freq
+                x_seq.is_test_set = True
+                if not self.dataset_small_preprocess:
+                    x_seq.update_transform(self.test_transform, train=False)
+        else:
+            raise NotImplementedError('Unsupported data type for time series data loader!')
+
+        dataset = X
+        dataset_test = TestSequenceDataset(dataset, train=False)
+
+        return torch.utils.data.DataLoader(
+            dataset_test,
+            batch_size=min(batch_size, len(dataset)),
+            shuffle=False,
+            collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
+        )
+
+    def get_train_data_loader(self) -> torch.utils.data.DataLoader:
+        """Returns a data loader object for the train data
+
+        Returns:
+            torch.utils.data.DataLoader: A train data loader
+        """
+        assert self.train_data_loader is not None, "No train data loader fitted"
+        return self.train_data_loader
+
+    def get_val_data_loader(self) -> torch.utils.data.DataLoader:
+        """Returns a data loader object for the validation data
+
+        Returns:
+            torch.utils.data.DataLoader: A validation data loader
+        """
+        assert self.val_data_loader is not None, "No val data loader fitted"
+        return self.val_data_loader
+
+    def get_test_data_loader(self) -> torch.utils.data.DataLoader:
+        """Returns a data loader object for the test data
+
+        Returns:
+            torch.utils.data.DataLoader: A validation data loader
+        """
+        return self.test_data_loader
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
+                                        batch_size: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter="batch_size",
+                                                                  value_range=(32, 320),
+                                                                  default_value=64),
+                                        window_size: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='window_size',
+                                                                  value_range=(20, 50),
+                                                                  default_value=30),
+                                        num_batches_per_epoch: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter="num_batches_per_epoch",
+                                                                  value_range=(30, 100),
+                                                                  default_value=50),
+                                        sample_strategy: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter="sample_strategy",
+                                                                  value_range=('LengthUniform', 'SeqUniform'),
+                                                                  default_value='SeqUniform'),
+                                        backcast: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='backcast',
+                                                                  value_range=(True, False),
+                                                                  default_value=False),
+                                        backcast_period: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='backcast_period',
+                                                                  value_range=(1, 7),
+                                                                  default_value=2),
+                                        transform_time_features: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='transform_time_features',
+                                                                  value_range=(True, False),
+                                                                  default_value=False)
+                                        ) -> ConfigurationSpace:
+        """
+        hyperparameter search space for forecasting dataloader. Forecasting dataloader construct the window size in two
+        ways: either window_size is directly assigned or it is computed by backcast_period * n_prediction_steps
+        (introduced by nbeats:
+        Oreshkin et al., N-BEATS: Neural basis expansion analysis for interpretable time series forecasting, ICLR 2020
+        https://arxiv.org/abs/1905.10437)
+        Currently back_cast_period is only activate when back_cast is activate
+        Args:
+            dataset_properties (Optional[Dict]):
+                dataset properties
+            batch_size (int):
+                batch size
+            window_size (int):
+                window size, (if activate) this value directly determines the window_size of the data loader
+            num_batches_per_epoch (int):
+                how many batches are trained at each iteration
+            sample_strategy(str):
+                how samples are distributed. if it is LengthUnifrom, then every single data point has the same
+                probability to be sampled, in which case longer sequence will occupy more samples. If it is
+                SeqUniform, then every sequence has the same probability to be sampled regardless of their length
+            backcast (bool):
+                if back_cast module is activate (in which case window size is a multiple of n_prediction_steps)
+            backcast_period (int):
+                activate if backcast is activate, the window size is then computed with
+                 backcast_period * n_prediction_steps
+            transform_time_features (bool)
+                if time feature trasnformation is applied
+
+        Returns:
+            cs:
+                Configuration Space
+
+        """
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, num_batches_per_epoch, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, sample_strategy, CategoricalHyperparameter)
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        seq_length_max = dataset_properties.get('seq_length_max', np.inf)
+
+        if seq_length_max <= window_size.value_range[1]:
+            if seq_length_max <= window_size.value_range[0]:
+                warnings.warn('The base window_size is larger than the maximal sequence length in the dataset,'
+                              'we simply set it as a constant value with maximal sequence length')
+                window_size = HyperparameterSearchSpace(hyperparameter=window_size.hyperparameter,
+                                                        value_range=(1, seq_length_max),
+                                                        default_value=seq_length_max)
+                window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+            else:
+                window_size_value_range = window_size.value_range
+                window_size = HyperparameterSearchSpace(hyperparameter='window_size',
+                                                        value_range=(window_size_value_range[0], seq_length_max),
+                                                        default_value=min(window_size.default_value, seq_length_max))
+                window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+        else:
+            window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+
+        backcast = get_hyperparameter(backcast, CategoricalHyperparameter)
+        backcast_period = get_hyperparameter(backcast_period, UniformIntegerHyperparameter)
+
+        cs.add_hyperparameters([window_size, backcast, backcast_period])
+
+        window_size_cond = EqualsCondition(window_size, backcast, False)
+        backcast_period_cond = EqualsCondition(backcast_period, backcast, True)
+        cs.add_conditions([window_size_cond, backcast_period_cond])
+
+        time_feature_transform = dataset_properties.get('time_feature_transform', [])
+        if time_feature_transform:
+            add_hyperparameter(cs, transform_time_features, CategoricalHyperparameter)
+
+        return cs
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.train_data_loader.__class__.__name__
+        return string
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
new file mode 100644
index 000000000..20c83b396
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -0,0 +1,294 @@
+import collections
+from typing import Iterator, List, Mapping, Optional, Sequence, Sized, Union
+
+import numpy as np
+
+import torch
+from torch._six import string_classes
+from torch.utils.data._utils.collate import default_collate, default_collate_err_msg_format, np_str_obj_array_pattern
+from torch.utils.data.sampler import SequentialSampler, SubsetRandomSampler
+
+from autoPyTorch.datasets.base_dataset import TransformSubset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+
+
+class TestSequenceDataset(TransformSubset):
+    def __init__(self, dataset: List[TimeSeriesSequence], train: bool = False) -> None:
+        self.dataset = dataset
+        self.indices = torch.arange(len(dataset))
+        self.train = train
+
+    def __getitem__(self, idx: int) -> np.ndarray:
+        # we only consider the entire sequence
+        seq = self.dataset[idx]
+        return seq.__getitem__(len(seq) - 1, self.train)
+
+
+def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
+                                     seq_minimal_length: int = 1,
+                                     seq_max_length: int = np.iinfo(np.int32).max,
+                                     batch_first: bool = True,
+                                     padding_value: float = 0.0) -> torch.Tensor:
+    r"""
+    This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we constraint the sequence to be
+    at least seq_minimal_length and at most seq_max_length
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+    max_len = min(max(max([s.size(0) for s in sequences]), seq_minimal_length), seq_max_length)
+    if seq_max_length > max_len:
+        seq_max_length = max_len
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+    if sequences[0].dtype == torch.bool:
+        out_tensor = sequences[0].new_full(out_dims, False)
+    else:
+        out_tensor = sequences[0].new_full(out_dims, padding_value)
+
+    for i, tensor in enumerate(sequences):
+        length = min(tensor.size(0), seq_max_length)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, -length:, ...] = tensor[-length:]
+        else:
+            out_tensor[-length:, i, ...] = tensor[-length:]
+
+    return out_tensor
+
+
+class PadSequenceCollector:
+    """
+    A collector that transform the sequences from dataset. Since the sequences might contain different
+    length, they need to be padded with constant value. Given that target value might require special value to
+    fit the requirement of distribution, past_target will be padded with special values
+
+    """
+
+    def __init__(self, window_size: int, sample_interval: int = 1, target_padding_value: float = 0.0,
+                 seq_max_length: int = np.iinfo(np.int32).max):
+        self.window_size = window_size
+        self.sample_interval = sample_interval
+        self.target_padding_value = target_padding_value
+        self.seq_max_length = seq_max_length
+
+    def __call__(self, batch: Sequence[torch.Tensor], sample_interval: int = 1,
+                 seq_minimal_length: int = 1, padding_value: float = 0.0) -> Union[torch.Tensor, Mapping]:
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            seq: torch.Tensor = pad_sequence_with_minimal_length(batch,  # type: ignore[arg-type]
+                                                                 seq_minimal_length=seq_minimal_length,
+                                                                 seq_max_length=self.seq_max_length,
+                                                                 batch_first=True, padding_value=padding_value)
+
+            if sample_interval > 1:
+                subseq_length = seq.shape[1]
+                first_indices = -(sample_interval * ((subseq_length - 1) // sample_interval) + 1)
+                sample_indices = torch.arange(first_indices, 0, step=sample_interval)
+                return seq[:, sample_indices]
+            else:
+                return seq
+
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+                return default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        elif isinstance(elem, string_classes):
+            return batch
+        elif isinstance(elem, collections.abc.Mapping):
+            # only past targets and features needs to be transformed
+            return {
+                key: self([d[key] for d in batch]) if "past" not in key else self(
+                    [d[key] for d in batch],
+                    self.sample_interval,
+                    self.window_size,
+                    self.target_padding_value if "targets" in key else 0.0
+                ) for key
+                in elem}
+
+        elif elem is None:
+            return None
+        raise TypeError(f"Unsupported data type {elem_type}")
+
+
+class TimeSeriesSampler(SubsetRandomSampler):
+    """
+    A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
+    sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
+    the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
+    and  randomly sample one instance from each interval. If num_instances_per_seqs is not an integral, then the
+    first interval is selected with a certain probability:
+    for instance, if we want to sample 1.3 instance from a sequence [0,1,2,3,4,5], then we first divide the seuqence
+    into two parts: [0, 3] and [3, 6], one sample is sampled from the second part, while an expected value of 0.3 is
+    sampled from the first part (This part will be sampled in the very end with torch.multinomial)
+
+    Attributes:
+        indices (Sequence[int]):
+            The set of all the possible indices that can be sampled from
+        seq_lengths (Union[Sequence[int], np.ndarray]):
+            lengths of each sequence, applied to unsqueeze indices
+        num_instances_per_seqs (Optional[List[int]]):
+            expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
+            sampled
+        min_start (int):
+            how many first time steps we want to skip (the first few sequences need to be padded with 0)
+        generator (Optional[torch.Generator]):
+            pytorch generator to control the randomness
+    """
+    def __init__(self,
+                 indices: Sequence[int],
+                 seq_lengths: Union[Sequence[int], np.ndarray],
+                 num_instances_per_seqs: Optional[Union[List[float], np.ndarray]] = None,
+                 min_start: int = 0,
+                 generator: Optional[torch.Generator] = None) -> None:
+        super().__init__(indices, generator)
+        if num_instances_per_seqs is None:
+            self.iter_all_seqs = True
+        else:
+            self.iter_all_seqs = False
+            if len(seq_lengths) != len(num_instances_per_seqs):
+                raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
+                                 f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
+            seq_intervals_int = []
+            seq_intervals_decimal = []
+
+            num_expected_ins_decimal = []
+            idx_tracker = 0
+            for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
+                idx_end = idx_tracker + seq_length
+                idx_start = idx_tracker + min_start
+                if idx_start > idx_end:
+                    idx_start = idx_tracker
+
+                num_interval = int(np.ceil(num_instances))
+                if num_interval > idx_end - idx_start or num_interval == 0:
+                    interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.intp)
+                    # In this case, seq_intervals_decimal contains the entire interval of the sequence.
+                    num_expected_ins_decimal.append(num_instances)
+                    seq_intervals_decimal.append(interval[:2])
+                    seq_intervals_int.append(interval[1:])
+                else:
+                    interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.intp)
+                    # The first two item determines the first sequence interval where most of the samples need to be
+                    # padded, we then make it the interval for the expected decimal
+                    num_expected_ins_decimal.append(np.modf(num_instances)[0])
+                    seq_intervals_decimal.append(interval[:2])
+
+                    seq_intervals_int.append(interval[1:])
+                idx_tracker += seq_length
+
+            num_expected_ins_decimal_stacked = np.stack(num_expected_ins_decimal)
+
+            self.seq_lengths = seq_lengths
+            self.seq_lengths_sum = np.sum(seq_lengths)
+            self.num_instances = int(np.round(np.sum(num_instances_per_seqs)))
+
+            self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
+            self.seq_intervals_int = seq_intervals_int
+
+            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal_stacked) + 1e-8
+
+    def __iter__(self) -> Iterator[int]:
+        if self.iter_all_seqs:
+            return super().__iter__()
+        samples = torch.ones(self.num_instances, dtype=torch.int)
+        idx_samples_start = 0
+        idx_samples_end = 0
+        for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals_int, self.seq_lengths)):
+            if len(interval) == 1:
+                continue
+            num_samples = len(interval) - 1
+            idx_samples_end = idx_samples_start + num_samples
+
+            samples_shift = torch.rand(num_samples, generator=self.generator) * (interval[1:] - interval[:-1])
+            samples_seq = torch.floor(samples_shift + interval[:-1]).int()
+            samples[idx_samples_start: idx_samples_end] = samples_seq
+
+            idx_samples_start = idx_samples_end
+        num_samples_remain = self.num_instances - idx_samples_end
+        if num_samples_remain > 0:
+            if num_samples_remain > self.num_expected_ins_decimal.shape[-1]:
+                replacement = True
+            else:
+                replacement = False
+
+            samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain, replacement)
+            seq_interval = self.seq_intervals_decimal[samples_idx]
+
+            samples_shift = torch.rand(num_samples_remain, generator=self.generator)
+            samples_shift *= (seq_interval[:, 1] - seq_interval[:, 0])
+            samples_seq_remain = torch.floor(samples_shift).int() + seq_interval[:, 0]
+            samples[-num_samples_remain:] = samples_seq_remain
+
+        # sometimes if self.seq_lengths_sum is too large, float might not be accurate enough
+        samples = torch.where(samples == self.seq_lengths_sum, samples - 1, samples)
+
+        yield from (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
+
+    def __len__(self) -> int:
+        return self.num_instances
+
+
+class SequentialSubSetSampler(SequentialSampler):
+    """
+    Sampler for validation set that allows to sample only a fraction of the datasetset. For those datasets that
+    have a big amount of datapoints. This function helps to reduce the inference time during validation after each
+    epoch
+
+
+    Attributes:
+        data_source (Dataset):
+            dataset to sample from, it is composed of several TimeSeriesSequence. for each TimeSeriesSequence only 1
+            sample is allowed
+        num_samples (int):
+            number of samples to be sampled from the dataset source
+        generator (Optional[torch.Generator]):
+            torch random generator
+    """
+    data_source: Sized
+
+    def __init__(self, data_source: Sized, num_samples: int, generator: Optional[torch.Generator] = None) -> None:
+        super(SequentialSubSetSampler, self).__init__(data_source)
+        if num_samples > len(data_source):
+            self.eval_all_sequences = True
+            self.num_samples = len(data_source)
+        else:
+            self.eval_all_sequences = False
+            self.num_samples = num_samples
+        self.generator = generator
+
+    def __iter__(self) -> Iterator[int]:
+        if self.eval_all_sequences:
+            yield from super(SequentialSubSetSampler, self).__iter__()
+        else:
+            yield from torch.randperm(len(self.data_source), generator=self.generator)[:self.num_samples]
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+
+class ExpandTransformTimeSeries(object):
+    """Expand Dimensionality so tabular transformations see
+       a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
+       along the last axis
+    """
+
+    def __call__(self, data: np.ndarray) -> np.ndarray:
+        if len(data.shape) <= 1:
+            data = np.expand_dims(data, axis=-1)
+        return data
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 0eb12fe28..37b3e2bcf 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -9,9 +9,16 @@
             MSELoss: supports continuous output types
             L1Loss: supports continuous output types
         Default: MSELoss
+Forecasting:
+            LogProbLoss: supports distribution output types
+            QuantileLoss: supports quantile output type
+            MAPELoss: supports continuous output types
+            MASELoss: supports continuous output types
+            L1Loss: supports continuous output types
 """
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, List, Optional, Type, Union
 
+import torch
 from torch.nn.modules.loss import (
     BCEWithLogitsLoss,
     CrossEntropyLoss,
@@ -20,22 +27,134 @@
 )
 from torch.nn.modules.loss import _Loss as Loss
 
-from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, MULTICLASS, REGRESSION_TASKS, \
-    STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
+from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, FORECASTING_TASKS, MULTICLASS, \
+    REGRESSION_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
+
+
+class AbstractForecastingLoss(Loss):
+    __constants__ = ['reduction']
+
+    def __init__(self, reduction: str = 'mean') -> None:
+        super(AbstractForecastingLoss, self).__init__(reduction=reduction)
+
+    def aggregate_loss(self, loss_values: torch.Tensor) -> torch.Tensor:
+        if self.reduction == 'mean':
+            return loss_values.mean()
+        elif self.reduction == 'sum':
+            return loss_values.sum()
+        else:
+            return loss_values
+
+
+class LogProbLoss(AbstractForecastingLoss):
+    def forward(self, input_dist: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
+        scores = input_dist.log_prob(target_tensor)
+        return self.aggregate_loss(-scores)
+
+
+class MAPELoss(AbstractForecastingLoss):
+    def forward(self, predictions: torch.Tensor, target_tensor: torch.Tensor) -> torch.Tensor:
+        # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/model/n_beats/_network.py
+        denominator = torch.abs(target_tensor)
+        diff = torch.abs(predictions - target_tensor)
+
+        flag = (denominator == 0).float()
+
+        mape = (diff * (1 - flag)) / (denominator + flag)
+
+        return self.aggregate_loss(mape)
+
+
+class MASELoss(AbstractForecastingLoss):
+    def __init__(self, reduction: str = 'mean') -> None:
+        super(MASELoss, self).__init__(reduction=reduction)
+        self._mase_coefficient: Union[float, torch.Tensor] = 1.0
+
+    def set_mase_coefficient(self, mase_coefficient: torch.Tensor) -> 'MASELoss':
+        """
+        set mase coefficient for computing MASE losses
+        Args:
+            mase_coefficient (torch.Tensor): mase coefficient, its dimensions corresponds to [B, L, N] and can be
+                broadcasted
+
+        Returns:
 
+        """
+        if len(mase_coefficient.shape) == 2:
+            mase_coefficient = mase_coefficient.unsqueeze(1)
 
-losses = dict(classification=dict(
-    CrossEntropyLoss=dict(
-        module=CrossEntropyLoss, supported_output_types=[MULTICLASS, BINARY]),
-    BCEWithLogitsLoss=dict(
-        module=BCEWithLogitsLoss, supported_output_types=[BINARY])),
+        self._mase_coefficient = mase_coefficient
+        return self
+
+    def forward(self,
+                predictions: torch.Tensor,
+                target_tensor: torch.Tensor) -> torch.Tensor:
+        if isinstance(self._mase_coefficient, torch.Tensor):
+            mase_shape = self._mase_coefficient.shape
+            pred_shape = predictions.shape
+            if len(mase_shape) == len(pred_shape):
+                if mase_shape[0] != pred_shape[0] or mase_shape[-1] != pred_shape[-1]:
+                    raise ValueError(f"If self._mase_coefficient is a Tensor, it must have the same batch size and "
+                                     f"num_targets as the predictions, However, their shapes are {mase_shape}"
+                                     f"(self._mase_coefficient) and {pred_shape}(pred_shape)")
+        loss_values = torch.abs(predictions - target_tensor) * self._mase_coefficient
+        return self.aggregate_loss(loss_values)
+
+
+class QuantileLoss(AbstractForecastingLoss):
+    def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5]) -> None:
+        super(QuantileLoss, self).__init__(reduction=reduction)
+        self.quantiles = quantiles
+
+    def set_quantiles(self, quantiles: List[float]) -> None:
+        self.quantiles = quantiles
+
+    def forward(self,
+                predictions: List[torch.Tensor],
+                target_tensor: torch.Tensor) -> torch.Tensor:
+        assert len(self.quantiles) == len(predictions)
+        losses_all = []
+        for q, y_pred in zip(self.quantiles, predictions):
+            diff = target_tensor - y_pred
+
+            loss_q = torch.max(q * diff, (q - 1) * diff)
+            losses_all.append(loss_q.unsqueeze(-1))
+
+        losses_all = torch.mean(torch.concat(losses_all, dim=-1), dim=-1)
+
+        return self.aggregate_loss(losses_all)
+
+
+losses = dict(
+    classification=dict(
+        CrossEntropyLoss=dict(
+            module=CrossEntropyLoss, supported_output_types=[MULTICLASS, BINARY]),
+        BCEWithLogitsLoss=dict(
+            module=BCEWithLogitsLoss, supported_output_types=[BINARY])),
     regression=dict(
         MSELoss=dict(
             module=MSELoss, supported_output_types=[CONTINUOUS]),
         L1Loss=dict(
-            module=L1Loss, supported_output_types=[CONTINUOUS])))
+            module=L1Loss, supported_output_types=[CONTINUOUS])),
+    forecasting=dict(
+        LogProbLoss=dict(
+            module=LogProbLoss, supported_output_types=[CONTINUOUS]),
+        MSELoss=dict(
+            module=MSELoss, supported_output_types=[CONTINUOUS]),
+        L1Loss=dict(
+            module=L1Loss, supported_output_types=[CONTINUOUS]),
+        MAPELoss=dict(
+            module=MAPELoss, supported_output_types=[CONTINUOUS]),
+        MASELoss=dict(
+            module=MASELoss, supported_output_types=[CONTINUOUS]),
+    )
+)
+
+default_losses: Dict[str, Type[Loss]] = dict(classification=CrossEntropyLoss,
+                                             regression=MSELoss,
+                                             forecasting=MASELoss)
 
-default_losses: Dict[str, Type[Loss]] = dict(classification=CrossEntropyLoss, regression=MSELoss)
+LOSS_TYPES = ['regression', 'distribution']
 
 
 def get_default(task: int) -> Type[Loss]:
@@ -51,6 +170,8 @@ def get_default(task: int) -> Type[Loss]:
         return default_losses['classification']
     elif task in REGRESSION_TASKS:
         return default_losses['regression']
+    elif task in FORECASTING_TASKS:
+        return default_losses['forecasting']
     else:
         raise ValueError("Invalid task type {}".format(TASK_TYPES_TO_STRING[task]))
 
@@ -75,6 +196,10 @@ def get_supported_losses(task: int, output_type: int) -> Dict[str, Type[Loss]]:
         for key, value in losses['regression'].items():
             if output_type in value['supported_output_types']:
                 supported_losses[key] = value['module']
+    elif task in FORECASTING_TASKS:
+        for key, value in losses['forecasting'].items():
+            if output_type in value['supported_output_types']:
+                supported_losses[key] = value['module']
     return supported_losses
 
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index c3f247cd3..0cac3c560 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, Union
 
 import numpy as np
 
@@ -11,7 +11,7 @@ class autoPyTorchMetric(object, metaclass=ABCMeta):
 
     def __init__(self,
                  name: str,
-                 score_func: Callable[..., float],
+                 score_func: Callable[..., Union[float, np.ndarray]],
                  optimum: float,
                  worst_possible_result: float,
                  sign: float,
@@ -26,7 +26,7 @@ def __init__(self,
     def __call__(self,
                  y_true: np.ndarray,
                  y_pred: np.ndarray,
-                 sample_weight: Optional[List[float]] = None
+                 sample_weight: Optional[List[float]] = None,
                  ) -> float:
         raise NotImplementedError()
 
@@ -37,6 +37,20 @@ def __repr__(self) -> str:
         return self.name
 
 
+# This is a mixin for computing time series forecasting losses, the  parameters are defined by:
+# https://www.sktime.org/en/stable/api_reference/performance_metrics.html
+# TODO considering adding more arguments to this function to allow advanced loss functions, e.g. asymmetric_error
+class ForecastingMetricMixin:
+    def __call__(self,
+                 y_true: np.ndarray,
+                 y_pred: np.ndarray,
+                 sp: int,
+                 n_prediction_steps: int,
+                 horizon_weight: Optional[List[float]] = None
+                 ) -> float:
+        raise NotImplementedError()
+
+
 class _PredictMetric(autoPyTorchMetric):
     def __call__(
             self,
@@ -176,6 +190,89 @@ def __call__(
             return self._sign * self._metric_func(y_true, y_pred, **self._kwargs)
 
 
+class _ForecastingMetric(ForecastingMetricMixin, autoPyTorchMetric):
+    def __call__(  # type: ignore[override]
+            self,
+            y_true: np.ndarray,
+            y_pred: np.ndarray,
+            sp: int,
+            n_prediction_steps: int,
+            horizon_weight: Optional[List[float]] = None,
+            sample_weight: Optional[List[float]] = None,
+            **kwarg: Any,
+    ) -> float:
+        """
+        Evaluate time series forecasting losses given input data
+        The description is nearly the same as the one defined under
+        https://www.sktime.org/en/stable/api_reference/performance_metrics.html
+
+        Args:
+        y_true (np.ndarray):
+             array-like ([n_seq x n_prediction_steps, n_output]). Ground truth (correct) target values.
+        y_pred (np.ndarray):
+            array-like ([n_seq x n_prediction_steps, n_output]). Forecasted values.
+        sp (int):
+            Seasonal periodicity of training data.
+        horizon_weight (Optional[List[float]]):
+            Forecast horizon weights.
+        sample_weight (Optional[List[float]]):
+            weights w.r.t. each sample
+
+        Returns
+        -------
+            score (float):
+                Score function applied to prediction of estimator on X.
+        """
+
+        agg = self._kwargs['aggregation']
+
+        if not len(y_pred) == len(y_true):
+            raise ValueError(f"The length of y_true, y_pred and y_train must equal, however, they are "
+                             f"{len(y_pred)} and {len(y_true)} respectively")
+
+        # we want to compute loss w.r.t. each sequence, so the first dimension needs to be n_prediction_steps
+
+        n_outputs = y_true.shape[-1]
+
+        if sample_weight is not None:
+            if n_outputs != len(sample_weight):
+                raise ValueError(("There must be equally many custom weights "
+                                  "(%d) as outputs (%d).") %
+                                 (len(sample_weight), n_outputs))
+
+        # shape is [n_prediction_steps, n_sequence, n_outputs]
+        y_true = np.transpose(y_true.reshape((-1, n_prediction_steps, n_outputs)),
+                              (1, 0, 2))
+        y_pred = np.transpose(y_pred.reshape((-1, n_prediction_steps, n_outputs)),
+                              (1, 0, 2))
+
+        # shape is [n_prediction_steps, n_sequence * n_outputs]
+        y_true = y_true.reshape((n_prediction_steps, -1))
+        y_pred = y_pred.reshape((n_prediction_steps, -1))
+        # TODO consider weights for each individual prediction, i.e., we could mask the unobserved values
+        losses_all: np.ndarray = self._metric_func(y_true=y_true,
+                                                   y_pred=y_pred,
+                                                   sp=sp,
+                                                   horizon_weight=horizon_weight,
+                                                   multioutput='raw_values',
+                                                   **self._kwargs)
+
+        losses_all = losses_all.reshape([-1, n_outputs])
+
+        # multi output aggregation
+        if sample_weight is not None:
+            losses_all = np.sum(losses_all * sample_weight, axis=-1)
+        else:
+            losses_all = np.mean(losses_all, -1)
+
+        if agg == 'mean':
+            return float(self._sign * np.mean(losses_all))
+        elif agg == 'median':
+            return float(self._sign * np.median(losses_all))
+        else:
+            raise NotImplementedError(f'Unsupported aggregation type {agg}')
+
+
 def make_metric(
     name: str,
     score_func: Callable,
@@ -184,6 +281,7 @@ def make_metric(
     greater_is_better: bool = True,
     needs_proba: bool = False,
     needs_threshold: bool = False,
+    do_forecasting: bool = False,
     **kwargs: Any
 ) -> autoPyTorchMetric:
     """
@@ -223,5 +321,7 @@ def make_metric(
         return _ProbaMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
     elif needs_threshold:
         return _ThresholdMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
+    elif do_forecasting:
+        return _ForecastingMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
     else:
         return _PredictMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 0d82b9622..5fa60a24d 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -1,8 +1,16 @@
 from functools import partial
+from typing import List, Union
 
+import numpy as np
 
 import sklearn.metrics
 
+try:
+    import sktime.performance_metrics.forecasting as forecasting_metrics
+    forecasting_dependencies_installed = True
+except ModuleNotFoundError:
+    forecasting_dependencies_installed = False
+
 from smac.utils.constants import MAXINT
 
 from autoPyTorch.pipeline.components.training.metrics.base import make_metric
@@ -47,6 +55,7 @@
 f1 = make_metric('f1',
                  sklearn.metrics.f1_score)
 
+
 # Score functions that need decision values
 roc_auc = make_metric('roc_auc', sklearn.metrics.roc_auc_score, needs_threshold=True)
 average_precision = make_metric('average_precision',
@@ -88,3 +97,134 @@
                                                         pos_label=None,
                                                         average=average))
         CLASSIFICATION_METRICS[qualified_name] = globals()[qualified_name]
+
+
+# Standard Forecasting Scores
+
+# To avoid storing unnecessary scale values here, we scale all the values under
+# AutoPytorch.evaluation.time_series_forecasting_train_evaluator
+
+def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> np.ndarray:
+    """
+    compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
+    this function aims at reducing the memory requirement
+
+    Args:
+        past_target (Optional[List, np.ndarray]):
+            past target observations
+        sp (int):
+            seasonality parameter to compute sp
+
+    Returns:
+        mase_coefficient (np.ndarray):
+            inverse of mase_denominator
+    """
+    past_target = np.nan_to_num(past_target)
+    max_past_target_abs = np.max(np.abs(past_target))
+    if max_past_target_abs == 0.:
+        return np.asarray([1.])
+    if sp >= len(past_target):
+        # in this case, we simply consider the mean value of the entire sequence
+        # TODO consider if there is a better way of handling this
+        try:
+            mase_denominator = forecasting_metrics.mean_absolute_error(past_target,
+                                                                       np.zeros_like(past_target),
+                                                                       multioutput="raw_values")
+        except ValueError:
+            return np.asarray([1.])
+
+    else:
+        mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
+                                                                   past_target[:-sp],
+                                                                   multioutput="raw_values")
+
+    return np.where(mase_denominator == 0.0,
+                    np.min([1., 1. / max_past_target_abs]),
+                    1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
+                    )
+
+
+if forecasting_dependencies_installed:
+    mean_MASE_forecasting = make_metric('mean_MASE_forecasting',
+                                        forecasting_metrics.mean_absolute_error,
+                                        optimum=0,
+                                        worst_possible_result=MAXINT,
+                                        greater_is_better=False,
+                                        do_forecasting=True,
+                                        aggregation='mean',
+                                        )
+
+    median_MASE_forecasting = make_metric('median_MASE_forecasting',
+                                          forecasting_metrics.mean_absolute_error,
+                                          optimum=0,
+                                          worst_possible_result=MAXINT,
+                                          greater_is_better=False,
+                                          do_forecasting=True,
+                                          aggregation='median',
+                                          )
+
+    MASE_LOSSES = [mean_MASE_forecasting, median_MASE_forecasting]
+
+    mean_MAE_forecasting = make_metric('mean_MAE_forecasting',
+                                       forecasting_metrics.mean_absolute_error,
+                                       optimum=0,
+                                       worst_possible_result=MAXINT,
+                                       greater_is_better=False,
+                                       do_forecasting=True,
+                                       aggregation='mean',
+                                       )
+
+    median_MAE_forecasting = make_metric('median_MAE_forecasting',
+                                         forecasting_metrics.mean_absolute_error,
+                                         optimum=0,
+                                         worst_possible_result=MAXINT,
+                                         greater_is_better=False,
+                                         do_forecasting=True,
+                                         aggregation='median',
+                                         )
+
+    mean_MAPE_forecasting = make_metric('mean_MAPE_forecasting',
+                                        forecasting_metrics.mean_absolute_percentage_error,
+                                        optimum=0,
+                                        worst_possible_result=MAXINT,
+                                        greater_is_better=False,
+                                        do_forecasting=True,
+                                        aggregation='mean',
+                                        )
+
+    median_MAPE_forecasting = make_metric('median_MAPE_forecasting',
+                                          forecasting_metrics.mean_absolute_percentage_error,
+                                          optimum=0,
+                                          worst_possible_result=MAXINT,
+                                          greater_is_better=False,
+                                          do_forecasting=True,
+                                          aggregation='median',
+                                          )
+
+    mean_MSE_forecasting = make_metric('mean_MSE_forecasting',
+                                       forecasting_metrics.mean_squared_error,
+                                       optimum=0,
+                                       worst_possible_result=MAXINT,
+                                       greater_is_better=False,
+                                       do_forecasting=True,
+                                       aggregation='mean',
+                                       )
+
+    median_MSE_forecasting = make_metric('median_MSE_forecasting',
+                                         forecasting_metrics.mean_squared_error,
+                                         optimum=0,
+                                         worst_possible_result=MAXINT,
+                                         greater_is_better=False,
+                                         do_forecasting=True,
+                                         aggregation='median',
+                                         )
+
+    FORECASTING_METRICS = dict()
+    for scorer in [mean_MASE_forecasting, median_MASE_forecasting,
+                   mean_MAE_forecasting, median_MAE_forecasting,
+                   mean_MAPE_forecasting, median_MAPE_forecasting,
+                   mean_MSE_forecasting, median_MSE_forecasting]:
+        FORECASTING_METRICS[scorer.name] = scorer
+else:
+    MASE_LOSSES = []
+    FORECASTING_METRICS = dict()
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index bbf7f86d4..e72c1afce 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -5,12 +5,19 @@
 
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
+    FORECASTING_TASKS,
+    ForecastingDependenciesNotInstalledMSG,
     REGRESSION_TASKS,
     STRING_TO_TASK_TYPES,
     TASK_TYPES,
 )
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS
+from autoPyTorch.pipeline.components.training.metrics.metrics import (
+    CLASSIFICATION_METRICS,
+    FORECASTING_METRICS,
+    MASE_LOSSES,
+    REGRESSION_METRICS,
+)
 
 
 def sanitize_array(array: np.ndarray) -> np.ndarray:
@@ -40,6 +47,10 @@ def get_supported_metrics(dataset_properties: Dict[str, Any]) -> Dict[str, autoP
         return REGRESSION_METRICS
     elif STRING_TO_TASK_TYPES[task_type] in CLASSIFICATION_TASKS:
         return CLASSIFICATION_METRICS
+    elif STRING_TO_TASK_TYPES[task_type] in FORECASTING_TASKS:
+        if len(FORECASTING_METRICS) == 0:
+            raise ModuleNotFoundError(ForecastingDependenciesNotInstalledMSG)
+        return FORECASTING_METRICS
     else:
         raise NotImplementedError(task_type)
 
@@ -78,7 +89,10 @@ def get_metrics(dataset_properties: Dict[str, Any],
                                                 'binary': 'accuracy',
                                                 'multiclass-multioutput': 'f1'}),
                            regression=dict({'continuous': 'r2',
-                                            'continuous-multioutput': 'r2'}))
+                                            'continuous-multioutput': 'r2'}),
+                           forecasting=dict({'continuous': 'mean_MASE_forecasting',
+                                             'continuous-multioutput': 'mean_MASE_forecasting'})
+                           )
 
     supported_metrics = get_supported_metrics(dataset_properties)
     metrics: List[autoPyTorchMetric] = list()
@@ -99,6 +113,8 @@ def get_metrics(dataset_properties: Dict[str, Any],
                 metrics.append(supported_metrics[default_metrics['classification'][dataset_properties['output_type']]])
             if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in REGRESSION_TASKS:
                 metrics.append(supported_metrics[default_metrics['regression'][dataset_properties['output_type']]])
+            if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in FORECASTING_TASKS:
+                metrics.append(supported_metrics[default_metrics['forecasting'][dataset_properties['output_type']]])
 
     return metrics
 
@@ -108,9 +124,27 @@ def calculate_score(
     prediction: np.ndarray,
     task_type: int,
     metrics: Iterable[autoPyTorchMetric],
+    **score_kwargs: Any
 ) -> Dict[str, float]:
     score_dict = dict()
-    if task_type in REGRESSION_TASKS:
+    if task_type in FORECASTING_TASKS:
+        if len(MASE_LOSSES) == 0:
+            raise ModuleNotFoundError(ForecastingDependenciesNotInstalledMSG)
+        cprediction = sanitize_array(prediction)
+        for metric_ in metrics:
+            if metric_ in MASE_LOSSES and 'mase_coefficient' in score_kwargs:
+                mase_coe_shape = score_kwargs['mase_coefficient'].shape
+                target_shape = target.shape
+                if mase_coe_shape[0] != target_shape[0] or mase_coe_shape[-1] != target_shape[-1]:
+                    raise ValueError(f"the shape of MASE coefficient and target_shape must be consistent in the "
+                                     f"first and last dimension. However, their shapes are {mase_coe_shape}"
+                                     f"(MASE coefficient) and {target_shape} (targets)")
+                target_scaled = target * score_kwargs['mase_coefficient']
+                cprediction_scaled = cprediction * score_kwargs['mase_coefficient']
+                score_dict[metric_.name] = metric_._sign * metric_(target_scaled, cprediction_scaled, **score_kwargs)
+            else:
+                score_dict[metric_.name] = metric_._sign * metric_(target, cprediction, **score_kwargs)
+    elif task_type in REGRESSION_TASKS:
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:
             try:
@@ -150,6 +184,7 @@ def calculate_loss(
     prediction: np.ndarray,
     task_type: int,
     metrics: Iterable[autoPyTorchMetric],
+    **score_kwargs: Dict
 ) -> Dict[str, float]:
     """
     Returns a loss (a magnitude that allows casting the
@@ -169,6 +204,8 @@ def calculate_loss(
             prediction is according to the solution.
         scoring_functions: List[Scorer]
             A list of metrics to calculate multiple losses
+        score_kwargs: Dict
+            additional arguments for computing scores
     Returns
     -------
         float or Dict[str, float]
@@ -179,6 +216,7 @@ def calculate_loss(
         prediction=prediction,
         task_type=task_type,
         metrics=metrics,
+        **score_kwargs,
     )
 
     loss_dict = dict()
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index e54006d10..3134db201 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -18,7 +18,7 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-from autoPyTorch.constants import STRING_TO_TASK_TYPES
+from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -66,6 +66,7 @@ def __init__(self,
                          random_state=random_state)
         self.run_summary: Optional[RunSummary] = None
         self.writer: Optional[SummaryWriter] = None
+        self.early_stopping_split_type: Optional[str] = None
         self._fit_requirements: Optional[List[FitRequirement]] = [
             FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
             FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
@@ -203,8 +204,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
         self.logger = get_named_client_logger(
             name=f"{X['num_run']}_{time.time()}",
             # Log to a user provided port else to the default logging port
-            port=X['logger_port'
-                   ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+            port=X['logger_port'] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         )
 
         # Call the actual fit function.
@@ -216,6 +216,44 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
 
         return cast(autoPyTorchComponent, self.choice)
 
+    def prepare_trainer(self, X: Dict) -> None:
+        """
+        prepare trainer, forecasting tasks require more parameters
+        """
+        assert self.choice is not None
+
+        # Support additional user metrics
+        metrics = get_metrics(dataset_properties=X['dataset_properties'])
+        if 'additional_metrics' in X:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
+        if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
+        additional_losses = X['additional_losses'] if 'additional_losses' in X else None
+
+        labels = self._get_train_label(X)
+
+        self.choice.prepare(
+            model=X['network'],
+            metrics=metrics,
+            criterion=get_loss(X['dataset_properties'],
+                               name=additional_losses),
+            budget_tracker=self.budget_tracker,
+            optimizer=X['optimizer'],
+            device=get_device_from_fit_dictionary(X),
+            metrics_during_training=X['metrics_during_training'],
+            scheduler=X['lr_scheduler'],
+            task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
+            labels=labels,
+            step_interval=X['step_interval']
+        )
+
+    def get_budget_tracker(self, X: Dict) -> BudgetTracker:
+        return BudgetTracker(
+            budget_type=X['budget_type'],
+            max_runtime=X['runtime'] if 'runtime' in X else None,
+            max_epochs=X['epochs'] if 'epochs' in X else None,
+        )
+
     def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice':
         """
         Fits a component by using an input dictionary with pre-requisites
@@ -243,33 +281,10 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
         if X["torch_num_threads"] > 0:
             torch.set_num_threads(X["torch_num_threads"])
 
-        self.budget_tracker = BudgetTracker(
-            budget_type=X['budget_type'],
-            max_runtime=X['runtime'] if 'runtime' in X else None,
-            max_epochs=X['epochs'] if 'epochs' in X else None,
-        )
+        self.budget_tracker = self.get_budget_tracker(X)
+
+        self.prepare_trainer(X)
 
-        # Support additional user metrics
-        metrics = get_metrics(dataset_properties=X['dataset_properties'])
-        if 'additional_metrics' in X:
-            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
-        if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
-            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
-        additional_losses = X['additional_losses'] if 'additional_losses' in X else None
-        self.choice.prepare(
-            model=X['network'],
-            metrics=metrics,
-            criterion=get_loss(X['dataset_properties'],
-                               name=additional_losses),
-            budget_tracker=self.budget_tracker,
-            optimizer=X['optimizer'],
-            device=get_device_from_fit_dictionary(X),
-            metrics_during_training=X['metrics_during_training'],
-            scheduler=X['lr_scheduler'],
-            task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
-            labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
-            step_interval=X['step_interval']
-        )
         total_parameter_count, trainable_parameter_count = self.count_parameters(X['network'])
         self.run_summary = RunSummary(
             total_parameter_count,
@@ -277,6 +292,11 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
             optimize_metric=None if not X['metrics_during_training'] else X.get('optimize_metric'),
         )
 
+        if X['val_data_loader'] is not None:
+            self.early_stopping_split_type = 'val'
+        else:
+            self.early_stopping_split_type = 'train'
+
         epoch = 1
 
         while True:
@@ -293,9 +313,17 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
                 writer=writer,
             )
 
+            # its fine if train_loss is None due to `is_max_time_reached()`
+            if train_loss is None:
+                if self.budget_tracker.is_max_time_reached():
+                    break
+                else:
+                    raise RuntimeError("Got an unexpected None in `train_loss`.")
+
             val_loss, val_metrics, test_loss, test_metrics = None, {}, None, {}
             if self.eval_valid_each_epoch(X):
-                val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
+                if X['val_data_loader']:
+                    val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
                 if 'test_data_loader' in X and X['test_data_loader']:
                     test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
 
@@ -334,9 +362,13 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
             if 'cuda' in X['device']:
                 torch.cuda.empty_cache()
 
+        if self.run_summary.is_empty():
+            raise RuntimeError("Budget exhausted without finishing an epoch.")
+
         # wrap up -- add score if not evaluating every epoch
         if not self.eval_valid_each_epoch(X):
-            val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
+            if X['val_data_loader']:
+                val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
             if 'test_data_loader' in X and X['val_data_loader']:
                 test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
             self.run_summary.add_performance(
@@ -363,6 +395,21 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
 
         return self
 
+    def _get_train_label(self, X: Dict[str, Any]) -> List[int]:
+        """
+        Verifies and validates the labels from train split.
+        """
+        # Ensure that the split is not missing any class.
+        labels: List[int] = X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]]
+        if STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in CLASSIFICATION_TASKS:
+            unique_labels = len(np.unique(labels))
+            if unique_labels < X['dataset_properties']['output_shape']:
+                raise ValueError(f"Expected number of unique labels {unique_labels} in train split: {X['split_id']}"
+                                 f" to be = num_classes {X['dataset_properties']['output_shape']}."
+                                 f" Consider using stratified splitting strategies.")
+
+        return labels
+
     def _load_best_weights_and_clean_checkpoints(self, X: Dict[str, Any]) -> None:
         """
         Load the best model until the last epoch and delete all the files for checkpoints.
@@ -372,14 +419,17 @@ def _load_best_weights_and_clean_checkpoints(self, X: Dict[str, Any]) -> None:
         """
         assert self.checkpoint_dir is not None  # mypy
         assert self.run_summary is not None  # mypy
+        assert self.early_stopping_split_type is not None  # mypy
 
         best_path = os.path.join(self.checkpoint_dir, 'best.pth')
-        self.logger.debug(f" Early stopped model {X['num_run']} on epoch {self.run_summary.get_best_epoch()}")
+        best_epoch = self.run_summary.get_best_epoch(split_type=self.early_stopping_split_type)
+        self.logger.debug(f" Early stopped model {X['num_run']} on epoch {best_epoch}")
         # We will stop the training. Load the last best performing weights
         X['network'].load_state_dict(torch.load(best_path))
 
         # Clean the temp dir
         shutil.rmtree(self.checkpoint_dir)
+        self.checkpoint_dir = None
 
     def early_stop_handler(self, X: Dict[str, Any]) -> bool:
         """
@@ -394,6 +444,7 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
             bool: If true, training should be stopped
         """
         assert self.run_summary is not None
+        assert self.early_stopping_split_type is not None  # mypy
 
         # Allow to disable early stopping
         if X['early_stopping'] is None or X['early_stopping'] < 0:
@@ -403,7 +454,12 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
         if self.checkpoint_dir is None:
             self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory)
 
-        epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch()
+        if not os.path.exists(self.checkpoint_dir):
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+
+        last_epoch = self.run_summary.get_last_epoch()
+        best_epoch = self.run_summary.get_best_epoch(split_type=self.early_stopping_split_type)
+        epochs_since_best = last_epoch - best_epoch
 
         # Save the checkpoint if there is a new best epoch
         best_path = os.path.join(self.checkpoint_dir, 'best.pth')
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index 4909f56ce..0dba1e869 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -12,11 +12,14 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-
-from autoPyTorch.constants import REGRESSION_TASKS
+from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
-from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS
+from autoPyTorch.pipeline.components.training.metrics.metrics import (
+    CLASSIFICATION_METRICS,
+    FORECASTING_METRICS,
+    REGRESSION_METRICS,
+)
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 from autoPyTorch.utils.implementations import get_loss_weight_strategy
 
@@ -119,25 +122,29 @@ def add_performance(self,
         self.performance_tracker['val_metrics'][epoch] = val_metrics
         self.performance_tracker['test_metrics'][epoch] = test_metrics
 
-    def get_best_epoch(self, loss_type: str = 'val_loss') -> int:
-        # If we compute validation scores, prefer the performance
+    def get_best_epoch(self, split_type: str = 'val') -> int:
+        # If we compute for optimization, prefer the performance
         # metric to the loss
         if self.optimize_metric is not None:
-            scorer = CLASSIFICATION_METRICS[
-                self.optimize_metric
-            ] if self.optimize_metric in CLASSIFICATION_METRICS else REGRESSION_METRICS[
-                self.optimize_metric
-            ]
+            metrics_type = f"{split_type}_metrics"
+            if self.optimize_metric in CLASSIFICATION_METRICS:
+                scorer = CLASSIFICATION_METRICS[self.optimize_metric]
+            elif self.optimize_metric in REGRESSION_METRICS:
+                scorer = REGRESSION_METRICS[self.optimize_metric]
+            elif self.optimize_metric in FORECASTING_METRICS:
+                scorer = FORECASTING_METRICS[self.optimize_metric]
+            else:
+                raise NotImplementedError(f"Unsupported optimizer metric: {self.optimize_metric}")
+
             # Some metrics maximize, other minimize!
             opt_func = np.argmax if scorer._sign > 0 else np.argmin
             return int(opt_func(
-                [self.performance_tracker['val_metrics'][e][self.optimize_metric]
-                 for e in range(1, len(self.performance_tracker['val_metrics']) + 1)]
+                [metrics[self.optimize_metric] for metrics in self.performance_tracker[metrics_type].values()]
             )) + 1  # Epochs start at 1
         else:
+            loss_type = f"{split_type}_loss"
             return int(np.argmin(
-                [self.performance_tracker[loss_type][e]
-                 for e in range(1, len(self.performance_tracker[loss_type]) + 1)],
+                list(self.performance_tracker[loss_type].values()),
             )) + 1  # Epochs start at 1
 
     def get_last_epoch(self) -> int:
@@ -179,6 +186,16 @@ def repr_last_epoch(self) -> str:
         string += '=' * 40
         return string
 
+    def is_empty(self) -> bool:
+        """
+        Checks if the object is empty or not
+
+        Returns:
+            bool
+        """
+        # if train_loss is empty, we can be sure that RunSummary is empty.
+        return not bool(self.performance_tracker['train_loss'])
+
 
 class BaseTrainerComponent(autoPyTorchTrainingComponent):
 
@@ -205,7 +222,8 @@ def prepare(
         scheduler: _LRScheduler,
         task_type: int,
         labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
-        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch
+        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+        **kwargs: Dict
     ) -> None:
 
         # Save the device to be used
@@ -277,7 +295,7 @@ def _scheduler_step(
 
     def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                     writer: Optional[SummaryWriter],
-                    ) -> Tuple[float, Dict[str, float]]:
+                    ) -> Tuple[Optional[float], Dict[str, float]]:
         """
         Train the model for a single epoch.
 
@@ -302,9 +320,10 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
             loss, outputs = self.train_step(data, targets)
 
-            # save for metric evaluation
-            outputs_data.append(outputs.detach().cpu())
-            targets_data.append(targets.detach().cpu())
+            if self.metrics_during_training:
+                # save for metric evaluation
+                outputs_data.append(outputs.detach().cpu())
+                targets_data.append(targets.detach().cpu())
 
             batch_size = data.size(0)
             loss_sum += loss * batch_size
@@ -317,6 +336,9 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                     epoch * len(train_loader) + step,
                 )
 
+        if N == 0:
+            return None, {}
+
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
         if self.metrics_during_training:
@@ -325,7 +347,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             return loss_sum / N, {}
 
     def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
-        if self.task_type in REGRESSION_TASKS:
+        if self.task_type in (REGRESSION_TASKS + FORECASTING_TASKS):
             targets = targets.float().to(self.device)
             # make sure that targets will have same shape as outputs (really important for mse loss for example)
             if targets.ndim == 1:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
new file mode 100644
index 000000000..197887339
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
@@ -0,0 +1,16 @@
+from typing import Dict, Optional, Union
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.training.trainer.MixUpTrainer import MixUpTrainer
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import \
+    ForecastingBaseTrainerComponent
+
+
+class ForecastingMixUpTrainer(ForecastingBaseTrainerComponent, MixUpTrainer):
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ForecastingMixUpTrainer',
+            'name': 'MixUp Regularized Trainer',
+        }
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
new file mode 100644
index 000000000..9235565fe
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
@@ -0,0 +1,16 @@
+from typing import Dict, Optional, Union
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.training.trainer.StandardTrainer import StandardTrainer
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import \
+    ForecastingBaseTrainerComponent
+
+
+class ForecastingStandardTrainer(ForecastingBaseTrainerComponent, StandardTrainer):
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ForecastingStandardTrainer',
+            'name': 'Forecasting Standard Trainer',
+        }
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
new file mode 100644
index 000000000..30b504ec3
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -0,0 +1,106 @@
+import collections
+import os
+from typing import Dict, Optional
+
+import numpy as np
+
+from autoPyTorch.constants import FORECASTING_BUDGET_TYPE, STRING_TO_TASK_TYPES
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components
+)
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
+from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
+from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
+    ForecastingBaseTrainerComponent
+)
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    get_device_from_fit_dictionary
+)
+
+trainer_directory = os.path.split(__file__)[0]
+_trainers = find_components(__package__,
+                            trainer_directory,
+                            ForecastingBaseTrainerComponent)
+_addons = ThirdPartyComponents(ForecastingBaseTrainerComponent)
+
+
+def add_trainer(trainer: ForecastingBaseTrainerComponent) -> None:
+    _addons.add_component(trainer)
+
+
+class ForecastingTrainerChoice(TrainerChoice):
+    def __init__(self,
+                 dataset_properties: Dict[str, BaseDatasetPropertiesType],
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        super().__init__(dataset_properties=dataset_properties, random_state=random_state)
+        assert self._fit_requirements is not None
+        self._fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
+                                                      user_defined=False, dataset_property=False),
+                                       FitRequirement("window_size", (int,), user_defined=False,
+                                                      dataset_property=False)])
+
+    def get_budget_tracker(self, X: Dict) -> BudgetTracker:
+        if 'epochs' in X:
+            max_epochs = X['epochs']
+        elif X['budget_type'] in FORECASTING_BUDGET_TYPE:
+            max_epochs = 50
+        else:
+            max_epochs = None
+        return BudgetTracker(
+            budget_type='epochs' if X['budget_type'] in FORECASTING_BUDGET_TYPE else X['budget_type'],
+            max_runtime=X['runtime'] if 'runtime' in X else None,
+            max_epochs=max_epochs,
+        )
+
+    def prepare_trainer(self, X: Dict) -> None:
+        assert self.choice is not None
+        # Support additional user metrics
+        metrics = get_metrics(dataset_properties=X['dataset_properties'])
+        if 'additional_metrics' in X:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
+        if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
+        if hasattr(X['y_train'], "to_numpy"):
+            labels = X['y_train'].to_numpy()[X['backend'].load_datamanager().splits[X['split_id']][0]]
+        else:
+            labels = X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]]
+
+        self.choice.prepare(
+            model=X['network'],
+            metrics=metrics,
+            criterion=X['loss'],
+            budget_tracker=self.budget_tracker,
+            optimizer=X['optimizer'],
+            device=get_device_from_fit_dictionary(X),
+            metrics_during_training=X['metrics_during_training'],
+            scheduler=X['lr_scheduler'],
+            task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
+            labels=labels,
+            step_interval=X['step_interval'],
+            window_size=X['window_size'],
+            dataset_properties=X['dataset_properties'],
+            target_scaler=X['target_scaler'],
+            backcast_loss_ratio=X.get('backcast_loss_ratio', 0.0)
+        )
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available trainer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all components available
+                as choices for learning rate scheduling
+        """
+        components: Dict[str, autoPyTorchComponent] = collections.OrderedDict()
+        components.update(_trainers)
+        components.update(_addons.components)
+        return components
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
new file mode 100644
index 000000000..858cf775b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -0,0 +1,345 @@
+from abc import ABC
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import numpy as np
+
+import pandas as pd
+
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.tensorboard.writer import SummaryWriter
+
+from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import (
+    ForecastingDeepARNet,
+    ForecastingNet,
+    ForecastingSeq2SeqNet,
+    NBEATSNet
+)
+from autoPyTorch.pipeline.components.training.losses import MASELoss
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
+from autoPyTorch.pipeline.components.training.trainer.base_trainer import (
+    BaseTrainerComponent,
+    BudgetTracker
+)
+
+
+class ForecastingBaseTrainerComponent(BaseTrainerComponent, ABC):
+    def prepare(  # type: ignore[override]
+            self,
+            metrics: List[Any],
+            model: ForecastingNet,
+            criterion: Type[torch.nn.Module],
+            budget_tracker: BudgetTracker,
+            optimizer: Optimizer,
+            device: torch.device,
+            metrics_during_training: bool,
+            scheduler: _LRScheduler,
+            task_type: int,
+            labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
+            step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+            window_size: int = 20,
+            dataset_properties: Dict = {},
+            target_scaler: BaseTargetScaler = BaseTargetScaler(),
+            backcast_loss_ratio: Optional[float] = None,
+    ) -> None:
+        # metrics_during_training is not appliable when computing scaled values
+        metrics_during_training = False
+        super().prepare(metrics=metrics,
+                        model=model,
+                        criterion=criterion,
+                        budget_tracker=budget_tracker,
+                        optimizer=optimizer,
+                        device=device,
+                        metrics_during_training=metrics_during_training,
+                        scheduler=scheduler,
+                        task_type=task_type,
+                        labels=labels,
+                        step_interval=step_interval
+                        )
+        # Weights for the loss function
+        kwargs = {}
+        if self.weighted_loss:
+            kwargs = self.get_class_weights(criterion, labels)
+        kwargs["reduction"] = 'none'
+        # Setup the loss function
+        self.criterion = criterion(**kwargs)
+        metric_kwargs = {"sp": dataset_properties.get("sp", 1),
+                         "n_prediction_steps": dataset_properties.get("n_prediction_steps", 1)}
+        self.metrics_kwargs = metric_kwargs
+        self.target_scaler = target_scaler  # typing: BaseTargetScaler
+        self.backcast_loss_ratio = backcast_loss_ratio
+        self.window_size = window_size
+        self.model.device = self.device
+
+    def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
+                    writer: Optional[SummaryWriter],
+                    ) -> Tuple[float, Dict[str, float]]:
+        """
+        Train the model for a single epoch.
+
+        Args:
+            train_loader (torch.utils.data.DataLoader):
+                generator of features/label
+            epoch (int):
+                The current epoch used solely for tracking purposes
+
+        Returns:
+            float:
+                training loss
+            Dict[str, float]:
+                scores for each desired metric
+        """
+        loss_sum = 0.0
+        N = 0
+        self.model.train()
+        outputs_data = list()
+        targets_data = list()
+
+        for step, (data, targets) in enumerate(train_loader):
+            if self.budget_tracker.is_max_time_reached():
+                break
+
+            loss, outputs = self.train_step(data, targets)
+
+            if self.metrics_during_training:
+                # save for metric evaluation
+                outputs_data.append(outputs.detach().cpu())
+                targets_data.append(targets.detach().cpu())
+
+            batch_size = data["past_targets"].size(0)
+            loss_sum += loss * batch_size
+            N += batch_size
+
+            if writer:
+                writer.add_scalar(
+                    'Train/loss',
+                    loss,
+                    epoch * len(train_loader) + step,
+                )
+
+        self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
+
+        if self.metrics_during_training:
+            return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
+        else:
+            return loss_sum / N, {}
+
+    def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, torch.Tensor]) \
+            -> Tuple[float, torch.Tensor]:
+        """
+        Allows to train 1 step of gradient descent, given a batch of train/labels
+
+        Args:
+            data ( Dict[str, torch.Tensor]):
+                input features to the network
+            future_targets (Dict[str, torch.Tensor]):
+                ground truth to calculate loss
+
+        Returns:
+            torch.Tensor:
+                The predictions of the network
+            float:
+                the loss incurred in the prediction
+        """
+        past_observed_targets = data['past_observed_targets']
+
+        past_features = data["past_features"]
+        if past_features is not None:
+            past_features = past_features.float()
+        future_features = data['future_features']
+        if future_features is not None:
+            future_features = future_features.float()
+
+        future_observed_targets = future_targets["future_observed_targets"]
+        future_targets_values = future_targets["future_targets"]
+
+        past_target = self.cast_targets(data['past_targets'])
+        future_targets_values = self.cast_targets(future_targets_values)
+
+        if isinstance(self.criterion, MASELoss):
+            self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
+
+        # training
+        self.optimizer.zero_grad()
+
+        if isinstance(self.model, NBEATSNet):
+            past_target = past_target[:, -self.window_size:]
+            past_observed_targets = past_observed_targets[:, -self.window_size:]
+            past_target, criterion_kwargs_past = self.data_preparation(past_target,
+                                                                       past_target.to(self.device))
+            past_target, criterion_kwargs_future = self.data_preparation(past_target,
+                                                                         future_targets_values.to(self.device))
+            backcast, forecast = self.model(past_targets=past_target, past_observed_targets=past_observed_targets)
+
+            loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
+            loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
+
+            loss_backcast = loss_func_backcast(self.criterion, backcast) * past_observed_targets.to(self.device)
+            loss_forecast = loss_func_forecast(self.criterion, forecast) * future_observed_targets.to(self.device)
+
+            loss = loss_forecast.mean() + loss_backcast.mean() * self.backcast_loss_ratio
+
+            outputs = forecast
+        else:
+            if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
+                if self.window_size > past_target.shape[1]:
+                    all_targets = torch.cat([past_target[:, 1:, ], future_targets_values], dim=1)
+                    future_observed_targets = torch.cat([past_observed_targets[:, 1:, ],
+                                                         future_observed_targets], dim=1)
+                else:
+                    if self.window_size == 1:
+                        all_targets = future_targets_values
+                    else:
+                        all_targets = torch.cat([past_target[:, 1 - self.window_size:, ],
+                                                 future_targets_values], dim=1)
+                        future_observed_targets = torch.cat([past_observed_targets[:, 1 - self.window_size:, ],
+                                                             future_observed_targets], dim=1)
+                past_target, criterion_kwargs = self.data_preparation(past_target, all_targets.to(self.device))
+            else:
+                past_target, criterion_kwargs = self.data_preparation(past_target,
+                                                                      future_targets_values.to(self.device))
+
+            outputs = self.model(past_targets=past_target,
+                                 past_features=past_features,
+                                 future_features=future_features,
+                                 future_targets=future_targets_values,
+                                 past_observed_targets=past_observed_targets)
+
+            loss_func = self.criterion_preparation(**criterion_kwargs)
+
+            loss = torch.mean(loss_func(self.criterion, outputs) * future_observed_targets.to(self.device))
+
+        loss.backward()
+        self.optimizer.step()
+        self._scheduler_step(step_interval=StepIntervalUnit.batch, loss=loss.item())
+
+        return loss.item(), outputs
+
+    def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
+                 writer: Optional[SummaryWriter],
+                 ) -> Tuple[float, Dict[str, float]]:
+        """
+        Evaluate the model in both metrics and criterion
+
+        Args:
+            test_loader (torch.utils.data.DataLoader):
+                generator of features/label
+            epoch (int):
+                the current epoch for tracking purposes
+
+        Returns:
+            float:
+                test loss
+            Dict[str, float]:
+                scores for each desired metric
+        """
+        if not isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
+            # To save time, we simply make one-step prediction for DeepAR and Seq2Seq
+            self.model.eval()
+        if isinstance(self.model, ForecastingDeepARNet):
+            self.model.only_generate_future_dist = True
+
+        loss_sum = 0.0
+        N = 0
+        outputs_data = list()
+        targets_data = list()
+
+        mase_coefficients = list()
+
+        with torch.no_grad():
+            for step, (data, future_targets) in enumerate(test_loader):
+                past_target = data['past_targets'].float()
+                past_observed_targets = data['past_observed_targets']
+
+                past_features = data["past_features"]
+                if past_features is not None:
+                    past_features = past_features.float()
+                future_features = data['future_features']
+                if future_features is not None:
+                    future_features = future_features.float()
+
+                mase_coefficients.append(data['mase_coefficient'])
+                if isinstance(self.criterion, MASELoss):
+                    self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
+
+                batch_size = past_target.shape[0]
+
+                future_observed_targets = future_targets["future_observed_targets"].to(self.device)
+                future_targets_values = future_targets["future_targets"]
+
+                future_targets_values = self.cast_targets(future_targets_values)
+
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets_values)
+
+                if isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
+                    outputs = self.model(past_targets=past_target,
+                                         past_features=past_features,
+                                         future_targets=future_targets_values,
+                                         future_features=future_features,
+                                         past_observed_targets=past_observed_targets)
+                else:
+                    outputs = self.model(past_targets=past_target,
+                                         past_features=past_features,
+                                         future_features=future_features,
+                                         past_observed_targets=past_observed_targets)
+
+                # prepare
+                future_targets_values = future_targets_values.to(self.device)
+
+                if isinstance(outputs, list) and self.model.output_type != 'quantile':
+                    losses = [self.criterion(output, future_targets_values) for output in outputs]
+                    loss = torch.mean(torch.Tensor(losses) * future_observed_targets)
+                else:
+                    loss = torch.mean(self.criterion(outputs, future_targets_values) * future_observed_targets)
+                outputs = self.model.pred_from_net_output(outputs)
+                outputs = outputs.detach().cpu()
+
+                loss_sum += loss.item() * batch_size
+                N += batch_size
+
+                outputs_data.append(outputs)
+                targets_data.append(future_targets_values.detach().cpu())
+
+                if writer:
+                    writer.add_scalar(
+                        'Val/loss',
+                        loss.item(),
+                        epoch * len(test_loader) + step,
+                    )
+
+        # mase_coefficent has the shape [B, 1, 1]
+        # to be compatible with outputs_data with shape [B, n_prediction_steps, num_output]
+        mase_coefficients = np.expand_dims(torch.cat(mase_coefficients, dim=0).numpy(), axis=[1])
+        self.metrics_kwargs.update({'mase_coefficient': mase_coefficients})
+
+        self._scheduler_step(step_interval=StepIntervalUnit.valid, loss=loss_sum / N)
+
+        self.model.train()
+        return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
+
+    def compute_metrics(self, outputs_data: List[torch.Tensor], targets_data: List[torch.Tensor]
+                        ) -> Dict[str, float]:
+        # TODO: change once Ravin Provides the PR
+        outputs_data = torch.cat(outputs_data, dim=0).numpy()
+        targets_data = torch.cat(targets_data, dim=0).numpy()
+
+        return calculate_score(targets_data, outputs_data, self.task_type, self.metrics, **self.metrics_kwargs)
+
+    def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
+        """
+        This function is quite similar to the base class implementation, except that we do not move targets to
+        sef.device
+
+        """
+        if self.task_type in (REGRESSION_TASKS + FORECASTING_TASKS):
+            targets = targets.float()
+            # make sure that targets will have same shape as outputs (really important for mse loss for example)
+            if targets.ndim == 1:
+                targets = targets.unsqueeze(1)
+        else:
+            targets = targets.long()
+        return targets
diff --git a/autoPyTorch/pipeline/create_searchspace_util.py b/autoPyTorch/pipeline/create_searchspace_util.py
index f66371917..640a787e2 100644
--- a/autoPyTorch/pipeline/create_searchspace_util.py
+++ b/autoPyTorch/pipeline/create_searchspace_util.py
@@ -47,7 +47,7 @@ def get_match_array(
     matches_dimensions = [len(choices) for choices in node_i_choices]
     # Start by allowing every combination of nodes. Go through all
     # combinations/pipelines and erase the illegal ones
-    matches = np.ones(matches_dimensions, dtype=int)
+    matches = np.ones(matches_dimensions, dtype=np.int32)
 
     # TODO: Check if we need this, like are there combinations from the
     # pipeline we should dynamically avoid?
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
index b95de512e..720d0af64 100644
--- a/autoPyTorch/pipeline/tabular_classification.py
+++ b/autoPyTorch/pipeline/tabular_classification.py
@@ -19,6 +19,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -27,6 +30,8 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -307,6 +312,8 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
+            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py
index 57d0126d0..06da9cabb 100644
--- a/autoPyTorch/pipeline/tabular_regression.py
+++ b/autoPyTorch/pipeline/tabular_regression.py
@@ -19,6 +19,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -27,6 +30,8 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -257,6 +262,8 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
+            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
new file mode 100644
index 000000000..53143e4df
--- /dev/null
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -0,0 +1,441 @@
+import copy
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+from ConfigSpace.forbidden import (
+    ForbiddenAndConjunction,
+    ForbiddenEqualsClause,
+    ForbiddenInClause
+)
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.base import RegressorMixin
+
+import torch
+
+from autoPyTorch.constants import STRING_TO_TASK_TYPES
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
+    TimeSeriesFeatureTransformer
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
+    TimeSeriesFeatureImputer,
+    TimeSeriesTargetImputer
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import (
+    BaseScaler
+)
+from autoPyTorch.pipeline.components.setup.early_preprocessor.TimeSeriesEarlyPreProcessing import (
+    TimeSeriesEarlyPreprocessing,
+    TimeSeriesTargetEarlyPreprocessing
+)
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
+from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice
+from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
+    TimeSeriesForecastingDataLoader
+)
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesForecastingPipeline(RegressorMixin, BasePipeline):
+    """This class is a proof of concept to integrate AutoPyTorch Components
+
+    It implements a pipeline, which includes as steps:
+
+        ->One preprocessing step
+        ->One neural network
+
+    Contrary to the sklearn API it is not possible to enumerate the
+    possible parameters in the __init__ function because we only know the
+    available regressors at runtime. For this reason the user must
+    specifiy the parameters by passing an instance of
+    ConfigSpace.configuration_space.Configuration.
+
+
+    Args:
+        config (Configuration):
+            The configuration to evaluate.
+        random_state (Optional[RandomState):
+            random_state is the random number generator
+
+    Attributes:
+    """
+
+    def __init__(self,
+                 config: Optional[Configuration] = None,
+                 steps: Optional[List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]] = None,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 random_state: Optional[np.random.RandomState] = None,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 ):
+        BasePipeline.__init__(self,
+                              config, steps, dataset_properties, include, exclude,
+                              random_state, init_params, search_space_updates)
+
+        # Because a pipeline is passed to a worker, we need to honor the random seed
+        # in this context. A tabular regression pipeline will implement a torch
+        # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html
+        torch.manual_seed(self.random_state.get_state()[1][0])
+
+    def score(self, X: List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]],
+              y: np.ndarray, batch_size: Optional[int] = None, **score_kwargs: Any) -> float:
+        """Scores the fitted estimator on (X, y)
+
+        Args:
+            X (List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]):
+                input to the pipeline, from which to guess targets
+            batch_size (Optional[int]):
+                batch_size controls whether the pipeline will be called on small chunks of the data.
+                 Useful when calling the predict method on the whole array X results in a MemoryError.
+        Returns:
+            np.ndarray:
+                coefficient of determination R^2 of the prediction
+        """
+        from autoPyTorch.pipeline.components.training.metrics.utils import (
+            calculate_score, get_metrics)
+        metrics = get_metrics(self.dataset_properties, ['mean_MAPE_forecasting'])
+        y_pred = self.predict(X, batch_size=batch_size)  # type: ignore[arg-types]
+        r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[str(self.dataset_properties['task_type'])],
+                             metrics=metrics, **score_kwargs)['mean_MAPE_forecasting']
+        return r2
+
+    def _get_hyperparameter_search_space(self,
+                                         dataset_properties: Dict[str, Any],
+                                         include: Optional[Dict[str, Any]] = None,
+                                         exclude: Optional[Dict[str, Any]] = None,
+                                         ) -> ConfigurationSpace:
+        """Create the hyperparameter configuration space.
+
+        For the given steps, and the Choices within that steps,
+        this procedure returns a configuration space object to
+        explore.
+
+        Args:
+            include (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to remove from the configuration space
+            dataset_properties (Optional[Dict[str, Union[str, int]]]):
+                Characteristics of the dataset to guide the pipeline choices of components
+
+        Returns:
+            cs (Configuration):
+                The configuration space describing the TimeSeriesRegressionPipeline.
+        """
+        cs = ConfigurationSpace()
+
+        if not isinstance(dataset_properties, dict):
+            warnings.warn('The given dataset_properties argument contains an illegal value.'
+                          'Proceeding with the default value')
+            dataset_properties = dict()
+
+        if 'target_type' not in dataset_properties:
+            dataset_properties['target_type'] = 'time_series_forecasting'
+        if dataset_properties['target_type'] != 'time_series_forecasting':
+            warnings.warn('Time series forecasting is being used, however the target_type'
+                          'is not given as "time_series_forecasting". Overriding it.')
+            dataset_properties['target_type'] = 'time_series_forecasting'
+        # get the base search space given this
+        # dataset properties. Then overwrite with custom
+        # regression requirements
+        cs = self._get_base_search_space(
+            cs=cs, dataset_properties=dataset_properties,
+            exclude=exclude, include=include, pipeline=self.steps)
+
+        # Here we add custom code, like this with this
+        # is not a valid configuration
+        # Learned Entity Embedding is only valid when encoder is one hot encoder
+        if 'network_embedding' in self.named_steps.keys():
+            embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
+            if 'LearnedEntityEmbedding' in embeddings:
+                if 'feature_encoding' in self.named_steps.keys():
+                    feature_encodings = cs.get_hyperparameter('feature_encoding:__choice__').choices
+                    default = cs.get_hyperparameter('network_embedding:__choice__').default_value
+                    possible_default_embeddings = copy.copy(list(embeddings))
+                    del possible_default_embeddings[possible_default_embeddings.index(default)]
+
+                    for encoding in feature_encodings:
+                        if encoding == 'OneHotEncoder':
+                            continue
+                        while True:
+                            try:
+                                cs.add_forbidden_clause(ForbiddenAndConjunction(
+                                    ForbiddenEqualsClause(cs.get_hyperparameter(
+                                        'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
+                                    ForbiddenEqualsClause(
+                                        cs.get_hyperparameter('feature_encoding:__choice__'), encoding
+                                    )))
+                                break
+                            except ValueError:
+                                # change the default and try again
+                                try:
+                                    default = possible_default_embeddings.pop()
+                                except IndexError:
+                                    raise ValueError("Cannot find a legal default configuration")
+                                cs.get_hyperparameter('network_embedding:__choice__').default_value = default
+
+                if 'network_backbone:flat_encoder:__choice__' in cs:
+                    hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
+                    if 'NBEATSEncoder' in hp_flat_encoder.choices:
+                        cs.add_forbidden_clause(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                            ForbiddenEqualsClause(cs.get_hyperparameter(
+                                'network_embedding:__choice__'), 'LearnedEntityEmbedding'))
+                        )
+
+        # dist_cls and auto_regressive are only activate if the network outputs distribution
+        if 'loss' in self.named_steps.keys() and 'network_backbone' in self.named_steps.keys():
+            hp_loss = cs.get_hyperparameter('loss:__choice__')
+
+            ar_forbidden = True
+
+            hp_deepAR = []
+            for hp_name in cs.get_hyperparameter_names():
+                if hp_name.startswith('network_backbone:'):
+                    if hp_name.endswith(':auto_regressive'):
+                        hp_deepAR.append(cs.get_hyperparameter(hp_name))
+
+            # DeepAR
+            forbidden_losses_all = []
+            losses_non_ar = []
+            for loss in hp_loss.choices:
+                if loss != "DistributionLoss":
+                    losses_non_ar.append(loss)
+            if losses_non_ar:
+                forbidden_hp_regression_loss = ForbiddenInClause(hp_loss, losses_non_ar)
+                for hp_ar in hp_deepAR:
+                    if True in hp_ar.choices:
+                        forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                        forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
+                        forbidden_losses_all.append(forbidden_hp_dist)
+
+            if "network_backbone:seq_encoder:decoder_auto_regressive" in cs:
+                decoder_auto_regressive = cs.get_hyperparameter("network_backbone:seq_encoder:decoder_auto_regressive")
+                forecast_strategy = cs.get_hyperparameter("loss:DistributionLoss:forecast_strategy")
+                use_tf = cs.get_hyperparameter("network_backbone:seq_encoder:use_temporal_fusion")
+
+                if True in decoder_auto_regressive.choices and\
+                        'sample' in forecast_strategy.choices and True in use_tf.choices:
+                    cs.add_forbidden_clause(
+                        ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(decoder_auto_regressive, True),
+                            ForbiddenEqualsClause(forecast_strategy, 'sample'),
+                            ForbiddenEqualsClause(use_tf, True)
+                        )
+                    )
+
+            if 'network_backbone:flat_encoder:__choice__' in cs:
+                network_flat_encoder_hp = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
+
+                if 'MLPEncoder' in network_flat_encoder_hp.choices:
+                    forbidden = ['MLPEncoder']
+                    forbidden_deepAREncoder = [
+                        forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices
+                    ]
+                    for hp_ar in hp_deepAR:
+                        if True in hp_ar.choices:
+                            forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                            forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp,
+                                                                        forbidden_deepAREncoder)
+                            forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
+                            forbidden_losses_all.append(forbidden_hp_ar_mlp)
+
+            if 'loss:DistributionLoss:forecast_strategy' in cs:
+                forecast_strategy = cs.get_hyperparameter('loss:DistributionLoss:forecast_strategy')
+                if 'mean' in forecast_strategy.choices:
+                    for hp_ar in hp_deepAR:
+                        if True in hp_ar.choices:
+
+                            forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                            forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
+                            forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
+                                                                                        forbidden_hp_forecast_strategy)
+                            forbidden_losses_all.append(forbidden_hp_ar_forecast_strategy)
+            if forbidden_losses_all:
+                cs.add_forbidden_clauses(forbidden_losses_all)
+
+            # NBEATS
+            network_encoder_hp = cs.get_hyperparameter("network_backbone:__choice__")
+            forbidden_NBEATS = []
+            encoder_non_flat = [choice for choice in network_encoder_hp.choices if choice != 'flat_encoder']
+            loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
+            data_loader_backcast = cs.get_hyperparameter('data_loader:backcast')
+
+            forbidden_encoder_non_flat = ForbiddenInClause(network_encoder_hp, encoder_non_flat)
+            forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
+            forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
+
+            if 'network_backbone:flat_encoder:__choice__' in cs:
+                hp_flat_encoder = cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
+
+                # Ensure that NBEATS encoder only works with NBEATS decoder
+                if 'NBEATSEncoder' in hp_flat_encoder.choices:
+                    forbidden_NBEATS.append(ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                        forbidden_loss_non_regression)
+                    )
+                    transform_time_features = "data_loader:transform_time_features"
+                    if transform_time_features in cs:
+                        hp_ttf = cs.get_hyperparameter(transform_time_features)
+                        forbidden_NBEATS.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                            ForbiddenEqualsClause(hp_ttf, True))
+                        )
+
+            forbidden_NBEATS.append(ForbiddenAndConjunction(
+                forbidden_backcast,
+                forbidden_encoder_non_flat
+            ))
+
+            cs.add_forbidden_clauses(forbidden_NBEATS)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+        One key difference between Forecasting pipeline and others is that we put "data_loader"
+        before "network_backbone" such that
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]:
+                list of steps sequentially exercised by the pipeline.
+        """
+        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
+
+        default_dataset_properties: Dict[str, BaseDatasetPropertiesType] = {'target_type': 'time_series_prediction'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+
+        if not default_dataset_properties.get("uni_variant", False):
+            steps.extend([("impute", TimeSeriesFeatureImputer(random_state=self.random_state)),
+                          ("scaler", BaseScaler(random_state=self.random_state)),
+                          ('feature_encoding', TimeSeriesEncoderChoice(default_dataset_properties,
+                                                                       random_state=self.random_state)),
+                          ("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),
+                          ("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
+                          ])
+
+        steps.extend([
+            ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
+            ("target_preprocessing", TimeSeriesTargetEarlyPreprocessing(random_state=self.random_state)),
+            ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
+            ("target_scaler", BaseTargetScaler(random_state=self.random_state)),
+            ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
+            ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
+                                                         random_state=self.random_state)),
+            ("network_backbone", ForecastingNetworkChoice(dataset_properties=default_dataset_properties,
+                                                          random_state=self.random_state)),
+            ("network_head", ForecastingHead(random_state=self.random_state)),
+            ("network", ForecastingNetworkComponent(random_state=self.random_state)),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties,
+                                                      random_state=self.random_state)),
+            ("optimizer", OptimizerChoice(default_dataset_properties,
+                                          random_state=self.random_state)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties,
+                                             random_state=self.random_state)),
+            ("trainer", ForecastingTrainerChoice(default_dataset_properties, random_state=self.random_state)),
+        ])
+        return steps
+
+    def get_pipeline_representation(self) -> Dict[str, str]:
+        """
+        Returns a representation of the pipeline, so that it can be
+        consumed and formatted by the API.
+
+        It should be a representation that follows:
+        [{'PreProcessing': <>, 'Estimator': <>}]
+
+        Returns:
+            Dict: contains the pipeline representation in a short format
+        """
+        preprocessing: List[str] = []
+        estimator: List[str] = []
+        skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
+                      'preprocessing', 'time_series_transformer']
+        for step_name, step_component in self.steps:
+            if step_name in skip_steps:
+                continue
+            properties: Dict[str, Any] = {}
+            if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
+                properties = step_component.choice.get_properties()
+            elif isinstance(step_component, autoPyTorchComponent):
+                properties = step_component.get_properties()
+            if 'shortname' in properties:
+                if 'network' in step_name:
+                    estimator.append(properties['shortname'])
+                else:
+                    preprocessing.append(properties['shortname'])
+        return {
+            'Preprocessing': ','.join(preprocessing),
+            'Estimator': ','.join(estimator),
+        }
+
+    def _get_estimator_hyperparameter_name(self) -> str:
+        """
+        Returns the name of the current estimator.
+
+        Returns:
+            str: name of the pipeline type
+        """
+        return "time_series_forecasting"
+
+    def predict(self,
+                X: List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]],  # type: ignore[override]
+                batch_size: Optional[int] = None) -> np.ndarray:
+        """Predict the output using the selected model.
+
+        Args:
+            X (List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]):
+                input data to predict
+            batch_size (Optional[int]):
+                batch_size controls whether the pipeline will be called on small chunks of the data.
+                Useful when calling the predict method on the whole array X results in a MemoryError.
+        Returns:
+            np.ndarray:
+                the predicted values given input X
+        """
+
+        # Pre-process X
+        if batch_size is None:
+            warnings.warn("Batch size not provided. "
+                          "Will use 1000 instead")
+            batch_size = 1000
+
+        loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size)
+        try:
+            return self.named_steps['network'].predict(loader).flatten()
+        except Exception as e:
+            # https://github.com/pytorch/fairseq/blob/50a671f78d0c8de0392f924180db72ac9b41b801/fairseq/trainer.py#L283
+            if 'out of memory' in str(e):
+                if batch_size <= 1:
+                    raise e
+                warnings.warn('| WARNING: ran out of memory, retrying batch')
+                torch.cuda.empty_cache()
+                batch_size = batch_size // 2
+                return self.predict(X, batch_size=batch_size // 2).flatten()
+            else:
+                raise e
diff --git a/autoPyTorch/py.typed b/autoPyTorch/py.typed
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/autoPyTorch/py.typed
@@ -0,0 +1 @@
+
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index 7be8a233c..77f250164 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union
+from enum import Enum
+from typing import Any, Callable, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -13,7 +14,7 @@
 
 import pandas as pd
 
-import scipy.sparse
+from scipy.sparse import spmatrix
 
 import torch
 from torch.utils.data.dataloader import default_collate
@@ -21,6 +22,11 @@
 HyperparameterValueType = Union[int, str, float]
 
 
+def ispandas(X: Any) -> bool:
+    """ Whether X is pandas.DataFrame or pandas.Series """
+    return hasattr(X, "iloc")
+
+
 class FitRequirement(NamedTuple):
     """
     A class that holds inputs required to fit a pipeline. Also indicates whether
@@ -75,7 +81,31 @@ def __str__(self) -> str:
             self.hyperparameter, self.value_range, self.default_value, self.log)
 
 
-def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
+class autoPyTorchEnum(str, Enum):
+    """
+    Utility class for enums in autoPyTorch.
+    Allows users to use strings, while we internally use
+    this enum
+    """
+    def __eq__(self, other: Any) -> bool:
+        if isinstance(other, autoPyTorchEnum):
+            return type(self) == type(other) and self.value == other.value
+        elif isinstance(other, str):
+            return bool(self.value == other)
+        else:
+            enum_name = self.__class__.__name__
+            raise RuntimeError(f"Unsupported type {type(other)}. "
+                               f"{enum_name} only supports `str` and"
+                               f"`{enum_name}`")
+
+    def __hash__(self) -> int:
+        return hash(self.value)
+
+    def __str__(self) -> str:
+        return str(self.value)
+
+
+def custom_collate_fn(batch: List, x_collector: Callable = default_collate) -> List[Optional[torch.Tensor]]:
     """
     In the case of not providing a y tensor, in a
     dataset of form {X, y}, y would be None.
@@ -86,6 +116,8 @@ def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
 
     Args:
         batch (List): a batch from a dataset
+        x_collector (callable): how the data is collected, e.g., when one want to pad sequences with different lengths.
+            collate is only applied to X, for y, the normal default_collate is applied.
 
     Returns:
         List[Optional[torch.Tensor]]
@@ -94,7 +126,7 @@ def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
     items = list(zip(*batch))
 
     # The feature will always be available
-    items[0] = default_collate(items[0])
+    items[0] = x_collector(items[0])
     if None in items[1]:
         items[1] = list(items[1])
     else:
@@ -146,10 +178,10 @@ def get_device_from_fit_dictionary(X: Dict[str, Any]) -> torch.device:
     return torch.device(X.get("device", "cpu"))
 
 
-def subsampler(data: Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
+def subsampler(data: Union[np.ndarray, pd.DataFrame, spmatrix],
                x: Union[np.ndarray, List[int]]
-               ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]:
-    return data[x] if isinstance(data, (np.ndarray, scipy.sparse.csr_matrix)) else data.iloc[x]
+               ) -> Union[np.ndarray, pd.DataFrame, spmatrix]:
+    return data[x] if isinstance(data, (np.ndarray, spmatrix)) else data.iloc[x]
 
 
 def get_hyperparameter(hyperparameter: HyperparameterSearchSpace,
@@ -214,3 +246,20 @@ def add_hyperparameter(cs: ConfigurationSpace,
         None
     """
     cs.add_hyperparameter(get_hyperparameter(hyperparameter, hyperparameter_type))
+
+
+def check_none(p: Any) -> bool:
+    """
+    utility function to check if `p` is None.
+
+    Args:
+        p (str):
+            variable to check
+
+    Returns:
+        bool:
+            True, if `p` is in (None, "none", "None")
+    """
+    if p in ("None", "none", None):
+        return True
+    return False
diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py
index a0b020622..4b699e3c3 100644
--- a/autoPyTorch/utils/implementations.py
+++ b/autoPyTorch/utils/implementations.py
@@ -1,7 +1,11 @@
-from typing import Any, Callable, Dict, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import numpy as np
 
+from scipy import sparse
+
+from sklearn.base import BaseEstimator, TransformerMixin
+
 import torch
 
 
@@ -59,3 +63,124 @@ def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
     @staticmethod
     def get_properties() -> Dict[str, Any]:
         return {'supported_losses': ['BCEWithLogitsLoss']}
+
+
+class MinorityCoalesceTransformer(BaseEstimator, TransformerMixin):
+    """ Group together categories whose occurrence is less than a specified min_frac."""
+    def __init__(self, min_frac: Optional[float] = None):
+        self.min_frac = min_frac
+        self._categories_to_coalesce: Optional[List[np.ndarray]] = None
+
+        if self.min_frac is not None and (self.min_frac < 0 or self.min_frac > 1):
+            raise ValueError(f"min_frac for {self.__class__.__name__} must be in [0, 1], but got {min_frac}")
+
+    def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
+        """
+        When transforming datasets, we modify values to:
+            *  0 for nan values
+            * -1 for unknown values
+            * -2 for values to be coalesced
+        For this reason, we need to check whether datasets have values
+        smaller than -2 to avoid mis-transformation.
+        Note that zero-imputation is the default setting in SimpleImputer of sklearn.
+
+        Args:
+            X (np.ndarray):
+                The input features from the user, likely transformed by an encoder and imputator.
+        """
+        X_data = X.data if sparse.issparse(X) else X
+        if np.nanmin(X_data) <= -2:
+            raise ValueError("The categoricals in input features for MinorityCoalesceTransformer "
+                             "cannot have integers smaller than -2.")
+
+    @staticmethod
+    def _get_column_data(
+        X: Union[np.ndarray, sparse.csr_matrix],
+        col_idx: int,
+        is_sparse: bool
+    ) -> Union[np.ndarray, sparse.csr_matrix]:
+        """
+        Args:
+            X (Union[np.ndarray, sparse.csr_matrix]):
+                The feature tensor with only categoricals.
+            col_idx (int):
+                The index of the column to get the data.
+            is_sparse (bool):
+                Whether the tensor is sparse or not.
+
+        Return:
+            col_data (Union[np.ndarray, sparse.csr_matrix]):
+                The column data of the tensor.
+        """
+
+        if is_sparse:
+            assert not isinstance(X, np.ndarray)  # mypy check
+            indptr_start = X.indptr[col_idx]
+            indptr_end = X.indptr[col_idx + 1]
+            col_data = X.data[indptr_start:indptr_end]
+        else:
+            col_data = X[:, col_idx]
+
+        return col_data
+
+    def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
+            y: Optional[np.ndarray] = None) -> 'MinorityCoalesceTransformer':
+        """
+        Train the estimator to identify low frequency classes on the input train data.
+
+        Args:
+            X (Union[np.ndarray, sparse.csr_matrix]):
+                The input features from the user, likely transformed by an encoder and imputator.
+            y (Optional[np.ndarray]):
+                Optional labels for the given task, not used by this estimator.
+        """
+        self._check_dataset(X)
+        n_instances, n_features = X.shape
+
+        if self.min_frac is None:
+            self._categories_to_coalesce = [np.array([]) for _ in range(n_features)]
+            return self
+
+        categories_to_coalesce: List[np.ndarray] = []
+        is_sparse = sparse.issparse(X)
+        for col in range(n_features):
+            col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
+            unique_vals, counts = np.unique(col_data, return_counts=True)
+            frac = counts / n_instances
+            categories_to_coalesce.append(unique_vals[frac < self.min_frac])
+
+        self._categories_to_coalesce = categories_to_coalesce
+        return self
+
+    def transform(
+        self,
+        X: Union[np.ndarray, sparse.csr_matrix]
+    ) -> Union[np.ndarray, sparse.csr_matrix]:
+        """
+        Coalesce categories with low frequency in X.
+
+        Args:
+            X (Union[np.ndarray, sparse.csr_matrix]):
+                The input features from the user, likely transformed by an encoder and imputator.
+        """
+        self._check_dataset(X)
+
+        if self._categories_to_coalesce is None:
+            raise RuntimeError("fit() must be called before transform()")
+
+        if self.min_frac is None:
+            return X
+
+        n_features = X.shape[1]
+        is_sparse = sparse.issparse(X)
+
+        for col in range(n_features):
+            # -2 stands coalesced. For more details, see the doc in _check_dataset
+            col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
+            mask = np.isin(col_data, self._categories_to_coalesce[col])
+            col_data[mask] = -2
+
+        return X
+
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        return self.fit(X, y).transform(X)
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index f3cc07003..3a92e3cf3 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -4,6 +4,9 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 
 from autoPyTorch.constants import (
+    CLASSIFICATION_TASKS,
+    FORECASTING_TASKS,
+    ForecastingDependenciesNotInstalledMSG,
     IMAGE_TASKS,
     REGRESSION_TASKS,
     STRING_TO_TASK_TYPES,
@@ -12,6 +15,11 @@
 from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
+try:
+    from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+    forecasting_dependencies_installed = True
+except ModuleNotFoundError:
+    forecasting_dependencies_installed = False
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -63,12 +71,20 @@ def get_dataset_requirements(info: Dict[str, Any],
                                                     exclude if exclude is not None else {},
                                                     search_space_updates=search_space_updates
                                                     )
-    else:
+    elif task_type in CLASSIFICATION_TASKS:
         return _get_classification_dataset_requirements(info,
                                                         include if include is not None else {},
                                                         exclude if exclude is not None else {},
                                                         search_space_updates=search_space_updates
                                                         )
+    else:
+        if not forecasting_dependencies_installed:
+            raise ModuleNotFoundError(ForecastingDependenciesNotInstalledMSG)
+        return _get_forecasting_dataset_requirements(info,
+                                                     include if include is not None else {},
+                                                     exclude if exclude is not None else {},
+                                                     search_space_updates=search_space_updates
+                                                     )
 
 
 def _get_regression_dataset_requirements(info: Dict[str, Any],
@@ -78,13 +94,13 @@ def _get_regression_dataset_requirements(info: Dict[str, Any],
                                          ) -> List[FitRequirement]:
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
     if task_type in TABULAR_TASKS:
-        fit_requirements = TabularRegressionPipeline(
+        return TabularRegressionPipeline(
             dataset_properties=info,
             include=include,
             exclude=exclude,
             search_space_updates=search_space_updates
         ).get_dataset_requirements()
-        return fit_requirements
+
     else:
         raise ValueError("Task_type not supported")
 
@@ -100,14 +116,34 @@ def _get_classification_dataset_requirements(info: Dict[str, Any],
         return TabularClassificationPipeline(
             dataset_properties=info,
             include=include, exclude=exclude,
-            search_space_updates=search_space_updates). \
-            get_dataset_requirements()
+            search_space_updates=search_space_updates
+        ).get_dataset_requirements()
     elif task_type in IMAGE_TASKS:
         return ImageClassificationPipeline(
             dataset_properties=info,
             include=include, exclude=exclude,
-            search_space_updates=search_space_updates). \
-            get_dataset_requirements()
+            search_space_updates=search_space_updates
+        ).get_dataset_requirements()
+    else:
+        raise ValueError("Task_type not supported")
+
+
+def _get_forecasting_dataset_requirements(info: Dict[str, Any],
+                                          include: Optional[Dict] = None,
+                                          exclude: Optional[Dict] = None,
+                                          search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                                          ) -> List[FitRequirement]:
+    task_type = STRING_TO_TASK_TYPES[info['task_type']]
+
+    if task_type in FORECASTING_TASKS:
+        if not forecasting_dependencies_installed:
+            raise ModuleNotFoundError(ForecastingDependenciesNotInstalledMSG)
+        return TimeSeriesForecastingPipeline(
+            dataset_properties=info,
+            include=include,
+            exclude=exclude,
+            search_space_updates=search_space_updates
+        ).get_dataset_requirements()
     else:
         raise ValueError("Task_type not supported")
 
@@ -152,6 +188,12 @@ def get_configuration_space(info: Dict[str, Any],
                                                    exclude if exclude is not None else {},
                                                    search_space_updates=search_space_updates
                                                    )
+    elif task_type in FORECASTING_TASKS:
+        return _get_forecasting_configuration_space(info,
+                                                    include if include is not None else {},
+                                                    exclude if exclude is not None else {},
+                                                    search_space_updates=search_space_updates
+                                                    )
     else:
         return _get_classification_configuration_space(info,
                                                        include if include is not None else {},
@@ -165,13 +207,13 @@ def _get_regression_configuration_space(info: Dict[str, Any], include: Dict[str,
                                         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                         ) -> ConfigurationSpace:
     if STRING_TO_TASK_TYPES[info['task_type']] in TABULAR_TASKS:
-        configuration_space = TabularRegressionPipeline(
+        pipeline = TabularRegressionPipeline(
             dataset_properties=info,
             include=include,
             exclude=exclude,
-            search_space_updates=search_space_updates
-        ).get_hyperparameter_search_space()
-        return configuration_space
+            search_space_updates=search_space_updates)
+        return pipeline.get_hyperparameter_search_space()
+
     else:
         raise ValueError("Task_type not supported")
 
@@ -185,6 +227,7 @@ def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[
                                                  include=include, exclude=exclude,
                                                  search_space_updates=search_space_updates)
         return pipeline.get_hyperparameter_search_space()
+
     elif STRING_TO_TASK_TYPES[info['task_type']] in IMAGE_TASKS:
         return ImageClassificationPipeline(
             dataset_properties=info,
@@ -193,3 +236,13 @@ def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[
             get_hyperparameter_search_space()
     else:
         raise ValueError("Task_type not supported")
+
+
+def _get_forecasting_configuration_space(info: Dict[str, Any], include: Dict[str, List[str]],
+                                         exclude: Dict[str, List[str]],
+                                         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                                         ) -> ConfigurationSpace:
+    pipeline = TimeSeriesForecastingPipeline(dataset_properties=info,
+                                             include=include, exclude=exclude,
+                                             search_space_updates=search_space_updates)
+    return pipeline.get_hyperparameter_search_space()
diff --git a/autoPyTorch/utils/results_manager.py b/autoPyTorch/utils/results_manager.py
new file mode 100644
index 000000000..c1860b0f6
--- /dev/null
+++ b/autoPyTorch/utils/results_manager.py
@@ -0,0 +1,686 @@
+import io
+from datetime import datetime
+from typing import Any, Dict, List, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+import scipy
+
+from smac.runhistory.runhistory import RunHistory, RunKey, RunValue
+from smac.tae import StatusType
+from smac.utils.io.traj_logging import TrajEntry
+
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+
+
+# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2
+#  is the new minimum required version!
+STATUS_TYPES = [
+    StatusType.SUCCESS,
+    # Success (but did not advance to higher budget such as cutoff by hyperband)
+    StatusType.DONOTADVANCE,
+    StatusType.TIMEOUT,
+    StatusType.CRASHED,
+    StatusType.ABORT,
+    StatusType.MEMOUT
+]
+
+
+def cost2metric(cost: float, metric: autoPyTorchMetric) -> float:
+    """
+    Revert cost metric evaluated in SMAC to the original metric.
+
+    The conversion is defined in:
+        autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
+        cost = metric._optimum - metric._sign * original_metric_value
+        ==> original_metric_value = metric._sign * (metric._optimum - cost)
+    """
+    return metric._sign * (metric._optimum - cost)
+
+
+def get_start_time(run_history: RunHistory) -> float:
+    """
+    Get start time of optimization.
+
+    Args:
+        run_history (RunHistory):
+            The history of config evals from SMAC.
+
+    Returns:
+        starttime (float):
+            The start time of the first training.
+    """
+
+    start_times = []
+    for run_value in run_history.data.values():
+        if run_value.status in (StatusType.STOP, StatusType.RUNNING):
+            continue
+        elif run_value.status not in STATUS_TYPES:
+            raise ValueError(f'Unexpected run status: {run_value.status}')
+
+        start_times.append(run_value.starttime)
+
+    return float(np.min(start_times))  # mypy redefinition
+
+
+def _extract_metrics_info(
+    run_value: RunValue,
+    scoring_functions: List[autoPyTorchMetric],
+    inference_name: str
+) -> Dict[str, float]:
+    """
+    Extract the metric information given a run_value
+    and a list of metrics of interest.
+
+    Args:
+        run_value (RunValue):
+            The information for each config evaluation.
+        scoring_functions (List[autoPyTorchMetric]):
+            The list of metrics to retrieve the info.
+        inference_name (str):
+            The name of the inference. Either `train`, `opt` or `test`.
+
+    Returns:
+        metric_info (Dict[str, float]):
+            The metric values of interest.
+            Since the metrics in additional_info are `cost`,
+            we transform them into the original form.
+    """
+
+    if run_value.status not in (StatusType.SUCCESS, StatusType.DONOTADVANCE):
+        # Additional info for metrics is not available in this case.
+        return {metric.name: metric._worst_possible_result for metric in scoring_functions}
+
+    inference_choices = ['train', 'opt', 'test']
+    if inference_name not in inference_choices:
+        raise ValueError(f'inference_name must be in {inference_choices}, but got {inference_choices}')
+
+    cost_info = run_value.additional_info[f'{inference_name}_loss']
+    avail_metrics = cost_info.keys()
+
+    return {
+        metric.name: cost2metric(cost=cost_info[metric.name], metric=metric)
+        if metric.name in avail_metrics else metric._worst_possible_result
+        for metric in scoring_functions
+    }
+
+
+class EnsembleResults:
+    def __init__(
+        self,
+        metric: autoPyTorchMetric,
+        ensemble_performance_history: List[Dict[str, Any]],
+        order_by_endtime: bool = False
+    ):
+        """
+        The wrapper class for ensemble_performance_history.
+        This class extracts the information from ensemble_performance_history
+        and allows other class to easily handle the history.
+
+        Attributes:
+            train_scores (List[float]):
+                The ensemble scores on the training dataset.
+            test_scores (List[float]):
+                The ensemble scores on the test dataset.
+            end_times (List[float]):
+                The end time of the end of each ensemble evaluation.
+                Each element is a float timestamp.
+            empty (bool):
+                Whether the ensemble history about `self.metric` is empty or not.
+            metric (autoPyTorchMetric):
+                The information about the metric to contain.
+                In the case when such a metric does not exist in the record,
+                This class raises KeyError.
+        """
+        self._test_scores: List[float] = []
+        self._train_scores: List[float] = []
+        self._end_times: List[float] = []
+        self._metric = metric
+        self._empty = True  # Initial state is empty.
+        self._instantiated = False
+
+        self._extract_results_from_ensemble_performance_history(ensemble_performance_history)
+        if order_by_endtime:
+            self._sort_by_endtime()
+
+        self._instantiated = True
+
+    @property
+    def train_scores(self) -> np.ndarray:
+        return np.asarray(self._train_scores)
+
+    @property
+    def test_scores(self) -> np.ndarray:
+        return np.asarray(self._test_scores)
+
+    @property
+    def end_times(self) -> np.ndarray:
+        return np.asarray(self._end_times)
+
+    @property
+    def metric_name(self) -> str:
+        return self._metric.name
+
+    def empty(self) -> bool:
+        """ This is not property to follow coding conventions. """
+        return self._empty
+
+    def _update(self, data: Dict[str, Any]) -> None:
+        if self._instantiated:
+            raise RuntimeError(
+                'EnsembleResults should not be overwritten once instantiated. '
+                'Instantiate new object rather than using update.'
+            )
+
+        self._train_scores.append(data[f'train_{self.metric_name}'])
+        self._test_scores.append(data[f'test_{self.metric_name}'])
+        self._end_times.append(datetime.timestamp(data['Timestamp']))
+
+    def _sort_by_endtime(self) -> None:
+        """
+        Since the default order is by start time
+        and parallel computation might change the order of ending,
+        this method provides the feature to sort by end time.
+        Note that this method is destructive.
+        """
+        if self._instantiated:
+            raise RuntimeError(
+                'EnsembleResults should not be overwritten once instantiated. '
+                'Instantiate new object with order_by_endtime=True.'
+            )
+
+        order = np.argsort(self._end_times)
+
+        self._train_scores = self.train_scores[order].tolist()
+        self._test_scores = self.test_scores[order].tolist()
+        self._end_times = self.end_times[order].tolist()
+
+    def _extract_results_from_ensemble_performance_history(
+        self,
+        ensemble_performance_history: List[Dict[str, Any]]
+    ) -> None:
+        """
+        Extract information to from `ensemble_performance_history`
+        to match the format of this class format.
+
+        Args:
+            ensemble_performance_history (List[Dict[str, Any]]):
+                The history of the ensemble performance from EnsembleBuilder.
+                Its key must be either `train_xxx`, `test_xxx` or `Timestamp`.
+        """
+
+        if (
+            len(ensemble_performance_history) == 0
+            or f'train_{self.metric_name}' not in ensemble_performance_history[0].keys()
+        ):
+            self._empty = True
+            return
+
+        self._empty = False  # We can extract ==> not empty
+        for data in ensemble_performance_history:
+            self._update(data)
+
+
+class SearchResults:
+    def __init__(
+        self,
+        metric: autoPyTorchMetric,
+        scoring_functions: List[autoPyTorchMetric],
+        run_history: RunHistory,
+        order_by_endtime: bool = False
+    ):
+        """
+        The wrapper class for run_history.
+        This class extracts the information from run_history
+        and allows other class to easily handle the history.
+        Note that the data is sorted by starttime by default and
+        metric_dict has the original form of metric value, i.e. not necessarily cost.
+
+        Attributes:
+            train_metric_dict (Dict[str, List[float]]):
+                The extracted train metric information at each evaluation.
+                Each list keeps the metric information specified by scoring_functions and metric.
+            opt_metric_dict (Dict[str, List[float]]):
+                The extracted opt metric information at each evaluation.
+                Each list keeps the metric information specified by scoring_functions and metric.
+            test_metric_dict (Dict[str, List[float]]):
+                The extracted test metric information at each evaluation.
+                Each list keeps the metric information specified by scoring_functions and metric.
+            fit_times (List[float]):
+                The time needed to fit each model.
+            end_times (List[float]):
+                The end time of the end of each evaluation.
+                Each element is a float timestamp.
+            configs (List[Configuration]):
+                The configurations at each evaluation.
+            status_types (List[StatusType]):
+                The list of status types of each evaluation (e.g. success, crush).
+            budgets (List[float]):
+                The budgets used for each evaluation.
+                Here, budget refers to the definition in Hyperband or Successive halving.
+            config_ids (List[int]):
+                The ID of each configuration. Since we use cutoff such as in Hyperband,
+                we need to store it to know whether each configuration is a suvivor.
+            is_traditionals (List[bool]):
+                Whether each configuration is from traditional machine learning methods.
+            additional_infos (List[Dict[str, float]]):
+                It usually serves as the source of each metric at each evaluation.
+                In other words, train or test performance is extracted from this info.
+            rank_opt_scores (np.ndarray):
+                The rank of each evaluation among all the evaluations.
+            metric (autoPyTorchMetric):
+                The metric of the main interest.
+            scoring_functions (List[autoPyTorchMetric]):
+                The list of metrics to contain in the additional_infos.
+        """
+        if metric not in scoring_functions:
+            scoring_functions.append(metric)
+
+        self.train_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions}
+        self.opt_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions}
+        self.test_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions}
+
+        self._fit_times: List[float] = []
+        self._end_times: List[float] = []
+        self.configs: List[Configuration] = []
+        self.status_types: List[StatusType] = []
+        self.budgets: List[float] = []
+        self.config_ids: List[int] = []
+        self.is_traditionals: List[bool] = []
+        self.additional_infos: List[Dict[str, float]] = []
+        self.rank_opt_scores: np.ndarray = np.array([])
+        self._scoring_functions = scoring_functions
+        self._metric = metric
+        self._instantiated = False
+
+        self._extract_results_from_run_history(run_history)
+        if order_by_endtime:
+            self._sort_by_endtime()
+
+        self._instantiated = True
+
+    @property
+    def train_scores(self) -> np.ndarray:
+        """ training metric values at each evaluation """
+        return np.asarray(self.train_metric_dict[self.metric_name])
+
+    @property
+    def opt_scores(self) -> np.ndarray:
+        """ validation metric values at each evaluation """
+        return np.asarray(self.opt_metric_dict[self.metric_name])
+
+    @property
+    def test_scores(self) -> np.ndarray:
+        """ test metric values at each evaluation """
+        return np.asarray(self.test_metric_dict[self.metric_name])
+
+    @property
+    def fit_times(self) -> np.ndarray:
+        return np.asarray(self._fit_times)
+
+    @property
+    def end_times(self) -> np.ndarray:
+        return np.asarray(self._end_times)
+
+    @property
+    def metric_name(self) -> str:
+        return self._metric.name
+
+    def _update(
+        self,
+        config: Configuration,
+        run_key: RunKey,
+        run_value: RunValue
+    ) -> None:
+
+        if self._instantiated:
+            raise RuntimeError(
+                'SearchResults should not be overwritten once instantiated. '
+                'Instantiate new object rather than using update.'
+            )
+        elif run_value.status in (StatusType.STOP, StatusType.RUNNING):
+            return
+        elif run_value.status not in STATUS_TYPES:
+            raise ValueError(f'Unexpected run status: {run_value.status}')
+
+        is_traditional = False  # If run is not successful, unsure ==> not True ==> False
+        if run_value.additional_info is not None:
+            is_traditional = run_value.additional_info['configuration_origin'] == 'traditional'
+
+        self.status_types.append(run_value.status)
+        self.configs.append(config)
+        self.budgets.append(run_key.budget)
+        self.config_ids.append(run_key.config_id)
+        self.is_traditionals.append(is_traditional)
+        self.additional_infos.append(run_value.additional_info)
+        self._fit_times.append(run_value.time)
+        self._end_times.append(run_value.endtime)
+
+        for inference_name in ['train', 'opt', 'test']:
+            metric_info = _extract_metrics_info(
+                run_value=run_value,
+                scoring_functions=self._scoring_functions,
+                inference_name=inference_name
+            )
+            for metric_name, val in metric_info.items():
+                getattr(self, f'{inference_name}_metric_dict')[metric_name].append(val)
+
+    def _sort_by_endtime(self) -> None:
+        """
+        Since the default order is by start time
+        and parallel computation might change the order of ending,
+        this method provides the feature to sort by end time.
+        Note that this method is destructive.
+        """
+        if self._instantiated:
+            raise RuntimeError(
+                'SearchResults should not be overwritten once instantiated. '
+                'Instantiate new object with order_by_endtime=True.'
+            )
+
+        order = np.argsort(self._end_times)
+
+        self.train_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.train_metric_dict.items()}
+        self.opt_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.opt_metric_dict.items()}
+        self.test_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.test_metric_dict.items()}
+
+        self._fit_times = [self._fit_times[idx] for idx in order]
+        self._end_times = [self._end_times[idx] for idx in order]
+        self.status_types = [self.status_types[idx] for idx in order]
+        self.budgets = [self.budgets[idx] for idx in order]
+        self.config_ids = [self.config_ids[idx] for idx in order]
+        self.is_traditionals = [self.is_traditionals[idx] for idx in order]
+        self.additional_infos = [self.additional_infos[idx] for idx in order]
+
+        # Don't use numpy slicing to avoid version dependency (cast config to object might cause issues)
+        self.configs = [self.configs[idx] for idx in order]
+
+        # Only rank_opt_scores is np.ndarray
+        self.rank_opt_scores = self.rank_opt_scores[order]
+
+    def _extract_results_from_run_history(self, run_history: RunHistory) -> None:
+        """
+        Extract the information to match this class format.
+
+        Args:
+            run_history (RunHistory):
+                The history of config evals from SMAC.
+        """
+
+        for run_key, run_value in run_history.data.items():
+            config = run_history.ids_config[run_key.config_id]
+            self._update(config=config, run_key=run_key, run_value=run_value)
+
+        self.rank_opt_scores = scipy.stats.rankdata(
+            -1 * self._metric._sign * self.opt_scores,  # rank order
+            method='min'
+        )
+
+
+class MetricResults:
+    def __init__(
+        self,
+        metric: autoPyTorchMetric,
+        run_history: RunHistory,
+        ensemble_performance_history: List[Dict[str, Any]]
+    ):
+        """
+        The wrapper class for ensemble_performance_history.
+        This class extracts the information from ensemble_performance_history
+        and allows other class to easily handle the history.
+        Note that all the data is sorted by endtime!
+
+        Attributes:
+            start_time (float):
+                The timestamp at the very beginning of the optimization.
+            cum_times (np.ndarray):
+                The runtime needed to reach the end of each evaluation.
+                The time unit is second.
+            metric (autoPyTorchMetric):
+                The information about the metric to contain.
+            search_results (SearchResults):
+                The instance to fetch the metric values of `self.metric`
+                from run_history.
+            ensemble_results (EnsembleResults):
+                The instance to fetch the metric values of `self.metric`
+                from ensemble_performance_history.
+                If there is no information available, self.empty() returns True.
+            data (Dict[str, np.ndarray]):
+                Keys are `{single, ensemble}::{train, opt, test}::{metric.name}`.
+                Each array contains the evaluated values for the corresponding category.
+        """
+        self.start_time = get_start_time(run_history)
+        self.metric = metric
+        self.search_results = SearchResults(
+            metric=metric,
+            run_history=run_history,
+            scoring_functions=[],
+            order_by_endtime=True
+        )
+        self.ensemble_results = EnsembleResults(
+            metric=metric,
+            ensemble_performance_history=ensemble_performance_history,
+            order_by_endtime=True
+        )
+
+        if (
+            not self.ensemble_results.empty()
+            and self.search_results.end_times[-1] < self.ensemble_results.end_times[-1]
+        ):
+            # Augment runtime table with the final available end time
+            self.cum_times = np.hstack(
+                [self.search_results.end_times - self.start_time,
+                 [self.ensemble_results.end_times[-1] - self.start_time]]
+            )
+        else:
+            self.cum_times = self.search_results.end_times - self.start_time
+
+        self.data: Dict[str, np.ndarray] = {}
+        self._extract_results()
+
+    def _extract_results(self) -> None:
+        """ Extract metric values of `self.metric` and store them in `self.data`. """
+        metric_name = self.metric.name
+        for inference_name in ['train', 'test', 'opt']:
+            # TODO: Extract information from self.search_results
+            data = getattr(self.search_results, f'{inference_name}_metric_dict')[metric_name]
+            self.data[f'single::{inference_name}::{metric_name}'] = np.array(data)
+
+            if self.ensemble_results.empty() or inference_name == 'opt':
+                continue
+
+            data = getattr(self.ensemble_results, f'{inference_name}_scores')
+            self.data[f'ensemble::{inference_name}::{metric_name}'] = np.array(data)
+
+    def get_ensemble_merged_data(self) -> Dict[str, np.ndarray]:
+        """
+        Merge the ensemble performance data to the closest time step
+        available in the run_history.
+        One performance metric will be allocated to one time step.
+        Other time steps will be filled by the worst possible value.
+
+        Returns:
+            data (Dict[str, np.ndarray]):
+                Merged data as mentioned above
+        """
+
+        data = {k: v.copy() for k, v in self.data.items()}  # deep copy
+
+        if self.ensemble_results.empty():  # no ensemble data available
+            return data
+
+        train_scores, test_scores = self.ensemble_results.train_scores, self.ensemble_results.test_scores
+        end_times = self.ensemble_results.end_times
+        cur, timestep_size, sign = 0, self.cum_times.size, self.metric._sign
+        key_train, key_test = f'ensemble::train::{self.metric.name}', f'ensemble::test::{self.metric.name}'
+
+        train_perfs = np.full_like(self.cum_times, self.metric._worst_possible_result)
+        test_perfs = np.full_like(self.cum_times, self.metric._worst_possible_result)
+
+        for timestamp, train_score, test_score in zip(end_times, train_scores, test_scores):
+            avail_time = timestamp - self.start_time
+            while cur < timestep_size and self.cum_times[cur] < avail_time:
+                # Guarantee that cum_times[cur] >= avail_time
+                cur += 1
+
+            # results[cur] is the closest available checkpoint after or at the avail_time
+            # ==> Assign this data to that checkpoint
+            time_index = min(cur, timestep_size - 1)
+            # If there already exists a previous allocated value, update by a better value
+            train_perfs[time_index] = sign * max(sign * train_perfs[time_index], sign * train_score)
+            test_perfs[time_index] = sign * max(sign * test_perfs[time_index], sign * test_score)
+
+        data.update({key_train: train_perfs, key_test: test_perfs})
+        return data
+
+
+class ResultsManager:
+    def __init__(self, *args: Any, **kwargs: Any):
+        """
+        This module is used to gather result information for BaseTask.
+        In other words, this module is supposed to be wrapped by BaseTask.
+
+        Attributes:
+            run_history (RunHistory):
+                A `SMAC Runshistory <https://automl.github.io/SMAC3/master/apidoc/smac.runhistory.runhistory.html>`_
+                object that holds information about the runs of the target algorithm made during search
+            ensemble_performance_history (List[Dict[str, Any]]):
+                The history of the ensemble performance from EnsembleBuilder.
+                Its keys are `train_xxx`, `test_xxx` or `Timestamp`.
+            trajectory (List[TrajEntry]):
+                A list of all incumbent configurations during search
+        """
+        self.run_history: RunHistory = RunHistory()
+        self.ensemble_performance_history: List[Dict[str, Any]] = []
+        self.trajectory: List[TrajEntry] = []
+
+    def _check_run_history(self) -> None:
+        if self.run_history is None:
+            raise RuntimeError("No Run History found, search has not been called.")
+
+        if self.run_history.empty():
+            raise RuntimeError("Run History is empty. Something went wrong, "
+                               "SMAC was not able to fit any model?")
+
+    def get_incumbent_results(
+        self,
+        metric: autoPyTorchMetric,
+        include_traditional: bool = False
+    ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]:
+        """
+        Get Incumbent config and the corresponding results
+
+        Args:
+            metric (autoPyTorchMetric):
+                A metric that is evaluated when searching with fit AutoPytorch.
+            include_traditional (bool):
+                Whether to include results from tradtional pipelines
+
+        Returns:
+            Configuration (CS.ConfigurationSpace):
+                The incumbent configuration
+            Dict[str, Union[int, str, float]]:
+                Additional information about the run of the incumbent configuration.
+        """
+        self._check_run_history()
+
+        results = SearchResults(metric=metric, scoring_functions=[], run_history=self.run_history)
+
+        if not include_traditional:
+            non_traditional = ~np.array(results.is_traditionals)
+            scores = results.opt_scores[non_traditional]
+            indices = np.arange(len(results.configs))[non_traditional]
+        else:
+            scores = results.opt_scores
+            indices = np.arange(len(results.configs))
+
+        incumbent_idx = indices[np.argmax(metric._sign * scores)]
+        incumbent_config = results.configs[incumbent_idx]
+        incumbent_results = results.additional_infos[incumbent_idx]
+
+        assert incumbent_results is not None  # mypy check
+        return incumbent_config, incumbent_results
+
+    def get_search_results(
+        self,
+        scoring_functions: List[autoPyTorchMetric],
+        metric: autoPyTorchMetric
+    ) -> SearchResults:
+        """
+        This attribute is populated with data from `self.run_history`
+        and contains information about the configurations, and their
+        corresponding metric results, status of run, parameters and
+        the budget
+
+        Args:
+            scoring_functions (List[autoPyTorchMetric]):
+                Metrics to show in the results.
+            metric (autoPyTorchMetric):
+                A metric that is evaluated when searching with fit AutoPytorch.
+
+        Returns:
+            SearchResults:
+                An instance that contains the results from search
+        """
+        self._check_run_history()
+        return SearchResults(metric=metric, scoring_functions=scoring_functions, run_history=self.run_history)
+
+    def sprint_statistics(
+        self,
+        dataset_name: str,
+        scoring_functions: List[autoPyTorchMetric],
+        metric: autoPyTorchMetric
+    ) -> str:
+        """
+        Prints statistics about the SMAC search.
+
+        These statistics include:
+
+        1. Optimisation Metric
+        2. Best Optimisation score achieved by individual pipelines
+        3. Total number of target algorithm runs
+        4. Total number of successful target algorithm runs
+        5. Total number of crashed target algorithm runs
+        6. Total number of target algorithm runs that exceeded the time limit
+        7. Total number of successful target algorithm runs that exceeded the memory limit
+
+        Args:
+            dataset_name (str):
+                The dataset name that was used in the run.
+            scoring_functions (List[autoPyTorchMetric]):
+                Metrics to show in the results.
+            metric (autoPyTorchMetric):
+                A metric that is evaluated when searching with fit AutoPytorch.
+
+        Returns:
+            (str):
+                Formatted string with statistics
+        """
+        search_results = self.get_search_results(scoring_functions, metric)
+        success_status = (StatusType.SUCCESS, StatusType.DONOTADVANCE)
+        sio = io.StringIO()
+        sio.write("autoPyTorch results:\n")
+        sio.write(f"\tDataset name: {dataset_name}\n")
+        sio.write(f"\tOptimisation Metric: {metric}\n")
+
+        num_runs = len(search_results.status_types)
+        num_success = sum([s in success_status for s in search_results.status_types])
+        num_crash = sum([s == StatusType.CRASHED for s in search_results.status_types])
+        num_timeout = sum([s == StatusType.TIMEOUT for s in search_results.status_types])
+        num_memout = sum([s == StatusType.MEMOUT for s in search_results.status_types])
+
+        if num_success > 0:
+            best_score = metric._sign * np.max(metric._sign * search_results.opt_scores)
+            sio.write(f"\tBest validation score: {best_score}\n")
+
+        sio.write(f"\tNumber of target algorithm runs: {num_runs}\n")
+        sio.write(f"\tNumber of successful target algorithm runs: {num_success}\n")
+        sio.write(f"\tNumber of crashed target algorithm runs: {num_crash}\n")
+        sio.write(f"\tNumber of target algorithms that exceeded the time "
+                  f"limit: {num_timeout}\n")
+        sio.write(f"\tNumber of target algorithms that exceeded the memory "
+                  f"limit: {num_memout}\n")
+
+        return sio.getvalue()
diff --git a/autoPyTorch/utils/results_visualizer.py b/autoPyTorch/utils/results_visualizer.py
new file mode 100644
index 000000000..e1debe29c
--- /dev/null
+++ b/autoPyTorch/utils/results_visualizer.py
@@ -0,0 +1,334 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, NamedTuple, Optional, Tuple
+
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+from autoPyTorch.utils.results_manager import MetricResults
+
+
+plt.rcParams["font.family"] = "Times New Roman"
+plt.rcParams["font.size"] = 18
+
+
+@dataclass(frozen=True)
+class ColorLabelSettings:
+    """
+    The settings for each plot.
+    If None is provided, those plots are omitted.
+
+    Attributes:
+        single_train (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal single train result.
+        single_opt (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal single result used in optimization.
+        single_test (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal single test result.
+        ensemble_train (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal ensemble train result.
+        ensemble_test (Optional[Tuple[Optional[str], Optional[str]]]):
+            The setting for the plot of the optimal ensemble test result.
+    """
+    single_train: Optional[Tuple[Optional[str], Optional[str]]] = ('red', None)
+    single_opt: Optional[Tuple[Optional[str], Optional[str]]] = ('blue', None)
+    single_test: Optional[Tuple[Optional[str], Optional[str]]] = ('green', None)
+    ensemble_train: Optional[Tuple[Optional[str], Optional[str]]] = ('brown', None)
+    ensemble_test: Optional[Tuple[Optional[str], Optional[str]]] = ('purple', None)
+
+    def extract_dicts(
+        self,
+        results: MetricResults
+    ) -> Tuple[Dict[str, Optional[str]], Dict[str, Optional[str]]]:
+        """
+        Args:
+            results (MetricResults):
+                The results of the optimization in the base task API.
+                It determines what keys to include.
+
+        Returns:
+            colors, labels (Tuple[Dict[str, Optional[str]], Dict[str, Optional[str]]]):
+                The dicts for colors and labels.
+                The keys are determined by results and each label and color
+                are determined by each instantiation.
+                Note that the keys include the metric name.
+        """
+
+        colors, labels = {}, {}
+
+        for key, color_label in vars(self).items():
+            if color_label is None:
+                continue
+
+            prefix = '::'.join(key.split('_'))
+            try:
+                new_key = [key for key in results.data.keys() if key.startswith(prefix)][0]
+                colors[new_key], labels[new_key] = color_label
+            except IndexError:  # ensemble does not always have results
+                pass
+
+        return colors, labels
+
+
+class PlotSettingParams(NamedTuple):
+    """
+    Parameters for the plot environment.
+
+    Attributes:
+        n_points (int):
+            The number of points to plot.
+        xlabel (Optional[str]):
+            The label in the x axis.
+        ylabel (Optional[str]):
+            The label in the y axis.
+        xscale (str):
+            The scale of x axis.
+        yscale (str):
+            The scale of y axis.
+        title (Optional[str]):
+            The title of the subfigure.
+        xlim (Tuple[float, float]):
+            The range of x axis.
+        ylim (Tuple[float, float]):
+            The range of y axis.
+        grid (bool):
+            Whether to have grid lines.
+            If users would like to define lines in detail,
+            they need to deactivate it.
+        legend (bool):
+            Whether to have legend in the figure.
+        legend_kwargs (Dict[str, Any]):
+            The kwargs for ax.legend.
+            Ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.legend.html
+        title (Optional[str]):
+            The title of the figure.
+        title_kwargs (Dict[str, Any]):
+            The kwargs for ax.set_title except title label.
+            Ref: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.axes.Axes.set_title.html
+        show (bool):
+            Whether to show the plot.
+            If figname is not None, the save will be prioritized.
+        figname (Optional[str]):
+            Name of a figure to save. If None, no figure will be saved.
+        savefig_kwargs (Dict[str, Any]):
+            The kwargs for plt.savefig except filename.
+            Ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html
+        args, kwargs (Any):
+            Arguments for the ax.plot.
+    """
+    n_points: int = 20
+    xscale: str = 'linear'
+    yscale: str = 'linear'
+    xlabel: Optional[str] = None
+    ylabel: Optional[str] = None
+    title: Optional[str] = None
+    title_kwargs: Dict[str, Any] = {}
+    xlim: Optional[Tuple[float, float]] = None
+    ylim: Optional[Tuple[float, float]] = None
+    grid: bool = True
+    legend: bool = True
+    legend_kwargs: Dict[str, Any] = {}
+    show: bool = False
+    figname: Optional[str] = None
+    figsize: Optional[Tuple[int, int]] = None
+    savefig_kwargs: Dict[str, Any] = {}
+
+
+class ScaleChoices(Enum):
+    linear = 'linear'
+    log = 'log'
+
+
+def _get_perf_and_time(
+    cum_results: np.ndarray,
+    cum_times: np.ndarray,
+    plot_setting_params: PlotSettingParams,
+    worst_val: float
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get the performance and time step to plot.
+
+    Args:
+        cum_results (np.ndarray):
+            The cumulated performance per evaluation.
+        cum_times (np.ndarray):
+            The cumulated runtime at the end of each evaluation.
+        plot_setting_params (PlotSettingParams):
+            Parameters for the plot.
+        worst_val (float):
+            The worst possible value given a metric.
+
+    Returns:
+        check_points (np.ndarray):
+            The time in second where the plot will happen.
+        perf_by_time_step (np.ndarray):
+            The best performance at the corresponding time in second
+            where the plot will happen.
+    """
+
+    scale_choices = [s.name for s in ScaleChoices]
+    if plot_setting_params.xscale not in scale_choices or plot_setting_params.yscale not in scale_choices:
+        raise ValueError(f'xscale and yscale must be in {scale_choices}, '
+                         f'but got xscale={plot_setting_params.xscale}, yscale={plot_setting_params.yscale}')
+
+    n_evals, runtime_lb, runtime_ub = cum_results.size, cum_times[0], cum_times[-1]
+
+    if plot_setting_params.xscale == 'log':
+        # Take the even time interval in the log scale and revert
+        check_points = np.exp(np.linspace(np.log(runtime_lb), np.log(runtime_ub), plot_setting_params.n_points))
+    else:
+        check_points = np.linspace(runtime_lb, runtime_ub, plot_setting_params.n_points)
+
+    check_points += 1e-8  # Prevent float error
+
+    # The worst possible value is always at the head
+    perf_by_time_step = np.full_like(check_points, worst_val)
+    cur = 0
+
+    for i, check_point in enumerate(check_points):
+        while cur < n_evals and cum_times[cur] <= check_point:
+            # Guarantee that cum_times[cur] > check_point
+            # ==> cum_times[cur - 1] <= check_point
+            cur += 1
+        if cur:  # filter cur - 1 == -1
+            # results[cur - 1] was obtained before or at the checkpoint
+            # ==> The best performance up to this checkpoint
+            perf_by_time_step[i] = cum_results[cur - 1]
+
+    if plot_setting_params.yscale == 'log' and np.any(perf_by_time_step < 0):
+        raise ValueError('log scale is not available when performance metric can be negative.')
+
+    return check_points, perf_by_time_step
+
+
+class ResultsVisualizer:
+    @staticmethod
+    def _set_plot_args(
+        ax: plt.Axes,
+        plot_setting_params: PlotSettingParams
+    ) -> None:
+        if plot_setting_params.xlim is not None:
+            ax.set_xlim(*plot_setting_params.xlim)
+        if plot_setting_params.ylim is not None:
+            ax.set_ylim(*plot_setting_params.ylim)
+
+        if plot_setting_params.xlabel is not None:
+            ax.set_xlabel(plot_setting_params.xlabel)
+        if plot_setting_params.ylabel is not None:
+            ax.set_ylabel(plot_setting_params.ylabel)
+
+        ax.set_xscale(plot_setting_params.xscale)
+        ax.set_yscale(plot_setting_params.yscale)
+
+        if plot_setting_params.grid:
+            if plot_setting_params.xscale == 'log' or plot_setting_params.yscale == 'log':
+                ax.grid(True, which='minor', color='gray', linestyle=':')
+
+            ax.grid(True, which='major', color='black')
+
+        if plot_setting_params.legend:
+            ax.legend(**plot_setting_params.legend_kwargs)
+
+        if plot_setting_params.title is not None:
+            ax.set_title(plot_setting_params.title, **plot_setting_params.title_kwargs)
+
+        if plot_setting_params.figname is not None:
+            plt.savefig(plot_setting_params.figname, **plot_setting_params.savefig_kwargs)
+        elif plot_setting_params.show:
+            plt.show()
+
+    @staticmethod
+    def _plot_individual_perf_over_time(
+        ax: plt.Axes,
+        cum_times: np.ndarray,
+        cum_results: np.ndarray,
+        worst_val: float,
+        plot_setting_params: PlotSettingParams,
+        label: Optional[str] = None,
+        color: Optional[str] = None,
+        *args: Any,
+        **kwargs: Any
+    ) -> None:
+        """
+        Plot the incumbent performance of the AutoPytorch over time.
+        This method is created to make plot_perf_over_time more readable
+        and it is not supposed to be used only in this class, but not from outside.
+
+        Args:
+            ax (plt.Axes):
+                axis to plot (subplots of matplotlib).
+            cum_times (np.ndarray):
+                The cumulated time until each end of config evaluation.
+            results (np.ndarray):
+                The cumulated performance per evaluation.
+            worst_val (float):
+                The worst possible value given a metric.
+            plot_setting_params (PlotSettingParams):
+                Parameters for the plot.
+            label (Optional[str]):
+                The name of the plot.
+            color (Optional[str]):
+                Color of the plot.
+            args, kwargs (Any):
+                Arguments for the ax.plot.
+        """
+        check_points, perf_by_time_step = _get_perf_and_time(
+            cum_results=cum_results,
+            cum_times=cum_times,
+            plot_setting_params=plot_setting_params,
+            worst_val=worst_val
+        )
+
+        ax.plot(check_points, perf_by_time_step, color=color, label=label, *args, **kwargs)
+
+    def plot_perf_over_time(
+        self,
+        results: MetricResults,
+        plot_setting_params: PlotSettingParams,
+        colors: Dict[str, Optional[str]],
+        labels: Dict[str, Optional[str]],
+        ax: Optional[plt.Axes] = None,
+        *args: Any,
+        **kwargs: Any
+    ) -> None:
+        """
+        Plot the incumbent performance of the AutoPytorch over time.
+
+        Args:
+            results (MetricResults):
+                The module that handles results from various sources.
+            plot_setting_params (PlotSettingParams):
+                Parameters for the plot.
+            labels (Dict[str, Optional[str]]):
+                The name of the plot.
+            colors (Dict[str, Optional[str]]):
+                Color of the plot.
+            ax (Optional[plt.Axes]):
+                axis to plot (subplots of matplotlib).
+                If None, it will be created automatically.
+            args, kwargs (Any):
+                Arguments for the ax.plot.
+        """
+        if ax is None:
+            _, ax = plt.subplots(nrows=1, ncols=1)
+
+        data = results.get_ensemble_merged_data()
+        cum_times = results.cum_times
+        minimize = (results.metric._sign == -1)
+
+        for key in data.keys():
+            _label, _color, _perfs = labels[key], colors[key], data[key]
+            # Take the best results over time
+            _cum_perfs = np.minimum.accumulate(_perfs) if minimize else np.maximum.accumulate(_perfs)
+
+            self._plot_individual_perf_over_time(  # type: ignore
+                ax=ax, cum_results=_cum_perfs, cum_times=cum_times,
+                plot_setting_params=plot_setting_params,
+                worst_val=results.metric._worst_possible_result,
+                label=_label if _label is not None else ' '.join(key.split('::')),
+                color=_color,
+                *args, **kwargs
+            )
+
+        self._set_plot_args(ax=ax, plot_setting_params=plot_setting_params)
diff --git a/autoPyTorch/utils/single_thread_client.py b/autoPyTorch/utils/single_thread_client.py
index 9bb0fe3eb..30fd05b94 100644
--- a/autoPyTorch/utils/single_thread_client.py
+++ b/autoPyTorch/utils/single_thread_client.py
@@ -61,8 +61,24 @@ def submit(
         func: Callable,
         *args: List,
         priority: int = 0,
+        key: Any = None,
+        workers: Any = None,
+        resources: Any = None,
+        retries: Any = None,
+        fifo_timeout: Any = "100 ms",
+        allow_other_workers: Any = False,
+        actor: Any = False,
+        actors: Any = False,
+        pure: Any = None,
         **kwargs: Any,
     ) -> Any:
+        """
+        Note
+        ----
+        The keyword arguments caught in `dask.distributed.Client` need to
+        be specified here so they don't get passed in as ``**kwargs`` to the
+        ``func``.
+        """
         return DummyFuture(func(*args, **kwargs))
 
     def close(self) -> None:
diff --git a/docs/api.rst b/docs/api.rst
index 00ff11d08..f54dd1e90 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -25,6 +25,15 @@ Regression
     :members:
     :inherited-members: search, refit, predict, score
 
+~~~~~~~~~~~~~~
+Time Series Forecasting
+~~~~~~~~~~~~~~
+
+.. autoclass:: autoPyTorch.api.time_series_forecasting.TimeSeriesForecastingTask
+    :members:
+    :inherited-members: search, refit, predict, score
+
+
 
 =========
 Pipelines
@@ -50,6 +59,14 @@ Tabular Regression
 .. autoclass:: autoPyTorch.pipeline.traditional_tabular_regression.TraditionalTabularRegressionPipeline
     :members:
 
+~~~~~~~~~~~~~~~~~~
+Time Series Forecasting
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline
+    :members:
+
+
 =================
 Steps in Pipeline
 =================
diff --git a/docs/dev.rst b/docs/dev.rst
index a3c154cd7..f1fec96c9 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -60,10 +60,23 @@ handle column-reordering.
 Note that column-reordering shifts categorical columns to the earlier indices
 and it is activated only if one uses a ColumnTransformer.
 
+Similar procedures can be found under time series forecasting tasks:
+
+#. `Feature Imputation <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation>`_
+#. `Feature scaling <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling>`_
+#. `Feature Encoding <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding>`_
+#. `Feature preprocessing <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing>`_
+#. `Target Imputation <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation>`_
+#. `Target Preprocessing <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing>`_
+#. `Target Scaling <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/setup/forecasting_target_scaling>`_
+#. `Loss Types <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/setup>`_
+#. `Algorithm setup <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/setup>`_
+#. `Training <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/training>`_
+
 Training of individual models
 -----------------------------
 
-Auto-PyTorch can fit 3 types of pipelines:
+**Auto-PyTorch Tabular** can fit 3 types of pipelines:
 
 #. Dummy pipeline: Use sklearn.dummy to construct an estimator that predicts using simple rules such as most frequent class
 #. Traditional machine learning pipelines: Use LightGBM, CatBoost, RandomForest, ExtraTrees, K-Nearest-Neighbors, and SupportVectorMachines
@@ -78,6 +91,9 @@ and data loaders required to perform the neural architecture search.
 After the training (fitting a pipeline), we use pickle to save it
 to disk as stated `here <https://scikit-learn.org/stable/modules/model_persistence.html>`_.
 
+**Auto-PyTorch Time Series Forecasting** currently only allows Dummy pipelines and PyTorch neural networks. Traditional machine learning pipelines
+will be introduced in a future iteration.
+
 Optimization of pipeline
 ------------------------
 
diff --git a/docs/installation.rst b/docs/installation.rst
index c9f236d14..56d344e33 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -12,7 +12,7 @@ System requirements
 Auto-PyTorch has the following system requirements:
 
 * Linux operating system (for example Ubuntu) `(get Linux here) <https://www.wikihow.com/Install-Linux>`_,
-* Python (>=3.6) `(get Python here) <https://www.python.org/downloads/>`_.
+* Python (>=3.7) `(get Python here) <https://www.python.org/downloads/>`_.
 * C++ compiler (with C++11 supports) `(get GCC here) <https://www.tutorialspoint.com/How-to-Install-Cplusplus-Compiler-on-Linux>`_ and
 * SWIG (version 3.0.* is required; >=4.0.0 is not supported) `(get SWIG here) <http://www.swig.org/survey.html>`_.
 
@@ -25,6 +25,12 @@ PyPI Installation
 .. code:: bash
     pip install autoPyTorch
 
+Auto-PyTorch for Time Series Forecasting requires additional dependencies
+
+.. code:: bash
+    pip install autoPyTorch[forecasting]
+
+
 Manual Installation
 -------------------
 
@@ -44,7 +50,43 @@ Manual Installation
     cat requirements.txt | xargs -n 1 -L 1 pip install
     python setup.py install
 
+Similarly, Auto-PyTorch for time series forecasting requires additional dependencies
+
+.. code:: bash
+    git submodule update --init --recursive
+
+    conda create -n auto-pytorch python=3.8
+    conda activate auto-pytorch
+    conda install swig
+    pip install -e[forecasting]
+
 
 Docker Image
-=========================
- TODO
+============
+A Docker image is also provided on dockerhub. To download from dockerhub,
+use:
+
+.. code:: bash
+
+    docker pull automlorg/autopytorch:master
+
+You can also verify that the image was downloaded via:
+
+.. code:: bash
+
+    docker images  # Verify that the image was downloaded
+
+This image can be used to start an interactive session as follows:
+
+.. code:: bash
+
+    docker run -it automlorg/autopytorch:master
+
+To start a Jupyter notebook, you could instead run e.g.:
+
+.. code:: bash
+
+    docker run -it -v ${PWD}:/opt/nb -p 8888:8888 automlorg/autopytorch:master /bin/bash -c "mkdir -p /opt/nb && jupyter notebook --notebook-dir=/opt/nb --ip='0.0.0.0' --port=8888 --no-browser --allow-root"
+
+Alternatively, it is possible to use the development version of autoPyTorch by replacing all
+occurences of ``master`` by ``development``.
diff --git a/docs/manual.rst b/docs/manual.rst
index fabee8422..5277dffe1 100644
--- a/docs/manual.rst
+++ b/docs/manual.rst
@@ -18,23 +18,35 @@ Examples
 ========
 * `Classification <examples/20_basics/example_tabular_classification.html>`_
 * `Regression <examples/20_basics/example_tabular_regression.html>`_
+* `Forecasting <examples/20_basic/example_time_series_forecasting.html>`_
 * `Customizing the search space <examples/40_advanced/example_custom_configuration_space.html>`_
 * `Changing the resampling strategy <examples/40_advanced/example_resampling_strategy.html>`_
 * `Visualizing the results <examples/40_advanced/example_visualization.html>`_
 
 Data validation
 ===============
-For tabular tasks, *Auto-PyTorch* uses a feature and target validator on the input feature set and target set respectively. 
+For **tabular tasks**, *Auto-PyTorch* uses a feature and target validator on the input feature set and target set respectively.
 
 The feature validator checks whether the data is supported by *Auto-PyTorch* or not. Additionally, a sklearn column transformer
 is also used which imputes and ordinally encodes the categorical columns of the dataset. This ensures
 that no unseen category is found while fitting the data.
 
-The target validator applies a label encoder on the target column. 
+The target validator applies a label encoder on the target column.
+
+For **time series forecasting tasks**, besides the functions described above, time series forecasting validators will also
+check the information specify for time series forecasting tasks: it checks
+
+* The index of the series that each data point belongs to
+* If the dataset is uni-variant (only targets information is contained in the datasets)
+* The sample frequency of the datasets
+* The static features in the dataset, i.e., features that contain only one value within each series
+
+Time Series forecasting validator then transforms the features and targets into a `pd.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_
+whose index is applied to identify the series that the time step belongs to.
 
 Data Preprocessing
 ==================
-The tabular preprocessing pipeline in *Auto-PyTorch* consists of 
+The **tabular preprocessing pipeline** in *Auto-PyTorch* consists of
 
 #. `Imputation <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation>`_
 #. `Encoding <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding>`_
@@ -47,7 +59,24 @@ The tabular preprocessing pipeline in *Auto-PyTorch* consists of
 Along with the choices, their corresponding hyperparameters are also tuned. A sklearn ColumnTransformer is
 created which includes a categorical pipeline and a numerical pipeline. These pipelines are made up of the 
 relevant preprocessors chosen in the previous steps. The column transformer is compatible with `torchvision transforms <https://pytorch.org/vision/stable/transforms.html>`_
-and is therefore passed to the DataLoader. 
+and is therefore passed to the DataLoader.
+
+**time series forecasting pipeline** has two sorts of setup:
+
+- Uni-variant model only requires target transformations. They include *1:
+    #. `Target Imputation <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/>`_
+        Choice of `linear`, `nearest`, `constant_zero`, `bfill` and `ffill`
+- Multi-variant model contains target transformations (see above) and feature transformation. They include
+    #. `Imputation <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation>`_
+         Choice of `linear`, `nearest`, `constant_zero`, `bfill` and `ffill`
+    #. `Scaling <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling>`_
+        Choice of `standard`, `min_max`, `max_abs`, `mean_abs`, or no transformation *2
+    #. `Encoding <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding>`_
+        Choice of `OneHotEncoder` or no encoding.
+
+*1 Target scaling is considered as part of `setup <https://github.com/automl/Auto-PyTorch/tree/development/autoPyTorch/pipeline/components/setup>`_ and the transform is done within each batch iteration
+
+*2 Scaling is transformed within each series
 
 Resource Allocation
 ===================
diff --git a/docs/releases.rst b/docs/releases.rst
index 287a5e8c7..4ce326bb8 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -12,6 +12,48 @@
 Releases
 ========
 
+Version 0.2
+===========
+[FIX] Documentation and docker workflow file (#449)
+[RELEASE] Changes for release v0.2 (#446)
+[ADD] Allow users to pass feat types to tabular validator (#441)
+[ADD] docs for forecasting task (#443)
+[FIX] fit updates in gluonts (#445)
+[ADD] Time series forecasting (#434)
+[FIX] fix dist twine check for github (#439)
+[ADD] Subsampling Dataset (#398)
+[feat] Add __str__ to autoPyTorchEnum (#405)
+[ADD] feature preprocessors from autosklearn (#378)
+[refactor] Fix SparseMatrixType --> spmatrix and add ispandas (#397)
+[ADD] dataset compression (#387)
+[fix] Update the SMAC version (#388)
+[feat] Add new task inference for APT (#386)
+[FIX] Datamanager in memory (#382)
+[FIX] Fix: keyword arguments to submit (#384)
+[feat] Add coalescer (#376)
+[FIX] Remove redundant categorical imputation (#375)
+[ADD] scalers from autosklearn (#372)
+[ADD] variance thresholding (#373)
+[fix] Change int to np.int32 for the ndarray dtype specification (#371)
+[fix] Hotfix debug no training in simple intensifier (#370)
+[ADD] Test evaluator (#368)
+[FIX] Fix 361 (#367)
+[FIX] fix error after merge
+[ADD] Docker publish workflow (#357)
+[ADD] fit pipeline honoring API constraints with tests (#348)
+[FIX] Update workflow files (#363)
+[feat] Add the option to save a figure in plot setting params (#351)
+[FIX] Cleanup of simple_imputer (#346)
+[feat] Add an object that realizes the perf over time viz (#331)
+
+Contributors v0.2
+*****************
+
+* Ravin Kohli
+* Shuhei Watanabe
+* Eddie Bergman
+* Difan Deng
+
 Version 0.1.1
 ==============
 [refactor] Completely refactored version with a new scikit-learn compatible API.
diff --git a/examples/20_basics/example_time_series_forecasting.py b/examples/20_basics/example_time_series_forecasting.py
new file mode 100644
index 000000000..a7adba025
--- /dev/null
+++ b/examples/20_basics/example_time_series_forecasting.py
@@ -0,0 +1,93 @@
+"""
+======================
+Time Series Forecasting
+======================
+
+The following example shows how to fit a sample forecasting model
+with AutoPyTorch. This is only a dummmy example because of the limited size of the dataset.
+Thus, it could be possible that the AutoPyTorch model does not perform as well as a dummy predictor
+"""
+import os
+import tempfile as tmp
+import warnings
+import copy
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+from sktime.datasets import load_longley
+targets, features = load_longley()
+
+forecasting_horizon = 3
+
+# Dataset optimized by APT-TS can be a list of np.ndarray/ pd.DataFrame where each series represents an element in the
+# list, or a single pd.DataFrame that records the series
+# index information: to which series the timestep belongs? This id can be stored as the DataFrame's index or a separate
+# column
+# Within each series, we take the last forecasting_horizon as test targets. The items before that as training targets
+# Normally the value to be forecasted should follow the training sets
+y_train = [targets[: -forecasting_horizon]]
+y_test = [targets[-forecasting_horizon:]]
+
+# same for features. For uni-variant models, X_train, X_test can be omitted and set as None
+X_train = [features[: -forecasting_horizon]]
+# Here x_test indicates the 'known future features': they are the features known previously, features that are unknown
+# could be replaced with NAN or zeros (which will not be used by our networks). If no feature is known beforehand,
+# we could also omit X_test
+known_future_features = list(features.columns)
+X_test = [features[-forecasting_horizon:]]
+
+start_times = [targets.index.to_timestamp()[0]]
+freq = '1Y'
+
+from autoPyTorch.api.time_series_forecasting import TimeSeriesForecastingTask
+############################################################################
+# Build and fit a forecaster
+# ==========================
+api = TimeSeriesForecastingTask()
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=copy.deepcopy(y_train),
+    X_test=X_test,
+    optimize_metric='mean_MASE_forecasting',
+    n_prediction_steps=forecasting_horizon,
+    memory_limit=16 * 1024,   # Currently, forecasting models use much more memories
+    freq=freq,
+    start_times=start_times,
+    func_eval_time_limit_secs=50,
+    total_walltime_limit=60,
+    min_num_test_instances=1000,  # proxy validation sets. This only works for the tasks with more than 1000 series
+    known_future_features=known_future_features,
+)
+
+
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+
+test_sets = []
+
+# We could construct test sets from scratch
+for feature, future_feature, target, start_time in zip(X_train, X_test,y_train, start_times):
+    test_sets.append(
+        TimeSeriesSequence(X=feature.values,
+                           Y=target.values,
+                           X_test=future_feature.values,
+                           start_time=start_time,
+                           is_test_set=True,
+                           # additional information required to construct a new time series sequence
+                           **api.dataset.sequences_builder_kwargs
+                           )
+    )
+# Alternatively, if we only want to forecast the value after the X_train, we could directly ask datamanager to
+# generate a test set:
+# test_sets2 = api.dataset.generate_test_seqs()
+
+pred = api.predict(test_sets)
diff --git a/examples/40_advanced/example_pass_feature_types.py b/examples/40_advanced/example_pass_feature_types.py
new file mode 100644
index 000000000..658796a28
--- /dev/null
+++ b/examples/40_advanced/example_pass_feature_types.py
@@ -0,0 +1,93 @@
+"""
+=====================================================
+Tabular Classification with user passed feature types
+=====================================================
+
+The following example shows how to pass feature typesfor datasets which are in 
+numpy format (also works for dataframes and lists) fit a sample classification 
+model with AutoPyTorch.
+
+AutoPyTorch relies on column dtypes for intepreting the feature types. But they 
+can be misinterpreted for example, when dataset is passed as a numpy array, all 
+the data is interpreted as numerical if it's dtype is int or float. However, the 
+categorical values could have been encoded as integers.
+
+Passing feature types helps AutoPyTorch interpreting them correctly as well as
+validates the dataset by checking the dtype of the columns for any incompatibilities.
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import openml
+import sklearn.model_selection
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+
+
+############################################################################
+# Data Loading
+# ============
+task = openml.tasks.get_task(task_id=146821)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='array',
+    target=dataset.default_target_attribute,
+)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X,
+    y,
+    random_state=1,
+)
+
+feat_types = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+# 
+############################################################################
+# Build and fit a classifier
+# ==========================
+api = TabularClassificationTask(
+    # To maintain logs of the run, you can uncomment the
+    # Following lines
+    # temporary_directory='./tmp/autoPyTorch_example_tmp_01',
+    # output_directory='./tmp/autoPyTorch_example_out_01',
+    # delete_tmp_folder_after_terminate=False,
+    # delete_output_folder_after_terminate=False,
+    seed=42,
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    dataset_name='Australian',
+    optimize_metric='accuracy',
+    total_walltime_limit=100,
+    func_eval_time_limit_secs=50,
+    feat_types=feat_types,
+    enable_traditional_pipeline=False
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test)
+print(score)
+# Print the final ensemble built by AutoPyTorch
+print(api.show_models())
+
+# Print statistics from search
+print(api.sprint_statistics())
diff --git a/examples/40_advanced/example_plot_over_time.py b/examples/40_advanced/example_plot_over_time.py
new file mode 100644
index 000000000..cf672fc46
--- /dev/null
+++ b/examples/40_advanced/example_plot_over_time.py
@@ -0,0 +1,81 @@
+"""
+==============================
+Plot the Performance over Time
+==============================
+
+Auto-Pytorch uses SMAC to fit individual machine learning algorithms
+and then ensembles them together using `Ensemble Selection
+<https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml04.icdm06long.pdf>`_.
+
+The following examples shows how to plot both the performance
+of the individual models and their respective ensemble.
+
+Additionally, as we are compatible with matplotlib,
+you can input any args or kwargs that are compatible with ax.plot.
+In the case when you would like to create multipanel visualization,
+please input plt.Axes obtained from matplotlib.pyplot.subplots.
+
+"""
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from sklearn import model_selection
+
+import matplotlib.pyplot as plt
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.utils.results_visualizer import PlotSettingParams
+
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+
+############################################################################
+# Task Definition
+# ===============
+n_samples, dim = 100, 2
+X = np.random.random((n_samples, dim)) * 2 - 1
+y = ((X ** 2).sum(axis=-1) < 2 / np.pi).astype(np.int32)
+print(y)
+
+X, y = pd.DataFrame(X), pd.DataFrame(y)
+X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)
+
+############################################################################
+# API Instantiation and Searching
+# ===============================
+api = TabularClassificationTask(seed=42)
+
+api.search(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
+           optimize_metric='accuracy', total_walltime_limit=120, func_eval_time_limit_secs=10)
+
+############################################################################
+# Create Setting Parameters Object
+# ================================
+metric_name = 'accuracy'
+
+params = PlotSettingParams(
+    xscale='log',
+    xlabel='Runtime',
+    ylabel='Accuracy',
+    title='Toy Example',
+    figname='example_plot_over_time.png',
+    savefig_kwargs={'bbox_inches': 'tight'},
+    show=False  # If you would like to show, make it True and set figname=None
+)
+
+############################################################################
+# Plot with the Specified Setting Parameters
+# ==========================================
+# _, ax = plt.subplots()  <=== You can feed it to post-process the figure.
+
+# You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment.
+api.plot_perf_over_time(
+    metric_name=metric_name,
+    plot_setting_params=params,
+    marker='*',
+    markersize=10
+)
diff --git a/examples/40_advanced/example_resampling_strategy.py b/examples/40_advanced/example_resampling_strategy.py
index d02859f1b..852375589 100644
--- a/examples/40_advanced/example_resampling_strategy.py
+++ b/examples/40_advanced/example_resampling_strategy.py
@@ -93,7 +93,7 @@
 
 ############################################################################
 # Search for an ensemble of machine learning algorithms
-# -----------------------------------------------------------------------
+# -----------------------------------------------------
 
 api.search(
     X_train=X_train,
@@ -107,7 +107,7 @@
 
 ############################################################################
 # Print the final ensemble performance
-# ------------
+# ------------------------------------
 y_pred = api.predict(X_test)
 score = api.score(y_pred, y_test)
 print(score)
diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py
new file mode 100644
index 000000000..7f87c6de3
--- /dev/null
+++ b/examples/40_advanced/example_single_configuration.py
@@ -0,0 +1,81 @@
+# -*- encoding: utf-8 -*-
+"""
+==========================
+Fit a single configuration
+==========================
+*Auto-PyTorch* searches for the best combination of machine learning algorithms
+and their hyper-parameter configuration for a given task.
+This example shows how one can fit one of these pipelines, both, with a user defined
+configuration, and a randomly sampled one form the configuration space.
+The pipelines that Auto-PyTorch fits are compatible with Scikit-Learn API. You can
+get further documentation about Scikit-Learn models here: <https://scikit-learn.org/stable/getting_started.html`>_
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import sklearn.datasets
+import sklearn.metrics
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes
+
+
+############################################################################
+# Data Loading
+# ============
+
+X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X, y, test_size=0.5, random_state=3
+)
+
+############################################################################
+# Define an estimator
+# ===================
+
+estimator = TabularClassificationTask(
+    resampling_strategy=HoldoutValTypes.holdout_validation,
+    resampling_strategy_args={'val_share': 0.5},
+)
+
+############################################################################
+# Get a configuration of the pipeline for current dataset
+# ===============================================================
+
+dataset = estimator.get_dataset(X_train=X_train,
+                                y_train=y_train,
+                                X_test=X_test,
+                                y_test=y_test,
+                                dataset_name='kr-vs-kp')
+configuration = estimator.get_search_space(dataset).get_default_configuration()
+
+print("Passed Configuration:", configuration)
+###########################################################################
+# Fit the configuration
+# =====================
+
+pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset,
+                                                                configuration=configuration,
+                                                                budget_type='epochs',
+                                                                budget=5,
+                                                                run_time_limit_secs=75
+                                                                )
+
+# The fit_pipeline command also returns a named tuple with the pipeline constraints
+print(run_info)
+
+# The fit_pipeline command also returns a named tuple with train/test performance
+print(run_value)
+
+# This object complies with Scikit-Learn Pipeline API.
+# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
+print(pipeline.named_steps)
diff --git a/examples/40_advanced/example_visualization.py b/examples/40_advanced/example_visualization.py
index 37c1c6dc3..a88899e81 100644
--- a/examples/40_advanced/example_visualization.py
+++ b/examples/40_advanced/example_visualization.py
@@ -149,18 +149,3 @@
     grid=True,
 )
 plt.show()
-
-# We then can understand the importance of each input feature using
-# a permutation importance analysis. This is done as a proof of concept, to
-# showcase that we can leverage of scikit-learn API.
-result = permutation_importance(estimator, X_train, y_train, n_repeats=5,
-                                scoring='accuracy',
-                                random_state=seed)
-sorted_idx = result.importances_mean.argsort()
-
-fig, ax = plt.subplots()
-ax.boxplot(result.importances[sorted_idx].T,
-           vert=False, labels=X_test.columns[sorted_idx])
-ax.set_title("Permutation Importances (Train set)")
-fig.tight_layout()
-plt.show()
diff --git a/requirements.txt b/requirements.txt
index 6f81bfcb7..3f37e131c 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 pandas
-torch
+torch>=1.10.1
 torchvision
 tensorboard
 scikit-learn>=0.24.0,<0.25.0
@@ -7,10 +7,10 @@ numpy
 scipy>=1.7
 lockfile
 imgaug>=0.4.0
-ConfigSpace>=0.4.14,<0.5
+ConfigSpace>=0.5.0
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
-smac==0.14.0
+smac>=1.2
 dask
 distributed>=2.2.0
 catboost
diff --git a/setup.py b/setup.py
index 96cafefe9..bd524276d 100755
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
 # noinspection PyInterpreter
 setuptools.setup(
     name="autoPyTorch",
-    version="0.1.1",
+    version="0.2",
     author="AutoML Freiburg",
     author_email="eddiebergmanhs@gmail.com",
     description=("Auto-PyTorch searches neural architectures using smac"),
@@ -32,6 +32,7 @@
     keywords="machine learning algorithm configuration hyperparameter"
              "optimization tuning neural architecture deep learning",
     packages=setuptools.find_packages(),
+    package_data={"autoPyTorch": ['py.typed']},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Topic :: Utilities",
@@ -47,6 +48,11 @@
     install_requires=requirements,
     include_package_data=True,
     extras_require={
+        "forecasting": [
+            "gluonts>=0.10.0",
+            "sktime",
+            "pytorch-forecasting",
+        ],
         "test": [
             "matplotlib",
             "pytest",
@@ -57,6 +63,7 @@
             "pre-commit",
             "pytest-cov",
             'pytest-forked',
+            'pytest-subtests',
             "codecov",
             "pep8",
             "mypy",
@@ -70,6 +77,7 @@
             "jupyter",
             "notebook",
             "seaborn",
+            "openml"
         ],
         "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"],
     },
diff --git a/test/conftest.py b/test/conftest.py
index 604d8f00e..2cf976d7a 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,3 +1,4 @@
+import datetime
 import logging.handlers
 import os
 import re
@@ -13,15 +14,21 @@
 
 import pandas as pd
 
+
 import pytest
 
+from scipy import sparse
+
 from sklearn.datasets import fetch_openml, make_classification, make_regression
+from sklearn.utils import check_random_state
 
 import torch
 
 from autoPyTorch.automl_common.common.utils.backend import create
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
@@ -52,6 +59,10 @@ def callattr_ahead_of_alltests(request):
         4871,    # sensory
         4857,    # boston
         3916,    # kc1
+        2295,    # cholesterol
+        3916,    # kc1-binary
+        293554,  # reuters
+        294846   # rf1
     ]
 
     # Populate the cache
@@ -458,6 +469,16 @@ def loss_mse():
     return dataset_properties, predictions, name, targets, labels
 
 
+@pytest.fixture
+def loss_mape():
+    dataset_properties = {'task_type': 'time_series_forecasting', 'output_type': 'continuous'}
+    predictions = torch.randn(4)
+    name = 'MAPELoss'
+    targets = torch.randn(4)
+    labels = None
+    return dataset_properties, predictions, name, targets, labels
+
+
 @pytest.fixture
 def loss_details(request):
     return request.getfixturevalue(request.param)
@@ -466,3 +487,415 @@ def loss_details(request):
 @pytest.fixture
 def n_samples():
     return N_SAMPLES
+
+
+# Fixtures for input validators. By default all elements have 100 datapoints
+@pytest.fixture
+def input_data_featuretest(request):
+    if request.param == 'numpy_categoricalonly_nonan':
+        return np.random.randint(10, size=(100, 10))
+    elif request.param == 'numpy_numericalonly_nonan':
+        return np.random.uniform(10, size=(100, 10))
+    elif request.param == 'numpy_mixed_nonan':
+        return np.column_stack([
+            np.random.uniform(10, size=(100, 3)),
+            np.random.randint(10, size=(100, 3)),
+            np.random.uniform(10, size=(100, 3)),
+            np.random.randint(10, size=(100, 1)),
+        ])
+    elif request.param == 'numpy_string_nonan':
+        return np.array([
+            ['a', 'b', 'c', 'a', 'b', 'c'],
+            ['a', 'b', 'd', 'r', 'b', 'c'],
+        ])
+    elif request.param == 'numpy_categoricalonly_nan':
+        array = np.random.randint(10, size=(100, 10)).astype('float')
+        array[50, 0:5] = np.nan
+        return array
+    elif request.param == 'numpy_numericalonly_nan':
+        array = np.full(fill_value=10.0, shape=(100, 10), dtype=np.float64)
+        array[50, 0:5] = np.nan
+        # Somehow array is changed to dtype object after np.nan
+        return array.astype('float')
+    elif request.param == 'numpy_mixed_nan':
+        array = np.column_stack([
+            np.random.uniform(10, size=(100, 3)),
+            np.random.randint(10, size=(100, 3)),
+            np.random.uniform(10, size=(100, 3)),
+            np.random.randint(10, size=(100, 1)),
+        ])
+        array[50, 0:5] = np.nan
+        return array
+    elif request.param == 'numpy_string_nan':
+        return np.array([
+            ['a', 'b', 'c', 'a', 'b', 'c'],
+            [np.nan, 'b', 'd', 'r', 'b', 'c'],
+        ])
+    elif request.param == 'pandas_categoricalonly_nonan':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category')
+    elif request.param == 'pandas_numericalonly_nonan':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='float')
+    elif request.param == 'pandas_mixed_nonan':
+        frame = pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category')
+        frame['B'] = pd.to_numeric(frame['B'])
+        return frame
+    elif request.param == 'pandas_categoricalonly_nan':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2, 'C': np.nan},
+            {'A': 3, 'C': np.nan},
+        ], dtype='category')
+    elif request.param == 'pandas_numericalonly_nan':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2, 'C': np.nan},
+            {'A': 3, 'C': np.nan},
+        ], dtype='float')
+    elif request.param == 'pandas_mixed_nan':
+        frame = pd.DataFrame([
+            {'A': 1, 'B': 2, 'C': 8},
+            {'A': 3, 'B': 4},
+        ], dtype='category')
+        frame['B'] = pd.to_numeric(frame['B'])
+        return frame
+    elif request.param == 'pandas_string_nonan':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='string')
+    elif request.param == 'list_categoricalonly_nonan':
+        return [
+            ['a', 'b', 'c', 'd'],
+            ['e', 'f', 'c', 'd'],
+        ]
+    elif request.param == 'list_numericalonly_nonan':
+        return [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8]
+        ]
+    elif request.param == 'list_mixed_nonan':
+        return [
+            ['a', 2, 3, 4],
+            ['b', 6, 7, 8]
+        ]
+    elif request.param == 'list_categoricalonly_nan':
+        return [
+            ['a', 'b', 'c', np.nan],
+            ['e', 'f', 'c', 'd'],
+        ]
+    elif request.param == 'list_numericalonly_nan':
+        return [
+            [1, 2, 3, np.nan],
+            [5, 6, 7, 8]
+        ]
+    elif request.param == 'list_mixed_nan':
+        return [
+            ['a', np.nan, 3, 4],
+            ['b', 6, 7, 8]
+        ]
+    elif 'sparse' in request.param:
+        # We expect the names to be of the type sparse_csc_nonan
+        sparse_, type_, nan_ = request.param.split('_')
+        if 'nonan' in nan_:
+            data = np.ones(3)
+        else:
+            data = np.array([1, 2, np.nan])
+
+        # Then the type of sparse
+        row_ind = np.array([0, 1, 2])
+        col_ind = np.array([1, 2, 1])
+        if 'csc' in type_:
+            return sparse.csc_matrix((data, (row_ind, col_ind)))
+        elif 'csr' in type_:
+            return sparse.csr_matrix((data, (row_ind, col_ind)))
+        elif 'coo' in type_:
+            return sparse.coo_matrix((data, (row_ind, col_ind)))
+        elif 'bsr' in type_:
+            return sparse.bsr_matrix((data, (row_ind, col_ind)))
+        elif 'lil' in type_:
+            return sparse.lil_matrix((data))
+        elif 'dok' in type_:
+            return sparse.dok_matrix(np.vstack((data, data, data)))
+        elif 'dia' in type_:
+            return sparse.dia_matrix(np.vstack((data, data, data)))
+        else:
+            ValueError("Unsupported indirect fixture {}".format(request.param))
+    elif 'openml' in request.param:
+        _, openml_id = request.param.split('_')
+        X, y = fetch_openml(data_id=int(openml_id),
+                            return_X_y=True, as_frame=True)
+        return X
+    else:
+        ValueError("Unsupported indirect fixture {}".format(request.param))
+
+
+# Forecasting tasks
+def get_forecasting_data(request):
+    uni_variant = False
+    with_missing_values = False
+    type_X = 'pd'
+    with_series_id = False
+    if request == 'uni_variant_wo_missing':
+        uni_variant = True
+    elif request == 'uni_variant_w_missing':
+        uni_variant = True
+        with_missing_values = True
+    elif request == 'multi_variant_wo_missing':
+        with_missing_values = False
+    elif request == 'multi_variant_w_missing':
+        with_missing_values = True
+
+    generator = check_random_state(0)
+    n_seq = 10
+    base_length = 50
+    targets = []
+
+    start_times = []
+    # the first character indicates the type of the feature:
+    # n: numerical, c: categorical, s: static
+    # for categorical features, the following character indicate how the feature is stored:
+    # s: stored as string; n: stored as
+    if type_X == 'pd':
+        if 'only_cat' in request:
+            feature_columns = ['cs2_10', 'cn4_5']
+        elif 'only_num' in request:
+            feature_columns = ['n1', 'n3', 'n5']
+        else:
+            feature_columns = ['n1', 'cs2_10', 'n3', 'cn4_5', 'n5']
+    else:
+        if 'only_cat' in request:
+            feature_columns = ['cn2_5', 'cn4_5']
+        elif 'only_num' in request:
+            feature_columns = ['n1', 'n3', 'n5']
+        else:
+            feature_columns = ['n1', 'cn2_5', 'n3', 'cn4_5', 'n5']
+
+    def generate_forecasting_features(feature_type, length):
+        feature_type_content = list(feature_type)
+        if feature_type_content[0] == 'n':
+            # numerical features
+            return generator.rand(length)
+        elif feature_type_content[0] == 'c':
+            num_class = int(feature_type.split("_")[-1])
+            if feature_type_content[1] == 's':
+                return generator.choice([f'value_{feature_id}' for feature_id in range(num_class)],
+                                        size=length, replace=True)
+            elif feature_type_content[1] == 'n':
+                return generator.choice(list(range(num_class)), size=length, replace=True)
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    features = []
+    for i in range(n_seq):
+        new_seq = np.arange(i * 1000, base_length + i * 1010).astype(np.float)
+        series_length = base_length + i * 10
+
+        targets.append(np.arange(i * 1000, series_length + i * 1000))
+        if not uni_variant:
+            if type_X == 'np':
+                feature = np.asarray([generate_forecasting_features(col, series_length) for col in feature_columns])
+            elif type_X == 'pd':
+                feature = {col: generate_forecasting_features(col, series_length) for col in feature_columns}
+                if with_series_id:
+                    feature["series_id"] = [i] * series_length
+                feature = pd.DataFrame(
+                    feature
+                )
+
+                for col in feature.columns:
+                    if col.startswith("n"):
+                        feature[col] = feature[col].astype('float')
+                    elif col.startswith("cs"):
+                        feature[col] = feature[col].astype('category')
+                    elif col.startswith("cn"):
+                        feature[col] = feature[col].astype('int')
+            else:
+                raise NotImplementedError
+            features.append(feature)
+
+        if with_missing_values:
+            new_seq[5] = np.NAN
+            new_seq[-5] = np.NAN
+
+        start_time = datetime.datetime.strptime(f'190{i // 5}-01-01 00-00-00', '%Y-%m-%d %H-%M-%S')
+        start_times.append(start_time)
+    input_validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    features = features if len(features) > 0 else None
+    return features, targets, input_validator.fit(features, targets, start_times=start_times)
+
+
+def get_forecasting_datamangaer(X, y, validator, with_y_test=True, forecast_horizon=3, freq='1D'):
+    if X is not None:
+        X_test = []
+        for x in X:
+            if hasattr(x, 'iloc'):
+                X_test.append(x.iloc[-forecast_horizon:].copy())
+            else:
+                X_test.append(x[-forecast_horizon:].copy())
+        known_future_features = tuple(X[0].columns) if isinstance(X[0], pd.DataFrame) else \
+            np.arange(X[0].shape[-1]).tolist()
+    else:
+        X_test = None
+        known_future_features = None
+
+    if with_y_test:
+        y_test = []
+        for y_seq in y:
+            if hasattr(y_seq, 'iloc'):
+                y_test.append(y_seq.iloc[-forecast_horizon:].copy() + 1)
+            else:
+                y_test.append(y_seq[-forecast_horizon:].copy() + 1)
+    else:
+        y_test = None
+    datamanager = TimeSeriesForecastingDataset(
+        X=X, Y=y,
+        X_test=X_test,
+        Y_test=y_test,
+        validator=validator,
+        freq=freq,
+        n_prediction_steps=forecast_horizon,
+        known_future_features=known_future_features
+    )
+    return datamanager
+
+
+def get_forecasting_fit_dictionary(datamanager, backend, forecasting_budgets='epochs'):
+    info = datamanager.get_required_dataset_info()
+
+    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
+
+    fit_dictionary = {
+        'X_train': datamanager.train_tensors[0],
+        'y_train': datamanager.train_tensors[1],
+        'dataset_properties': dataset_properties,
+        # Training configuration
+        'num_run': 1,
+        'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
+        'device': 'cpu',
+        'torch_num_threads': 1,
+        'early_stopping': 1,
+        'use_tensorboard_logger': False,
+        'use_pynisher': False,
+        'metrics_during_training': False,
+        'seed': 1,
+        'budget_type': 'epochs',
+        'epochs': 1,
+        'split_id': 0,
+        'backend': backend,
+        'logger_port': logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+    }
+    if forecasting_budgets == 'epochs':
+        fit_dictionary.update({'forecasting_budgets': 'epochs',
+                               'epochs': 1})
+    elif forecasting_budgets == 'resolution':
+        fit_dictionary.update({'forecasting_budgets': 'resolution',
+                               'sample_interval': 2})
+    elif forecasting_budgets == 'num_sample_per_seq':
+        fit_dictionary.update({'forecasting_budgets': 'num_sample_per_seq',
+                               'fraction_samples_per_seq': 0.5})
+    elif forecasting_budgets == 'num_seq':
+        fit_dictionary.update({'forecasting_budgets': 'num_seq',
+                               'fraction_seq': 0.5})
+    else:
+        raise NotImplementedError
+    backend.save_datamanager(datamanager)
+    return fit_dictionary
+
+
+# Fixtures for forecasting input validators
+@pytest.fixture
+def input_data_forecastingfeaturetest(request):
+    if request.param == 'numpy_nonan':
+        return np.random.uniform(10, size=(100, 10)), None, None
+    elif request.param == 'numpy_with_static':
+        return np.zeros([2, 3], dtype=np.int), None, None
+    elif request.param == 'numpy_with_seq_length':
+        return np.zeros([5, 3], dtype=np.int), None, [2, 3]
+    elif request.param == 'pandas_wo_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category'), None, [2]
+    elif request.param == 'pandas_w_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 0},
+            {'A': 0, 'B': 1},
+        ], dtype='category'), 'A', [2]
+    elif request.param == 'pandas_only_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 0},
+            {'A': 0, 'B': 1},
+        ], dtype='category'), ['A', 'B'], [2]
+    elif request.param == 'pandas_without_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category'), None, [2]
+    elif request.param == 'pandas_with_static_features':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 4},
+        ], dtype='category'), None, [2]
+    elif request.param == 'pandas_multi_seq':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 4},
+            {'A': 3, 'B': 2},
+            {'A': 2, 'B': 4},
+        ], dtype='category'), None, [2, 2]
+    elif request.param == 'pandas_multi_seq_w_idx':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 4},
+            {'A': 3, 'B': 2},
+            {'A': 2, 'B': 4},
+        ], dtype='category', index=[0, 0, 1, 1]), None, None
+    elif request.param == 'pandas_with_static_features_multi_series':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 2},
+            {'A': 2, 'B': 3},
+            {'A': 2, 'B': 3},
+        ], dtype='category'), 'A', None
+    else:
+        ValueError("Unsupported indirect fixture {}".format(request.param))
+
+
+@pytest.fixture(scope="class")
+def get_forecasting_datamanager(request):
+    X, y, validator = get_forecasting_data(request.param)
+    datamanager = get_forecasting_datamangaer(X, y, validator)
+    return datamanager
+
+
+@pytest.fixture
+def forecasting_toy_dataset(request):
+    x, y, _ = get_forecasting_data(request.param)
+    return x, y
+
+
+@pytest.fixture(params=['epochs'])
+def forecasting_budgets(request):
+    return request.param
+
+
+@pytest.fixture
+def fit_dictionary_forecasting(request, forecasting_budgets, backend):
+    X, y, validator = get_forecasting_data(request.param)
+    datamanager = get_forecasting_datamangaer(X, y, validator)
+    return get_forecasting_fit_dictionary(datamanager, backend, forecasting_budgets=forecasting_budgets)
+
+
+# Fixtures for forecasting validators.
+@pytest.fixture
+def input_data_forecasting_featuretest(request):
+    return [input_data_featuretest(request) for _ in range(3)]
diff --git a/test/test_api/.tmp_api/runhistory.json b/test/test_api/.tmp_api/runhistory.json
index 6f61e1395..28c0cbd32 100644
--- a/test/test_api/.tmp_api/runhistory.json
+++ b/test/test_api/.tmp_api/runhistory.json
@@ -705,6 +705,7 @@
     "1": {
       "data_loader:batch_size": 64,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
       "imputer:numerical_strategy": "mean",
       "lr_scheduler:__choice__": "ReduceLROnPlateau",
@@ -737,6 +738,7 @@
     "2": {
       "data_loader:batch_size": 101,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:numerical_strategy": "most_frequent",
       "lr_scheduler:__choice__": "CyclicLR",
@@ -801,6 +803,7 @@
     "3": {
       "data_loader:batch_size": 242,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "NoScheduler",
@@ -831,6 +834,7 @@
     "4": {
       "data_loader:batch_size": 115,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "Nystroem",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -864,6 +868,7 @@
     "5": {
       "data_loader:batch_size": 185,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "ReduceLROnPlateau",
@@ -904,6 +909,7 @@
     "6": {
       "data_loader:batch_size": 95,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:numerical_strategy": "most_frequent",
       "lr_scheduler:__choice__": "ExponentialLR",
@@ -937,6 +943,7 @@
     "7": {
       "data_loader:batch_size": 119,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "Nystroem",
       "imputer:numerical_strategy": "mean",
       "lr_scheduler:__choice__": "StepLR",
@@ -979,6 +986,7 @@
     "8": {
       "data_loader:batch_size": 130,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PolynomialFeatures",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "CyclicLR",
@@ -1032,6 +1040,7 @@
     "9": {
       "data_loader:batch_size": 137,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "Nystroem",
       "imputer:numerical_strategy": "mean",
       "lr_scheduler:__choice__": "CosineAnnealingLR",
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 5cb271eb0..465d74c6b 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -2,8 +2,13 @@
 import os
 import pathlib
 import pickle
+import tempfile
 import unittest
-from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_function
+from test.test_api.utils import (
+    dummy_do_dummy_prediction,
+    dummy_eval_train_function,
+    dummy_forecasting_eval_train_function
+)
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import Configuration
@@ -17,17 +22,19 @@
 
 import sklearn
 import sklearn.datasets
-from sklearn.base import BaseEstimator
-from sklearn.base import clone
+from sklearn.base import BaseEstimator, clone
 from sklearn.ensemble import VotingClassifier, VotingRegressor
 
-from smac.runhistory.runhistory import RunHistory
+from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue
 
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
 from autoPyTorch.api.tabular_regression import TabularRegressionTask
+from autoPyTorch.api.time_series_forecasting import TimeSeriesForecastingTask
+from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
+    NoResamplingStrategyTypes,
 )
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
@@ -41,8 +48,8 @@
 
 # Test
 # ====
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function',
-                     new=dummy_eval_function)
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
+                     new=dummy_eval_train_function)
 @pytest.mark.parametrize('openml_id', (40981, ))
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.holdout_validation, None),
@@ -216,13 +223,10 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl
     # Make sure that a configuration space is stored in the estimator
     assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace)
 
-    # test fit on dummy data
-    assert isinstance(estimator.fit(dataset=backend.load_datamanager()), BasePipeline)
-
 
 @pytest.mark.parametrize('openml_name', ("boston", ))
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function',
-                     new=dummy_eval_function)
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
+                     new=dummy_eval_train_function)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.holdout_validation, None),
                           (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS})
@@ -406,6 +410,181 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
     assert 'Estimator' in representation
 
 
+@pytest.mark.parametrize('forecasting_toy_dataset', ['uni_variant_wo_missing'], indirect=True)
+@unittest.mock.patch('autoPyTorch.evaluation.tae.forecasting_eval_train_function',
+                     new=dummy_forecasting_eval_train_function)
+@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
+                         ((HoldoutValTypes.time_series_hold_out_validation, None),
+                          (CrossValTypes.time_series_cross_validation, {'num_splits': CV_NUM_SPLITS}),
+                          ))
+def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, backend, resampling_strategy_args):
+    forecast_horizon = 3
+    freq = '1Y'
+    X, Y = forecasting_toy_dataset
+
+    if X is not None:
+        X_train = []
+        X_test = []
+        for x in X:
+            if hasattr(x, 'iloc'):
+                X_train.append(x.iloc[:-forecast_horizon].copy())
+                X_test.append(x.iloc[-forecast_horizon:].copy())
+            else:
+                X_train.append(x[:-forecast_horizon].copy())
+                X_test.append(x[-forecast_horizon:].copy())
+        known_future_features = tuple(X[0].columns) if isinstance(X[0], pd.DataFrame) else \
+            np.arange(X[0].shape[-1]).tolist()
+    else:
+        X_train = None
+        X_test = None
+        known_future_features = None
+
+    y_train = []
+    y_test = []
+
+    for y in Y:
+        if hasattr(y, 'iloc'):
+            y_train.append(y.iloc[:-forecast_horizon].copy())
+            y_test.append(y.iloc[-forecast_horizon:].copy())
+        else:
+            y_train.append(y[:-forecast_horizon].copy())
+            y_test.append(y[-forecast_horizon:].copy())
+
+    # Search for a good configuration
+    # patch.mock  is not applied to partial func. We only test lightweight FFNN networks
+    estimator = TimeSeriesForecastingTask(
+        backend=backend,
+        resampling_strategy=resampling_strategy,
+        resampling_strategy_args=resampling_strategy_args,
+        ensemble_size=2,
+        seed=42,
+    )
+
+    with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction):
+        estimator.search(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
+            memory_limit=None,
+            optimize_metric='mean_MSE_forecasting',
+            n_prediction_steps=forecast_horizon,
+            freq=freq,
+            total_walltime_limit=30,
+            func_eval_time_limit_secs=10,
+            known_future_features=known_future_features,
+        )
+
+    # Internal dataset has expected settings
+    assert estimator.dataset.task_type == 'time_series_forecasting'
+    expected_num_splits = HOLDOUT_NUM_SPLITS if resampling_strategy == HoldoutValTypes.time_series_hold_out_validation \
+        else CV_NUM_SPLITS
+    assert estimator.resampling_strategy == resampling_strategy
+    assert estimator.dataset.resampling_strategy == resampling_strategy
+
+    assert len(estimator.dataset.splits) == expected_num_splits
+
+    # Check for the created files
+    tmp_dir = estimator._backend.temporary_directory
+    loaded_datamanager = estimator._backend.load_datamanager()
+    assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors)
+
+    expected_files = [
+        'smac3-output/run_42/configspace.json',
+        'smac3-output/run_42/runhistory.json',
+        'smac3-output/run_42/scenario.txt',
+        'smac3-output/run_42/stats.json',
+        'smac3-output/run_42/train_insts.txt',
+        'smac3-output/run_42/trajectory.json',
+        '.autoPyTorch/datamanager.pkl',
+        '.autoPyTorch/ensemble_read_preds.pkl',
+        '.autoPyTorch/start_time_42',
+        '.autoPyTorch/ensemble_history.json',
+        '.autoPyTorch/ensemble_read_losses.pkl',
+        '.autoPyTorch/true_targets_ensemble.npy',
+    ]
+    for expected_file in expected_files:
+        assert os.path.exists(os.path.join(tmp_dir, expected_file)), expected_file
+
+    # Check that smac was able to find proper models
+    succesful_runs = [run_value.status for run_value in estimator.run_history.data.values(
+    ) if 'SUCCESS' in str(run_value.status)]
+    assert len(succesful_runs) >= 1, [(k, v) for k, v in estimator.run_history.data.items()]
+
+    # Search for an existing run key in disc. A individual model might have
+    # a timeout and hence was not written to disc
+    successful_num_run = None
+    SUCCESS = False
+    for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
+        if 'SUCCESS' in str(value.status):
+            run_key_model_run_dir = estimator._backend.get_numrun_directory(
+                estimator.seed, run_key.config_id + 1, run_key.budget)
+            successful_num_run = run_key.config_id + 1
+            if os.path.exists(run_key_model_run_dir):
+                # Runkey config id is different from the num_run
+                # more specifically num_run = config_id + 1(dummy)
+                SUCCESS = True
+                break
+
+    assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
+
+    if resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
+        model_file = os.path.join(run_key_model_run_dir,
+                                  f"{estimator.seed}.{successful_num_run}.{run_key.budget}.model")
+        assert os.path.exists(model_file), model_file
+        model = estimator._backend.load_model_by_seed_and_id_and_budget(
+            estimator.seed, successful_num_run, run_key.budget)
+    elif resampling_strategy == CrossValTypes.time_series_cross_validation:
+        model_file = os.path.join(
+            run_key_model_run_dir,
+            f"{estimator.seed}.{successful_num_run}.{run_key.budget}.cv_model"
+        )
+        assert os.path.exists(model_file), model_file
+        model = estimator._backend.load_cv_model_by_seed_and_id_and_budget(
+            estimator.seed, successful_num_run, run_key.budget)
+        assert isinstance(model, VotingRegressor)
+        assert len(model.estimators_) == CV_NUM_SPLITS
+    else:
+        pytest.fail(resampling_strategy)
+
+    # Make sure that predictions on the test data are printed and make sense
+    test_prediction = os.path.join(run_key_model_run_dir,
+                                   estimator._backend.get_prediction_filename(
+                                       'test', estimator.seed, successful_num_run,
+                                       run_key.budget))
+    assert os.path.exists(test_prediction), test_prediction
+    assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == forecast_horizon * np.shape(y_test)[0]
+
+    # Also, for ensemble builder, the OOF predictions should be there and match
+    # the Ground truth that is also physically printed to disk
+    ensemble_prediction = os.path.join(run_key_model_run_dir,
+                                       estimator._backend.get_prediction_filename(
+                                           'ensemble',
+                                           estimator.seed, successful_num_run,
+                                           run_key.budget))
+    assert os.path.exists(ensemble_prediction), ensemble_prediction
+    assert np.shape(np.load(ensemble_prediction, allow_pickle=True))[0] == np.shape(
+        estimator._backend.load_targets_ensemble()
+    )[0]
+
+    # Ensemble Builder produced an ensemble
+    estimator.ensemble_ is not None
+
+    # There should be a weight for each element of the ensemble
+    assert len(estimator.ensemble_.identifiers_) == len(estimator.ensemble_.weights_)
+
+    X_test = backend.load_datamanager().generate_test_seqs()
+
+    y_pred = estimator.predict(X_test)
+
+    assert np.shape(y_pred) == np.shape(y_test)
+
+    # Test refit on dummy data
+    estimator.refit(dataset=backend.load_datamanager())
+    # Make sure that a configuration space is stored in the estimator
+    assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace)
+
+
 @pytest.mark.parametrize('openml_id', (
     1590,  # Adult to test NaN in categorical columns
 ))
@@ -467,7 +646,7 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular):
     estimator._all_supported_metrics = False
 
     with pytest.raises(ValueError, match=r".*Dummy prediction failed with run state.*"):
-        with unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') as dummy:
+        with unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') as dummy:
             dummy.side_effect = MemoryError
             estimator._do_dummy_prediction()
 
@@ -498,8 +677,8 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular):
     del estimator
 
 
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function',
-                     new=dummy_eval_function)
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
+                     new=dummy_eval_train_function)
 @pytest.mark.parametrize('openml_id', (40981, ))
 def test_portfolio_selection(openml_id, backend, n_samples):
 
@@ -540,8 +719,8 @@ def test_portfolio_selection(openml_id, backend, n_samples):
     assert any(successful_config in portfolio_configs for successful_config in successful_configs)
 
 
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function',
-                     new=dummy_eval_function)
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
+                     new=dummy_eval_train_function)
 @pytest.mark.parametrize('openml_id', (40981, ))
 def test_portfolio_selection_failure(openml_id, backend, n_samples):
 
@@ -645,3 +824,289 @@ def test_build_pipeline(api_type, fit_dictionary_tabular):
     pipeline = api.build_pipeline(fit_dictionary_tabular['dataset_properties'])
     assert isinstance(pipeline, BaseEstimator)
     assert len(pipeline.steps) > 0
+
+
+@pytest.mark.parametrize("disable_file_output", [['all'], None])
+@pytest.mark.parametrize('openml_id', (40984,))
+@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
+                         ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}),
+                          (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}),
+                          (NoResamplingStrategyTypes.no_resampling, {})
+                          )
+                         )
+@pytest.mark.parametrize("budget", [15, 20])
+def test_pipeline_fit(openml_id,
+                      resampling_strategy,
+                      resampling_strategy_args,
+                      backend,
+                      disable_file_output,
+                      budget,
+                      n_samples):
+    # Get the data and check that contents of data-manager make sense
+    X, y = sklearn.datasets.fetch_openml(
+        data_id=int(openml_id),
+        return_X_y=True, as_frame=True
+    )
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X[:n_samples], y[:n_samples], random_state=1)
+
+    # Search for a good configuration
+    estimator = TabularClassificationTask(
+        backend=backend,
+        resampling_strategy=resampling_strategy,
+        ensemble_size=0
+    )
+
+    dataset = estimator.get_dataset(X_train=X_train,
+                                    y_train=y_train,
+                                    X_test=X_test,
+                                    y_test=y_test,
+                                    resampling_strategy=resampling_strategy,
+                                    resampling_strategy_args=resampling_strategy_args)
+
+    configuration = estimator.get_search_space(dataset).get_default_configuration()
+    pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset,
+                                                                    configuration=configuration,
+                                                                    run_time_limit_secs=50,
+                                                                    disable_file_output=disable_file_output,
+                                                                    budget_type='epochs',
+                                                                    budget=budget
+                                                                    )
+    assert isinstance(dataset, BaseDataset)
+    assert isinstance(run_info, RunInfo)
+    assert isinstance(run_info.config, Configuration)
+
+    assert isinstance(run_value, RunValue)
+    assert 'SUCCESS' in str(run_value.status)
+
+    if disable_file_output is None:
+        if resampling_strategy in CrossValTypes:
+            assert isinstance(pipeline, BaseEstimator)
+            X_test = dataset.test_tensors[0]
+            preds = pipeline.predict_proba(X_test)
+            assert isinstance(preds, np.ndarray)
+
+            score = accuracy(dataset.test_tensors[1], preds)
+            assert isinstance(score, float)
+            assert score > 0.65
+        else:
+            assert isinstance(pipeline, BasePipeline)
+            # To make sure we fitted the model, there should be a
+            # run summary object with accuracy
+            run_summary = pipeline.named_steps['trainer'].run_summary
+            assert run_summary is not None
+            X_test = dataset.test_tensors[0]
+            preds = pipeline.predict(X_test)
+            assert isinstance(preds, np.ndarray)
+
+            score = accuracy(dataset.test_tensors[1], preds)
+            assert isinstance(score, float)
+            assert score > 0.65
+    else:
+        assert pipeline is None
+        assert run_value.cost < 0.35
+
+    # Make sure that the pipeline can be pickled
+    dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl')
+    with open(dump_file, 'wb') as f:
+        pickle.dump(pipeline, f)
+
+    num_run_dir = estimator._backend.get_numrun_directory(
+        run_info.seed, run_value.additional_info['num_run'], budget=float(budget))
+
+    cv_model_path = os.path.join(num_run_dir, estimator._backend.get_cv_model_filename(
+        run_info.seed, run_value.additional_info['num_run'], budget=float(budget)))
+    model_path = os.path.join(num_run_dir, estimator._backend.get_model_filename(
+        run_info.seed, run_value.additional_info['num_run'], budget=float(budget)))
+
+    if disable_file_output:
+        # No file output is expected
+        assert not os.path.exists(num_run_dir)
+    else:
+        # We expect the model path always
+        # And the cv model only on 'cv'
+        assert os.path.exists(model_path)
+        if resampling_strategy in CrossValTypes:
+            assert os.path.exists(cv_model_path)
+        elif resampling_strategy in HoldoutValTypes:
+            assert not os.path.exists(cv_model_path)
+
+
+@pytest.mark.parametrize('openml_id', (40984,))
+@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
+                         ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}),
+                          )
+                         )
+def test_pipeline_fit_error(
+    openml_id,
+    resampling_strategy,
+    resampling_strategy_args,
+    backend,
+    n_samples
+):
+    # Get the data and check that contents of data-manager make sense
+    X, y = sklearn.datasets.fetch_openml(
+        data_id=int(openml_id),
+        return_X_y=True, as_frame=True
+    )
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X[:n_samples], y[:n_samples], random_state=1)
+
+    # Search for a good configuration
+    estimator = TabularClassificationTask(
+        backend=backend,
+        resampling_strategy=resampling_strategy,
+    )
+
+    dataset = estimator.get_dataset(X_train=X_train,
+                                    y_train=y_train,
+                                    X_test=X_test,
+                                    y_test=y_test,
+                                    resampling_strategy=resampling_strategy,
+                                    resampling_strategy_args=resampling_strategy_args)
+
+    configuration = estimator.get_search_space(dataset).get_default_configuration()
+    pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset,
+                                                                    configuration=configuration,
+                                                                    run_time_limit_secs=7,
+                                                                    )
+
+    assert 'TIMEOUT' in str(run_value.status)
+    assert pipeline is None
+
+
+@pytest.mark.parametrize('openml_id', (40981, ))
+def test_tabular_classification_test_evaluator(openml_id, backend, n_samples):
+
+    # Get the data and check that contents of data-manager make sense
+    X, y = sklearn.datasets.fetch_openml(
+        data_id=int(openml_id),
+        return_X_y=True, as_frame=True
+    )
+    X, y = X.iloc[:n_samples], y.iloc[:n_samples]
+
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X, y, random_state=42)
+
+    # Search for a good configuration
+    estimator = TabularClassificationTask(
+        backend=backend,
+        resampling_strategy=NoResamplingStrategyTypes.no_resampling,
+        seed=42,
+        ensemble_size=0
+    )
+
+    with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction):
+        estimator.search(
+            X_train=X_train, y_train=y_train,
+            X_test=X_test, y_test=y_test,
+            optimize_metric='accuracy',
+            total_walltime_limit=50,
+            func_eval_time_limit_secs=20,
+            enable_traditional_pipeline=False,
+        )
+
+    # Internal dataset has expected settings
+    assert estimator.dataset.task_type == 'tabular_classification'
+
+    assert estimator.resampling_strategy == NoResamplingStrategyTypes.no_resampling
+    assert estimator.dataset.resampling_strategy == NoResamplingStrategyTypes.no_resampling
+    # Check for the created files
+    tmp_dir = estimator._backend.temporary_directory
+    loaded_datamanager = estimator._backend.load_datamanager()
+    assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors)
+
+    expected_files = [
+        'smac3-output/run_42/configspace.json',
+        'smac3-output/run_42/runhistory.json',
+        'smac3-output/run_42/scenario.txt',
+        'smac3-output/run_42/stats.json',
+        'smac3-output/run_42/train_insts.txt',
+        'smac3-output/run_42/trajectory.json',
+        '.autoPyTorch/datamanager.pkl',
+        '.autoPyTorch/start_time_42',
+    ]
+    for expected_file in expected_files:
+        assert os.path.exists(os.path.join(tmp_dir, expected_file)), "{}/{}/{}".format(
+            tmp_dir,
+            [data for data in pathlib.Path(tmp_dir).glob('*')],
+            expected_file,
+        )
+
+    # Check that smac was able to find proper models
+    succesful_runs = [run_value.status for run_value in estimator.run_history.data.values(
+    ) if 'SUCCESS' in str(run_value.status)]
+    assert len(succesful_runs) > 1, [(k, v) for k, v in estimator.run_history.data.items()]
+
+    # Search for an existing run key in disc. A individual model might have
+    # a timeout and hence was not written to disc
+    successful_num_run = None
+    SUCCESS = False
+    for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
+        if 'SUCCESS' in str(value.status):
+            run_key_model_run_dir = estimator._backend.get_numrun_directory(
+                estimator.seed, run_key.config_id + 1, run_key.budget)
+            successful_num_run = run_key.config_id + 1
+            if os.path.exists(run_key_model_run_dir):
+                # Runkey config id is different from the num_run
+                # more specifically num_run = config_id + 1(dummy)
+                SUCCESS = True
+                break
+
+    assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
+
+    model_file = os.path.join(run_key_model_run_dir,
+                              f"{estimator.seed}.{successful_num_run}.{run_key.budget}.model")
+    assert os.path.exists(model_file), model_file
+
+    # Make sure that predictions on the test data are printed and make sense
+    test_prediction = os.path.join(run_key_model_run_dir,
+                                   estimator._backend.get_prediction_filename(
+                                       'test', estimator.seed, successful_num_run,
+                                       run_key.budget))
+    assert os.path.exists(test_prediction), test_prediction
+    assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == np.shape(X_test)[0]
+
+    y_pred = estimator.predict(X_test)
+    assert np.shape(y_pred)[0] == np.shape(X_test)[0]
+
+    # Make sure that predict proba has the expected shape
+    probabilites = estimator.predict_proba(X_test)
+    assert np.shape(probabilites) == (np.shape(X_test)[0], 2)
+
+    score = estimator.score(y_pred, y_test)
+    assert 'accuracy' in score
+
+    # check incumbent config and results
+    incumbent_config, incumbent_results = estimator.get_incumbent_results()
+    assert isinstance(incumbent_config, Configuration)
+    assert isinstance(incumbent_results, dict)
+    assert 'opt_loss' in incumbent_results, "run history: {}, successful_num_run: {}".format(estimator.run_history.data,
+                                                                                             successful_num_run)
+    assert 'train_loss' in incumbent_results
+
+
+@pytest.mark.parametrize("ans,task_class", (
+    ("continuous", TabularRegressionTask),
+    ("multiclass", TabularClassificationTask))
+)
+def test_task_inference(ans, task_class, backend):
+    # Get the data and check that contents of data-manager make sense
+    X = np.random.random((6, 1))
+    y = np.array([-10 ** 12, 0, 1, 2, 3, 4], dtype=np.int64) + 10 ** 12
+
+    estimator = task_class(
+        backend=backend,
+        resampling_strategy=HoldoutValTypes.holdout_validation,
+        resampling_strategy_args=None,
+        seed=42,
+    )
+    dataset = estimator.get_dataset(X, y)
+    assert dataset.output_type == ans
+
+    y += 10 ** 12 + 10  # Check if the function catches overflow possibilities
+    if ans == 'continuous':
+        with pytest.raises(ValueError):  # ValueError due to `Too large value`
+            estimator.get_dataset(X, y)
+    else:
+        estimator.get_dataset(X, y)
diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py
index 126b702e6..bb8f9c061 100644
--- a/test/test_api/test_base_api.py
+++ b/test/test_api/test_base_api.py
@@ -12,6 +12,7 @@
 
 from autoPyTorch.api.base_task import BaseTask, _pipeline_predict
 from autoPyTorch.constants import TABULAR_CLASSIFICATION, TABULAR_REGRESSION
+from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 
 
@@ -20,6 +21,7 @@
 # ====
 @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True)
 def test_nonsupported_arguments(fit_dictionary_tabular):
+    BaseTask.__abstractmethods__ = set()
     with pytest.raises(ValueError, match=r".*Expected search space updates to be of instance.*"):
         api = BaseTask(search_space_updates='None')
 
@@ -82,6 +84,7 @@ def test_pipeline_predict_function():
 
 @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True)
 def test_show_models(fit_dictionary_tabular):
+    BaseTask.__abstractmethods__ = set()
     api = BaseTask()
     api.ensemble_ = MagicMock()
     api.models_ = [TabularClassificationPipeline(dataset_properties=fit_dictionary_tabular['dataset_properties'])]
@@ -94,6 +97,7 @@ def test_show_models(fit_dictionary_tabular):
 
 def test_set_pipeline_config():
     # checks if we can correctly change the pipeline options
+    BaseTask.__abstractmethods__ = set()
     estimator = BaseTask()
     pipeline_options = {"device": "cuda",
                         "budget_type": "epochs",
@@ -110,6 +114,7 @@ def test_set_pipeline_config():
         (3, 50, 'runtime', {'budget_type': 'runtime', 'runtime': 50}),
     ])
 def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, budget_type, expected):
+    BaseTask.__abstractmethods__ = set()
     estimator = BaseTask(task_type='tabular_classification', ensemble_size=0)
 
     # Fixture pipeline config
@@ -139,3 +144,60 @@ def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, bud
         assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config
         assert list(smac_mock.call_args)[1]['max_budget'] == max_budget
         assert list(smac_mock.call_args)[1]['initial_budget'] == min_budget
+
+
+def test_no_resampling_error(backend):
+    """
+    Checks if an error is raised when trying to construct ensemble
+    using `NoResamplingStrategy`.
+    """
+    BaseTask.__abstractmethods__ = set()
+
+    with pytest.raises(ValueError, match=r"`NoResamplingStrategy` cannot be used for ensemble construction"):
+        BaseTask(
+            backend=backend,
+            resampling_strategy=NoResamplingStrategyTypes.no_resampling,
+            seed=42,
+            ensemble_size=1
+        )
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
+@pytest.mark.parametrize(
+    "min_budget,max_budget,budget_type,expected", [
+        (5, 75, 'epochs', {'budget_type': 'epochs', 'epochs': 75}),
+        (0.01, 1.0, 'resolution', {'budget_type': 'resolution', 'resolution': 1.0}),
+        (0.01, 1.0, 'num_seq', {'budget_type': 'num_seq', 'num_seq': 1.0}),
+        (0.01, 1.0, 'num_sample_per_seq', {'budget_type': 'num_sample_per_seq', 'num_sample_per_seq': 1.0}),
+    ])
+def test_pipeline_get_budget_forecasting(fit_dictionary_forecasting, min_budget, max_budget, budget_type, expected):
+    BaseTask.__abstractmethods__ = set()
+    estimator = BaseTask(task_type='time_series_forecasting', ensemble_size=0)
+    # Fixture pipeline config
+    default_pipeline_config = {
+        'device': 'cpu', 'budget_type': 'epochs', 'epochs': 50, 'runtime': 3600,
+        'torch_num_threads': 1, 'early_stopping': 20, 'use_tensorboard_logger': False,
+        'metrics_during_training': True, 'optimize_metric': 'mean_MASE_forecasting'
+    }
+    default_pipeline_config.update(expected)
+
+    # Create pre-requisites
+    dataset = fit_dictionary_forecasting['backend'].load_datamanager()
+    pipeline_fit = unittest.mock.Mock()
+
+    smac = unittest.mock.Mock()
+    smac.solver.runhistory = RunHistory()
+    smac.solver.intensifier.traj_logger.trajectory = []
+    smac.solver.tae_runner = unittest.mock.Mock(spec=SerialRunner)
+    smac.solver.tae_runner.budget_type = 'epochs'
+    with unittest.mock.patch('autoPyTorch.optimizer.smbo.get_smac_object') as smac_mock:
+        smac_mock.return_value = smac
+        estimator._search(optimize_metric='mean_MASE_forecasting', dataset=dataset, tae_func=pipeline_fit,
+                          min_budget=min_budget, max_budget=max_budget, budget_type=budget_type,
+                          enable_traditional_pipeline=False,
+                          total_walltime_limit=20, func_eval_time_limit_secs=10,
+                          memory_limit=8192,
+                          load_models=False)
+        assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config
+        assert list(smac_mock.call_args)[1]['max_budget'] == max_budget
+        assert list(smac_mock.call_args)[1]['initial_budget'] == min_budget
diff --git a/test/test_api/test_results_manager.py b/test/test_api/test_results_manager.py
deleted file mode 100644
index 4c6e7a7ae..000000000
--- a/test/test_api/test_results_manager.py
+++ /dev/null
@@ -1,232 +0,0 @@
-import json
-import os
-from test.test_api.utils import make_dict_run_history_data
-from unittest.mock import MagicMock
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-
-import numpy as np
-
-import pytest
-
-from smac.runhistory.runhistory import RunHistory, StatusType
-
-from autoPyTorch.api.base_task import BaseTask
-from autoPyTorch.api.results_manager import ResultsManager, STATUS2MSG, SearchResults, cost2metric
-from autoPyTorch.metrics import accuracy, balanced_accuracy, log_loss
-
-
-def _check_status(status):
-    """ Based on runhistory_B.json """
-    ans = [
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.CRASHED], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS],
-        STATUS2MSG[StatusType.TIMEOUT], STATUS2MSG[StatusType.TIMEOUT],
-    ]
-    assert isinstance(status, list)
-    assert isinstance(status[0], str)
-    assert status == ans
-
-
-def _check_costs(costs):
-    """ Based on runhistory_B.json """
-    ans = [0.15204678362573099, 0.4444444444444444, 0.5555555555555556, 0.29824561403508776,
-           0.4444444444444444, 0.4444444444444444, 1.0, 0.5555555555555556, 0.4444444444444444,
-           0.15204678362573099, 0.15204678362573099, 0.4035087719298246, 0.4444444444444444,
-           0.4444444444444444, 1.0, 1.0]
-    assert np.allclose(1 - np.array(costs), ans)
-    assert isinstance(costs, np.ndarray)
-    assert costs.dtype is np.dtype(np.float)
-
-
-def _check_fit_times(fit_times):
-    """ Based on runhistory_B.json """
-    ans = [3.154788017272949, 3.2763524055480957, 22.723600149154663, 4.990685224533081, 10.684926509857178,
-           9.947429180145264, 11.687273979187012, 8.478890419006348, 5.485020637512207, 11.514830589294434,
-           15.370736837387085, 23.846530199050903, 6.757539510726929, 15.061991930007935, 50.010520696640015,
-           22.011935234069824]
-
-    assert np.allclose(fit_times, ans)
-    assert isinstance(fit_times, np.ndarray)
-    assert fit_times.dtype is np.dtype(np.float)
-
-
-def _check_budgets(budgets):
-    """ Based on runhistory_B.json """
-    ans = [5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
-           5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
-           5.555555555555555, 16.666666666666664, 50.0, 16.666666666666664, 16.666666666666664,
-           16.666666666666664, 50.0, 50.0]
-    assert np.allclose(budgets, ans)
-    assert isinstance(budgets, list)
-    assert isinstance(budgets[0], float)
-
-
-def _check_additional_infos(status_types, additional_infos):
-    for i, status in enumerate(status_types):
-        info = additional_infos[i]
-        if status in (STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.DONOTADVANCE]):
-            metric_info = info.get('opt_loss', None)
-            assert metric_info is not None
-        elif info is not None:
-            metric_info = info.get('opt_loss', None)
-            assert metric_info is None
-
-
-def _check_metric_dict(metric_dict, status_types):
-    assert isinstance(metric_dict['accuracy'], list)
-    assert metric_dict['accuracy'][0] > 0
-    assert isinstance(metric_dict['balanced_accuracy'], list)
-    assert metric_dict['balanced_accuracy'][0] > 0
-
-    for key, vals in metric_dict.items():
-        # ^ is a XOR operator
-        # True and False / False and True must be fulfilled
-        assert all([(s == STATUS2MSG[StatusType.SUCCESS]) ^ isnan
-                    for s, isnan in zip(status_types, np.isnan(vals))])
-
-
-def test_extract_results_from_run_history():
-    # test the raise error for the `status_msg is None`
-    run_history = RunHistory()
-    cs = ConfigurationSpace()
-    config = Configuration(cs, {})
-    run_history.add(
-        config=config,
-        cost=0.0,
-        time=1.0,
-        status=StatusType.CAPPED,
-    )
-    with pytest.raises(ValueError) as excinfo:
-        SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history)
-
-        assert excinfo._excinfo[0] == ValueError
-
-
-def test_search_results_sprint_statistics():
-    api = BaseTask()
-    for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']:
-        with pytest.raises(RuntimeError) as excinfo:
-            getattr(api, method)()
-
-        assert excinfo._excinfo[0] == RuntimeError
-
-    run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
-                                                   '.tmp_api/runhistory_B.json'),
-                                      mode='r'))['data']
-    api._results_manager.run_history = MagicMock()
-    api.run_history.empty = MagicMock(return_value=False)
-
-    # The run_history has 16 runs + 1 run interruption ==> 16 runs
-    api.run_history.data = make_dict_run_history_data(run_history_data)
-    api._metric = accuracy
-    api.dataset_name = 'iris'
-    api._scoring_functions = [accuracy, balanced_accuracy]
-    api.search_space = MagicMock(spec=ConfigurationSpace)
-    search_results = api.get_search_results()
-
-    _check_status(search_results.status_types)
-    _check_costs(search_results.opt_scores)
-    _check_fit_times(search_results.fit_times)
-    _check_budgets(search_results.budgets)
-    _check_metric_dict(search_results.metric_dict, search_results.status_types)
-    _check_additional_infos(status_types=search_results.status_types,
-                            additional_infos=search_results.additional_infos)
-
-    # config_ids can duplicate because of various budget size
-    config_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 10, 11, 12, 10, 13]
-    assert config_ids == search_results.config_ids
-
-    # assert that contents of search_results are of expected types
-    assert isinstance(search_results.rank_test_scores, np.ndarray)
-    assert search_results.rank_test_scores.dtype is np.dtype(np.int)
-    assert isinstance(search_results.configs, list)
-
-    n_success, n_timeout, n_memoryout, n_crashed = 13, 2, 0, 1
-    msg = ["autoPyTorch results:", f"\tDataset name: {api.dataset_name}",
-           f"\tOptimisation Metric: {api._metric.name}",
-           f"\tBest validation score: {max(search_results.opt_scores)}",
-           "\tNumber of target algorithm runs: 16", f"\tNumber of successful target algorithm runs: {n_success}",
-           f"\tNumber of crashed target algorithm runs: {n_crashed}",
-           f"\tNumber of target algorithms that exceeded the time limit: {n_timeout}",
-           f"\tNumber of target algorithms that exceeded the memory limit: {n_memoryout}"]
-
-    assert isinstance(api.sprint_statistics(), str)
-    assert all([m1 == m2 for m1, m2 in zip(api.sprint_statistics().split("\n"), msg)])
-
-
-@pytest.mark.parametrize('run_history', (None, RunHistory()))
-def test_check_run_history(run_history):
-    manager = ResultsManager()
-    manager.run_history = run_history
-
-    with pytest.raises(RuntimeError) as excinfo:
-        manager._check_run_history()
-
-    assert excinfo._excinfo[0] == RuntimeError
-
-
-T, NT = 'traditional', 'non-traditional'
-SCORES = [0.1 * (i + 1) for i in range(10)]
-
-
-@pytest.mark.parametrize('include_traditional', (True, False))
-@pytest.mark.parametrize('metric', (accuracy, log_loss))
-@pytest.mark.parametrize('origins', ([T] * 5 + [NT] * 5, [T, NT] * 5, [NT] * 5 + [T] * 5))
-@pytest.mark.parametrize('scores', (SCORES, SCORES[::-1]))
-def test_get_incumbent_results(include_traditional, metric, origins, scores):
-    manager = ResultsManager()
-    cs = ConfigurationSpace()
-    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
-
-    configs = [0.1 * (i + 1) for i in range(len(scores))]
-    if metric.name == "log_loss":
-        # This is to detect mis-computation in reversion
-        metric._optimum = 0.1
-
-    best_cost, best_idx = np.inf, -1
-    for idx, (a, origin, score) in enumerate(zip(configs, origins, scores)):
-        config = Configuration(cs, {'a': a})
-
-        # conversion defined in:
-        # autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
-        cost = metric._optimum - metric._sign * score
-        manager.run_history.add(
-            config=config,
-            cost=cost,
-            time=1.0,
-            status=StatusType.SUCCESS,
-            additional_info={'opt_loss': {metric.name: score},
-                             'configuration_origin': origin}
-        )
-        if cost > best_cost:
-            continue
-
-        if include_traditional:
-            best_cost, best_idx = cost, idx
-        elif origin != T:
-            best_cost, best_idx = cost, idx
-
-    incumbent_config, incumbent_results = manager.get_incumbent_results(
-        metric=metric,
-        include_traditional=include_traditional
-    )
-
-    assert isinstance(incumbent_config, Configuration)
-    assert isinstance(incumbent_results, dict)
-    best_score, best_a = scores[best_idx], configs[best_idx]
-    assert np.allclose(
-        [best_score, best_score, best_a],
-        [cost2metric(best_cost, metric),
-         incumbent_results['opt_loss'][metric.name],
-         incumbent_config['a']]
-    )
-
-    if not include_traditional:
-        assert incumbent_results['configuration_origin'] != T
diff --git a/test/test_api/utils.py b/test/test_api/utils.py
index a8c258fe9..bbee9a3c4 100644
--- a/test/test_api/utils.py
+++ b/test/test_api/utils.py
@@ -2,13 +2,15 @@
 
 from smac.runhistory.runhistory import DataOrigin, RunHistory, RunKey, RunValue, StatusType
 
-from autoPyTorch.constants import REGRESSION_TASKS
+from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
 from autoPyTorch.evaluation.abstract_evaluator import (
     DummyClassificationPipeline,
     DummyRegressionPipeline,
     fit_and_suppress_warnings
 )
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+from autoPyTorch.evaluation.utils_extra import DummyTimeSeriesForecastingPipeline
 from autoPyTorch.pipeline.traditional_tabular_classification import TraditionalTabularClassificationPipeline
 
 
@@ -33,8 +35,9 @@ def _fit_and_predict(self, pipeline, fold: int, train_indices,
                          test_indices,
                          add_pipeline_to_self
                          ):
-
-        if self.task_type in REGRESSION_TASKS:
+        if self.task_type in FORECASTING_TASKS:
+            pipeline = DummyTimeSeriesForecastingPipeline(config=1)
+        elif self.task_type in REGRESSION_TASKS:
             pipeline = DummyRegressionPipeline(config=1)
         else:
             pipeline = DummyClassificationPipeline(config=1)
@@ -68,8 +71,18 @@ def _fit_and_predict(self, pipeline, fold: int, train_indices,
         return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred
 
 
+class DummyForecastingEvaluator(TimeSeriesForecastingTrainEvaluator):
+    def _fit_and_predict(self, pipeline, fold: int, train_indices,
+                         test_indices,
+                         add_pipeline_to_self
+                         ):
+        return DummyTrainEvaluator._fit_and_predict(self,
+                                                    pipeline, fold, train_indices, test_indices,
+                                                    add_pipeline_to_self)
+
+
 # create closure for evaluating an algorithm
-def dummy_eval_function(
+def dummy_eval_train_function(
         backend,
         queue,
         metric,
@@ -106,7 +119,54 @@ def dummy_eval_function(
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
-        search_space_updates=search_space_updates
+        search_space_updates=search_space_updates,
+    )
+    evaluator.fit_predict_and_loss()
+
+
+# create closure for evaluating an algorithm
+def dummy_forecasting_eval_train_function(
+        backend,
+        queue,
+        metric,
+        budget: float,
+        config,
+        seed: int,
+        output_y_hat_optimization: bool,
+        num_run: int,
+        include,
+        exclude,
+        disable_file_output,
+        pipeline_config=None,
+        budget_type=None,
+        init_params=None,
+        logger_port=None,
+        all_supported_metrics=True,
+        search_space_updates=None,
+        instance: str = None,
+        max_budget=1.0,
+        min_num_test_instances=None
+) -> None:
+    evaluator = DummyForecastingEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates,
+        max_budget=max_budget,
+        min_num_test_instances=min_num_test_instances,
     )
     evaluator.fit_predict_and_loss()
 
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index 7f2ff2507..08da7d7fd 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -15,153 +15,6 @@
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 
 
-# Fixtures to be used in this class. By default all elements have 100 datapoints
-@pytest.fixture
-def input_data_featuretest(request):
-    if request.param == 'numpy_categoricalonly_nonan':
-        return np.random.randint(10, size=(100, 10))
-    elif request.param == 'numpy_numericalonly_nonan':
-        return np.random.uniform(10, size=(100, 10))
-    elif request.param == 'numpy_mixed_nonan':
-        return np.column_stack([
-            np.random.uniform(10, size=(100, 3)),
-            np.random.randint(10, size=(100, 3)),
-            np.random.uniform(10, size=(100, 3)),
-            np.random.randint(10, size=(100, 1)),
-        ])
-    elif request.param == 'numpy_string_nonan':
-        return np.array([
-            ['a', 'b', 'c', 'a', 'b', 'c'],
-            ['a', 'b', 'd', 'r', 'b', 'c'],
-        ])
-    elif request.param == 'numpy_categoricalonly_nan':
-        array = np.random.randint(10, size=(100, 10)).astype('float')
-        array[50, 0:5] = np.nan
-        return array
-    elif request.param == 'numpy_numericalonly_nan':
-        array = np.full(fill_value=10.0, shape=(100, 10), dtype=np.float64)
-        array[50, 0:5] = np.nan
-        # Somehow array is changed to dtype object after np.nan
-        return array.astype('float')
-    elif request.param == 'numpy_mixed_nan':
-        array = np.column_stack([
-            np.random.uniform(10, size=(100, 3)),
-            np.random.randint(10, size=(100, 3)),
-            np.random.uniform(10, size=(100, 3)),
-            np.random.randint(10, size=(100, 1)),
-        ])
-        array[50, 0:5] = np.nan
-        return array
-    elif request.param == 'numpy_string_nan':
-        return np.array([
-            ['a', 'b', 'c', 'a', 'b', 'c'],
-            [np.nan, 'b', 'd', 'r', 'b', 'c'],
-        ])
-    elif request.param == 'pandas_categoricalonly_nonan':
-        return pd.DataFrame([
-            {'A': 1, 'B': 2},
-            {'A': 3, 'B': 4},
-        ], dtype='category')
-    elif request.param == 'pandas_numericalonly_nonan':
-        return pd.DataFrame([
-            {'A': 1, 'B': 2},
-            {'A': 3, 'B': 4},
-        ], dtype='float')
-    elif request.param == 'pandas_mixed_nonan':
-        frame = pd.DataFrame([
-            {'A': 1, 'B': 2},
-            {'A': 3, 'B': 4},
-        ], dtype='category')
-        frame['B'] = pd.to_numeric(frame['B'])
-        return frame
-    elif request.param == 'pandas_categoricalonly_nan':
-        return pd.DataFrame([
-            {'A': 1, 'B': 2, 'C': np.nan},
-            {'A': 3, 'C': np.nan},
-        ], dtype='category')
-    elif request.param == 'pandas_numericalonly_nan':
-        return pd.DataFrame([
-            {'A': 1, 'B': 2, 'C': np.nan},
-            {'A': 3, 'C': np.nan},
-        ], dtype='float')
-    elif request.param == 'pandas_mixed_nan':
-        frame = pd.DataFrame([
-            {'A': 1, 'B': 2, 'C': 8},
-            {'A': 3, 'B': 4},
-        ], dtype='category')
-        frame['B'] = pd.to_numeric(frame['B'])
-        return frame
-    elif request.param == 'pandas_string_nonan':
-        return pd.DataFrame([
-            {'A': 1, 'B': 2},
-            {'A': 3, 'B': 4},
-        ], dtype='string')
-    elif request.param == 'list_categoricalonly_nonan':
-        return [
-            ['a', 'b', 'c', 'd'],
-            ['e', 'f', 'c', 'd'],
-        ]
-    elif request.param == 'list_numericalonly_nonan':
-        return [
-            [1, 2, 3, 4],
-            [5, 6, 7, 8]
-        ]
-    elif request.param == 'list_mixed_nonan':
-        return [
-            ['a', 2, 3, 4],
-            ['b', 6, 7, 8]
-        ]
-    elif request.param == 'list_categoricalonly_nan':
-        return [
-            ['a', 'b', 'c', np.nan],
-            ['e', 'f', 'c', 'd'],
-        ]
-    elif request.param == 'list_numericalonly_nan':
-        return [
-            [1, 2, 3, np.nan],
-            [5, 6, 7, 8]
-        ]
-    elif request.param == 'list_mixed_nan':
-        return [
-            ['a', np.nan, 3, 4],
-            ['b', 6, 7, 8]
-        ]
-    elif 'sparse' in request.param:
-        # We expect the names to be of the type sparse_csc_nonan
-        sparse_, type_, nan_ = request.param.split('_')
-        if 'nonan' in nan_:
-            data = np.ones(3)
-        else:
-            data = np.array([1, 2, np.nan])
-
-        # Then the type of sparse
-        row_ind = np.array([0, 1, 2])
-        col_ind = np.array([1, 2, 1])
-        if 'csc' in type_:
-            return sparse.csc_matrix((data, (row_ind, col_ind)))
-        elif 'csr' in type_:
-            return sparse.csr_matrix((data, (row_ind, col_ind)))
-        elif 'coo' in type_:
-            return sparse.coo_matrix((data, (row_ind, col_ind)))
-        elif 'bsr' in type_:
-            return sparse.bsr_matrix((data, (row_ind, col_ind)))
-        elif 'lil' in type_:
-            return sparse.lil_matrix((data))
-        elif 'dok' in type_:
-            return sparse.dok_matrix(np.vstack((data, data, data)))
-        elif 'dia' in type_:
-            return sparse.dia_matrix(np.vstack((data, data, data)))
-        else:
-            ValueError("Unsupported indirect fixture {}".format(request.param))
-    elif 'openml' in request.param:
-        _, openml_id = request.param.split('_')
-        X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id),
-                                             return_X_y=True, as_frame=True)
-        return X
-    else:
-        ValueError("Unsupported indirect fixture {}".format(request.param))
-
-
 # Actual checks for the features
 @pytest.mark.parametrize(
     'input_data_featuretest',
@@ -406,12 +259,12 @@ def test_column_transformer_created(input_data_featuretest):
     transformed_columns, feature_types = validator._get_columns_to_encode(input_data_featuretest)
 
     # At least one categorical
-    assert 'categorical' in validator.feat_type
+    assert 'categorical' in validator.feat_types
 
     # Numerical if the original data has numerical only columns
     if np.any([pd.api.types.is_numeric_dtype(input_data_featuretest[col]
                                              ) for col in input_data_featuretest.columns]):
-        assert 'numerical' in validator.feat_type
+        assert 'numerical' in validator.feat_types
     for i, feat_type in enumerate(feature_types):
         if 'numerical' in feat_type:
             np.testing.assert_array_equal(
@@ -508,10 +361,6 @@ def test_featurevalidator_new_data_after_fit(openml_id,
     transformed_X = validator.transform(X_test)
 
     # Basic Checking
-    if sparse.issparse(input_data_featuretest):
-        assert sparse.issparse(transformed_X)
-    else:
-        assert isinstance(transformed_X, np.ndarray)
     assert np.shape(X_test) == np.shape(transformed_X)
 
     # And then check proper error messages
@@ -557,3 +406,123 @@ def test_comparator():
         key=functools.cmp_to_key(validator._comparator)
     )
     assert ans == feat_type
+
+
+@pytest.fixture
+def input_data_feature_feat_types(request):
+    if request.param == 'pandas_categoricalonly':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category'), ['categorical', 'categorical']
+    elif request.param == 'pandas_numericalonly':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='float'), ['numerical', 'numerical']
+    elif request.param == 'pandas_mixed':
+        frame = pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category')
+        frame['B'] = pd.to_numeric(frame['B'])
+        return frame, ['categorical', 'numerical']
+    elif request.param == 'pandas_string_error':
+        frame = pd.DataFrame([
+            {'A': 1, 'B': '2'},
+            {'A': 3, 'B': '4'},
+        ], dtype='category')
+        return frame, ['categorical', 'numerical']
+    elif request.param == 'pandas_length_error':
+        frame = pd.DataFrame([
+            {'A': 1, 'B': '2'},
+            {'A': 3, 'B': '4'},
+        ], dtype='category')
+        return frame, ['categorical', 'categorical', 'numerical']
+    elif request.param == 'pandas_feat_type_error':
+        frame = pd.DataFrame([
+            {'A': 1, 'B': '2'},
+            {'A': 3, 'B': '4'},
+        ], dtype='category')
+        return frame, ['not_categorical', 'numerical']
+    else:
+        ValueError("Unsupported indirect fixture {}".format(request.param))
+
+
+@pytest.mark.parametrize(
+    'input_data_feature_feat_types',
+    (
+        'pandas_categoricalonly',
+        'pandas_numericalonly',
+        'pandas_mixed',
+    ),
+    indirect=True
+)
+def test_feature_validator_get_columns_to_encode(input_data_feature_feat_types):
+    X, feat_types = input_data_feature_feat_types
+    validator = TabularFeatureValidator(feat_types=feat_types)
+    transformed_columns, val_feat_types = validator.get_columns_to_encode(X)
+
+    assert feat_types == val_feat_types
+
+    for feat_type, col in zip(X.columns, val_feat_types):
+        if feat_type.lower() == 'categorical':
+            assert col in transformed_columns
+
+
+@pytest.mark.parametrize(
+    'input_data_feature_feat_types',
+    (
+        'pandas_string_error',
+    ),
+    indirect=True
+)
+def test_feature_validator_get_columns_to_encode_error_string(input_data_feature_feat_types):
+    """
+    Tests the correct error is raised when feat types passed to
+    the validator disagree with the column dtypes.
+
+    """
+    X, feat_types = input_data_feature_feat_types
+    validator = TabularFeatureValidator(feat_types=feat_types)
+    with pytest.raises(ValueError, match=r"Passed numerical as the feature type for column: B but "
+                                         r"the column is categorical"):
+        validator.get_columns_to_encode(X)
+
+
+@pytest.mark.parametrize(
+    'input_data_feature_feat_types',
+    (
+        'pandas_length_error',
+    ),
+    indirect=True
+)
+def test_feature_validator_get_columns_to_encode_error_length(input_data_feature_feat_types):
+    """
+    Tests the correct error is raised when the length of feat types passed to
+    the validator is not the same as the number of features
+
+    """
+    X, feat_types = input_data_feature_feat_types
+    validator = TabularFeatureValidator(feat_types=feat_types)
+    with pytest.raises(ValueError, match=r"Expected number of `feat_types`: .*"):
+        validator._validate_feat_types(X)
+
+
+@pytest.mark.parametrize(
+    'input_data_feature_feat_types',
+    (
+        'pandas_feat_type_error',
+    ),
+    indirect=True
+)
+def test_feature_validator_get_columns_to_encode_error_feat_type(input_data_feature_feat_types):
+    """
+    Tests the correct error is raised when the length of feat types passed to
+    the validator is not the same as the number of features
+
+    """
+    X, feat_types = input_data_feature_feat_types
+    validator = TabularFeatureValidator(feat_types=feat_types)
+    with pytest.raises(ValueError, match=r"Expected type of features to be in .*"):
+        validator._validate_feat_types(X)
diff --git a/test/test_data/test_forecasting_feature_validator.py b/test/test_data/test_forecasting_feature_validator.py
new file mode 100644
index 000000000..be3f4d1bb
--- /dev/null
+++ b/test/test_data/test_forecasting_feature_validator.py
@@ -0,0 +1,99 @@
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+
+
+# Actual checks for the features
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'numpy_nonan',
+        'numpy_with_static',
+        'numpy_with_seq_length',
+        'pandas_wo_seriesid',
+        'pandas_w_seriesid',
+        'pandas_only_seriesid',
+        'pandas_without_seriesid',
+        'pandas_with_static_features',
+        'pandas_multi_seq',
+        'pandas_multi_seq_w_idx',
+        'pandas_with_static_features_multi_series',
+    ),
+    indirect=True
+)
+def test_forecasting_validator_supported_types(input_data_forecastingfeaturetest):
+    data, series_idx, seq_lengths = input_data_forecastingfeaturetest
+    validator = TimeSeriesFeatureValidator()
+    validator.fit(data, data, series_idx, seq_lengths)
+
+    if series_idx is not None:
+        index = pd.MultiIndex.from_frame(pd.DataFrame(data[series_idx]))
+    elif seq_lengths is not None:
+        index = np.arange(len(seq_lengths)).repeat(seq_lengths)
+    else:
+        index = None
+    if series_idx is not None and np.all(series_idx == data.columns):
+        assert validator.only_contain_series_idx is True
+        return
+
+    transformed_X = validator.transform(data, index)
+    assert isinstance(transformed_X, pd.DataFrame)
+    if series_idx is None and seq_lengths is None:
+        if not (isinstance(data, pd.DataFrame) and len(data.index.unique() > 1)):
+            assert np.all(transformed_X.index == 0)
+    else:
+        if series_idx is not None:
+            assert series_idx not in transformed_X
+        else:
+            if seq_lengths is not None:
+                for i, group in enumerate(transformed_X.groupby(transformed_X.index)):
+                    assert len(group[1]) == seq_lengths[i]
+    # static features
+    all_columns = transformed_X.columns
+    all_columns_are_unique = {col: True for col in all_columns}
+    for group in transformed_X.groupby(transformed_X.index):
+        for col in group[1].columns:
+            unique = np.unique(group[1][col])
+            all_columns_are_unique[col] = all_columns_are_unique[col] & len(unique) == 1
+    for key, value in all_columns_are_unique.items():
+        if key in validator.static_features:
+            assert value is True
+        else:
+            assert value is False
+    assert validator._is_fitted
+
+
+def test_forecasting_validator_get_reordered_columns():
+    df = pd.DataFrame([
+        {'category': 'one', 'int': 1, 'float': 1.0, 'bool': True},
+        {'category': 'two', 'int': 2, 'float': 2.0, 'bool': False},
+    ])
+
+    for col in df.columns:
+        df[col] = df[col].astype(col)
+
+    validator = TimeSeriesFeatureValidator()
+    validator.fit(df)
+    reorder_cols = validator.get_reordered_columns()
+    assert reorder_cols == ['category', 'bool', 'int', 'float']
+
+
+def test_forecasting_validator_handle_exception():
+    df = pd.DataFrame([
+        {'A': 1, 'B': 2},
+        {'A': np.NAN, 'B': 3},
+
+    ])
+    validator = TimeSeriesFeatureValidator()
+    with pytest.raises(ValueError, match=r"All Series ID must be contained in the training column"):
+        validator.fit(df, series_idx=['B', 'C'])
+    with pytest.raises(ValueError, match=r'NaN should not exit in Series ID!'):
+        validator.fit(df, series_idx=['A'])
+    valirator2 = TimeSeriesFeatureValidator()
+    valirator2.fit(df)
+    with pytest.raises(ValueError, match=r'Given index must have length as the input features!'):
+        valirator2.transform(df, index=[0] * 5)
diff --git a/test/test_data/test_forecasting_input_validator.py b/test/test_data/test_forecasting_input_validator.py
new file mode 100644
index 000000000..8148f1f1d
--- /dev/null
+++ b/test/test_data/test_forecasting_input_validator.py
@@ -0,0 +1,155 @@
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+
+
+def test_uni_variant_validator_only_y():
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    y_train = [[0.] * 5, [1.] * 10]
+    y_test = [[0.] * 3, [1.] * 3]
+    validator.fit(X_train=None, y_train=y_train, X_test=None, y_test=y_test)
+    assert validator.start_times == [pd.Timestamp('1900-01-01')] * len(y_train)
+
+    assert validator._is_fitted
+    assert validator._is_uni_variant
+    assert validator.feature_validator.num_features == 0
+    assert len(validator.feature_validator.numerical_columns) == 0
+    assert len(validator.feature_validator.categorical_columns) == 0
+    assert validator.feature_validator._is_fitted is False
+    assert len(validator.feature_shapes) == 0
+    assert len(validator.feature_names) == 0
+
+    x_transformed, y_transformed, sequence_lengths = validator.transform(None, y_train)
+    assert x_transformed is None
+    assert isinstance(y_transformed, pd.DataFrame)
+    assert np.all(sequence_lengths == [5, 10])
+    assert y_transformed.index.tolist() == sum([[i] * l_seq for i, l_seq in enumerate(sequence_lengths)], [])
+
+
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'pandas_only_seriesid',
+    ),
+    indirect=True
+)
+def test_uni_variant_validator_with_series_id(input_data_forecastingfeaturetest):
+    data, series_idx, seq_lengths = input_data_forecastingfeaturetest
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    start_times = [pd.Timestamp('2000-01-01')]
+    x = [data]
+    y = [list(range(len(data)))]
+    validator.fit(x, y, start_times=start_times, series_idx=series_idx)
+    assert validator._is_uni_variant is True
+    assert validator.start_times == start_times
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert x_transformed is None
+    # for uni_variant validator, setting X as None should not cause any issue
+    with pytest.raises(ValueError, match=r"X must be given as series_idx!"):
+        _ = validator.transform(None, y)
+
+
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'pandas_w_seriesid',
+    ),
+    indirect=True
+)
+def test_multi_variant_validator_with_series_id(input_data_forecastingfeaturetest):
+    data, series_idx, seq_lengths = input_data_forecastingfeaturetest
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    start_times = [pd.Timestamp('2000-01-01')]
+    x = [data]
+    y = [list(range(len(data)))]
+    validator.fit(x, y, start_times=start_times, series_idx=series_idx)
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert series_idx not in x_transformed
+
+
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'pandas_wo_seriesid',
+        'pandas_w_seriesid',
+        'pandas_only_seriesid',
+        'pandas_without_seriesid',
+        'pandas_with_static_features',
+        'pandas_multi_seq',
+        'pandas_multi_seq_w_idx',
+        'pandas_with_static_features_multi_series',
+    ),
+    indirect=True
+)
+def test_transform_pds(input_data_forecastingfeaturetest):
+    data, series_idx, _ = input_data_forecastingfeaturetest
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    # start_times = [pd.Timestamp('2000-01-01')]
+    start_times = None
+    x = data
+    y = pd.DataFrame(range(len(data)))
+    validator.fit(x, y, start_times=start_times, series_idx=series_idx)
+
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert np.all(sequence_lengths == y_transformed.index.value_counts(sort=False).values)
+
+    if x_transformed is not None:
+        assert series_idx not in x_transformed
+        assert np.all(sequence_lengths == x_transformed.index.value_counts(sort=False).values)
+    if series_idx is not None:
+        for seq_len, group in zip(sequence_lengths, data.groupby(series_idx)):
+            assert seq_len == len(group[1])
+
+
+def test_forecasting_validator():
+    df = pd.DataFrame([
+        {'category': 'one', 'int': 1, 'float': 1.0, 'bool': True},
+        {'category': 'two', 'int': 2, 'float': 2.0, 'bool': False},
+    ])
+
+    for col in df.columns:
+        df[col] = df[col].astype(col)
+
+    x = [df, df]
+    y = [[1., 2.], [1., 2.]]
+
+    validator = TimeSeriesForecastingInputValidator()
+    validator.fit(x, y, start_times=[pd.Timestamp('1900-01-01')] * 2)
+    feature_names = ['category', 'bool', 'int', 'float']
+    assert validator._is_uni_variant is False
+    assert validator.feature_names == feature_names
+
+    for fea_name in feature_names:
+        assert fea_name in validator.feature_shapes
+        assert validator.feature_shapes[fea_name] == 1
+
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert isinstance(x_transformed, pd.DataFrame)
+    assert isinstance(y_transformed, pd.DataFrame)
+    assert np.all(x_transformed.index == y_transformed.index)
+    assert len(x_transformed) == sum(sequence_lengths)
+
+    # y is only allowed to be None if validate_for_future_features is True
+    _ = validator.transform(x, None, validate_for_future_features=True)
+    with pytest.raises(ValueError, match=r"Targets must be given!"):
+        validator.transform(x)
+    with pytest.raises(ValueError, match=r"Multi Variant dataset requires X as input!"):
+        validator.transform(None, y)
+
+
+def test_forecasting_handle_exception():
+    validator = TimeSeriesForecastingInputValidator()
+    # if X and y has different lengths
+    X = [np.ones(3), np.ones(3)]
+    y = [[1], ]
+    with pytest.raises(ValueError, match="Inconsistent number of sequences for features and targets"):
+        validator.fit(X, y)
+
+    y = [[1], [1]]
+    # test data must have the same shapes as they are attached to the tails of the datasets
+    with pytest.raises(ValueError, match="Inconsistent number of test datapoints for features and targets"):
+        validator.fit(X, y, X_test=X, y_test=y)
diff --git a/test/test_data/test_forecasting_target_validator.py b/test/test_data/test_forecasting_target_validator.py
new file mode 100644
index 000000000..0d4288cbc
--- /dev/null
+++ b/test/test_data/test_forecasting_target_validator.py
@@ -0,0 +1,60 @@
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+from scipy import sparse
+
+from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
+
+
+def test_forecasting_target_transform():
+    validator = TimeSeriesTargetValidator(is_classification=False)
+    series_length = 10
+    y = np.ones(series_length)
+    validator.fit(y)
+    y_transformed_0 = validator.transform(y)
+    assert isinstance(y_transformed_0, pd.DataFrame)
+    assert np.all(y_transformed_0.index.values == np.zeros(series_length, dtype=np.int64))
+
+    index_1 = np.full(series_length, 1)
+    y_transformed_1 = validator.transform(y, index_1)
+    assert np.all(y_transformed_1.index.values == index_1)
+
+    index_2 = pd.Index([f"a{i}" for i in range(series_length)])
+    y_transformed_2 = validator.transform(y, index_2)
+    assert np.all(y_transformed_2.index.values == index_2)
+
+    index_3 = [('a', 'a')] * (series_length // 3) + \
+              [('a', 'b')] * (series_length // 3) + \
+              [('b', 'a')] * (series_length - series_length // 3 * 2)
+    index_3 = pd.MultiIndex.from_tuples(index_3)
+    y_transformed_3 = validator.transform(y, index_3)
+    assert isinstance(y_transformed_3.index, pd.MultiIndex)
+    assert np.all(y_transformed_3.index == index_3)
+
+
+def test_forecasting_target_handle_exception():
+    validator = TimeSeriesTargetValidator(is_classification=False)
+    target_sparse = sparse.csr_matrix(np.array([1, 1, 1]))
+    with pytest.raises(NotImplementedError, match=r"Sparse Target is unsupported for forecasting task!"):
+        # sparse matrix is unsupported for nan filling
+        validator.fit(target_sparse)
+
+    series_length = 10
+    y = np.ones(series_length)
+    validator.fit(y)
+    with pytest.raises(ValueError, match=r"Index must have length as the input targets!"):
+        validator.transform(y, np.asarray([1, 2, 3]))
+
+
+def test_forecasting_target_missing_values():
+    """
+    Makes sure we raise a proper message to the user,
+    when providing not supported data input
+    """
+    validator1 = TimeSeriesTargetValidator(is_classification=False)
+    target_1 = np.array([np.nan, 1, 2])
+    validator1.fit(target_1)
+    assert validator1.transform(target_1).isnull().values.sum() == 1
diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py
index aadc73416..3866bfb79 100644
--- a/test/test_data/test_target_validator.py
+++ b/test/test_data/test_target_validator.py
@@ -126,7 +126,7 @@ def input_data_targettest(request):
         'sparse_csc_nonan',
         'sparse_csr_nonan',
         'sparse_lil_nonan',
-        'openml_204',
+        'openml_204',  # openml cholesterol dataset
     ),
     indirect=True
 )
@@ -150,17 +150,17 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest)
     assert validator.encoder is None
 
     if hasattr(input_data_targettest, "iloc"):
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(input_data_targettest.to_numpy()),
             np.ravel(transformed_y)
         )
     elif sparse.issparse(input_data_targettest):
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(input_data_targettest.todense()),
             np.ravel(transformed_y.todense())
         )
     else:
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(np.array(input_data_targettest)),
             np.ravel(transformed_y)
         )
@@ -182,7 +182,7 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest)
         'sparse_csc_nonan',
         'sparse_csr_nonan',
         'sparse_lil_nonan',
-        'openml_2',
+        'openml_2',  # anneal dataset
     ),
     indirect=True
 )
@@ -246,7 +246,7 @@ def test_targetvalidator_supported_types_classification(input_data_targettest):
         'pandas_binary',
         'numpy_binary',
         'list_binary',
-        'openml_1066',
+        'openml_1066',  # kc1-binary dataset
     ),
     indirect=True
 )
@@ -266,7 +266,7 @@ def test_targetvalidator_binary(input_data_targettest):
         'pandas_multiclass',
         'numpy_multiclass',
         'list_multiclass',
-        'openml_54',
+        'openml_54',  # vehicle dataset
     ),
     indirect=True
 )
@@ -285,7 +285,7 @@ def test_targetvalidator_multiclass(input_data_targettest):
         'pandas_multilabel',
         'numpy_multilabel',
         'list_multilabel',
-        'openml_40594',
+        'openml_40594',  # reuters dataset
     ),
     indirect=True
 )
@@ -305,7 +305,7 @@ def test_targetvalidator_multilabel(input_data_targettest):
         'pandas_continuous',
         'numpy_continuous',
         'list_continuous',
-        'openml_531',
+        'openml_531',  # boston dataset
     ),
     indirect=True
 )
@@ -324,7 +324,7 @@ def test_targetvalidator_continuous(input_data_targettest):
         'pandas_continuous-multioutput',
         'numpy_continuous-multioutput',
         'list_continuous-multioutput',
-        'openml_41483',
+        'openml_41483',  # rf1 dataset
     ),
     indirect=True
 )
diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
new file mode 100644
index 000000000..4269c4e5f
--- /dev/null
+++ b/test/test_data/test_utils.py
@@ -0,0 +1,218 @@
+import warnings
+from test.test_data.utils import convert, dtype, size
+from typing import Mapping
+
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+from scipy.sparse import csr_matrix
+
+from sklearn.datasets import fetch_openml
+
+from autoPyTorch.constants import (
+    BINARY,
+    CLASSIFICATION_TASKS,
+    CONTINUOUS,
+    CONTINUOUSMULTIOUTPUT,
+    MULTICLASS,
+    MULTICLASSMULTIOUTPUT,
+    TABULAR_CLASSIFICATION,
+    TABULAR_REGRESSION
+)
+from autoPyTorch.data.utils import (
+    default_dataset_compression_arg,
+    get_dataset_compression_mapping,
+    megabytes,
+    reduce_dataset_size_if_too_large,
+    reduce_precision,
+    subsample,
+    validate_dataset_compression_arg
+)
+from autoPyTorch.utils.common import subsampler
+
+
+@pytest.mark.parametrize('openmlid', [2, 40984])
+@pytest.mark.parametrize('as_frame', [True, False])
+def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples):
+    X, y = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame)
+    X = subsampler(data=X, x=range(n_samples))
+    y = subsampler(data=y, x=range(n_samples))
+
+    X_converted, y_converted = reduce_dataset_size_if_too_large(
+        X.copy(),
+        y=y.copy(),
+        is_classification=True,
+        random_state=1,
+        memory_allocation=0.001)
+
+    assert X_converted.shape[0] < X.shape[0]
+    assert y_converted.shape[0] < y.shape[0]
+
+    assert megabytes(X_converted) < megabytes(X)
+
+
+@pytest.mark.parametrize("X", [np.asarray([[1, 1, 1]] * 30)])
+@pytest.mark.parametrize("x_type", [list, np.ndarray, csr_matrix, pd.DataFrame])
+@pytest.mark.parametrize(
+    "y, task, output",
+    [
+        (np.asarray([0] * 15 + [1] * 15), TABULAR_CLASSIFICATION, BINARY),
+        (np.asarray([0] * 10 + [1] * 10 + [2] * 10), TABULAR_CLASSIFICATION, MULTICLASS),
+        (np.asarray([[1, 0, 1]] * 30), TABULAR_CLASSIFICATION, MULTICLASSMULTIOUTPUT),
+        (np.asarray([1.0] * 30), TABULAR_REGRESSION, CONTINUOUS),
+        (np.asarray([[1.0, 1.0, 1.0]] * 30), TABULAR_REGRESSION, CONTINUOUSMULTIOUTPUT),
+    ],
+)
+@pytest.mark.parametrize("y_type", [list, np.ndarray, pd.DataFrame, pd.Series])
+@pytest.mark.parametrize("random_state", [0])
+@pytest.mark.parametrize("sample_size", [0.25, 0.5, 5, 10])
+def test_subsample_validity(X, x_type, y, y_type, random_state, sample_size, task, output):
+    """Asserts the validity of the function with all valid types
+    We want to make sure that `subsample` works correctly with all the types listed
+    as x_type and y_type.
+    We also want to make sure it works with all kinds of target types.
+    The output should maintain the types, and subsample the correct amount.
+    (test adapted from autosklearn)
+    """
+    assert len(X) == len(y)  # Make sure our test data is correct
+
+    if y_type == pd.Series and output in [
+        MULTICLASSMULTIOUTPUT,
+        CONTINUOUSMULTIOUTPUT,
+    ]:
+        # We can't have a pd.Series with multiple values as it's 1 dimensional
+        pytest.skip("Can't have pd.Series as y when task is n-dimensional")
+
+    # Convert our data to its given x_type or y_type
+    X = convert(X, x_type)
+    y = convert(y, y_type)
+
+    # Subsample the data, ignoring any warnings
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        X_sampled, y_sampled = subsample(
+            X,
+            y=y,
+            random_state=random_state,
+            sample_size=sample_size,
+            is_classification=task in CLASSIFICATION_TASKS,
+        )
+
+    # Check that the types of X remain the same after subsampling
+    if isinstance(X, pd.DataFrame):
+        # Dataframe can have multiple types, one per column
+        assert list(dtype(X_sampled)) == list(dtype(X))
+    else:
+        assert dtype(X_sampled) == dtype(X)
+
+    # Check that the types of y remain the same after subsampling
+    if isinstance(y, pd.DataFrame):
+        assert list(dtype(y_sampled)) == list(dtype(y))
+    else:
+        assert dtype(y_sampled) == dtype(y)
+
+    # check the right amount of samples were taken
+    if sample_size < 1:
+        assert size(X_sampled) == int(sample_size * size(X))
+    else:
+        assert size(X_sampled) == sample_size
+
+
+def test_validate_dataset_compression_arg():
+
+    data_compression_args = validate_dataset_compression_arg({}, 10)
+    # check whether the function uses default args
+    # to fill in case args is empty
+    assert data_compression_args is not None
+
+    # assert memory allocation is a float after validation
+    assert isinstance(data_compression_args['memory_allocation'], float)
+
+    # check whether the function raises an error
+    # in case an unknown key is in args
+    with pytest.raises(ValueError, match=r'Unknown key in dataset_compression, .*'):
+        validate_dataset_compression_arg({'not_there': 1}, 1)
+
+    # check whether the function raises an error
+    # in case memory_allocation is not int or float is in args
+    with pytest.raises(ValueError, match=r"key 'memory_allocation' must be an `int` or `float`.*"):
+        validate_dataset_compression_arg({'memory_allocation': 'not int'}, 1)
+
+    # check whether the function raises an error
+    # in case memory_allocation is an int greater than memory limit
+    with pytest.raises(ValueError, match=r"key 'memory_allocation' if int must be in.*"):
+        validate_dataset_compression_arg({'memory_allocation': 1}, 0)
+
+    # check whether the function raises an error
+    # in case memory_allocation is a float greater than 1
+    with pytest.raises(ValueError, match=r"key 'memory_allocation' if float must be in.*"):
+        validate_dataset_compression_arg({'memory_allocation': 1.5}, 0)
+
+    # check whether the function raises an error
+    # in case an unknown method is passed in args
+    with pytest.raises(ValueError, match=r"key 'methods' can only contain .*"):
+        validate_dataset_compression_arg({'methods': 'unknown'}, 1)
+
+    # check whether the function raises an error
+    # in case an unknown key is in args
+    with pytest.raises(ValueError, match=r'Unknown type for `dataset_compression` .*'):
+        validate_dataset_compression_arg(1, 1)
+
+
+def test_error_raised_reduce_precision():
+    # check whether the function raises an error
+    # in case X is not an expected type
+    with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to .*'):
+        reduce_precision(X='not expected')
+
+
+def _verify_dataset_compression_mapping(mapping, expected_mapping):
+    assert isinstance(mapping, Mapping)
+    assert 'methods' in mapping
+    assert 'memory_allocation' in mapping
+    assert mapping == expected_mapping
+
+
+@pytest.mark.parametrize('memory_limit', [2048])
+def test_get_dataset_compression_mapping(memory_limit):
+    """
+    Tests the functionalities of `get_dataset_compression_mapping`
+    """
+    dataset_compression_mapping = get_dataset_compression_mapping(
+        dataset_compression=True,
+        memory_limit=memory_limit)
+    # validation converts the memory allocation from float to integer based on the memory limit
+    expected_mapping = validate_dataset_compression_arg(default_dataset_compression_arg, memory_limit)
+    _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping)
+
+    mapping = {'memory_allocation': 0.01, 'methods': ['precision']}
+    dataset_compression_mapping = get_dataset_compression_mapping(
+        dataset_compression=mapping,
+        memory_limit=memory_limit
+    )
+    expected_mapping = validate_dataset_compression_arg(mapping, memory_limit)
+    _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping)
+
+    dataset_compression_mapping = get_dataset_compression_mapping(
+        dataset_compression=False,
+        memory_limit=memory_limit
+    )
+    assert dataset_compression_mapping is None
+
+
+def test_unsupported_errors():
+    """
+    Checks if errors are raised when unsupported data is passed to reduce
+    """
+    X = np.array([
+        ['a', 'b', 'c', 'a', 'b', 'c'],
+        ['a', 'b', 'd', 'r', 'b', 'c']])
+    with pytest.raises(ValueError, match=r'X.dtype = .*'):
+        reduce_dataset_size_if_too_large(X, is_classification=True, random_state=1, memory_allocation=0)
+
+    X = [[1, 2], [2, 3]]
+    with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'):
+        reduce_dataset_size_if_too_large(X, is_classification=True, random_state=1, memory_allocation=0)
diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py
index 482c99769..ba60a1760 100644
--- a/test/test_data/test_validation.py
+++ b/test/test_data/test_validation.py
@@ -10,6 +10,7 @@
 import sklearn.model_selection
 
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.utils import megabytes
 
 
 @pytest.mark.parametrize('openmlid', [2, 40975, 40984])
@@ -48,8 +49,8 @@ def test_data_validation_for_classification(openmlid, as_frame):
 
     # Categorical columns are sorted to the beginning
     if as_frame:
-        validator.feature_validator.feat_type is not None
-        ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_type))
+        validator.feature_validator.feat_types is not None
+        ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_types))
         if len(ordered_unique_elements) > 1:
             assert ordered_unique_elements[0] == 'categorical'
 
@@ -90,8 +91,8 @@ def test_data_validation_for_regression(openmlid, as_frame):
 
     # Categorical columns are sorted to the beginning
     if as_frame:
-        validator.feature_validator.feat_type is not None
-        ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_type))
+        validator.feature_validator.feat_types is not None
+        ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_types))
         if len(ordered_unique_elements) > 1:
             assert ordered_unique_elements[0] == 'categorical'
 
@@ -137,3 +138,50 @@ def test_validation_unsupported():
             X=np.array([[0, 1, 0], [0, 1, 1]]),
             y=np.array([0, 1]),
         )
+
+
+@pytest.mark.parametrize(
+    'input_data_featuretest',
+    (
+        'numpy_numericalonly_nonan',
+        'numpy_numericalonly_nan',
+        'numpy_mixed_nan',
+        'pandas_numericalonly_nan',
+        'sparse_bsr_nonan',
+        'sparse_bsr_nan',
+        'sparse_coo_nonan',
+        'sparse_coo_nan',
+        'sparse_csc_nonan',
+        'sparse_csc_nan',
+        'sparse_csr_nonan',
+        'sparse_csr_nan',
+        'sparse_dia_nonan',
+        'sparse_dia_nan',
+        'sparse_dok_nonan',
+        'sparse_dok_nan',
+        'openml_40981',  # Australian
+    ),
+    indirect=True
+)
+def test_featurevalidator_dataset_compression(input_data_featuretest):
+    n_samples = input_data_featuretest.shape[0]
+    input_data_targets = np.random.random_sample((n_samples))
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        input_data_featuretest, input_data_targets, test_size=0.1, random_state=1)
+    validator = TabularInputValidator(
+        dataset_compression={'memory_allocation': 0.8 * megabytes(X_train), 'methods': ['precision', 'subsample']}
+    )
+    validator.fit(X_train=X_train, y_train=y_train)
+    transformed_X_train, _ = validator.transform(X_train.copy(), y_train.copy())
+
+    assert validator._reduced_dtype is not None
+    assert megabytes(transformed_X_train) < megabytes(X_train)
+
+    transformed_X_test, _ = validator.transform(X_test.copy(), y_test.copy())
+    assert megabytes(transformed_X_test) < megabytes(X_test)
+    if hasattr(transformed_X_train, 'iloc'):
+        assert all(transformed_X_train.dtypes == transformed_X_test.dtypes)
+        assert all(transformed_X_train.dtypes == validator._precision)
+    else:
+        assert transformed_X_train.dtype == transformed_X_test.dtype
+    assert transformed_X_test.dtype == validator._reduced_dtype
diff --git a/test/test_data/utils.py b/test/test_data/utils.py
new file mode 100644
index 000000000..f1fff440a
--- /dev/null
+++ b/test/test_data/utils.py
@@ -0,0 +1,34 @@
+from typing import List
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import spmatrix
+
+
+def convert(arr, objtype):
+    if objtype == np.ndarray:
+        return arr
+    elif objtype == list:
+        return arr.tolist()
+    else:
+        return objtype(arr)
+
+
+# Function to get the type of an obj
+def dtype(obj):
+    if isinstance(obj, List):
+        return type(obj[0][0]) if isinstance(obj[0], List) else type(obj[0])
+    elif isinstance(obj, pd.DataFrame):
+        return obj.dtypes
+    else:
+        return obj.dtype
+
+
+# Function to get the size of an object
+def size(obj):
+    if isinstance(obj, spmatrix):  # spmatrix doesn't support __len__
+        return obj.shape[0] if obj.shape[0] > 1 else obj.shape[1]
+    else:
+        return len(obj)
diff --git a/test/test_datasets/test_base_dataset.py b/test/test_datasets/test_base_dataset.py
new file mode 100644
index 000000000..52b2fa9a5
--- /dev/null
+++ b/test/test_datasets/test_base_dataset.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+import pytest
+
+from autoPyTorch.datasets.base_dataset import _get_output_properties
+
+
+@pytest.mark.parametrize(
+    "target_labels,dim,task_type", (
+        (np.arange(5), 5, "multiclass"),
+        (np.linspace(0, 1, 3), 1, "continuous"),
+        (np.linspace(0, 1, 3)[:, np.newaxis], 1, "continuous")
+    )
+)
+def test_get_output_properties(target_labels, dim, task_type):
+    train_tensors = np.array([np.empty_like(target_labels), target_labels])
+    output_dim, output_type = _get_output_properties(train_tensors)
+    assert output_dim == dim
+    assert output_type == task_type
diff --git a/test/test_datasets/test_resampling_strategies.py b/test/test_datasets/test_resampling_strategies.py
index 7f14275a3..c37467433 100644
--- a/test/test_datasets/test_resampling_strategies.py
+++ b/test/test_datasets/test_resampling_strategies.py
@@ -20,6 +20,42 @@ def test_holdoutfuncs():
     assert 0 in y[val]
     assert 0 in y[train]
 
+    # Forecasting
+    n_prediction_steps = 3
+    n_repeats = 1
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeats=n_repeats)
+    # val must start n_predictions_steps after train
+    assert val[0] - train[-1] == n_prediction_steps
+    assert len(val) == n_repeats
+
+    n_prediction_steps = 2
+    n_repeats = 2
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeats=n_repeats)
+    assert val[0] - train[-1] == n_prediction_steps
+    assert len(val) == n_repeats
+    # No overlapping between different splits
+    assert val[1] - val[0] == n_prediction_steps
+
+    # Failure case
+    # Forecasting
+    n_prediction_steps = 10
+    n_repeats = 1
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeats=n_repeats)
+    # n_prediction steps is larger than the length of the sequence
+    assert len(train) == 0
+    assert val == 9
+
+    # TODO Theoretically, this should work properly, we need to write our own spliter
+    n_prediction_steps = 2
+    n_repeats = 3
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeats=n_repeats)
+    assert len(train) == 0
+    assert val == 9
+
 
 def test_crossvalfuncs():
     split = CrossValFuncs()
@@ -40,3 +76,49 @@ def test_crossvalfuncs():
     splits = split.stratified_k_fold_cross_validation(0, 10, X, stratify=y)
     assert len(splits) == 10
     assert all([0 in y[s[1]] for s in splits])
+
+    def eval_ts_cv(num_splits, n_prediction_steps, n_repeats):
+        splits = split.time_series_cross_validation(0, num_splits, X,
+                                                    n_prediction_steps=n_prediction_steps, n_repeats=n_repeats)
+        assert len(splits) == num_splits
+        for i, sp in enumerate(splits):
+            assert len(sp[1]) == n_repeats
+            assert sp[1][0] - sp[0][-1] == n_prediction_steps
+            if i > 0:
+                assert sp[1][0] - splits[i - 1][1][-1] == n_prediction_steps
+
+    eval_ts_cv(2, 10, 1)
+    eval_ts_cv(3, 10, 3)
+
+    def eval_ts_sea_cv(num_splits, n_prediction_steps, n_repeats, freq_value):
+        seasonality_h_value = int(np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
+        splits = split.time_series_ts_cross_validation(0, num_splits=num_splits,
+                                                       indices=X,
+                                                       n_prediction_steps=n_prediction_steps,
+                                                       n_repeats=n_repeats,
+                                                       seasonality_h_value=seasonality_h_value)
+        assert len(splits) == num_splits
+        assert splits[0][1][-1] == len(X) - 1
+        if num_splits > 1:
+            for i in range(1, num_splits):
+                dis_val_start_to_test = len(X) - 1 - (splits[i][1] - n_prediction_steps)
+                assert np.all(dis_val_start_to_test % freq_value == 0)
+
+    eval_ts_sea_cv(2, 10, 2, 6)
+    eval_ts_sea_cv(2, 10, 1, 12)
+    eval_ts_sea_cv(3, 10, 1, 6)
+
+    n_prediction_steps = 10
+    freq_value = 24
+    n_repeats = 1
+    num_splits = 2
+    seasonality_h_value = int(np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
+
+    sp2 = split.time_series_ts_cross_validation(0, num_splits=num_splits,
+                                                indices=X[:10],
+                                                n_prediction_steps=n_prediction_steps,
+                                                n_repeats=n_repeats,
+                                                seasonality_h_value=seasonality_h_value)
+    # We cannot do a split, thus the two splits are the same
+
+    assert np.all(sp2[1][1] == sp2[0][1])
diff --git a/test/test_datasets/test_tabular_dataset.py b/test/test_datasets/test_tabular_dataset.py
index 409e6bdec..2ee8b608e 100644
--- a/test/test_datasets/test_tabular_dataset.py
+++ b/test/test_datasets/test_tabular_dataset.py
@@ -2,6 +2,9 @@
 
 import pytest
 
+from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.datasets.base_dataset import TransformSubset
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
@@ -46,3 +49,34 @@ def test_get_dataset_properties(backend, fit_dictionary_tabular):
 def test_not_supported():
     with pytest.raises(ValueError, match=r".*A feature validator is required to build.*"):
         TabularDataset(np.ones(10), np.ones(10))
+
+
+@pytest.mark.parametrize('resampling_strategy',
+                         (HoldoutValTypes.holdout_validation,
+                          CrossValTypes.k_fold_cross_validation,
+                          NoResamplingStrategyTypes.no_resampling
+                          ))
+def test_get_dataset(resampling_strategy, n_samples):
+    """
+    Checks the functionality of get_dataset function of the TabularDataset
+    gives an error when trying to get training and validation subset
+    """
+    X = np.zeros(shape=(n_samples, 4))
+    Y = np.ones(n_samples)
+    validator = TabularInputValidator(is_classification=True)
+    validator.fit(X, Y)
+    dataset = TabularDataset(
+        resampling_strategy=resampling_strategy,
+        X=X,
+        Y=Y,
+        validator=validator
+    )
+    transform_subset = dataset.get_dataset(split_id=0, train=True)
+    assert isinstance(transform_subset, TransformSubset)
+
+    if isinstance(resampling_strategy, NoResamplingStrategyTypes):
+        with pytest.raises(ValueError):
+            dataset.get_dataset(split_id=0, train=False)
+    else:
+        transform_subset = dataset.get_dataset(split_id=0, train=False)
+        assert isinstance(transform_subset, TransformSubset)
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
new file mode 100644
index 000000000..fa8faa625
--- /dev/null
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -0,0 +1,476 @@
+import unittest
+from typing import Callable, List, Tuple
+
+from gluonts.time_feature import Constant as ConstantTransform
+from gluonts.time_feature import DayOfMonth
+
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+import torch
+
+
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
+from autoPyTorch.datasets.time_series_dataset import (
+    TimeSeriesForecastingDataset,
+    TimeSeriesSequence,
+    extract_feature_index
+)
+from autoPyTorch.utils.pipeline import get_dataset_requirements
+
+
+class ZeroTransformer:
+    def __call__(self, x: np.ndarray):
+        return np.zeros_like(x)
+
+
+class TestTimeSeriesSequence(unittest.TestCase):
+    def setUp(self) -> None:
+        rng = np.random.RandomState(1)
+        self.data_length = 10
+        self.n_prediction_steps = 3
+
+        n_features = 5
+
+        self.x_data = rng.rand(self.data_length, n_features)
+        self.y = rng.rand(self.data_length, 1)
+
+        self.x_test_data = rng.rand(self.n_prediction_steps, 5)
+        self.y_test = rng.rand(self.n_prediction_steps, 1)
+        self.time_feature_transform = [DayOfMonth(), ConstantTransform(10.0)]
+        self.known_future_features_index = [0, 2]
+        self.seq_uni = TimeSeriesSequence(X=None, Y=self.y,
+                                          n_prediction_steps=self.n_prediction_steps,
+                                          time_feature_transform=self.time_feature_transform)
+        self.seq_multi = TimeSeriesSequence(X=self.x_data,
+                                            Y=self.y,
+                                            X_test=self.x_test_data,
+                                            Y_test=self.y_test, n_prediction_steps=self.n_prediction_steps,
+                                            time_feature_transform=self.time_feature_transform,
+                                            freq="1M")
+        self.seq_multi_with_future = TimeSeriesSequence(X=self.x_data,
+                                                        Y=self.y,
+                                                        X_test=self.x_test_data,
+                                                        Y_test=self.y_test, n_prediction_steps=self.n_prediction_steps,
+                                                        time_feature_transform=self.time_feature_transform,
+                                                        known_future_features_index=self.known_future_features_index,
+                                                        freq="1M")
+
+    def test_sequence_uni_variant_base(self):
+        self.assertEqual(len(self.seq_uni), self.data_length - self.n_prediction_steps)
+        idx = 6
+        data, target = self.seq_uni[idx]
+        self.assertTrue(isinstance(data['past_targets'], torch.Tensor))
+        self.assertEqual(len(data['past_targets']), idx + 1)
+        self.assertEqual(data['decoder_lengths'], self.n_prediction_steps)
+        self.assertEqual(self.seq_uni.start_time, pd.Timestamp('1900-01-01'))
+        mase_coefficient_1 = data['mase_coefficient']
+        self.assertEqual(mase_coefficient_1.size, 1)
+        # all data is observed
+        self.assertTrue(data['past_observed_targets'].all())
+
+        self.assertTrue(np.allclose(data['past_targets'].numpy(),
+                                    self.y[:idx + 1]))
+        self.assertTrue(np.allclose(target['future_targets'].numpy(),
+                                    self.y[idx + 1:1 + idx + self.n_prediction_steps]))
+
+        self.assertTrue(target['future_observed_targets'].all())
+
+        self.assertTrue(self.seq_uni[-2][0]["past_targets"].size, self.data_length - self.n_prediction_steps - 2 + 1)
+
+    def test_uni_get_val_seq_and_test_targets(self):
+        val_seq = self.seq_uni.get_val_seq_set(-1)
+        self.assertEqual(len(val_seq), len(self.seq_uni))
+
+        self.seq_uni.cache_time_features()
+        val_seq = self.seq_uni.get_val_seq_set(5)
+        self.assertEqual(len(val_seq), 5 + 1)
+        self.assertEqual(len(val_seq._cached_time_features), 5 + 1 + self.n_prediction_steps)
+
+        test_targets = self.seq_uni.get_test_target(-1)
+        self.assertTrue(np.all(self.y[-self.n_prediction_steps:] == test_targets))
+
+        test_targets = self.seq_uni.get_test_target(5)
+        self.assertTrue(np.all(self.y[5 + 1: 5 + 1 + self.n_prediction_steps] == test_targets))
+
+    def test_multi_get_val_seq(self):
+        val_seq = self.seq_multi_with_future.get_val_seq_set(-1)
+        self.assertTrue(len(val_seq), len(self.seq_multi_with_future))
+
+        val_seq = self.seq_multi_with_future.get_val_seq_set(3)
+        self.assertTrue(np.array_equal(val_seq.X, self.seq_multi_with_future.X[:4]))
+        self.assertTrue(np.array_equal(val_seq.X_test, self.seq_multi_with_future.X[4:7]))
+
+        val_seq = self.seq_multi_with_future.get_val_seq_set(len(self.seq_multi_with_future) - 1)
+        self.assertTrue(len(val_seq), len(self.seq_multi_with_future))
+
+        val_seq = self.seq_multi_with_future.get_val_seq_set(len(self.seq_multi_with_future) - 2)
+
+        self.assertTrue(np.array_equal(val_seq.X, self.seq_multi_with_future.X[:6]))
+        self.assertTrue(np.array_equal(val_seq.X_test, self.seq_multi_with_future.X[6:9]))
+
+    def test_uni_get_update_time_features(self):
+        self.seq_uni.update_attribute(transform_time_features=True)
+
+        data, target = self.seq_uni[3]
+        past_features = data["past_features"]
+        future_features = data["future_features"]
+
+        self.assertEqual(len(self.seq_uni._cached_time_features), len(self.y))
+        self.assertTrue(list(past_features.shape) == [3 + 1, len(self.time_feature_transform)])
+        self.assertTrue(list(future_features.shape) == [self.n_prediction_steps, len(self.time_feature_transform)])
+        self.assertTrue(torch.all(past_features[:, 1] == 10.))
+        self.assertTrue(torch.all(future_features[:, 1] == 10.))
+
+    def test_uni_to_test_set(self):
+        self.seq_uni.transform_time_features = True
+        self.seq_uni.cache_time_features()
+        # For test set, its length should equal to y's length
+        self.seq_uni.is_test_set = True
+        self.assertEqual(len(self.seq_uni), len(self.y))
+
+        data, target = self.seq_uni[-1]
+        self.assertTrue(target is None)
+        self.assertEqual(len(data["past_targets"]), len(self.y))
+        self.assertEqual(len(data["past_features"]), len(self.y))
+        self.assertEqual(len(self.seq_uni._cached_time_features), len(self.y) + self.n_prediction_steps)
+
+    def test_observed_values(self):
+        y_with_nan = self.seq_uni.Y.copy()
+        y_with_nan[[3, -2]] = np.nan
+        seq_1 = TimeSeriesSequence(X=None, Y=y_with_nan, n_prediction_steps=self.n_prediction_steps)
+        data, target = seq_1[-1]
+        self.assertFalse(data["past_observed_targets"][3])
+        self.assertTrue(target["future_observed_targets"][2])
+
+    def test_compute_mase_coefficient(self):
+        seq_2 = TimeSeriesSequence(X=None, Y=self.y, n_prediction_steps=self.n_prediction_steps, is_test_set=True)
+        self.assertNotEqual(self.seq_uni.mase_coefficient, seq_2.mase_coefficient)
+
+    def test_sequence_multi_variant_base(self):
+        data, _ = self.seq_multi[-1]
+        self.assertEqual(list(data["past_features"].shape), [len(self.seq_multi), self.x_data.shape[-1]])
+        self.assertTrue(data['future_features'] is None)
+
+        data, _ = self.seq_multi[-1]
+
+    def test_multi_known_future_variant(self):
+        data, _ = self.seq_multi_with_future[-1]
+        num_future_var = len(self.known_future_features_index)
+        future_features = data['future_features']
+        self.assertEqual(list(future_features.shape), [self.n_prediction_steps, num_future_var])
+        self.assertTrue(np.allclose(
+            future_features.numpy(),
+            self.x_data[-self.n_prediction_steps:, self.known_future_features_index])
+        )
+
+    def test_multi_transform_features(self):
+        self.seq_multi_with_future.transform_time_features = True
+        num_future_var = len(self.known_future_features_index)
+
+        data, _ = self.seq_multi_with_future[-1]
+        past_features = data["past_features"]
+        self.assertEqual(list(past_features.shape),
+                         [len(self.seq_multi_with_future), self.x_data.shape[-1] + len(self.time_feature_transform)])
+
+        self.assertTrue(np.allclose(
+            past_features[:, -len(self.time_feature_transform):].numpy(),
+            self.seq_multi_with_future._cached_time_features[:-self.n_prediction_steps]
+        ))
+
+        future_features = data["future_features"]
+        self.assertEqual(list(future_features.shape),
+                         [self.n_prediction_steps, num_future_var + len(self.time_feature_transform)])
+
+        self.assertTrue(np.allclose(
+            future_features[:, -len(self.time_feature_transform):].numpy(),
+            self.seq_multi_with_future._cached_time_features[-self.n_prediction_steps:]
+        ))
+
+    def test_multi_to_test_set(self):
+        self.seq_multi_with_future.is_test_set = True
+        self.assertEqual(len(self.seq_multi_with_future.X), len(self.x_data))
+        data, _ = self.seq_multi_with_future[-1]
+
+        self.assertTrue(np.allclose(data["past_features"].numpy(), self.x_data))
+        self.assertTrue(
+            np.allclose(data["future_features"].numpy(), self.x_test_data[:, self.known_future_features_index])
+        )
+
+        self.seq_multi_with_future.is_test_set = False
+        self.assertEqual(len(self.seq_multi_with_future.X), len(self.x_data))
+
+        seq_2 = self.seq_multi_with_future.get_val_seq_set(6)
+        self.assertEqual(len(seq_2), 6 + 1)
+
+    def test_get_target_values(self):
+        last_visible_target = self.seq_uni.get_target_values(-1)
+        self.assertEqual(last_visible_target, self.seq_uni[-1][0]['past_targets'][-1].numpy())
+
+        self.seq_uni.is_test_set = True
+        last_visible_target = self.seq_uni.get_target_values(-1)
+        self.assertEqual(last_visible_target, self.seq_uni[-1][0]['past_targets'][-1].numpy())
+
+    def test_transformation(self):
+        self.seq_multi.update_transform(ZeroTransformer(), train=True)
+        data, _ = self.seq_multi[-1]
+        self.assertTrue(torch.all(data['past_features'][:, :-len(self.time_feature_transform)] == 0.))
+
+        self.seq_multi.update_transform(ZeroTransformer(), train=False)
+        data, _ = self.seq_multi.__getitem__(-1, False)
+        self.assertTrue(torch.all(data['past_features'][:, :-len(self.time_feature_transform)] == 0.))
+
+    def test_exception(self):
+        seq_1 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=None,
+                                   known_future_features_index=self.known_future_features_index,
+                                   is_test_set=False)
+
+        with self.assertRaises(ValueError):
+            seq_1.is_test_set = True
+
+        seq_2 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=self.x_test_data,
+                                   is_test_set=True)
+
+        with self.assertRaises(ValueError):
+            seq_2.get_val_seq_set(5)
+
+        with self.assertRaises(ValueError):
+            seq_2.get_test_target(5)
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing',
+                                                        'uni_variant_w_missing',
+                                                        'multi_variant_wo_missing',
+                                                        'uni_variant_w_missing'], indirect=True)
+def test_dataset_properties(backend, fit_dictionary_forecasting):
+    # The fixture creates a datamanager by itself
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    info = {'task_type': datamanager.task_type,
+            'numerical_features': datamanager.numerical_features,
+            'categorical_features': datamanager.categorical_features,
+            'output_type': datamanager.output_type,
+            'numerical_columns': datamanager.numerical_columns,
+            'categorical_columns': datamanager.categorical_columns,
+            'target_columns': (1,),
+            'issparse': False}
+
+    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
+    assert dataset_properties['n_prediction_steps'] == datamanager.n_prediction_steps
+    assert dataset_properties['sp'] == datamanager.seasonality
+    assert dataset_properties['freq'] == datamanager.freq
+    assert isinstance(dataset_properties['input_shape'], Tuple)
+    assert isinstance(dataset_properties['time_feature_transform'], List)
+    for item in dataset_properties['time_feature_transform']:
+        assert isinstance(item, Callable)
+    assert dataset_properties['uni_variant'] == (fit_dictionary_forecasting['X_train'] is None)
+    assert dataset_properties['targets_have_missing_values'] == \
+           fit_dictionary_forecasting['y_train'].isnull().values.any()
+    if fit_dictionary_forecasting['X_train'] is not None:
+        assert dataset_properties['features_have_missing_values'] == \
+               fit_dictionary_forecasting['X_train'].isnull().values.any()
+
+
+def test_freq_valeus():
+    freq = '1H'
+    n_prediction_steps = 12
+
+    seasonality, freq, freq_value = TimeSeriesForecastingDataset.compute_freq_values(freq, n_prediction_steps)
+    assert seasonality == 24
+    assert freq == '1H'
+    assert freq_value == 24
+
+    n_prediction_steps = 36
+    seasonality, freq, freq_value = TimeSeriesForecastingDataset.compute_freq_values(freq, n_prediction_steps)
+    assert seasonality == 24
+    assert freq_value == 168
+
+    freq = [2, 3, 4]
+    n_prediction_steps = 10
+    seasonality, freq, freq_value = TimeSeriesForecastingDataset.compute_freq_values(freq, n_prediction_steps)
+    assert seasonality == 2
+    assert freq == '1Y'
+    assert freq_value == 4
+
+
+def test_target_normalization():
+    Y = [[1, 2], [3, 4, 5]]
+    dataset = TimeSeriesForecastingDataset(None, Y, normalize_y=True)
+
+    assert np.allclose(dataset.y_mean.values, np.vstack([np.mean(y) for y in Y]))
+    assert np.allclose(dataset.y_std.values, np.vstack([np.std(y, ddof=1) for y in Y]))
+    assert np.allclose(dataset.train_tensors[1].values.flatten(),
+                       np.hstack([(y - np.mean(y)) / np.std(y, ddof=1) for y in Y]))
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
+def test_dataset_index(backend, fit_dictionary_forecasting):
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    assert np.allclose(datamanager[5][0]['past_targets'][-1].numpy(), 5.0)
+    assert np.allclose(datamanager[50][0]['past_targets'][-1].numpy(), 1003.0)
+    assert np.allclose(datamanager[150][0]['past_targets'][-1].numpy(), 2046.0)
+    assert np.allclose(datamanager[-1][0]['past_targets'][-1].numpy(), 9136.0)
+
+    assert datamanager.get_time_series_seq(50) == datamanager.datasets[1]
+
+    # test for validation indices
+    val_indices = datamanager.splits[0][1]
+    val_set = [datamanager.get_validation_set(val_idx) for val_idx in val_indices]
+
+    val_targets = np.concatenate([val_seq[-1][1]['future_targets'].numpy() for val_seq in val_set])
+    assert np.allclose(val_targets, datamanager.get_test_target(val_indices))
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
+def test_update_dataset(backend, fit_dictionary_forecasting):
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    X = datamanager.train_tensors[0]
+    for col in X.columns:
+        X[col] = X.index
+    datamanager.replace_data(X, None)
+    for i, data in enumerate(datamanager.datasets):
+        assert np.allclose(data.X, np.ones_like(data.X) * i)
+
+    datamanager.update_transform(ZeroTransformer(), train=True)
+    assert np.allclose(datamanager[0][0]['past_features'].numpy(), np.zeros(len(X.columns)))
+    assert datamanager.transform_time_features is False
+
+    datamanager.transform_time_features = True
+    for dataset in datamanager.datasets:
+        assert dataset.transform_time_features is True
+    seq_lengths = datamanager.sequence_lengths_train
+    new_test_seq = datamanager.generate_test_seqs()
+    for seq_len, test_seq in zip(seq_lengths, new_test_seq):
+        # seq_len is len(y) - n_prediction_steps, here we expand X_test with another n_prediction_steps
+        assert test_seq.X.shape[0] - seq_len == datamanager.n_prediction_steps
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
+def test_test_tensors(backend, fit_dictionary_forecasting):
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    test_tensors = datamanager.test_tensors
+    forecast_horizon = datamanager.n_prediction_steps
+    n_seq = len(datamanager.datasets)
+    assert test_tensors[0].shape == (n_seq * forecast_horizon, datamanager.num_features)
+    assert test_tensors[1].shape == (n_seq * forecast_horizon, datamanager.num_targets)
+
+    datamanager2 = TimeSeriesForecastingDataset(X=None, Y=[[1, 2]])
+    assert datamanager2.test_tensors is None
+
+
+def test_splits():
+    y = [np.arange(100 + i * 10) for i in range(10)]
+    resampling_strategy_args = {'num_splits': 5}
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    assert len(dataset.splits) == 5
+    assert dataset.splits[0][1][0] == (100 - 10 - 1)
+    for split in dataset.splits:
+        # We need to ensure that the training indices only interrupt at where the validation sets start, e.g.,
+        #  the tail of each sequence
+        assert len(np.unique(split[0] - np.arange(len(split[0])))) == len(y)
+        assert np.all(split[1][1:] - split[1][:-1] == [100 + i * 10 for i in range(9)])
+        assert len(split[1]) == len(y)
+
+    y = [np.arange(100) for _ in range(10)]
+    resampling_strategy_args = {'num_splits': 5,
+                                'n_repeats': 2}
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    assert len(dataset.splits) == 5
+    for split in dataset.splits:
+        assert len(split[1]) == len(y) * 1
+
+    y = [np.arange(40) for _ in range(10)]
+    resampling_strategy_args = {'num_splits': 5}
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    # the length of each sequence does not support 5 splitions
+    assert len(dataset.splits) == 3
+
+    # datasets with long but little sequence
+    y = [np.arange(4000) for _ in range(2)]
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    # the length of each sequence does not support 5 splits
+    assert len(dataset.splits) == 2
+    for split in dataset.splits:
+        assert len(split[1]) == len(y) * 50
+
+    resampling_strategy = CrossValTypes.time_series_cross_validation
+
+    y = [np.arange(40) for _ in range(10)]
+    resampling_strategy_args = {'num_splits': 5,
+                                'n_repeats': 5}
+
+    resampling_strategy, resampling_strategy_args = TimeSeriesForecastingDataset.get_split_strategy(
+        [60] * 10,
+        10,
+        25,
+        CrossValTypes.time_series_ts_cross_validation,
+        resampling_strategy_args=resampling_strategy_args,
+    )
+    assert resampling_strategy_args['num_splits'] == 3
+    assert resampling_strategy_args['n_repeats'] == 1
+
+    resampling_strategy, resampling_strategy_args = TimeSeriesForecastingDataset.get_split_strategy(
+        [15] * 10,
+        10,
+        25,
+        CrossValTypes.time_series_cross_validation,
+    )
+    assert resampling_strategy == HoldoutValTypes.time_series_hold_out_validation
+
+    resampling_strategy_args = {'num_splits': 5,
+                                'n_repeats': 5}
+    resampling_strategy, resampling_strategy_args = TimeSeriesForecastingDataset.get_split_strategy(
+        [60] * 10,
+        10,
+        25,
+        CrossValTypes.time_series_cross_validation,
+        resampling_strategy_args=resampling_strategy_args,
+    )
+    assert resampling_strategy_args['num_splits'] == 4
+    assert resampling_strategy_args['n_repeats'] == 1
+
+    y = [np.arange(60) for _ in range(10)]
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    assert len(dataset.splits) == 4
+
+    refit_set = dataset.create_refit_set()
+    assert len(refit_set.splits[0][0]) == len(refit_set)
+
+
+def test_extract_time_features():
+    feature_shapes = {'b': 5, 'a': 3, 'c': 7, 'd': 12}
+    feature_names = ['a', 'b', 'c', 'd']
+    queried_features = ('b', 'd')
+    feature_index = extract_feature_index(feature_shapes, feature_names, queried_features)
+    feature_index2 = []
+    idx_tracker = 0
+    for fea_name in feature_names:
+        feature_s = feature_shapes[fea_name]
+        if fea_name in queried_features:
+            feature_index2.append(list(range(idx_tracker, idx_tracker + feature_s)))
+        idx_tracker += feature_s
+
+    assert feature_index == tuple(sum(feature_index2, []))
+    # the value should not be relevant with the order of queried_features
+    assert feature_index == extract_feature_index(feature_shapes, feature_names, ('d', 'b'))
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index 088726963..a40b0e112 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -12,8 +12,10 @@
 from sklearn import preprocessing
 
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.pipeline.components.training.metrics.metrics import (
     accuracy,
     balanced_accuracy,
@@ -241,9 +243,37 @@ def get_500_classes_datamanager(resampling_strategy=HoldoutValTypes.holdout_vali
     return dataset
 
 
+def get_forecasting_dataset(n_seq=10,
+                            n_prediction_steps=3,
+                            resampling_strategy=HoldoutValTypes.time_series_hold_out_validation):
+    base_length = 50
+    X = []
+    targets = []
+    X_test = []
+    Y_test = []
+
+    for i in range(n_seq):
+        series_length = base_length + i * 10
+
+        targets.append(np.arange(i * 1000, series_length + i * 1000))
+        X.append(targets[-1] - 1)
+        X_test.append(np.arange(X[-1][-1] + 1, X[-1][-1] + 1 + n_prediction_steps))
+        Y_test.append(np.arange(targets[-1][-1] + 1, targets[-1][-1] + 1 + n_prediction_steps))
+
+    input_validator = TimeSeriesForecastingInputValidator(is_classification=False).fit(X, targets)
+    return TimeSeriesForecastingDataset(X=X, Y=targets, X_test=X_test,
+                                        Y_test=Y_test,
+                                        known_future_features=(0,),
+                                        validator=input_validator,
+                                        resampling_strategy=resampling_strategy,
+                                        n_prediction_steps=n_prediction_steps
+                                        )
+
+
 def get_dataset_getters():
     return [get_binary_classification_datamanager,
             get_multiclass_classification_datamanager,
             get_500_classes_datamanager,
             get_abalone_datamanager,
-            get_regression_datamanager]
+            get_regression_datamanager,
+            get_forecasting_dataset]
diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py
index 6cec57fb4..a0be2c3f3 100644
--- a/test/test_evaluation/test_abstract_evaluator.py
+++ b/test/test_evaluation/test_abstract_evaluator.py
@@ -13,6 +13,7 @@
 
 from autoPyTorch.automl_common.common.utils.backend import Backend, BackendContext
 from autoPyTorch.evaluation.abstract_evaluator import AbstractEvaluator
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy
 
 this_directory = os.path.dirname(__file__)
@@ -129,7 +130,7 @@ def test_disable_file_output(self):
         ae = AbstractEvaluator(
             backend=self.backend_mock,
             queue=queue_mock,
-            disable_file_output=True,
+            disable_file_output=[DisableFileOutputParameters.all],
             metric=accuracy,
             logger_port=unittest.mock.Mock(),
             budget=0,
@@ -314,3 +315,35 @@ def test_error_unsupported_budget_type(self):
                 self.assertIsInstance(e, ValueError)
 
             shutil.rmtree(self.working_directory, ignore_errors=True)
+
+    def test_error_unsupported_disable_file_output_parameters(self):
+        shutil.rmtree(self.working_directory, ignore_errors=True)
+        os.mkdir(self.working_directory)
+
+        queue_mock = unittest.mock.Mock()
+
+        context = BackendContext(
+            prefix='autoPyTorch',
+            temporary_directory=os.path.join(self.working_directory, 'tmp'),
+            output_directory=os.path.join(self.working_directory, 'out'),
+            delete_tmp_folder_after_terminate=True,
+            delete_output_folder_after_terminate=True,
+        )
+        with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock:
+            load_datamanager_mock.return_value = get_multiclass_classification_datamanager()
+
+            backend = Backend(context, prefix='autoPyTorch')
+
+            try:
+                AbstractEvaluator(
+                    backend=backend,
+                    output_y_hat_optimization=False,
+                    queue=queue_mock,
+                    metric=accuracy,
+                    budget=0,
+                    configuration=1,
+                    disable_file_output=['model'])
+            except Exception as e:
+                self.assertIsInstance(e, ValueError)
+
+            shutil.rmtree(self.working_directory, ignore_errors=True)
diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py
index 222755b6e..8e12d2f71 100644
--- a/test/test_evaluation/test_evaluation.py
+++ b/test/test_evaluation/test_evaluation.py
@@ -22,7 +22,7 @@
 
 this_directory = os.path.dirname(__file__)
 sys.path.append(this_directory)
-from evaluation_util import get_multiclass_classification_datamanager  # noqa E402
+from evaluation_util import get_forecasting_dataset, get_multiclass_classification_datamanager  # noqa E402
 
 
 def safe_eval_success_mock(*args, **kwargs):
@@ -44,6 +44,18 @@ def load_datamanager(self):
         return get_multiclass_classification_datamanager()
 
 
+class BackendMockForecasting(object):
+    def __init__(self):
+        self.temporary_directory = './.tmp_evaluation'
+        try:
+            os.mkdir(self.temporary_directory)
+        except:  # noqa 3722
+            pass
+
+    def load_datamanager(self):
+        return get_forecasting_dataset()
+
+
 class EvaluationTest(unittest.TestCase):
     def setUp(self):
         self.datamanager = get_multiclass_classification_datamanager()
@@ -92,13 +104,14 @@ def run_over_time():
 
     ############################################################################
     # Test ExecuteTaFuncWithQueue.run_wrapper()
-    @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function')
+    @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function')
     def test_eval_with_limits_holdout(self, pynisher_mock):
         pynisher_mock.side_effect = safe_eval_success_mock
         config = unittest.mock.Mock()
         config.config_id = 198
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
+                                    multi_objectives=["cost"],
                                     memory_limit=3072,
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
@@ -106,7 +119,7 @@ def test_eval_with_limits_holdout(self, pynisher_mock):
                                     logger_port=self.logger_port,
                                     pynisher_context='fork',
                                     )
-        info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None,
+        info = ta.run_wrapper(RunInfo(config=config, cutoff=2000000, instance=None,
                                       instance_specific=None, seed=1, capped=False))
         self.assertEqual(info[0].config.config_id, 198)
         self.assertEqual(info[1].status, StatusType.SUCCESS, info)
@@ -120,6 +133,7 @@ def test_cutoff_lower_than_remaining_time(self, pynisher_mock):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -146,6 +160,7 @@ def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -166,6 +181,7 @@ def test_zero_or_negative_cutoff(self, pynisher_mock):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -178,7 +194,7 @@ def test_zero_or_negative_cutoff(self, pynisher_mock):
                                              instance_specific=None, seed=1, capped=False))
         self.assertEqual(run_value.status, StatusType.STOP)
 
-    @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function')
+    @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function')
     def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock):
         pynisher_mock.return_value = None
         config = unittest.mock.Mock()
@@ -187,6 +203,7 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -220,7 +237,7 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock):
                                                    'subprocess_stdout': '',
                                                    'subprocess_stderr': ''})
 
-    @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function')
+    @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function')
     def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock):
         pynisher_mock.side_effect = MemoryError
         config = unittest.mock.Mock()
@@ -228,6 +245,7 @@ def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -266,6 +284,7 @@ def side_effect(**kwargs):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -289,6 +308,7 @@ def side_effect(**kwargs):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -302,7 +322,7 @@ def side_effect(**kwargs):
         self.assertIsInstance(info[1].time, float)
         self.assertNotIn('exitcode', info[1].additional_info)
 
-    @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function')
+    @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function')
     def test_eval_with_limits_holdout_2(self, eval_houldout_mock):
         config = unittest.mock.Mock()
         config.config_id = 198
@@ -316,6 +336,7 @@ def side_effect(*args, **kwargs):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -331,7 +352,7 @@ def side_effect(*args, **kwargs):
         self.assertIn('configuration_origin', info[1].additional_info)
         self.assertEqual(info[1].additional_info['message'], "{'subsample': 30}")
 
-    @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function')
+    @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function')
     def test_exception_in_target_function(self, eval_holdout_mock):
         config = unittest.mock.Mock()
         config.config_id = 198
@@ -340,6 +361,7 @@ def test_exception_in_target_function(self, eval_holdout_mock):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -363,6 +385,7 @@ def test_silent_exception_in_target_function(self):
         ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
+                                    multi_objectives=["cost"],
                                     metric=accuracy,
                                     cost_for_crash=get_cost_of_crash(accuracy),
                                     abort_on_first_run_crash=False,
@@ -394,6 +417,61 @@ def test_silent_exception_in_target_function(self):
         self.assertNotIn('exit_status', info[1].additional_info)
         self.assertNotIn('traceback', info[1])
 
+    def test_eval_with_simple_intensification(self):
+        config = unittest.mock.Mock(spec=int)
+        config.config_id = 198
+
+        ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
+                                    stats=self.stats,
+                                    memory_limit=3072,
+                                    multi_objectives=["cost"],
+                                    metric=accuracy,
+                                    cost_for_crash=get_cost_of_crash(accuracy),
+                                    abort_on_first_run_crash=False,
+                                    logger_port=self.logger_port,
+                                    pynisher_context='fork',
+                                    budget_type='runtime'
+                                    )
+        ta.pynisher_logger = unittest.mock.Mock()
+        run_info = RunInfo(config=config, cutoff=3000, instance=None,
+                           instance_specific=None, seed=1, capped=False)
+
+        for budget in [0.0, 50.0]:
+            # Simple intensification always returns budget = 0
+            # Other intensifications return a non-zero value
+            self.stats.submitted_ta_runs += 1
+            run_info = run_info._replace(budget=budget)
+            run_info_out, _ = ta.run_wrapper(run_info)
+            self.assertEqual(run_info_out.budget, budget)
+
+    def test_eval_forecsating(self):
+        config = unittest.mock.Mock(spec=int)
+        config.config_id = 198
+
+        ta = ExecuteTaFuncWithQueue(backend=BackendMockForecasting(), seed=1,
+                                    stats=self.stats,
+                                    memory_limit=3072,
+                                    multi_objectives=["cost"],
+                                    metric=accuracy,
+                                    cost_for_crash=get_cost_of_crash(accuracy),
+                                    abort_on_first_run_crash=False,
+                                    logger_port=self.logger_port,
+                                    pynisher_context='fork',
+                                    budget_type='runtime',
+                                    )
+
+        ta.pynisher_logger = unittest.mock.Mock()
+        run_info = RunInfo(config=config, cutoff=3000, instance=None,
+                           instance_specific=None, seed=1, capped=False)
+
+        for budget in [0.0, 50.0]:
+            # Simple intensification always returns budget = 0
+            # Other intensifications return a non-zero value
+            self.stats.submitted_ta_runs += 1
+            run_info = run_info._replace(budget=budget)
+            run_info_out, _ = ta.run_wrapper(run_info)
+            self.assertEqual(run_info_out.budget, budget)
+
 
 @pytest.mark.parametrize("metric,expected", [(accuracy, 1.0), (log_loss, MAXINT)])
 def test_get_cost_of_crash(metric, expected):
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_evaluators.py
similarity index 65%
rename from test/test_evaluation/test_train_evaluator.py
rename to test/test_evaluation/test_evaluators.py
index a3ff067f1..2ca32af10 100644
--- a/test/test_evaluation/test_train_evaluator.py
+++ b/test/test_evaluation/test_evaluators.py
@@ -15,7 +15,8 @@
 from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import create
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, NoResamplingStrategyTypes
+from autoPyTorch.evaluation.test_evaluator import TestEvaluator
 from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
 from autoPyTorch.evaluation.utils import read_queue
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
@@ -294,3 +295,155 @@ def test_additional_metrics_during_training(self, pipeline_mock):
         self.assertIn('additional_run_info', result)
         self.assertIn('opt_loss', result['additional_run_info'])
         self.assertGreater(len(result['additional_run_info']['opt_loss'].keys()), 1)
+
+
+class TestTestEvaluator(BaseEvaluatorTest, unittest.TestCase):
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        """
+        Creates a backend mock
+        """
+        tmp_dir_name = self.id()
+        self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name)
+        if os.path.exists(self.ev_path):
+            shutil.rmtree(self.ev_path)
+        os.makedirs(self.ev_path, exist_ok=False)
+        dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)]
+        dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)]
+        dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)]
+        backend_mock = unittest.mock.Mock()
+        backend_mock.get_model_dir.return_value = self.ev_path
+        backend_mock.get_cv_model_dir.return_value = self.ev_path
+        backend_mock.get_model_path.side_effect = dummy_model_files
+        backend_mock.get_cv_model_path.side_effect = dummy_cv_model_files
+        backend_mock.get_prediction_output_path.side_effect = dummy_pred_files
+        backend_mock.temporary_directory = self.ev_path
+        self.backend_mock = backend_mock
+
+        self.tmp_dir = os.path.join(self.ev_path, 'tmp_dir')
+        self.output_dir = os.path.join(self.ev_path, 'out_dir')
+
+    def tearDown(self):
+        if os.path.exists(self.ev_path):
+            shutil.rmtree(self.ev_path)
+
+    @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline')
+    def test_no_resampling(self, pipeline_mock):
+        # Binary iris, contains 69 train samples, 31 test samples
+        D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling)
+        pipeline_mock.predict_proba.side_effect = \
+            lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+        pipeline_mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10}
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, 'autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TestEvaluator(backend_api, queue_, configuration=configuration, metric=accuracy, budget=0)
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+        self.assertEqual(len(rval[0]), 3)
+        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertEqual(result, 0.5806451612903225)
+        self.assertEqual(pipeline_mock.fit.call_count, 1)
+        # 2 calls because of train and test set
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 2)
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        # Should be none as no val preds are mentioned
+        self.assertIsNone(evaluator.file_output.call_args[0][1])
+        # Number of y_test_preds and Y_test should be the same
+        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0],
+                         D.test_tensors[1].shape[0])
+        self.assertEqual(evaluator.pipeline.fit.call_count, 1)
+
+    @unittest.mock.patch.object(TestEvaluator, '_loss')
+    def test_file_output(self, loss_mock):
+
+        D = get_regression_datamanager(NoResamplingStrategyTypes.no_resampling)
+        D.name = 'test'
+        self.backend_mock.load_datamanager.return_value = D
+        configuration = unittest.mock.Mock(spec=Configuration)
+        queue_ = multiprocessing.Queue()
+        loss_mock.return_value = None
+
+        evaluator = TestEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0)
+
+        self.backend_mock.get_model_dir.return_value = True
+        evaluator.pipeline = 'model'
+        evaluator.Y_optimization = D.train_tensors[1]
+        rval = evaluator.file_output(
+            D.train_tensors[1],
+            None,
+            D.test_tensors[1],
+        )
+
+        self.assertEqual(rval, (None, {}))
+        # These targets are not saved as Fit evaluator is not used to make an ensemble
+        self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 0)
+        self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1)
+        self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
+                         {'seed', 'idx', 'budget', 'model', 'cv_model',
+                          'ensemble_predictions', 'valid_predictions', 'test_predictions'})
+        self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model'])
+        self.assertIsNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model'])
+
+        # Check for not containing NaNs - that the models don't predict nonsense
+        # for unseen data
+        D.test_tensors[1][0] = np.NaN
+        rval = evaluator.file_output(
+            D.train_tensors[1],
+            None,
+            D.test_tensors[1],
+        )
+        self.assertEqual(
+            rval,
+            (
+                1.0,
+                {
+                    'error':
+                    'Model predictions for test set contains NaNs.'
+                },
+            )
+        )
+
+    @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline')
+    def test_predict_proba_binary_classification(self, mock):
+        D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling)
+        self.backend_mock.load_datamanager.return_value = D
+        mock.predict_proba.side_effect = lambda y, batch_size=None: np.array(
+            [[0.1, 0.9]] * y.shape[0]
+        )
+        mock.side_effect = lambda **kwargs: mock
+        mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10}
+        configuration = unittest.mock.Mock(spec=Configuration)
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TestEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0)
+
+        evaluator.fit_predict_and_loss()
+        Y_test_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][-1][
+            'ensemble_predictions']
+
+        for i in range(7):
+            self.assertEqual(0.9, Y_test_pred[i][1])
+
+    def test_get_results(self):
+        queue_ = multiprocessing.Queue()
+        for i in range(5):
+            queue_.put((i * 1, 1 - (i * 0.2), 0, "", StatusType.SUCCESS))
+        result = read_queue(queue_)
+        self.assertEqual(len(result), 5)
+        self.assertEqual(result[0][0], 0)
+        self.assertAlmostEqual(result[0][1], 1.0)
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
new file mode 100644
index 000000000..580402d5c
--- /dev/null
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -0,0 +1,276 @@
+import multiprocessing
+import os
+import queue
+import sys
+import unittest
+import unittest.mock
+
+from ConfigSpace import Configuration
+
+import numpy as np
+
+from smac.tae import StatusType
+
+from autoPyTorch.automl_common.common.utils.backend import create
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
+from autoPyTorch.evaluation.utils import read_queue
+from autoPyTorch.pipeline.components.training.metrics.metrics import mean_MASE_forecasting
+
+this_directory = os.path.dirname(__file__)
+sys.path.append(this_directory)
+from evaluation_util import (  # noqa (E402: module level import not at top of file)
+    BaseEvaluatorTest, get_binary_classification_datamanager,
+    get_forecasting_dataset, get_multiclass_classification_datamanager,
+    get_regression_datamanager)
+
+from test_evaluators import TestTrainEvaluator
+
+
+class BackendMock(object):
+    def load_datamanager(self):
+        return get_multiclass_classification_datamanager()
+
+
+class TestTimeSeriesForecastingTrainEvaluator(unittest.TestCase):
+    def setUp(self):
+        TestTrainEvaluator.setUp(self)
+
+    def tearDown(self):
+        TestTrainEvaluator.tearDown(self)
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_budget_type_choices(self, pipeline_mock):
+        D = get_forecasting_dataset()
+        n_prediction_steps = D.n_prediction_steps
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        budget_value = 0.1
+
+        for budget_type in ['resolution', 'num_seq', 'num_sample_per_seq']:
+            evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                            queue_,
+                                                            configuration=configuration,
+                                                            metric=mean_MASE_forecasting, budget=0,
+                                                            pipeline_config={'budget_type': budget_type,
+                                                                             budget_type: 0.1},
+                                                            min_num_test_instances=100)
+            self.assertTrue('epochs' not in evaluator.fit_dictionary)
+            if budget_type == 'resolution':
+                self.assertTrue('sample_interval' in evaluator.fit_dictionary)
+                self.assertEqual(int(np.ceil(1.0 / budget_value)), evaluator.fit_dictionary['sample_interval'])
+            elif budget_type == 'num_seq':
+                self.assertTrue('fraction_seq' in evaluator.fit_dictionary)
+                self.assertEqual(budget_value, evaluator.fit_dictionary['fraction_seq'])
+            if budget_type == 'num_sample_per_seq':
+                self.assertTrue('fraction_samples_per_seq' in evaluator.fit_dictionary)
+                self.assertEqual(budget_value, evaluator.fit_dictionary['fraction_samples_per_seq'])
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_holdout(self, pipeline_mock):
+        pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 50}
+        D = get_forecasting_dataset()
+        n_prediction_steps = D.n_prediction_steps
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                        queue_,
+                                                        configuration=configuration,
+                                                        metric=mean_MASE_forecasting, budget=0,
+                                                        pipeline_config={'budget_type': 'epochs', 'epochs': 50},
+                                                        min_num_test_instances=100)
+        self.assertTrue('epochs' in evaluator.fit_dictionary)
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+        self.assertEqual(len(rval[0]), 3)
+
+        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertAlmostEqual(result, 4591.5, places=4)
+        self.assertEqual(pipeline_mock.fit.call_count, 1)
+        # As forecasting inference could be quite expensive, we only allow one opt prediction and test prediction
+        self.assertEqual(pipeline_mock.predict.call_count, 2)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], len(D.splits[0][1]) * n_prediction_steps)
+        self.assertIsNone(evaluator.file_output.call_args[0][1])
+
+        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
+                         D.test_tensors[1].shape[0])
+        self.assertEqual(evaluator.pipeline.fit.call_count, 1)
+
+        res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
+        assert np.all(res == 0.)
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_cv(self, pipeline_mock):
+        D = get_forecasting_dataset(resampling_strategy=CrossValTypes.time_series_cross_validation)
+        assert D.resampling_strategy_args['num_splits'] == 3
+
+        n_prediction_steps = D.n_prediction_steps
+
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                        queue_,
+                                                        configuration=configuration,
+                                                        metric=mean_MASE_forecasting, budget=0,
+                                                        pipeline_config={'budget_type': 'epochs', 'epochs': 50})
+
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+        self.assertEqual(len(rval[0]), 3)
+        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertAlmostEqual(result, 4590.06977, places=4)
+        self.assertEqual(pipeline_mock.fit.call_count, 3)
+        # 3 calls because of the 3 times validation evaluations, however, we only evaluate test target once
+        self.assertEqual(pipeline_mock.predict.call_count, 4)
+        # as the optimisation preds in cv is concatenation of the 3 folds,
+        # so it is 3*splits
+        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0],
+                         3 * len(D.splits[0][1]) * n_prediction_steps, evaluator.file_output.call_args)
+        self.assertIsNone(evaluator.file_output.call_args[0][1])
+        # we do not have test sets
+        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
+                         D.test_tensors[1].shape[0])
+
+        res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
+        assert np.all(res == 0.)
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_proxy_val_set(self, pipeline_mock):
+        pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 0.1}
+        D = get_forecasting_dataset(n_prediction_steps=5)
+        n_prediction_steps = D.n_prediction_steps
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                        queue_,
+                                                        configuration=configuration,
+                                                        metric=mean_MASE_forecasting, budget=0.3,
+                                                        pipeline_config={'budget_type': 'epochs', 'epochs': 50},
+                                                        min_num_test_instances=1)
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+
+        self.assertAlmostEqual(result, 925.2, places=4)
+        res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
+
+        n_evaluated_pip_mock = 0
+        val_split = D.splits[0][1]
+
+        for i_seq, seq_output in enumerate(res):
+            if i_seq % 3 == 0 and n_evaluated_pip_mock < 3:
+                n_evaluated_pip_mock += 1
+                assert np.all(seq_output == 0.)
+            else:
+                # predict with dummy predictor
+                assert np.all(seq_output == D.get_validation_set(val_split[i_seq])[-1][0]['past_targets'][-1].numpy())
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    @unittest.mock.patch('multiprocessing.Queue', )
+    def test_finish_up(self, pipeline_mock, queue_mock):
+        pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 50}
+
+        rs = np.random.RandomState(1)
+        D = get_forecasting_dataset(n_prediction_steps=3)
+
+        n_prediction_steps = D.n_prediction_steps
+
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+
+        ae = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                 queue_mock,
+                                                 configuration=configuration,
+                                                 metric=mean_MASE_forecasting, budget=0.3,
+                                                 pipeline_config={'budget_type': 'epochs', 'epochs': 50},
+                                                 min_num_test_instances=1)
+
+        val_splits = D.splits[0][1]
+        mase_val = ae.generate_mase_coefficient_for_validation(val_splits)
+
+        ae.Y_optimization = rs.rand(len(val_splits) * n_prediction_steps, D.num_targets) * mase_val
+        predictions_ensemble = rs.rand(len(val_splits) * n_prediction_steps, D.num_targets) * mase_val
+        predictions_test = rs.rand(len(D.datasets) * n_prediction_steps, D.num_targets)
+
+        metric_kwargs = {'sp': ae.seasonality,
+                         'n_prediction_steps': ae.n_prediction_steps,
+                         'mase_coefficient': ae.generate_mase_coefficient_for_test_set()}
+
+        # NaNs in prediction ensemble
+        ae.finish_up(
+            loss={'mean_MASE_forecasting': 0.1},
+            train_loss=None,
+            opt_pred=predictions_ensemble,
+            valid_pred=None,
+            test_pred=predictions_test,
+            additional_run_info=None,
+            file_output=True,
+            status=StatusType.SUCCESS,
+            **metric_kwargs
+        )
+        self.assertTrue('test_loss' in queue_mock.put.call_args[0][0]['additional_run_info'])
diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py
new file mode 100644
index 000000000..e81eea38b
--- /dev/null
+++ b/test/test_evaluation/test_utils.py
@@ -0,0 +1,35 @@
+"""
+Tests the functionality in autoPyTorch.evaluation.utils
+"""
+import pytest
+
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+
+
+@pytest.mark.parametrize('disable_file_output',
+                         [['pipeline', 'pipelines'],
+                          [DisableFileOutputParameters.pipelines, DisableFileOutputParameters.pipeline]])
+def test_disable_file_output_no_error(disable_file_output):
+    """
+    Checks that `DisableFileOutputParameters.check_compatibility`
+    does not raise an error for the parameterized values of `disable_file_output`.
+
+    Args:
+        disable_file_output ([List[Union[str, DisableFileOutputParameters]]]):
+            Options that should be compatible with the `DisableFileOutputParameters`
+            defined in `autoPyTorch`.
+    """
+    DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output)
+
+
+def test_disable_file_output_error():
+    """
+    Checks that `DisableFileOutputParameters.check_compatibility` raises an error
+    for a value not present in `DisableFileOutputParameters` and ensures that the
+    expected error is raised.
+    """
+    disable_file_output = ['model']
+    with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of"
+                                         r" DisableFileOutputParameters or as string value"
+                                         r" of a member."):
+        DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output)
diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py
index ac16e286a..a2705e19b 100644
--- a/test/test_pipeline/components/preprocessing/base.py
+++ b/test/test_pipeline/components/preprocessing/base.py
@@ -3,9 +3,12 @@
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \
     TabularColumnTransformer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import CoalescerChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 
 
@@ -28,6 +31,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
 
         steps.extend([
             ("imputer", SimpleImputer()),
+            ("variance_threshold", VarianceThreshold()),
+            ("coalescer", CoalescerChoice(default_dataset_properties)),
             ("encoder", EncoderChoice(default_dataset_properties)),
             ("scaler", ScalerChoice(default_dataset_properties)),
             ("tabular_transformer", TabularColumnTransformer()),
diff --git a/test/test_pipeline/components/preprocessing/forecasting/__init__.py b/test/test_pipeline/components/preprocessing/forecasting/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_pipeline/components/preprocessing/forecasting/base.py b/test/test_pipeline/components/preprocessing/forecasting/base.py
new file mode 100644
index 000000000..eed947113
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/base.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
+    TimeSeriesFeatureTransformer,
+    TimeSeriesTargetTransformer
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
+    TimeSeriesFeatureImputer,
+    TimeSeriesTargetImputer
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+
+
+class ForecastingPipeline(TimeSeriesForecastingPipeline):
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
+                            ) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
+                by the pipeline.
+        """
+        steps: List[Tuple[str, Union[autoPyTorchChoice, autoPyTorchComponent]]] = []
+
+        default_dataset_properties = {'target_type': 'time_series_forecasting'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+        if not default_dataset_properties['uni_variant']:
+
+            steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
+                          ("scaler", BaseScaler(random_state=self.random_state)),
+                          ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
+                                                               random_state=self.random_state)),
+                          ("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),
+                          ])
+
+        steps.extend([("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
+                      ("time_series_target_transformer", TimeSeriesTargetTransformer(random_state=self.random_state)),
+                      ])
+
+        return steps
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
new file mode 100644
index 000000000..9079c6bec
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
@@ -0,0 +1,22 @@
+import unittest
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
+
+
+class TestEncoderChoice(unittest.TestCase):
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the encoder
+        choice"""
+        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]}
+        encoder_choice = TimeSeriesEncoderChoice(dataset_properties)
+        cs = encoder_choice.get_hyperparameter_search_space()
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(encoder_choice.get_components().keys()))
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
new file mode 100644
index 000000000..5769650f2
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
@@ -0,0 +1,93 @@
+import unittest
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.NoEncoder import \
+    TimeSeriesNoEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.OneHotEncoder import \
+    TimeSeriesOneHotEncoder
+
+
+class TestEncoders(unittest.TestCase):
+    def setUp(self) -> None:
+        data = np.array([[1, 'male', 1],
+                         [1, 'female', 2],
+                         [1, 'unknown', 2],
+                         [2, 'male', 2],
+                         [2, 'female', 2]])
+        feature_names = ("feature_n1", "feature_c", "feature_n2")
+
+        self.data = pd.DataFrame(data, columns=feature_names)
+
+        categorical_columns = [1]
+        numerical_columns = [0, 2]
+        self.train_indices = np.array([0, 1, 2])
+        self.test_indices = np.array([3, 4])
+
+        self.dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+            'categories': [['female', 'male', 'unknown']],
+            'feature_names': feature_names,
+            'feature_shapes': {fea: 1 for fea in feature_names}
+        }
+
+    def test_one_hot_encoder_no_unknown(self):
+        X = {
+            'X_train': self.data.iloc[self.train_indices],
+            'dataset_properties': self.dataset_properties
+        }
+        encoder_component = TimeSeriesOneHotEncoder()
+        encoder_component.fit(X)
+        X = encoder_component.transform(X)
+        encoder = X['encoder']['categorical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['encoder'], dict)
+        self.assertIsInstance(encoder, BaseEstimator)
+        self.assertIsNone(X['encoder']['numerical'])
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((encoder, X['dataset_properties']['categorical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data.iloc[self.test_indices])
+        # check if the transform is correct
+
+        assert_array_equal(transformed.tolist(), [[0.0, 1.0, 0.0, '2', '2'], [1.0, 0.0, 0.0, '2', '2']])
+
+        dataset_properties = X['dataset_properties']
+
+        idx_cat = 0
+        for i, fea_name in enumerate(dataset_properties['feature_names']):
+            if i in dataset_properties['categorical_columns']:
+                self.assertEqual(dataset_properties['feature_shapes'][fea_name],
+                                 len(dataset_properties['categories'][idx_cat]))
+                idx_cat += 1
+            else:
+                assert dataset_properties['feature_shapes'][fea_name] == 1
+
+    def test_none_encoder(self):
+        X = {
+            'X_train': self.data.iloc[self.train_indices],
+            'dataset_properties': self.dataset_properties
+        }
+
+        encoder_component = TimeSeriesNoEncoder()
+        encoder_component.fit(X)
+        X = encoder_component.transform(X)
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['encoder'], dict)
+        self.assertIsNone(X['encoder']['categorical'])
+        self.assertIsNone(X['encoder']['numerical'])
+
+        dataset_properties = X['dataset_properties']
+        for i, fea_name in enumerate(dataset_properties['feature_names']):
+            self.assertEqual(dataset_properties['feature_shapes'][fea_name], 1)
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
new file mode 100644
index 000000000..6c0143609
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
@@ -0,0 +1,282 @@
+import unittest
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+import pandas as pd
+
+import pytest
+
+from sklearn.base import BaseEstimator, clone
+from sklearn.compose import make_column_transformer
+
+from sktime.transformations.series.impute import Imputer as SKTImpute
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
+    TimeSeriesFeatureImputer,
+    TimeSeriesTargetImputer
+)
+
+
+class TestTimeSeriesFeatureImputer(unittest.TestCase):
+    def setUp(self) -> None:
+        data = np.array([[1.0, np.nan, 3],
+                         [np.nan, 8, 9],
+                         [4.0, 5, np.nan],
+                         [np.nan, 2, 3],
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        train_indices = np.array([0, 1, 2])
+        self.test_indices = np.array([3, 4, 5])
+        dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+        }
+        self.X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        self.data = data
+        self.dataset_properties = dataset_properties
+
+    def test_get_config_space(self):
+        dataset_properties = dict(categorical_columns=[0, 1],
+                                  numerical_columns=[1, 2],
+                                  features_have_missing_values=True)
+        config = TimeSeriesFeatureImputer.get_hyperparameter_search_space(dataset_properties).sample_configuration()
+        estimator = TimeSeriesFeatureImputer(**config)
+        estimator_clone = clone(estimator)
+        estimator_clone_params = estimator_clone.get_params()
+
+        # Make sure all keys are copied properly
+        for k, v in estimator.get_params().items():
+            self.assertIn(k, estimator_clone_params)
+
+        # Make sure the params getter of estimator are honored
+        klass = estimator.__class__
+        new_object_params = estimator.get_params(deep=False)
+        for name, param in new_object_params.items():
+            new_object_params[name] = clone(param, safe=False)
+        new_object = klass(**new_object_params)
+        params_set = new_object.get_params(deep=False)
+
+        for name in new_object_params:
+            param1 = new_object_params[name]
+            param2 = params_set[name]
+            self.assertEqual(param1, param2)
+
+        dataset_properties['features_have_missing_values'] = False
+        cs = TimeSeriesFeatureImputer.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(len(cs.get_hyperparameters()), 0)
+
+        with self.assertRaises(ValueError):
+            TimeSeriesFeatureImputer.get_hyperparameter_search_space()
+
+    def test_drift_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='drift')
+        data = pd.DataFrame(self.data)
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        categorical_imputer = X['imputer']['categorical']
+        numerical_imputer = X['imputer']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['imputer'], dict)
+        self.assertIsNone(categorical_imputer)
+        self.assertIsInstance(numerical_imputer, BaseEstimator)
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data.iloc[self.test_indices])
+
+        skt_imputer = SKTImpute(method='drift', random_state=imputer_component.random_state)
+        skt_imputer.fit(X['X_train'])
+
+        self.assertTrue(np.allclose(transformed, skt_imputer.transform(data.iloc[self.test_indices]).values))
+
+    def test_linear_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='linear')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+
+        skt_imputer = SKTImpute(method='linear', random_state=imputer_component.random_state)
+        skt_imputer.fit(X['X_train'])
+
+        assert_array_equal(transformed, skt_imputer.transform(self.data[self.test_indices]))
+
+    def test_nearest_imputation(self):
+        data = np.array([[1.0, np.nan, 7],
+                         [np.nan, 9, 10],
+                         [10.0, 7, 7],
+                         [9.0, np.nan, 11],
+                         [9.0, 9, np.nan],
+                         [np.nan, 5, 6],
+                         [12.0, np.nan, 8],
+                         [9.0, 7.0, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        train_indices = np.array([0, 1, 2, 3, 4])
+        test_indices = np.array([5, 6, 7])
+        dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+        }
+        X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='nearest')
+
+        imputer_component = imputer_component.fit(X)
+        X = imputer_component.transform(X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data[test_indices])
+
+        skt_imputer = SKTImpute(method='nearest', random_state=imputer_component.random_state)
+        skt_imputer.fit(X['X_train'])
+
+        assert_array_equal(transformed, skt_imputer.transform(data[test_indices]))
+
+    def test_constant_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='constant_zero')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+        assert_array_equal(transformed, np.array([[0, 2, 3],
+                                                  [7, 0, 9],
+                                                  [4, 0, 0]]))
+
+    def test_bfill_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='bfill')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+
+        assert_array_equal(transformed, np.array([[7., 2, 3],
+                                                  [7, 2., 9],
+                                                  [4, 2., 9.]]))
+
+    def test_ffill_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='ffill')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+        assert_array_equal(transformed, np.array([[7, 2, 3],
+                                                  [7, 2, 9],
+                                                  [4, 2, 9]]))
+
+
+class TestTimeSeriesTargetImputer(unittest.TestCase):
+    def test_get_config_space(self):
+        dataset_properties = dict(categorical_columns=[0, 1],
+                                  numerical_columns=[1, 2])
+        config = TimeSeriesTargetImputer.get_hyperparameter_search_space(dataset_properties).sample_configuration()
+        estimator = TimeSeriesFeatureImputer(**config)
+        estimator_clone = clone(estimator)
+        estimator_clone_params = estimator_clone.get_params()
+
+        # Make sure all keys are copied properly
+        for k, v in estimator.get_params().items():
+            self.assertIn(k, estimator_clone_params)
+
+        # Make sure the params getter of estimator are honored
+        klass = estimator.__class__
+        new_object_params = estimator.get_params(deep=False)
+        for name, param in new_object_params.items():
+            new_object_params[name] = clone(param, safe=False)
+        new_object = klass(**new_object_params)
+        params_set = new_object.get_params(deep=False)
+
+        for name in new_object_params:
+            param1 = new_object_params[name]
+            param2 = params_set[name]
+            self.assertEqual(param1, param2)
+
+        dataset_properties = dict(targets_have_missing_values=False)
+        cs = TimeSeriesTargetImputer.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(len(cs.get_hyperparameters()), 0)
+
+        with pytest.raises(ValueError):
+            TimeSeriesTargetImputer.get_hyperparameter_search_space()
+
+    def test_ffill_imputation(self):
+        y = np.array([1.0, np.nan, 8, 9, 4.0, 5, np.nan]).reshape([-1, 1])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+        }
+        self.X = {
+            'y_train': y,
+            'dataset_properties': dataset_properties
+        }
+        self.dataset_properties = dataset_properties
+
+        imputer_component = TimeSeriesTargetImputer(imputation_strategy='ffill')
+
+        imputer_component = imputer_component.fit(self.X)
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['target_imputer']['target_numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['target_imputer'], dict)
+        self.assertIsInstance(numerical_imputer, BaseEstimator)
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer, [0]),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['y_train'])
+        transformed = column_transformer.transform(y)
+        assert_array_equal(transformed, np.array([[1.], [1.], [8.], [9.], [4.], [5.], [5.]]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
new file mode 100644
index 000000000..047806bc5
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -0,0 +1,186 @@
+import unittest
+
+import numpy as np
+
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
+
+
+class TestScaling(unittest.TestCase):
+    def setUp(self) -> None:
+        data_seq_1 = np.array([[1, 2, 3],
+                               [0, 2, 3],
+                               [2, 2, 3],
+                               ])
+
+        data_seq_2 = np.array([[0, 1, 1],
+                               [0, 1, 2],
+                               [0, 1, 4],
+                               [0, 1, 6]
+                               ])
+
+        columns = ['f1', 's', 'f2']
+        self.raw_data = [data_seq_1, data_seq_2]
+        self.data = pd.DataFrame(np.concatenate([data_seq_1, data_seq_2]), columns=columns, index=[0] * 3 + [1] * 4)
+        self.static_features = ('s',)
+        self.static_features_column = (1, )
+
+        categorical_columns = list()
+        numerical_columns = [0, 1, 2]
+
+        self.dataset_properties = {'categorical_columns': categorical_columns,
+                                   'numerical_columns': numerical_columns,
+                                   'static_features': self.static_features,
+                                   'is_small_preprocess': True}
+
+    def test_base_and_standard_scaler(self):
+        scaler_component = BaseScaler(scaling_mode='standard')
+        X = {
+            'X_train': self.data,
+            'dataset_properties': self.dataset_properties
+        }
+
+        scaler_component = scaler_component.fit(dict(dataset_properties=self.dataset_properties))
+        X = scaler_component.transform(X)
+
+        scaler: TimeSeriesScaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['scaler'], dict)
+        self.assertIsInstance(scaler, BaseEstimator)
+        self.assertIsNone(X['scaler']['categorical'])
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformer = column_transformer.named_transformers_['timeseriesscaler']
+
+        self.assertTrue(np.allclose(transformer.loc.values, np.asarray([[1.0, 1.428571, 3.00],
+                                                                        [0.0, 1.428571, 3.25]])))
+
+        self.assertTrue(np.allclose(transformer.scale.values, np.asarray([[1.0, 0.534522, 1.000000],
+                                                                          [1.0, 0.534522, 2.217356]])))
+        transformed = column_transformer.transform(self.data)
+
+        self.assertTrue(np.allclose(transformed, np.asarray([[0., 1.06904497, 0.],
+                                                             [-1., 1.06904497, 0.],
+                                                             [1., 1.06904497, 0.],
+                                                             [0., -0.80178373, -1.01472214],
+                                                             [0., -0.80178373, -0.56373452],
+                                                             [0., -0.80178373, 0.33824071],
+                                                             [0., -0.80178373, 1.24021595]])))
+
+        # second column is static features. It needs to be the mean and std value across all sequences
+        scaler.dataset_is_small_preprocess = False
+
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
+
+    def test_min_max(self):
+        scaler = TimeSeriesScaler(mode='min_max',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        self.assertTrue(np.allclose(scaler.loc.values, np.asarray([[0, 1, 3],
+                                                                   [0, 1, 1]])))
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 1, 1],
+                                                                     [1, 1, 5]])))
+
+        transformed_data = scaler.transform(self.data).values
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[0.5, 1., 0.],
+                                                                  [0., 1., 0.],
+                                                                  [1., 1., 0.],
+                                                                  [0., 0., 0.],
+                                                                  [0., 0., 0.2],
+                                                                  [0., 0., 0.6],
+                                                                  [0., 0., 1.]])))
+
+        scaler.dataset_is_small_preprocess = False
+        scaler.static_features = self.static_features_column
+
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
+
+    def test_max_abs_scaler(self):
+        scaler = TimeSeriesScaler(mode='max_abs',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+
+        self.assertIsNone(scaler.loc)
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 2, 3],
+                                                                     [1, 2, 6]])))
+
+        transformed_data = scaler.transform(self.data).values
+
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[0.5, 1., 1.],
+                                                                  [0., 1., 1.],
+                                                                  [1., 1., 1.],
+                                                                  [0., 0.5, 0.16666667],
+                                                                  [0., 0.5, 0.33333333],
+                                                                  [0., 0.5, 0.66666667],
+                                                                  [0., 0.5, 1.]])))
+
+        scaler.dataset_is_small_preprocess = False
+
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
+
+    def test_mean_abs_scaler(self):
+        scaler = TimeSeriesScaler(mode='mean_abs',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        transformed_data = scaler.transform(self.data).values
+
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[1., 1.33333333, 1.],
+                                                                  [0., 1.33333333, 1.],
+                                                                  [2., 1.33333333, 1.],
+                                                                  [0., 0.66666667, 0.30769231],
+                                                                  [0., 0.66666667, 0.61538462],
+                                                                  [0., 0.66666667, 1.23076923],
+                                                                  [0., 0.66666667, 1.84615385]])))
+        self.assertIsNone(scaler.loc)
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[1., 1.5, 3.],
+                                                                     [1., 1.5, 3.25]])))
+        scaler.dataset_is_small_preprocess = False
+        scaler.static_features = self.static_features_column
+        scaler = scaler.fit(self.raw_data[0])
+
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
+
+    def test_no_scaler(self):
+        scaler = TimeSeriesScaler(mode='none',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        transformed_data = scaler.transform(self.data).values
+
+        self.assertTrue(np.allclose(transformed_data, self.data.values))
+        self.assertIsNone(scaler.loc)
+        self.assertIsNone(scaler.scale)
+
+        scaler.dataset_is_small_preprocess = False
+
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
+
+        with self.assertRaises(ValueError):
+            scaler = TimeSeriesScaler(mode='random',
+                                      static_features=self.static_features
+                                      )
+            _ = scaler.fit(self.data)
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
new file mode 100644
index 000000000..d65c9070f
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
@@ -0,0 +1,67 @@
+from test.test_pipeline.components.preprocessing.forecasting.base import \
+    ForecastingPipeline
+
+import numpy as np
+
+import pytest
+
+from sklearn.compose import ColumnTransformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
+    TimeSeriesFeatureTransformer,
+    TimeSeriesTargetTransformer
+)
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing',
+                                                        'uni_variant_w_missing',
+                                                        'multi_variant_wo_missing',
+                                                        'multi_variant_w_missing',
+                                                        'multi_variant_w_missing_only_cat',
+                                                        'multi_variant_w_missing_only_num',
+                                                        ], indirect=True)
+def test_time_series_preprocess(fit_dictionary_forecasting):
+    pipeline = ForecastingPipeline(dataset_properties=fit_dictionary_forecasting['dataset_properties'])
+    pipeline = pipeline.fit(fit_dictionary_forecasting)
+    X = pipeline.transform(fit_dictionary_forecasting)
+
+    assert 'time_series_target_transformer' in X.keys()
+    target_transformer = X['time_series_target_transformer']
+
+    # check if transformer is of expected type
+    # In this case we expect the tabular transformer not the actual column transformer
+    # as the later is not callable and runs into error in the compose transform
+    assert isinstance(target_transformer, TimeSeriesTargetTransformer)
+
+    targets = target_transformer.preprocessor.fit_transform(X['y_train'])
+    assert isinstance(targets, np.ndarray)
+
+    targets_2 = target_transformer(X['y_train'])
+    assert np.allclose(targets, targets_2)
+
+    assert isinstance(target_transformer.get_target_transformer(), ColumnTransformer)
+
+    if not X['dataset_properties']['uni_variant']:
+        assert 'time_series_feature_transformer' in X.keys()
+        time_series_feature_transformer = X['time_series_feature_transformer']
+        assert isinstance(time_series_feature_transformer, TimeSeriesFeatureTransformer)
+
+        features = time_series_feature_transformer.preprocessor.fit_transform(X['X_train'])
+        assert isinstance(features, np.ndarray)
+
+        features_2 = time_series_feature_transformer(X['X_train'])
+        assert np.allclose(features, features_2)
+
+        assert isinstance(time_series_feature_transformer.get_column_transformer(), ColumnTransformer)
+
+        # Make sure no columns are unintentionally dropped after preprocessing
+        if len(fit_dictionary_forecasting['dataset_properties']["numerical_columns"]) == 0:
+            categorical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_[
+                'categorical_pipeline'
+            ]
+            categorical_data = categorical_pipeline.transform(X['X_train'])
+            assert features.shape[1] == categorical_data.shape[1]
+        elif len(fit_dictionary_forecasting['dataset_properties']["categorical_columns"]) == 0:
+            numerical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_['numerical_pipeline']
+            numerical_data = numerical_pipeline.transform(X['X_train'])
+            assert features.shape[1] == numerical_data.shape[1]
diff --git a/test/test_pipeline/components/preprocessing/test_coalescer.py b/test/test_pipeline/components/preprocessing/test_coalescer.py
new file mode 100644
index 000000000..811cf8b6e
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/test_coalescer.py
@@ -0,0 +1,86 @@
+import copy
+import unittest
+
+import numpy as np
+
+import pytest
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.MinorityCoalescer import (
+    MinorityCoalescer
+)
+
+
+def test_transform_before_fit():
+    with pytest.raises(RuntimeError):
+        mc = MinorityCoalescer(min_frac=None, random_state=np.random.RandomState())
+        mc.transform(np.random.random((4, 4)))
+
+
+class TestCoalescerChoice(unittest.TestCase):
+    def test_raise_error_in_check_update_compatiblity(self):
+        dataset_properties = {'numerical_columns': [], 'categorical_columns': []}
+        cc = CoalescerChoice(dataset_properties)
+        choices = ["NoCoescer"]  # component name with typo
+        with pytest.raises(ValueError):
+            # raise error because no categorical columns, but choices do not have no coalescer
+            cc._check_update_compatiblity(choices_in_update=choices, dataset_properties=dataset_properties)
+
+    def test_raise_error_in_get_component_without_updates(self):
+        dataset_properties = {'numerical_columns': [], 'categorical_columns': []}
+        cc = CoalescerChoice(dataset_properties)
+        with pytest.raises(ValueError):
+            # raise error because no categorical columns, but choices do not have no coalescer
+            cc._get_component_without_updates(
+                avail_components={},
+                dataset_properties=dataset_properties,
+                default="",
+                include=[]
+            )
+
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the Coalescer
+        choice"""
+        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]}
+        coalescer_choice = CoalescerChoice(dataset_properties)
+        cs = coalescer_choice.get_hyperparameter_search_space()
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(coalescer_choice.get_components().keys()))
+        )
+
+        # Make sure we can properly set some random configs
+        # Whereas just one iteration will make sure the algorithm works,
+        # doing five iterations increase the confidence. We will be able to
+        # catch component specific crashes
+        for _ in range(5):
+            config = cs.sample_configuration()
+            config_dict = copy.deepcopy(config.get_dictionary())
+            coalescer_choice.set_hyperparameters(config)
+
+            self.assertEqual(coalescer_choice.choice.__class__,
+                             coalescer_choice.get_components()[config_dict['__choice__']])
+
+            # Then check the choice configuration
+            selected_choice = config_dict.pop('__choice__', None)
+            for key, value in config_dict.items():
+                # Remove the selected_choice string from the parameter
+                # so we can query in the object for it
+                key = key.replace(selected_choice + ':', '')
+                self.assertIn(key, vars(coalescer_choice.choice))
+                self.assertEqual(value, coalescer_choice.choice.__dict__[key])
+
+    def test_only_numerical(self):
+        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': []}
+
+        chooser = CoalescerChoice(dataset_properties)
+        configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary()
+        self.assertEqual(configspace['__choice__'], 'NoCoalescer')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py
index 99fad6b1f..c4c03641c 100644
--- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py
+++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py
@@ -7,6 +7,7 @@
 from sklearn.base import BaseEstimator
 from sklearn.compose import make_column_transformer
 
+from autoPyTorch.constants import CLASSIFICATION_TASKS, REGRESSION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import (
     FeatureProprocessorChoice
 )
@@ -20,24 +21,49 @@ def random_state():
     return 11
 
 
-@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer',
-                        'Nystroem', 'KernelPCA', 'RandomKitchenSinks'])
+@pytest.fixture(params=['NoFeaturePreprocessor',
+                        'FastICA',
+                        'KernelPCA',
+                        'RandomKitchenSinks',
+                        'Nystroem',
+                        'PolynomialFeatures',
+                        'TruncatedSVD',
+                        'ExtraTreesPreprocessorClassification',
+                        'ExtraTreesPreprocessorRegression',
+                        'FeatureAgglomeration',
+                        'RandomTreesEmbedding',
+                        'SelectPercentileClassification',
+                        'SelectPercentileRegression',
+                        'SelectRatesClassification',
+                        'SelectRatesRegression',
+                        'LibLinearSVCPreprocessor'
+                        ])
 def preprocessor(request):
     return request.param
 
 
 @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',
-                                                    'classification_numerical_and_categorical'], indirect=True)
+                                                    'classification_numerical_and_categorical',
+                                                    'regression_numerical_only'], indirect=True)
 class TestFeaturePreprocessors:
 
     def test_feature_preprocessor(self, fit_dictionary_tabular, preprocessor, random_state):
+        task_type = str(fit_dictionary_tabular['dataset_properties']['task_type'])
+        if (
+            ("Classification" in preprocessor or preprocessor == "LibLinearSVCPreprocessor")
+            and STRING_TO_TASK_TYPES[task_type] not in CLASSIFICATION_TASKS
+        ):
+            pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__))
+        elif "Regression" in preprocessor and STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS:
+            pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__))
         preprocessor = FeatureProprocessorChoice(
             dataset_properties=fit_dictionary_tabular['dataset_properties']
-        ).get_components()[preprocessor](random_state=random_state)
+        ).get_components()[preprocessor]
+
         configuration = preprocessor. \
             get_hyperparameter_search_space(dataset_properties=fit_dictionary_tabular["dataset_properties"]) \
             .get_default_configuration().get_dictionary()
-        preprocessor = preprocessor.set_params(**configuration)
+        preprocessor = preprocessor(**configuration, random_state=random_state)
         preprocessor.fit(fit_dictionary_tabular)
         X = preprocessor.transform(fit_dictionary_tabular)
         sklearn_preprocessor = X['feature_preprocessor']['numerical']
@@ -54,7 +80,7 @@ def test_feature_preprocessor(self, fit_dictionary_tabular, preprocessor, random
         column_transformer = make_column_transformer((sklearn_preprocessor,
                                                       X['dataset_properties']['numerical_columns']),
                                                      remainder='passthrough')
-        column_transformer.fit(X['X_train'])
+        column_transformer.fit(X['X_train'], X['y_train'])
 
         transformed = column_transformer.transform(X['X_train'])
         assert isinstance(transformed, np.ndarray)
@@ -67,6 +93,14 @@ def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor):
         in the include
         """
 
+        task_type = str(fit_dictionary_tabular['dataset_properties']['task_type'])
+        if (
+            ("Classification" in preprocessor or preprocessor == "LibLinearSVCPreprocessor")
+            and STRING_TO_TASK_TYPES[task_type] not in CLASSIFICATION_TASKS
+        ):
+            pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__))
+        elif "Regression" in preprocessor and STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS:
+            pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__))
         fit_dictionary_tabular['epochs'] = 1
 
         pipeline = TabularClassificationPipeline(
@@ -78,6 +112,11 @@ def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor):
         try:
             pipeline.fit(fit_dictionary_tabular)
         except Exception as e:
+            if (
+                ("must be non-negative" in e.args[0] or "contains negative values" in e.args[0])
+                and not fit_dictionary_tabular['dataset_properties']['issigned']
+            ):
+                pytest.skip("Failure because scaler made data nonnegative.")
             pytest.fail(f"For config {config} failed with {e}")
 
         # To make sure we fitted the model, there should be a
diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py
index 57841aef0..d159b70e5 100644
--- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py
+++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py
@@ -10,14 +10,17 @@ class TestFeaturePreprocessorChoice(unittest.TestCase):
     def test_get_set_config_space(self):
         """Make sure that we can setup a valid choice in the feature preprocessor
         choice"""
-        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]}
+        dataset_properties = {'numerical_columns': list(range(4)),
+                              'categorical_columns': [5],
+                              'task_type': 'tabular_classification'}
         feature_preprocessor_choice = FeatureProprocessorChoice(dataset_properties)
         cs = feature_preprocessor_choice.get_hyperparameter_search_space()
 
         # Make sure that all hyperparameters are part of the search space
         self.assertListEqual(
             sorted(cs.get_hyperparameter('__choice__').choices),
-            sorted(list(feature_preprocessor_choice.get_components().keys()))
+            sorted(list(feature_preprocessor_choice.get_available_components(
+                dataset_properties=dataset_properties).keys()))
         )
 
         # Make sure we can properly set some random configs
@@ -39,10 +42,16 @@ def test_get_set_config_space(self):
                 # so we can query in the object for it
                 key = key.replace(selected_choice + ':', '')
                 self.assertIn(key, vars(feature_preprocessor_choice.choice))
+                # for score function in some feature preprocessors
+                # this will fail
+                if 'score_func' or 'pooling_func' in key:
+                    continue
                 self.assertEqual(value, feature_preprocessor_choice.choice.__dict__[key])
 
     def test_only_categorical(self):
-        dataset_properties = {'numerical_columns': [], 'categorical_columns': list(range(4))}
+        dataset_properties = {'numerical_columns': [],
+                              'categorical_columns': [5],
+                              'task_type': 'tabular_classification'}
 
         chooser = FeatureProprocessorChoice(dataset_properties)
         configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary()
diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
index 983737dfe..0db460b77 100644
--- a/test/test_pipeline/components/preprocessing/test_imputers.py
+++ b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -3,6 +3,8 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
+import pytest
+
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
 
@@ -37,14 +39,14 @@ def test_get_config_space(self):
             self.assertEqual(param1, param2)
 
     def test_mean_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -64,33 +66,33 @@ def test_mean_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer((categorical_imputer,
-                                                      X['dataset_properties']['categorical_columns']),
-                                                     (numerical_imputer,
+        column_transformer = make_column_transformer((numerical_imputer,
                                                       X['dataset_properties']['numerical_columns']),
                                                      remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
-                                                             [7.0, 3.5, 9.0],
-                                                             [4.0, 3.5, 3.0]], dtype=str))
+        assert_array_equal(transformed, np.array([[2.5, 8, 9],
+                                                  [7, 3.5, 9],
+                                                  [4, 3.5, 3]]))
 
     def test_median_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
-                         [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
-                         [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
-        train_indices = np.array([0, 2, 3])
-        test_indices = np.array([1, 4, 5])
+        data = np.array([[1.0, np.nan, 7],
+                         [np.nan, 9, 10],
+                         [10.0, 7, 7],
+                         [9.0, np.nan, 11],
+                         [9.0, 9, np.nan],
+                         [np.nan, 5, 6],
+                         [12.0, np.nan, 8],
+                         [9.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        train_indices = np.array([0, 2, 3, 4, 7])
+        test_indices = np.array([1, 5, 6])
         dataset_properties = {
             'categorical_columns': categorical_columns,
             'numerical_columns': numerical_columns,
@@ -108,33 +110,33 @@ def test_median_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
-                                                             [7.0, 3.5, 9.0],
-                                                             [4.0, 3.5, 3.0]], dtype=str))
+        assert_array_equal(transformed, np.array([[9, 9, 10],
+                                                  [9, 5, 6],
+                                                  [12, 8, 8]]))
 
     def test_frequent_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
-                         [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
-                         [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
-        train_indices = np.array([0, 2, 3])
-        test_indices = np.array([1, 4, 5])
+        data = np.array([[1.0, np.nan, 7],
+                         [np.nan, 9, 10],
+                         [10.0, 7, 7],
+                         [9.0, np.nan, 11],
+                         [9.0, 9, np.nan],
+                         [np.nan, 5, 6],
+                         [12.0, np.nan, 8],
+                         [9.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        train_indices = np.array([0, 2, 4, 5, 7])
+        test_indices = np.array([1, 3, 6])
         dataset_properties = {
             'categorical_columns': categorical_columns,
             'numerical_columns': numerical_columns,
@@ -143,8 +145,7 @@ def test_frequent_imputation(self):
             'X_train': data[train_indices],
             'dataset_properties': dataset_properties
         }
-        imputer_component = SimpleImputer(numerical_strategy='most_frequent',
-                                          categorical_strategy='most_frequent')
+        imputer_component = SimpleImputer(numerical_strategy='most_frequent')
 
         imputer_component = imputer_component.fit(X)
         X = imputer_component.transform(X)
@@ -153,31 +154,29 @@ def test_frequent_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9],
-                                                             [7.0, 2, 9],
-                                                             [4.0, 2, 3]], dtype=str))
+        assert_array_equal(transformed, np.array([[9, 9, 10],
+                                                  [9, 5, 11],
+                                                  [12, 5, 8]]))
 
     def test_constant_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -188,8 +187,7 @@ def test_constant_imputation(self):
             'X_train': data[train_indices],
             'dataset_properties': dataset_properties
         }
-        imputer_component = SimpleImputer(numerical_strategy='constant_zero',
-                                          categorical_strategy='constant_!missing!')
+        imputer_component = SimpleImputer(numerical_strategy='constant_zero')
 
         imputer_component = imputer_component.fit(X)
         X = imputer_component.transform(X)
@@ -198,20 +196,28 @@ def test_constant_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
-        assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9],
-                                                             [7.0, '0', 9],
-                                                             [4.0, '0', '0']], dtype=str))
+        assert_array_equal(transformed, np.array([[0, 8, 9],
+                                                  [7, 0, 9],
+                                                  [4, 0, 0]]))
+
+    def test_imputation_without_dataset_properties_raises_error(self):
+        """Tests SimpleImputer checks for dataset properties when querying for
+        HyperparameterSearchSpace, even though the arg is marked `Optional`.
+
+        Expects:
+            * Should raise a ValueError that no dataset_properties were passed
+        """
+        with pytest.raises(ValueError):
+            SimpleImputer.get_hyperparameter_search_space()
 
 
 if __name__ == '__main__':
diff --git a/test/test_pipeline/components/preprocessing/test_scalers.py b/test/test_pipeline/components/preprocessing/test_scalers.py
index 94ba0f2dc..7cbc12b07 100644
--- a/test/test_pipeline/components/preprocessing/test_scalers.py
+++ b/test/test_pipeline/components/preprocessing/test_scalers.py
@@ -9,6 +9,11 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.MinMaxScaler import MinMaxScaler
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.NoScaler import NoScaler
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.Normalizer import Normalizer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.PowerTransformer import \
+    PowerTransformer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.QuantileTransformer import \
+    QuantileTransformer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.RobustScaler import RobustScaler
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.StandardScaler import StandardScaler
 
 
@@ -239,3 +244,163 @@ def test_none_scaler(self):
         self.assertIsInstance(X['scaler'], dict)
         self.assertIsNone(X['scaler']['categorical'])
         self.assertIsNone(X['scaler']['numerical'])
+
+
+def test_power_transformer():
+    data = np.array([[1, 2, 3],
+                    [7, 8, 9],
+                    [4, 5, 6],
+                    [11, 12, 13],
+                    [17, 18, 19],
+                    [14, 15, 16]])
+    train_indices = np.array([0, 2, 5])
+    test_indices = np.array([1, 4, 3])
+    categorical_columns = list()
+    numerical_columns = [0, 1, 2]
+    dataset_properties = {'categorical_columns': categorical_columns,
+                          'numerical_columns': numerical_columns,
+                          'issparse': False}
+    X = {
+        'X_train': data[train_indices],
+        'dataset_properties': dataset_properties
+    }
+    scaler_component = PowerTransformer()
+
+    scaler_component = scaler_component.fit(X)
+    X = scaler_component.transform(X)
+    scaler = X['scaler']['numerical']
+
+    # check if the fit dictionary X is modified as expected
+    assert isinstance(X['scaler'], dict)
+    assert isinstance(scaler, BaseEstimator)
+    assert X['scaler']['categorical'] is None
+
+    # make column transformer with returned encoder to fit on data
+    column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                 remainder='passthrough')
+    column_transformer = column_transformer.fit(X['X_train'])
+    transformed = column_transformer.transform(data[test_indices])
+
+    assert_allclose(transformed, np.array([[0.531648, 0.522782, 0.515394],
+                                           [1.435794, 1.451064, 1.461685],
+                                           [0.993609, 1.001055, 1.005734]]), rtol=1e-06)
+
+
+def test_robust_scaler():
+    data = np.array([[1, 2, 3],
+                    [7, 8, 9],
+                    [4, 5, 6],
+                    [11, 12, 13],
+                    [17, 18, 19],
+                    [14, 15, 16]])
+    train_indices = np.array([0, 2, 5])
+    test_indices = np.array([1, 4, 3])
+    categorical_columns = list()
+    numerical_columns = [0, 1, 2]
+    dataset_properties = {'categorical_columns': categorical_columns,
+                          'numerical_columns': numerical_columns,
+                          'issparse': False}
+    X = {
+        'X_train': data[train_indices],
+        'dataset_properties': dataset_properties
+    }
+    scaler_component = RobustScaler()
+
+    scaler_component = scaler_component.fit(X)
+    X = scaler_component.transform(X)
+    scaler = X['scaler']['numerical']
+
+    # check if the fit dictionary X is modified as expected
+    assert isinstance(X['scaler'], dict)
+    assert isinstance(scaler, BaseEstimator)
+    assert X['scaler']['categorical'] is None
+
+    # make column transformer with returned encoder to fit on data
+    column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                 remainder='passthrough')
+    column_transformer = column_transformer.fit(X['X_train'])
+    transformed = column_transformer.transform(data[test_indices])
+
+    assert_allclose(transformed, np.array([[100, 100, 100],
+                                           [433.33333333, 433.33333333, 433.33333333],
+                                           [233.33333333, 233.33333333, 233.33333333]]))
+
+
+class TestQuantileTransformer():
+    def test_quantile_transformer_uniform(self):
+        data = np.array([[1, 2, 3],
+                         [7, 8, 9],
+                         [4, 5, 6],
+                         [11, 12, 13],
+                         [17, 18, 19],
+                         [14, 15, 16]])
+        train_indices = np.array([0, 2, 5])
+        test_indices = np.array([1, 4, 3])
+        categorical_columns = list()
+        numerical_columns = [0, 1, 2]
+        dataset_properties = {'categorical_columns': categorical_columns,
+                              'numerical_columns': numerical_columns,
+                              'issparse': False}
+        X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = QuantileTransformer(output_distribution='uniform')
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        assert isinstance(X['scaler'], dict)
+        assert isinstance(scaler, BaseEstimator)
+        assert X['scaler']['categorical'] is None
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data[test_indices])
+
+        assert_allclose(transformed, np.array([[0.65, 0.65, 0.65],
+                                               [1, 1, 1],
+                                               [0.85, 0.85, 0.85]]), rtol=1e-06)
+
+    def test_quantile_transformer_normal(self):
+        data = np.array([[1, 2, 3],
+                         [7, 8, 9],
+                         [4, 5, 6],
+                         [11, 12, 13],
+                         [17, 18, 19],
+                         [14, 15, 16]])
+        train_indices = np.array([0, 2, 5])
+        test_indices = np.array([1, 4, 3])
+        categorical_columns = list()
+        numerical_columns = [0, 1, 2]
+        dataset_properties = {'categorical_columns': categorical_columns,
+                              'numerical_columns': numerical_columns,
+                              'issparse': False}
+        X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = QuantileTransformer(output_distribution='normal')
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        assert isinstance(X['scaler'], dict)
+        assert isinstance(scaler, BaseEstimator)
+        assert X['scaler']['categorical'] is None
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data[test_indices])
+
+        assert_allclose(transformed, np.array([[0.38532, 0.38532, 0.38532],
+                                               [5.199338, 5.199338, 5.199338],
+                                               [1.036433, 1.036433, 1.036433]]), rtol=1e-05)
diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py
index 66a96f27f..36de9f275 100644
--- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py
+++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py
@@ -47,6 +47,7 @@ def test_sparse_data(self, fit_dictionary_tabular):
 
         X = np.random.binomial(1, 0.1, (100, 2000))
         sparse_X = csr_matrix(X)
+        y = np.random.randint(0, 1, 100)
         numerical_columns = list(range(2000))
         categorical_columns = []
         train_indices = np.array(range(50))
@@ -56,6 +57,7 @@ def test_sparse_data(self, fit_dictionary_tabular):
                                   issparse=True)
         X = {
             'X_train': sparse_X,
+            'y_train': y,
             'train_indices': train_indices,
             'dataset_properties': dataset_properties
         }
diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
new file mode 100644
index 000000000..3f22835b3
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
@@ -0,0 +1,49 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
+
+
+def test_variance_threshold():
+    data = np.array([[1, 2, 1],
+                     [7, 8, 9],
+                     [4, 5, 1],
+                     [11, 12, 1],
+                     [17, 18, 19],
+                     [14, 15, 16]])
+    numerical_columns = [0, 1, 2]
+    train_indices = np.array([0, 2, 3])
+    test_indices = np.array([1, 4, 5])
+    dataset_properties = {
+        'categorical_columns': [],
+        'numerical_columns': numerical_columns,
+    }
+    X = {
+        'X_train': data[train_indices],
+        'dataset_properties': dataset_properties
+    }
+    component = VarianceThreshold()
+
+    component = component.fit(X)
+    X = component.transform(X)
+    variance_threshold = X['variance_threshold']['numerical']
+
+    # check if the fit dictionary X is modified as expected
+    assert isinstance(X['variance_threshold'], dict)
+    assert isinstance(variance_threshold, BaseEstimator)
+
+    # make column transformer with returned encoder to fit on data
+    column_transformer = make_column_transformer((variance_threshold,
+                                                  X['dataset_properties']['numerical_columns']),
+                                                 remainder='passthrough')
+    column_transformer = column_transformer.fit(X['X_train'])
+    transformed = column_transformer.transform(data[test_indices])
+
+    assert_array_equal(transformed, np.array([[7, 8],
+                                              [17, 18],
+                                              [14, 15]]))
diff --git a/test/test_pipeline/components/setup/forecasting/__init__.py b/test/test_pipeline/components/setup/forecasting/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/__init__.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
new file mode 100644
index 000000000..a32117b0c
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -0,0 +1,394 @@
+import copy
+import itertools
+import unittest
+
+from ConfigSpace import Configuration
+
+import numpy as np
+
+import pandas as pd
+
+import torch
+
+from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.MLPDecoder import (
+    ForecastingMLPDecoder
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderBlockInfo,
+    EncoderNetwork
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+
+
+class DummyEmbedding(torch.nn.Module):
+    def forward(self, x):
+        if x.shape[-1] > 10:
+            return x[..., :-10]
+        return x
+
+
+class DummyEncoderNetwork(EncoderNetwork):
+    def forward(self, x, output_seq=False):
+        if output_seq:
+            return torch.ones((*x.shape[:-1], 10))
+        return torch.ones((*x.shape[:-2], 1, 10))
+
+
+class DummyForecastingEncoder(BaseForecastingEncoder):
+    def n_encoder_output_feature(self):
+        return 10
+
+    def build_encoder(self, input_shape):
+        return DummyEncoderNetwork()
+
+
+class DummyTranformers():
+    def __call__(self, x):
+        return x[..., :(x.shape[-1] // 2)]
+
+
+def generate_fit_dict_and_dataset_property():
+    embedding = DummyEmbedding()
+
+    transformation = [DummyTranformers()]
+    n_prediction_steps = 3
+    input_shape = (100, 50)
+    output_shape = (n_prediction_steps, 1)
+    time_feature_transform = [1, 2]
+
+    feature_names = ('f1', 'f2', 'f3', 'f4', 'f5')
+    feature_shapes = {'f1': 10, 'f2': 10, 'f3': 10, 'f4': 10, 'f5': 10}
+    known_future_features = ('f1', 'f2', 'f3', 'f4', 'f5')
+
+    dataset_properties = dict(input_shape=input_shape,
+                              output_shape=output_shape,
+                              transform_time_features=True,
+                              time_feature_transform=time_feature_transform,
+                              feature_shapes=feature_shapes,
+                              known_future_features=known_future_features,
+                              n_prediction_steps=n_prediction_steps,
+                              encoder_can_be_auto_regressive=True,
+                              feature_names=feature_names,
+                              is_small_preprocess=True,
+                              task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
+                              uni_variant=False,
+                              future_feature_shapes=(n_prediction_steps, 50),
+                              )
+
+    fit_dictionary = dict(X_train=pd.DataFrame(np.random.randn(*input_shape)),
+                          y_train=pd.DataFrame(np.random.randn(*output_shape)),
+                          network_embedding=embedding,
+                          preprocess_transforms=transformation,
+                          transform_time_features=True,
+                          window_size=5
+                          )
+
+    return dataset_properties, fit_dictionary
+
+
+class TestForecastingNetworkBases(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dataset_properties, self.fit_dictionary = generate_fit_dict_and_dataset_property()
+
+        self.encoder = DummyForecastingEncoder()
+
+        mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
+                                                                       can_be_auto_regressive=True)
+        mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
+        mlp_cfg_non_ar_wo_local = copy.copy(mlp_cfg_non_ar_w_local.get_dictionary())
+
+        mlp_cfg_non_ar_wo_local['has_local_layer'] = False
+        mlp_cfg_non_ar_wo_local.pop('units_local_layer')
+
+        mlp_cfg_ar = copy.copy(mlp_cfg_non_ar_wo_local)
+        mlp_cfg_ar.pop('has_local_layer')
+        mlp_cfg_ar['auto_regressive'] = True
+
+        mlp_cfg_non_ar_wo_local = Configuration(mlp_cs, values=mlp_cfg_non_ar_wo_local)
+        mlp_cfg_ar = Configuration(mlp_cs, values=mlp_cfg_ar)
+
+        self.decoder_ar = ForecastingMLPDecoder(**mlp_cfg_ar)
+        self.decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
+        self.decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
+
+        self.decoders = {"non_ar_w_local": self.decoder_w_local,
+                         "non_ar_wo_local": self.decoder_wo_local,
+                         "ar": self.decoder_ar}
+
+    def test_encoder_choices(self):
+        dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+        encoder_choices = ForecastingNetworkChoice(dataset_properties)
+        cs = encoder_choices.get_hyperparameter_search_space(dataset_properties)
+        self.assertListEqual(list(cs.get_hyperparameter('__choice__').choices), ['flat_encoder', 'seq_encoder'])
+
+        cs_only_flat = encoder_choices.get_hyperparameter_search_space(dataset_properties, include=['flat_encoder'])
+        for hp_name in cs_only_flat.get_hyperparameter_names():
+            self.assertFalse(hp_name.startswith('seq_encoder'))
+
+        cs_only_flat = encoder_choices.get_hyperparameter_search_space(dataset_properties, include=['flat_encoder'])
+        for hp_name in cs_only_flat.get_hyperparameter_names():
+            self.assertFalse(hp_name.startswith('seq_encoder'))
+
+        cs_only_rnn = encoder_choices.get_hyperparameter_search_space(dataset_properties,
+                                                                      include=['seq_encoder:RNNEncoder'])
+
+        self.assertListEqual(list(cs_only_rnn.get_hyperparameter('__choice__').choices), ['seq_encoder'])
+        self.assertListEqual(list(cs_only_rnn.get_hyperparameter('seq_encoder:block_1:__choice__').choices),
+                             ['RNNEncoder'])
+
+        cs_no_rnn = encoder_choices.get_hyperparameter_search_space(dataset_properties,
+                                                                    exclude=['seq_encoder:RNNEncoder'])
+        for hp_name in cs_no_rnn.get_hyperparameter_names():
+            self.assertFalse('RNNEncoder' in hp_name)
+
+        sample = cs.sample_configuration()
+
+        encoder_choices = encoder_choices.set_hyperparameters(sample)
+        self.assertIsInstance(encoder_choices.choice.choice, BaseForecastingEncoder)
+
+        encoder_choices = ForecastingNetworkChoice(dataset_properties)
+
+        update_seq = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                     hyperparameter='__choice__',
+                                                     value_range=('seq_encoder',),
+                                                     default_value='seq_encoder', )
+
+        encoder_choices._apply_search_space_update(update_seq)
+        cs_seq = encoder_choices.get_hyperparameter_search_space(dataset_properties)
+        self.assertListEqual(list(cs_seq.get_hyperparameter('__choice__').choices), ['seq_encoder'])
+
+        encoder_choices = ForecastingNetworkChoice(dataset_properties)
+        update_rnn_decoder_type = HyperparameterSearchSpaceUpdate(
+            node_name="network_backbone",
+            hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+            value_range=('MLPDecoder',),
+            default_value='MLPDecoder', )
+        encoder_choices._apply_search_space_update(update_rnn_decoder_type)
+        cs_seq = encoder_choices.get_hyperparameter_search_space(dataset_properties)
+        hp_rnn_decoder_type = cs_seq.get_hyperparameter(update_rnn_decoder_type.hyperparameter)
+        self.assertListEqual(list(hp_rnn_decoder_type.choices), ['MLPDecoder'])
+
+    def test_base_encoder(self):
+        window_size = self.fit_dictionary['window_size']
+        all_settings = [(True, False)] * 4
+        for hp_values in itertools.product(*all_settings):
+            uni_variant = hp_values[0]
+            variable_selection = hp_values[1]
+            transform_time_features = hp_values[2]
+            is_small_preprocess = hp_values[3]
+            with self.subTest(uni_variant=uni_variant,
+                              variable_selection=variable_selection,
+                              transform_time_features=transform_time_features,
+                              is_small_preprocess=is_small_preprocess):
+                network_structure = NetworkStructure(variable_selection=variable_selection)
+
+                dataset_properties = copy.copy(self.dataset_properties)
+                fit_dictionary = copy.copy(self.fit_dictionary)
+
+                dataset_properties['is_small_preprocess'] = is_small_preprocess
+                dataset_properties['uni_variant'] = uni_variant
+
+                fit_dictionary['dataset_properties'] = self.dataset_properties
+                fit_dictionary['network_structure'] = network_structure
+                fit_dictionary['transform_time_features'] = transform_time_features
+                fit_dictionary['dataset_properties'] = dataset_properties
+
+                encoder_block_1 = copy.deepcopy(self.encoder)
+
+                encoder_block_2 = copy.deepcopy(self.encoder)
+                encoder_block_2.block_number = 2
+
+                encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+                fit_dictionary = encoder_block_1.transform(fit_dictionary)
+                network_encoder = fit_dictionary['network_encoder']
+                self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
+                self.assertEqual(network_encoder['block_1'].encoder_output_shape, (1, 10))
+
+                if variable_selection:
+                    self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
+                else:
+                    if uni_variant:
+                        n_input_features = 0
+                    else:
+                        if is_small_preprocess:
+                            n_input_features = 40
+                        else:
+                            n_input_features = 15
+
+                    if transform_time_features:
+                        n_input_features += len(dataset_properties['time_feature_transform'])
+
+                    n_input_features += dataset_properties['output_shape'][-1]
+                    self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size,
+                                                                                      n_input_features))
+
+                encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+                fit_dictionary = encoder_block_2.transform(fit_dictionary)
+
+                network_encoder = fit_dictionary['network_encoder']
+                self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
+                self.assertEqual(network_encoder['block_2'].encoder_output_shape, (1, 10))
+                self.assertEqual(network_encoder['block_2'].encoder_input_shape, (1, 10))
+
+    def test_base_decoder(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        for variable_selection in (True, False):
+            with self.subTest(variable_selection=variable_selection):
+                network_structure = NetworkStructure(variable_selection=variable_selection, num_blocks=2)
+                dataset_properties = copy.copy(self.dataset_properties)
+                fit_dictionary = copy.copy(self.fit_dictionary)
+
+                fit_dictionary['network_structure'] = network_structure
+                fit_dictionary['dataset_properties'] = dataset_properties
+
+                encoder_block_1 = copy.deepcopy(self.encoder)
+                encoder_block_2 = copy.deepcopy(self.encoder)
+                encoder_block_2.block_number = 2
+
+                encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+                fit_dictionary = encoder_block_1.transform(fit_dictionary)
+                encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+                fit_dictionary = encoder_block_2.transform(fit_dictionary)
+
+                decoder1 = copy.deepcopy(self.decoder_w_local)
+                decoder1 = decoder1.fit(fit_dictionary)
+                self.assertEqual(decoder1.n_prediction_heads, n_prediction_steps)
+                fit_dictionary = decoder1.transform(fit_dictionary)
+
+                network_decoder = fit_dictionary['network_decoder']
+                self.assertIsInstance(network_decoder['block_1'], DecoderBlockInfo)
+                if variable_selection:
+                    self.assertEqual(network_decoder['block_1'].decoder_input_shape,
+                                     (n_prediction_steps, 10))  # Pure variable selection
+                    self.assertEqual(network_decoder['block_1'].decoder_output_shape,
+                                     (n_prediction_steps, 26))  # 10 (input features) + 16 (n_output_dims)
+                else:
+                    self.assertEqual(network_decoder['block_1'].decoder_input_shape,
+                                     (n_prediction_steps, 52))  # 50 (input features) + 2 (time_transforms)
+                    self.assertEqual(network_decoder['block_1'].decoder_output_shape,
+                                     (n_prediction_steps, 68))  # 52 (input features) + 16 (n_out_dims)
+
+                for name, decoder in self.decoders.items():
+                    with self.subTest(decoder_name=name):
+                        fit_dictionary_ = copy.deepcopy(fit_dictionary)
+                        decoder2 = copy.deepcopy(decoder)
+                        decoder2.block_number = 2
+                        decoder2 = decoder2.fit(fit_dictionary_)
+                        fit_dictionary_ = decoder2.transform(fit_dictionary_)
+                        self.assertTrue(decoder2.is_last_decoder)
+                        if name == 'ar':
+                            self.assertEqual(fit_dictionary_['n_prediction_heads'], 1)
+                        else:
+                            self.assertEqual(fit_dictionary_['n_prediction_heads'], n_prediction_steps)
+                        n_prediction_heads = fit_dictionary_['n_prediction_heads']
+
+                        network_decoder = fit_dictionary_['network_decoder']['block_2']
+                        self.assertIsInstance(network_decoder, DecoderBlockInfo)
+                        if variable_selection:
+                            self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 26))
+
+                            if name == 'non_ar_w_local':
+                                # 26+16
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 42))
+                            elif name == 'non_ar_wo_local':
+                                # num_global
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))
+                            elif name == 'ar':
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+                        else:
+                            self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 68))
+
+                            if name == 'non_ar_w_local':
+                                # 26+16
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 84))
+                            elif name == 'non_ar_wo_local':
+                                # num_global
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))
+                            elif name == 'ar':
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+
+    def test_forecasting_heads(self):
+        variable_selection = False
+        n_prediction_steps = self.dataset_properties["n_prediction_steps"]
+
+        network_structure = NetworkStructure(variable_selection=variable_selection, num_blocks=1)
+
+        dataset_properties = copy.copy(self.dataset_properties)
+        fit_dictionary = copy.copy(self.fit_dictionary)
+
+        input_tensor = torch.randn([10, 20, 3 + fit_dictionary['X_train'].shape[-1]])
+        input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dictionary['X_train'].shape[-1]])
+
+        network_embedding = self.fit_dictionary['network_embedding']
+        input_tensor = network_embedding(input_tensor)
+
+        fit_dictionary['dataset_properties'] = self.dataset_properties
+        fit_dictionary['network_structure'] = network_structure
+        fit_dictionary['transform_time_features'] = True
+        fit_dictionary['dataset_properties'] = dataset_properties
+        encoder = copy.deepcopy(self.encoder)
+        encoder = encoder.fit(fit_dictionary)
+        fit_dictionary = encoder.transform(fit_dictionary)
+
+        quantiles = [0.5, 0.1, 0.9]
+        for name, decoder in self.decoders.items():
+            with self.subTest(decoder_name=name):
+                fit_dictionary_ = copy.deepcopy(fit_dictionary)
+                decoder = decoder.fit(fit_dictionary_)
+                fit_dictionary_ = decoder.transform(fit_dictionary_)
+
+                for net_output_type in ['regression', 'distribution', 'quantile']:
+                    def eval_heads_output(fit_dict):
+                        head = ForecastingHead()
+                        head = head.fit(fit_dict)
+                        fit_dictionary_copy = head.transform(fit_dict)
+
+                        encoder = fit_dictionary_copy['network_encoder']['block_1'].encoder
+                        decoder = fit_dictionary_copy['network_decoder']['block_1'].decoder
+
+                        head = fit_dictionary_copy['network_head']
+                        output = head(decoder(input_tensor_future, encoder(input_tensor, output_seq=False)))
+                        if name != "ar":
+                            if net_output_type == 'regression':
+                                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+                            elif net_output_type == 'distribution':
+                                self.assertListEqual(list(output.sample().shape), [10, n_prediction_steps, 1])
+                            elif net_output_type == 'quantile':
+                                self.assertEqual(len(output), len(quantiles))
+                                for output_quantile in output:
+                                    self.assertListEqual(list(output_quantile.shape), [10, n_prediction_steps, 1])
+                        else:
+                            if net_output_type == 'regression':
+                                self.assertListEqual(list(output.shape), [10, 1, 1])
+                            elif net_output_type == 'distribution':
+                                self.assertListEqual(list(output.sample().shape), [10, 1, 1])
+                            elif net_output_type == 'quantile':
+                                self.assertEqual(len(output), len(quantiles))
+                                for output_quantile in output:
+                                    self.assertListEqual(list(output_quantile.shape), [10, 1, 1])
+                    with self.subTest(net_output_type=net_output_type):
+                        fit_dictionary_copy = copy.deepcopy(fit_dictionary_)
+                        fit_dictionary_copy['net_output_type'] = net_output_type
+
+                        if net_output_type == 'distribution':
+                            for dist in ALL_DISTRIBUTIONS.keys():
+                                fit_dictionary_copy['dist_forecasting_strategy'] = DisForecastingStrategy(dist_cls=dist)
+                                eval_heads_output(fit_dictionary_copy)
+                        elif net_output_type == 'quantile':
+                            fit_dictionary_copy['quantile_values'] = quantiles
+                            eval_heads_output(fit_dictionary_copy)
+                        else:
+                            eval_heads_output(fit_dictionary_copy)
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
new file mode 100644
index 000000000..7f1c225e0
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -0,0 +1,224 @@
+import copy
+import unittest
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import \
+    generate_fit_dict_and_dataset_property
+
+from ConfigSpace import Configuration
+
+from sklearn.pipeline import Pipeline
+
+import torch
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    StackedDecoder,
+    StackedEncoder
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.MLPDecoder import (
+    ForecastingMLPDecoder
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    NBEATSDecoder import NBEATSDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    flat_encoder import FlatForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    MLPEncoder import MLPEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    NBEATSEncoder import NBEATSEncoder
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+
+
+class TestFlatEncoder(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dataset_properties, self.fit_dictionary = generate_fit_dict_and_dataset_property()
+        self.fit_dictionary['net_output_type'] = 'regression'
+        self.fit_dictionary['network_embedding'] = _NoEmbedding()
+
+    def test_flat_encoder_choice(self):
+        encoder_choices = FlatForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+        cs_flat = encoder_choices.get_hyperparameter_search_space(self.dataset_properties)
+        available_encoder = cs_flat.get_hyperparameter("__choice__")
+
+        self.assertTrue('MLPEncoder' in available_encoder.choices)
+        self.assertTrue('NBEATSEncoder' in available_encoder.choices)
+
+        sample = cs_flat.sample_configuration()
+        encoder_choices.set_hyperparameters(sample)
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        self.assertIsInstance(encoder_choices.pipeline, Pipeline)
+        encoder_choices = encoder_choices.fit(fit_dict)
+        fit_dict = encoder_choices.transform(fit_dict)
+
+        self.assertTrue('network_structure' in fit_dict)
+        network_structure = fit_dict['network_structure']
+        self.assertIsInstance(network_structure, NetworkStructure)
+        self.assertTrue(network_structure.num_blocks, 1)
+
+        self.assertTrue('network_encoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_encoder']), 1)
+
+        self.assertTrue('network_decoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_decoder']), 1)
+
+    def test_mlp_network(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        network_structure = NetworkStructure()
+
+        encoder_cfg = MLPEncoder().get_hyperparameter_search_space().get_default_configuration()
+        encoder = MLPEncoder(**encoder_cfg)
+
+        mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
+                                                                       can_be_auto_regressive=True)
+        mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
+        mlp_cfg_non_ar_wo_local = copy.copy(mlp_cfg_non_ar_w_local.get_dictionary())
+
+        mlp_cfg_non_ar_wo_local['has_local_layer'] = False
+        mlp_cfg_non_ar_wo_local.pop('units_local_layer')
+
+        mlp_cfg_non_ar_wo_local = Configuration(mlp_cs, values=mlp_cfg_non_ar_wo_local)
+
+        decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
+        decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
+
+        decoders = {"non_ar_w_local": decoder_w_local,
+                    "non_ar_wo_local": decoder_wo_local}
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        fit_dict['network_structure'] = network_structure
+
+        encoder = encoder.fit(fit_dict)
+        fit_dict = encoder.transform(fit_dict)
+
+        for name, decoder in decoders.items():
+            with self.subTest(decoder_name=name):
+                fit_dict_ = copy.copy(fit_dict)
+
+                decoder = decoder.fit(fit_dict_)
+                fit_dict_ = decoder.transform(fit_dict_)
+
+                input_tensor = torch.randn([10, 20, 3 + fit_dict_['X_train'].shape[-1]])
+                input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dict_['X_train'].shape[-1]])
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict_)
+                fit_dict_ = head.transform(fit_dict_)
+
+                net_encoder = StackedEncoder(network_structure, False,
+                                             fit_dict_['network_encoder'], fit_dict_['network_decoder'])
+                net_decoder = StackedDecoder(network_structure, net_encoder.encoder, fit_dict_['network_encoder'],
+                                             fit_dict_['network_decoder'])
+
+                head = fit_dict_['network_head']
+
+                encoder2decoder, _ = net_encoder(input_tensor, [None])
+                output = head(net_decoder(input_tensor_future, encoder2decoder))
+
+                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+    def test_nbeats_network(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        window_size = self.fit_dictionary['window_size']
+        network_structure = NetworkStructure()
+
+        encoder_cfg = NBEATSEncoder().get_hyperparameter_search_space().get_default_configuration()
+        encoder = NBEATSEncoder(**encoder_cfg)
+
+        nbeats_cs = NBEATSDecoder.get_hyperparameter_search_space(self.dataset_properties)
+
+        nbeatsI_cfg = {
+            "backcast_loss_ration": 0.0,
+            "normalization": "LN",
+            "activation": "relu",
+
+            "n_beats_type": "I",
+
+            "use_dropout_i": True,
+            "num_stacks_i": 2,
+
+            "num_blocks_i_1": 2,
+            "num_layers_i_1": 2,
+            "width_i_1": 16,
+            "weight_sharing_i_1": True,
+            "stack_type_i_1": 'trend',
+            "expansion_coefficient_length_i_trend_1": 3,
+            "dropout_i_1": 0.1,
+
+            "num_blocks_i_2": 3,
+            "num_layers_i_2": 2,
+            "width_i_2": 16,
+            "weight_sharing_i_2": False,
+            "stack_type_i_2": 'seasonality',
+            "expansion_coefficient_length_i_seasonality_2": 7,
+            "dropout_i_2": 0.1,
+        }
+
+        nbeatsG_cfg = {
+            "backcast_loss_ration": 0.0,
+            "normalization": "NoNorm",
+            "activation": "relu",
+
+            "n_beats_type": "G",
+
+            "use_dropout_g": True,
+            "num_stacks_g": 2,
+
+            "num_blocks_g": 1,
+            "num_layers_g": 4,
+            "width_g": 512,
+            "weight_sharing_g": False,
+            "expansion_coefficient_length_g": 32,
+            "dropout_g": 0.1,
+        }
+
+        nbeatsI_cfg = Configuration(nbeats_cs, values=nbeatsI_cfg)
+        nbeatsG_cfg = Configuration(nbeats_cs, values=nbeatsG_cfg)
+
+        nbeats_i = NBEATSDecoder(**nbeatsI_cfg)
+        nbeats_g = NBEATSDecoder(**nbeatsG_cfg)
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        fit_dict['network_structure'] = network_structure
+
+        encoder = encoder.fit(fit_dict)
+        fit_dict = encoder.transform(fit_dict)
+
+        for decoder_idx, decoder in enumerate([nbeats_i, nbeats_g]):
+            with self.subTest(decoder_idx=decoder_idx):
+                fit_dict = copy.copy(fit_dict)
+                fit_dict_ = copy.copy(fit_dict)
+
+                decoder = decoder.fit(fit_dict_)
+                fit_dict_ = decoder.transform(fit_dict_)
+
+                input_tensor = torch.randn([10, 20, 1])
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict_)
+                fit_dict_ = head.transform(fit_dict_)
+
+                encoder_net = fit_dict_['network_encoder']['block_1'].encoder
+                decoder_net = fit_dict_['network_decoder']['block_1'].decoder
+                idx_tracker = 0
+                if decoder_idx == 0:
+                    # only check nbeats_i
+                    for i_stack in range(1, 1 + nbeatsI_cfg['num_stacks_i']):
+                        num_blocks = nbeatsI_cfg[f'num_blocks_i_{i_stack}']
+                        idx_end = idx_tracker + num_blocks
+                        num_individual_models = len(set(decoder_net[idx_tracker:idx_end]))
+                        if nbeatsI_cfg[f'weight_sharing_i_{i_stack}']:
+                            self.assertEqual(num_individual_models, 1)
+                        else:
+                            self.assertEqual(num_individual_models, num_blocks)
+                        idx_tracker = idx_end
+
+                input_tensor = encoder_net(input_tensor, output_seq=False)
+
+                for block in decoder_net:
+                    backcast_block, forecast_block = block([None], input_tensor)
+                    self.assertListEqual(list(backcast_block.shape), [10, window_size * 1])
+                    self.assertListEqual(list(forecast_block.shape), [10, n_prediction_steps * 1])
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
new file mode 100644
index 000000000..252fe7d1d
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
@@ -0,0 +1,371 @@
+import copy
+import unittest
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import \
+    generate_fit_dict_and_dataset_property
+
+import pytest
+
+import torch
+
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
+    AbstractForecastingNet,
+    get_lagged_subsequences,
+    get_lagged_subsequences_inference
+)
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+
+
+class ReducedEmbedding(torch.nn.Module):
+    # a dummy reduced embedding, it simply cut row for each categorical features
+    def __init__(self, num_input_features, num_numerical_features: int):
+        super(ReducedEmbedding, self).__init__()
+        self.num_input_features = num_input_features
+        self.num_numerical_features = num_numerical_features
+        self.n_cat_features = len(num_input_features) - num_numerical_features
+
+    def forward(self, x):
+        x = x[..., :-self.n_cat_features]
+        return x
+
+    def get_partial_models(self, subset_features):
+        num_numerical_features = sum([sf < self.num_numerical_features for sf in subset_features])
+        num_input_features = [self.num_input_features[sf] for sf in subset_features]
+        return ReducedEmbedding(num_input_features, num_numerical_features)
+
+
+@pytest.fixture(params=['ForecastingNet', 'ForecastingSeq2SeqNet', 'ForecastingDeepARNet', 'NBEATSNet'])
+def network_type(request):
+    return request.param
+
+
+@pytest.fixture(params=['RNNEncoder', 'TCNEncoder'])
+def network_encoder(request):
+    return request.param
+
+
+@pytest.fixture(params=['ReducedEmbedding', 'NoEmbedding'])
+def embedding(request):
+    return request.param
+
+
+@pytest.fixture(params=['distribution_mean', 'distribution_sample', 'regression', 'quantile'])
+def net_output_type(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def variable_selection(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def with_static_features(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def uni_variant_data(request):
+    return request.param
+
+
+class TestForecastingNetworks:
+    dataset_properties, fit_dictionary = generate_fit_dict_and_dataset_property()
+
+    def test_network_forward(self,
+                             embedding,
+                             net_output_type,
+                             variable_selection,
+                             with_static_features,
+                             network_encoder,
+                             network_type,
+                             uni_variant_data):
+        if network_type == 'ForecastingDeepARNet' and net_output_type != 'distribution_sample':
+            return
+        if network_type == 'ForecastingSeq2SeqNet' and network_encoder == 'TCNEncoder':
+            return
+        if network_type == 'NBEATSNet':
+            # NBEATS only needs one pass
+            if not (embedding == 'NoEmbedding' and net_output_type == 'regression'
+                    and not variable_selection and not with_static_features and network_encoder == 'RNNEncoder'
+                    and not uni_variant_data):
+                return
+        if uni_variant_data:
+            if not (embedding == 'NoEmbedding' and not with_static_features):
+                return
+
+        dataset_properties = copy.copy(self.dataset_properties)
+        time_feature_names = ('t1', 't2')
+        dataset_properties['time_feature_names'] = time_feature_names
+
+        if network_type != 'ForecastingDeepARNet':
+            dataset_properties['known_future_features'] = ('f1', 'f3', 'f5')
+
+        if with_static_features:
+            dataset_properties['static_features'] = (0, 4)
+        else:
+            dataset_properties['static_features'] = tuple()
+
+        fit_dictionary = copy.copy(self.fit_dictionary)
+        fit_dictionary['dataset_properties'] = dataset_properties
+        fit_dictionary['target_scaler'] = BaseTargetScaler(scaling_mode='standard').fit(fit_dictionary)
+
+        if net_output_type.startswith("distribution"):
+            fit_dictionary['dist_forecasting_strategy'] = DisForecastingStrategy(
+                list(ALL_DISTRIBUTIONS.keys())[0],
+                forecast_strategy=net_output_type.split("_")[1]
+            )
+            net_output_type = net_output_type.split("_")[0]
+        elif net_output_type == 'quantile':
+            fit_dictionary['quantile_values'] = [0.5, 0.1, 0.9]
+
+        fit_dictionary['net_output_type'] = net_output_type
+
+        if embedding == 'NoEmbedding':
+            fit_dictionary['network_embedding'] = _NoEmbedding()
+        else:
+            fit_dictionary['network_embedding'] = ReducedEmbedding([10] * 5, 2)
+            dataset_properties['feature_shapes'] = {'f1': 10, 'f2': 10, 'f3': 9, 'f4': 9, 'f5': 9}
+
+        if uni_variant_data:
+            fit_dictionary['X_train'] = None
+            fit_dictionary['transform_time_features'] = False
+            dataset_properties.update({'feature_shapes': {},
+                                       'feature_names': tuple(),
+                                       'known_future_features': tuple(),
+                                       'uni_variant': True,
+                                       'input_shape': (100, 0),
+                                       'static_features': tuple(),
+                                       'future_feature_shapes': (dataset_properties['n_prediction_steps'], 0),
+                                       })
+
+        n_prediction_steps = dataset_properties['n_prediction_steps']
+        window_size = fit_dictionary['window_size']
+        n_features_past = 10 * len(dataset_properties['feature_names']) + len(time_feature_names)
+        n_features_future = 10 * len(dataset_properties['known_future_features']) + len(time_feature_names)
+        n_targets = 1
+
+        backbone = ForecastingNetworkChoice(dataset_properties)
+        head = ForecastingHead()
+        network = ForecastingNetworkComponent()
+
+        if network_type == 'NBEATSNet':
+            updates = [HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                       hyperparameter='__choice__',
+                                                       value_range=('flat_encoder',),
+                                                       default_value='flat_encoder', )]
+            include = ['flat_encoder:NBEATSEncoder']
+
+        else:
+            updates = [HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                       hyperparameter='__choice__',
+                                                       value_range=('seq_encoder',),
+                                                       default_value='seq_encoder', ),
+                       HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                       hyperparameter='seq_encoder:num_blocks',
+                                                       value_range=(1, 1),
+                                                       default_value=1, ),
+                       ]
+            include = [f'seq_encoder:{network_encoder}']
+
+            if network_type == 'ForecastingNet':
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                    value_range=(False,),
+                    default_value=False, ))
+
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:decoder_auto_regressive',
+                    value_range=(False,),
+                    default_value=False, ))
+                if uni_variant_data and network_encoder == 'RNNEncoder':
+                    updates.append(HyperparameterSearchSpaceUpdate(
+                        node_name="network_backbone",
+                        hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+                        value_range=('MLPDecoder',),
+                        default_value='MLPDecoder', ))
+
+            elif network_type == 'ForecastingSeq2SeqNet':
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+                    value_range=("RNNDecoder",),
+                    default_value="RNNDecoder", ))
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:decoder_auto_regressive',
+                    value_range=(True,),
+                    default_value=True, ))
+
+            elif network_type == 'ForecastingDeepARNet':
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+                    value_range=('MLPDecoder',),
+                    default_value='MLPDecoder', ))
+
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                    value_range=(True,),
+                    default_value=True, ))
+
+            if variable_selection:
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:variable_selection',
+                    value_range=(True,),
+                    default_value=True, ))
+            else:
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:variable_selection',
+                    value_range=(False,),
+                    default_value=False, ))
+
+        for update in updates:
+            backbone._apply_search_space_update(update)
+
+        cs = backbone.get_hyperparameter_search_space(dataset_properties=dataset_properties, include=include)
+
+        sample = cs.sample_configuration()
+        backbone.set_hyperparameters(sample)
+
+        backbone = backbone.fit(fit_dictionary)
+        fit_dictionary = backbone.transform(fit_dictionary)
+
+        head = head.fit(fit_dictionary)
+        fit_dictionary = head.transform(fit_dictionary)
+
+        network = network.fit(fit_dictionary)
+        fit_dictionary = network.transform(fit_dictionary)
+
+        neu_arch = fit_dictionary['network']
+
+        assert isinstance(neu_arch, AbstractForecastingNet)
+        batch_size = 2
+
+        past_targets = torch.ones([batch_size, 50, n_targets])
+        future_targets = torch.ones([batch_size, n_prediction_steps, n_targets])
+        past_observed_targets = torch.ones([batch_size, 50, n_targets]).bool()
+        if uni_variant_data:
+            past_features = None
+            future_features = None
+        else:
+            past_features = torch.ones([batch_size, 50, n_features_past])
+            future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
+
+        output = neu_arch(past_targets=past_targets,
+                          future_targets=future_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+
+        if net_output_type.startswith('distribution'):
+            assert isinstance(output, torch.distributions.Distribution)
+            output = output.mean
+        elif net_output_type == 'quantile':
+            assert len(output) == 3
+            output = output[0]
+        if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
+            assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        elif network_type == "ForecastingDeepARNet":
+            assert list(output.shape) == [batch_size, n_prediction_steps + min(50, neu_arch.window_size) - 1, n_targets]
+        else:
+            backcast = output[0]
+            forecast = output[1]
+            assert list(backcast.shape) == [batch_size, window_size, n_targets]
+            assert list(forecast.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        neu_arch.eval()
+        output = neu_arch.predict(past_targets=past_targets,
+                                  past_features=past_features,
+                                  future_features=future_features,
+                                  past_observed_targets=past_observed_targets)
+
+        assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        neu_arch.train()
+
+        past_targets = torch.ones([batch_size, 3, n_targets])
+        future_targets = torch.ones([batch_size, n_prediction_steps, n_targets])
+        past_observed_targets = torch.ones([batch_size, 3, n_targets]).bool()
+        if uni_variant_data:
+            past_features = None
+            future_features = None
+        else:
+            past_features = torch.ones([batch_size, 3, n_features_past])
+            future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
+
+        output = neu_arch(past_targets=past_targets,
+                          future_targets=future_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+        if net_output_type.startswith('distribution'):
+            assert isinstance(output, torch.distributions.Distribution)
+            output = output.mean
+        elif net_output_type == 'quantile':
+            assert len(output) == 3
+            output = output[0]
+        if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
+            assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+        elif network_type == "ForecastingDeepARNet":
+            assert list(output.shape) == [batch_size, n_prediction_steps + min(3, neu_arch.window_size) - 1, n_targets]
+        else:
+            backcast = output[0]
+            forecast = output[1]
+            assert list(backcast.shape) == [batch_size, window_size, n_targets]
+            assert list(forecast.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
+            assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+        neu_arch.eval()
+
+        output = neu_arch.predict(past_targets=past_targets,
+                                  past_features=past_features,
+                                  future_features=future_features,
+                                  past_observed_targets=past_observed_targets)
+
+        assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+
+
+class TestForecastingNetworkUtil(unittest.TestCase):
+    def test_get_lagged_values(self):
+        seq_raw = torch.arange(10).reshape([1, -1, 1]).float()
+        window_size = 3
+        lag_sequence = [0, 1, 2, 3, 5]
+        lagged_seq1, mask = get_lagged_subsequences(seq_raw, window_size, lag_sequence)
+        lagged_seq2, _ = get_lagged_subsequences(seq_raw, window_size, lag_sequence, mask)
+        lagged_seq3 = get_lagged_subsequences_inference(seq_raw, window_size, lag_sequence)
+
+        self.assertTrue(torch.equal(lagged_seq1, lagged_seq2))
+        self.assertTrue(torch.equal(lagged_seq2, lagged_seq3))
+        self.assertTrue(torch.equal(lagged_seq1[0], torch.Tensor([[7, 6, 5, 4, 2],
+                                                                  [8, 7, 6, 5, 3],
+                                                                  [9, 8, 7, 6, 4]]).float()))
+        self.assertListEqual(list(mask.shape), [len(lag_sequence), max(lag_sequence) + window_size])
+
+        seq_raw = torch.arange(5, 5 + 3).reshape([1, -1, 1]).float()
+        window_size = 3
+        lag_sequence = [0, 1, 2, 3, 5]
+        lagged_seq1, mask = get_lagged_subsequences(seq_raw, window_size, lag_sequence)
+        lagged_seq2, mask2 = get_lagged_subsequences(seq_raw, window_size, lag_sequence, mask)
+        lagged_seq3 = get_lagged_subsequences_inference(seq_raw, window_size, lag_sequence)
+
+        self.assertTrue(torch.all(lagged_seq1 == lagged_seq2))
+        self.assertTrue(torch.all(lagged_seq2 == lagged_seq3))
+        self.assertTrue(torch.equal(lagged_seq1[0], torch.Tensor([[5, 0, 0, 0, 0],
+                                                                  [6, 5, 0, 0, 0],
+                                                                  [7, 6, 5, 0, 0]]).float()))
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
new file mode 100644
index 000000000..67b6f85f6
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -0,0 +1,375 @@
+import copy
+import unittest
+from itertools import product
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import \
+    generate_fit_dict_and_dataset_property
+
+from sklearn.pipeline import Pipeline
+
+import torch
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    StackedDecoder,
+    StackedEncoder,
+    TemporalFusionLayer
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    seq_encoder import SeqForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.utils.common import HyperparameterSearchSpace
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+
+
+class TestSeqEncoder(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dataset_properties, self.fit_dictionary = generate_fit_dict_and_dataset_property()
+        self.fit_dictionary['net_output_type'] = 'regression'
+        self.fit_dictionary['network_embedding'] = _NoEmbedding()
+
+    def test_config_space(self):
+        seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+        cs_seq = seq_encoder_choice.get_hyperparameter_search_space(
+            dataset_properties=self.dataset_properties,
+            num_blocks=HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                 value_range=(2, 3),
+                                                 default_value=2), )
+        sample = cs_seq.sample_configuration()
+
+        num_blocks = sample['num_blocks']
+        seq_encoder_choice.set_hyperparameters(sample)
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        self.assertIsInstance(seq_encoder_choice.pipeline, Pipeline)
+        encoder_choices = seq_encoder_choice.fit(fit_dict)
+        fit_dict = encoder_choices.transform(fit_dict)
+
+        self.assertTrue('network_structure' in fit_dict)
+        network_structure = fit_dict['network_structure']
+        self.assertIsInstance(network_structure, NetworkStructure)
+        self.assertTrue(network_structure.num_blocks, num_blocks)
+
+        self.assertTrue('network_encoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_encoder']), num_blocks)
+
+        self.assertTrue('network_decoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_decoder']), num_blocks)
+
+        # test error:
+        dataset_properties = copy.copy(self.dataset_properties)
+        dataset_properties.update({'feature_shapes': {},
+                                   'feature_names': tuple(),
+                                   'known_future_features': tuple(),
+                                   'uni_variant': True,
+                                   'input_shape': (100, 0),
+                                   'static_features': tuple(),
+                                   'future_feature_shapes': (dataset_properties['n_prediction_steps'], 0),
+                                   })
+
+    def test_deepar(self):
+        for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder', 'TCNEncoder', 'InceptionTimeEncoder']):
+            with self.subTest(valid_encoder=valid_encoder):
+                seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+                update_ar = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                            hyperparameter='auto_regressive',
+                                                            value_range=(True,),
+                                                            default_value=True, )
+                update_rnn_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                 hyperparameter='decoder_type',
+                                                                 value_range=('MLPDecoder',),
+                                                                 default_value='MLPDecoder', )
+                update_transformer_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                         hyperparameter='decoder_type',
+                                                                         value_range=('MLPDecoder',),
+                                                                         default_value='MLPDecoder', )
+                seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_mlp,
+                                                  "block_1:TransformerEncoder:decoder_type": update_transformer_mlp,
+                                                  "block_1:MLPDecoder:auto_regressive": update_ar}
+
+                cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
+                                                                            include=[valid_encoder])
+                sample = cs_seq.get_default_configuration()
+
+                seq_encoder_choice.set_hyperparameters(sample)
+
+                fit_dict = copy.copy(self.fit_dictionary)
+                fit_dict['dataset_properties'] = self.dataset_properties
+
+                encoder_choices = seq_encoder_choice.fit(fit_dict)
+                fit_dict = encoder_choices.transform(fit_dict)
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict)
+                fit_dict = head.transform(fit_dict)
+
+                net_encoder = StackedEncoder(fit_dict['network_structure'], False,
+                                             fit_dict['network_encoder'], fit_dict['network_decoder'])
+                net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                             fit_dict['network_encoder'],
+                                             fit_dict['network_decoder'])
+
+                head = fit_dict['network_head']
+                if i < 2:
+                    input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
+                    input_tensor_future = torch.randn([10, 1, 59])
+                else:
+                    input_tensor = torch.randn([10, 20, 53])  # no lag
+                    input_tensor_future = torch.randn([10, 1, 53])
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                              additional_input=[None],
+                                                              cache_intermediate_state=True,
+                                                              )
+                output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, 1, 1])
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
+                                                              additional_input=[None],
+                                                              output_seq=False, cache_intermediate_state=True,
+                                                              incremental_update=True
+                                                              )
+                output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, 1, 1])
+
+    def test_seq2seq(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+
+        for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder']):
+            with self.subTest(valid_encoder=valid_encoder):
+                seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+
+                update_rnn_rnn = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                 hyperparameter='decoder_type',
+                                                                 value_range=('RNNDecoder',),
+                                                                 default_value='RNNDecoder', )
+                update_trans_trans = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                     hyperparameter='decoder_type',
+                                                                     value_range=('TransformerDecoder',),
+                                                                     default_value='TransformerDecoder', )
+
+                seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_rnn,
+                                                  "block_1:TransformerEncoder:decoder_type": update_trans_trans}
+                decoder_auto_regressive = HyperparameterSearchSpace(
+                    hyperparameter="decoder_auto_regressive",
+                    value_range=(True,),
+                    default_value=True,
+                )
+
+                cs_seq = seq_encoder_choice.get_hyperparameter_search_space(
+                    dataset_properties=self.dataset_properties,
+                    decoder_auto_regressive=decoder_auto_regressive,
+                    include=[valid_encoder]
+                )
+                sample = cs_seq.get_default_configuration()
+
+                seq_encoder_choice.set_hyperparameters(sample)
+
+                fit_dict = copy.copy(self.fit_dictionary)
+                fit_dict['dataset_properties'] = self.dataset_properties
+
+                encoder_choices = seq_encoder_choice.fit(fit_dict)
+                fit_dict = encoder_choices.transform(fit_dict)
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict)
+                fit_dict = head.transform(fit_dict)
+
+                net_encoder = StackedEncoder(fit_dict['network_structure'], False,
+                                             fit_dict['network_encoder'], fit_dict['network_decoder'])
+                net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                             fit_dict['network_encoder'],
+                                             fit_dict['network_decoder'])
+
+                head = fit_dict['network_head']
+
+                input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
+                input_tensor_future = torch.randn([10, n_prediction_steps, 59])
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                              additional_input=[None],
+                                                              cache_intermediate_state=True,
+                                                              )
+                output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+                net_encoder.eval()
+                net_decoder.eval()
+                input_tensor_future = torch.randn([10, 1, 59])
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
+                                                              additional_input=[None],
+                                                              output_seq=False, cache_intermediate_state=True,
+                                                              incremental_update=True
+                                                              )
+                output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, 1, 1])
+
+    def test_seq_models(self):
+        update = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                 hyperparameter='auto_regressive',
+                                                 value_range=(False,),
+                                                 default_value=False, )
+        # To avoid that default setting raises conflict for forbidden clauses
+        update_rnn_default = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                             hyperparameter='decoder_type',
+                                                             value_range=('MLPDecoder', 'RNNDecoder'),
+                                                             default_value='RNNDecoder', )
+        num_blocks = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                               value_range=(2, 2),
+                                               default_value=2)
+        window_size: int = self.fit_dictionary['window_size']
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        n_features = self.dataset_properties['input_shape'][-1]
+        n_targets = self.dataset_properties['output_shape'][-1]
+        n_time_features = len(self.dataset_properties['time_feature_transform'])
+        all_settings = [(True, False), (True, False), (True, False), (True, False), ('gate_add_norm', 'add')]
+        for hp_values in product(*all_settings):
+            hp_variable_selection = hp_values[0]
+            hp_use_temporal_fusion = hp_values[1]
+            hp_decoder_auto_regressive = hp_values[2]
+            hp_skip_connection = hp_values[3]
+            hp_skip_connection_type = hp_values[4]
+            with self.subTest(hp_variable_selection=hp_variable_selection,
+                              hp_use_temporal_fusion=hp_use_temporal_fusion,
+                              hp_decoder_auto_regressive=hp_decoder_auto_regressive,
+                              hp_skip_connection=hp_skip_connection,
+                              hp_skip_connection_type=hp_skip_connection_type):
+                variable_selection = HyperparameterSearchSpace('variable_selection',
+                                                               (hp_variable_selection,), hp_variable_selection)
+                use_temporal_fusion = HyperparameterSearchSpace('use_temporal_fusion',
+                                                                (hp_use_temporal_fusion,), hp_use_temporal_fusion)
+                decoder_auto_regressive = HyperparameterSearchSpace('decoder_auto_regressive',
+                                                                    (hp_decoder_auto_regressive,),
+                                                                    hp_decoder_auto_regressive)
+                skip_connection = HyperparameterSearchSpace('skip_connection',
+                                                            (hp_skip_connection,),
+                                                            hp_skip_connection)
+                skip_connection_type = HyperparameterSearchSpace('skip_connection_type',
+                                                                 (hp_skip_connection_type,),
+                                                                 hp_skip_connection_type)
+
+                seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+                seq_encoder_choice._cs_updates = {"block_1:MLPDecoder:auto_regressive": update,
+                                                  "block_1:RNNEncoder:decoder_type": update_rnn_default,
+                                                  "block_2:RNNEncoder:decoder_type": update_rnn_default,
+                                                  }
+                cs_seq_encoder = seq_encoder_choice.get_hyperparameter_search_space(
+                    dataset_properties=self.dataset_properties,
+                    num_blocks=num_blocks,
+                    variable_selection=variable_selection,
+                    use_temporal_fusion=use_temporal_fusion,
+                    decoder_auto_regressive=decoder_auto_regressive,
+                    skip_connection=skip_connection,
+                    skip_connection_type=skip_connection_type
+                )
+                sample = cs_seq_encoder.sample_configuration()
+                seq_encoder_choice.set_hyperparameters(sample)
+
+                fit_dict = copy.copy(self.fit_dictionary)
+                fit_dict['dataset_properties'] = self.dataset_properties
+
+                encoder_choices = seq_encoder_choice.fit(fit_dict)
+                fit_dict = encoder_choices.transform(fit_dict)
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict)
+                fit_dict = head.transform(fit_dict)
+
+                network_structure = fit_dict['network_structure']
+                net_encoder = StackedEncoder(fit_dict['network_structure'],
+                                             network_structure.use_temporal_fusion,
+                                             fit_dict['network_encoder'], fit_dict['network_decoder'])
+                net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                             fit_dict['network_encoder'],
+                                             fit_dict['network_decoder'])
+                if hp_use_temporal_fusion:
+                    temporal_fusion: TemporalFusionLayer = fit_dict['temporal_fusion']
+
+                head = fit_dict['network_head']
+
+                if hp_variable_selection:
+                    n_feature_encoder = fit_dict['network_encoder']['block_1'].encoder_output_shape[-1]
+                    if decoder_auto_regressive:
+                        n_feature_decoder = n_feature_encoder
+                    else:
+                        n_feature_decoder = n_feature_encoder - 1
+                else:
+                    if hasattr(net_encoder.encoder['block_1'], 'lagged_value'):
+                        n_feature_encoder = n_features + n_time_features
+                        n_feature_encoder += n_targets * len(net_encoder.encoder['block_1'].lagged_value)
+                    else:
+                        n_feature_encoder = n_features + n_time_features + n_targets
+                    if hp_decoder_auto_regressive:
+                        if hasattr(net_decoder.decoder['block_1'], 'lagged_value'):
+                            n_feature_decoder = n_features + n_time_features
+                            n_feature_decoder += n_targets * len(
+                                net_decoder.decoder['block_1'].lagged_value)
+                        else:
+                            n_feature_decoder = n_features + n_time_features + n_targets
+                    else:
+                        n_feature_decoder = n_features + n_time_features
+
+                input_tensor = torch.ones([10, window_size, n_feature_encoder])
+                input_tensor_future = torch.randn([10, n_prediction_steps, n_feature_decoder])
+                input_tensor_future_ar = torch.randn([10, 1, n_feature_decoder])
+                past_observed_values = torch.ones([10, window_size, 1]).bool()
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                              additional_input=[None] * 2,
+                                                              )
+
+                decoder_output = net_decoder(x_future=input_tensor_future,
+                                             encoder_output=encoder2decoder,
+                                             pos_idx=(window_size, window_size + n_prediction_steps))
+
+                if hp_use_temporal_fusion:
+                    decoder_output = temporal_fusion(encoder_output=encoder_output,
+                                                     decoder_output=decoder_output,
+                                                     past_observed_targets=past_observed_values,
+                                                     decoder_length=n_prediction_steps,
+                                                     )
+
+                output = head(decoder_output)
+                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+                if hp_decoder_auto_regressive:
+                    net_encoder.eval()
+                    net_decoder.eval()
+                    head.eval()
+
+                    encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                                  additional_input=[None] * 2,
+                                                                  cache_intermediate_state=False,
+                                                                  )
+
+                    decoder_output = net_decoder(x_future=input_tensor_future_ar,
+                                                 encoder_output=encoder2decoder,
+                                                 pos_idx=(window_size, window_size + 1),
+                                                 cache_intermediate_state=True,
+                                                 )
+                    if hp_use_temporal_fusion:
+                        temporal_fusion.eval()
+                        decoder_output = temporal_fusion(encoder_output=encoder_output,
+                                                         decoder_output=decoder_output,
+                                                         past_observed_targets=past_observed_values,
+                                                         decoder_length=1,
+                                                         )
+                        output = head(decoder_output)
+                        self.assertListEqual(list(output.shape), [10, 1, 1])
+
+                    decoder_output = net_decoder.forward(x_future=input_tensor_future_ar,
+                                                         encoder_output=encoder2decoder,
+                                                         pos_idx=(window_size, window_size + 1),
+                                                         cache_intermediate_state=True,
+                                                         incremental_update=True,
+                                                         )
+                    if hp_use_temporal_fusion:
+                        decoder_output = temporal_fusion(encoder_output=encoder_output,
+                                                         decoder_output=decoder_output,
+                                                         past_observed_targets=past_observed_values,
+                                                         decoder_length=1,
+                                                         )
+                        output = head(decoder_output)
+                        self.assertListEqual(list(output.shape), [10, 1, 1])
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
new file mode 100644
index 000000000..a415e2e22
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -0,0 +1,311 @@
+import unittest
+
+import torch
+
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+
+
+class TestTargetScalar(unittest.TestCase):
+    def test_target_no_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = BaseTargetScaler(scaling_mode='none')
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.rand([5, 6, 7])
+        future_targets = torch.rand(([5, 3, 7]))
+
+        past_observed_values = torch.rand([5, 6, 7]) > 0.5
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets)
+        self.assertTrue(torch.equal(past_targets, transformed_past_target))
+        self.assertTrue(torch.equal(future_targets, transformed_future_targets))
+        self.assertIsNone(loc)
+        self.assertIsNone(scale)
+
+        _, transformed_future_targets, _, _ = scalar(past_targets)
+        self.assertIsNone(transformed_future_targets)
+
+    def test_target_mean_abs_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = BaseTargetScaler(scaling_mode='mean_abs')
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [8. / 11.] * 5 + [16. / 11.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 80. / 11.))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 11. / 8., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertIsNone(loc)
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [10. / 11.] * 5 + [20. / 11.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 100. / 11))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 1.1, 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
+        self.assertIsNone(loc_full)
+
+    def test_target_standard_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = BaseTargetScaler(scaling_mode='standard')
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [-0.7246] * 5 + [1.2076] * 3).unsqueeze(-1), atol=1e-4))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.allclose(transformed_future_targets[1], torch.ones([10, 1]) * 16.6651))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 6.0000))
+
+        self.assertTrue(
+            torch.allclose(loc,
+                           torch.Tensor([0., 11. / 8., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        self.assertTrue(
+            torch.allclose(scale,
+                           torch.Tensor([1., 0.5175, 1.]).reshape([len(past_targets), 1, past_targets.shape[-1]]),
+                           atol=1e-4)
+        )
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([-1.4908] * 2 + [-0.1355] * 5 + [1.2197] * 3).unsqueeze(-1),
+                                       atol=1e-4)
+                        )
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.allclose(transformed_future_targets[1], torch.ones([10, 1]) * 12.0618, atol=1e-4))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 6.))
+
+        self.assertTrue(
+            torch.allclose(loc,
+                           torch.Tensor([0., 1.1, 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]),
+                           atol=1e-4
+                           )
+        )
+        self.assertTrue(
+            torch.allclose(scale,
+                           torch.Tensor([1., 0.7379, 1.]).reshape([len(past_targets), 1, past_targets.shape[-1]]),
+                           atol=1e-4
+                           )
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(loc, loc_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
+    def test_target_min_max_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = BaseTargetScaler(scaling_mode='min_max')
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [0.] * 5 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 9))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 1.5))
+
+        self.assertTrue(
+            torch.equal(loc, torch.Tensor([0., 1., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 1., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.equal(transformed_past_target[1],
+                                    torch.Tensor([0.] * 2 + [0.5] * 5 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 1.5))
+        self.assertTrue(
+            torch.equal(loc, torch.Tensor([0., 0., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
+    def test_target_max_abs_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = BaseTargetScaler(scaling_mode='max_abs')
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [0.5] * 5 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5.))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertIsNone(loc)
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.equal(transformed_past_target[1],
+                                    torch.Tensor([0.] * 2 + [0.5] * 5 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertIsNone(loc)
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertIsNone(loc_full)
+        self.assertTrue(torch.equal(scale, scale_full))
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
new file mode 100644
index 000000000..1b795a0cd
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
@@ -0,0 +1,100 @@
+import copy
+import unittest
+
+from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.DistributionLoss import DistributionLoss
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.QuantileLoss import NetworkQuantileLoss
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.RegressionLoss import RegressionLoss
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.pipeline.components.training.losses import (
+    L1Loss,
+    LogProbLoss,
+    MAPELoss,
+    MASELoss,
+    MSELoss,
+    QuantileLoss
+)
+
+
+class TestForecastingTrainingLoss(unittest.TestCase):
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the encoder
+        choice"""
+        dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+        loss_choice = ForecastingLossChoices(dataset_properties)
+        cs = loss_choice.get_hyperparameter_search_space(dataset_properties)
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(loss_choice.get_components().keys()))
+        )
+
+        # Make sure we can properly set some random configs
+        # Whereas just one iteration will make sure the algorithm works,
+        # doing five iterations increase the confidence. We will be able to
+        # catch component specific crashes
+        for i in range(5):
+            config = cs.sample_configuration()
+            config_dict = copy.deepcopy(config.get_dictionary())
+            loss_choice.set_hyperparameters(config)
+
+            self.assertEqual(loss_choice.choice.__class__,
+                             loss_choice.get_components()[config_dict['__choice__']])
+
+        include = ['DistributionLoss', 'QuantileLoss']
+        cs = loss_choice.get_hyperparameter_search_space(dataset_properties=dataset_properties, include=include)
+        self.assertTrue(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(include),
+        )
+
+    def test_distribution_loss(self):
+        for dist_cls in ALL_DISTRIBUTIONS.keys():
+            loss = DistributionLoss(dist_cls)
+            self.assertEqual(loss.dist_cls, dist_cls)
+
+            dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+            fit_dictionary = {'dataset_properties': dataset_properties}
+            loss = loss.fit(fit_dictionary)
+            fit_dictionary = loss.transform(fit_dictionary)
+
+            self.assertEqual(fit_dictionary['loss'], LogProbLoss)
+            self.assertEqual(fit_dictionary['required_padding_value'], ALL_DISTRIBUTIONS[dist_cls].value_in_support)
+            self.assertIsInstance(fit_dictionary['dist_forecasting_strategy'], DisForecastingStrategy)
+
+    def test_quantile_loss(self):
+        lower = 0.2
+        upper = 0.8
+        loss = NetworkQuantileLoss(lower_quantile=lower, upper_quantile=upper)
+        self.assertEqual(loss.quantiles, [0.5, lower, upper])
+
+        dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+        fit_dictionary = {'dataset_properties': dataset_properties}
+        loss = loss.fit(fit_dictionary)
+        fit_dictionary = loss.transform(fit_dictionary)
+        train_loss = fit_dictionary['loss']()
+
+        self.assertIsInstance(train_loss, QuantileLoss)
+        self.assertListEqual(train_loss.quantiles, loss.quantiles)
+        self.assertListEqual(fit_dictionary['quantile_values'], loss.quantiles)
+
+    def test_regression_loss(self):
+        loss_dict = dict(l1=L1Loss,
+                         mse=MSELoss,
+                         mape=MAPELoss,
+                         mase=MASELoss)
+        for loss_name, loss_type in loss_dict.items():
+            loss = RegressionLoss(loss_name)
+
+            dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+            fit_dictionary = {'dataset_properties': dataset_properties}
+            loss = loss.fit(fit_dictionary)
+            fit_dictionary = loss.transform(fit_dictionary)
+            train_loss = fit_dictionary['loss']
+
+            self.assertEqual(train_loss, loss_type)
diff --git a/test/test_pipeline/components/setup/test_setup.py b/test/test_pipeline/components/setup/test_setup.py
index 9d66953b2..e4b8deeb4 100644
--- a/test/test_pipeline/components/setup/test_setup.py
+++ b/test/test_pipeline/components/setup/test_setup.py
@@ -317,12 +317,10 @@ class TestNetworkBackbone:
     def test_all_backbones_available(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
 
-        assert len(backbone_choice.get_components().keys()) == 8
+        assert len(backbone_choice.get_components().keys()) == 6
 
     @pytest.mark.parametrize('task_type_input_shape', [(constants.IMAGE_CLASSIFICATION, (3, 64, 64)),
                                                        (constants.IMAGE_REGRESSION, (3, 64, 64)),
-                                                       (constants.TIMESERIES_CLASSIFICATION, (32, 6)),
-                                                       (constants.TIMESERIES_REGRESSION, (32, 6)),
                                                        (constants.TABULAR_CLASSIFICATION, (100,)),
                                                        (constants.TABULAR_REGRESSION, (100,))])
     def test_dummy_forward_backward_pass(self, task_type_input_shape):
@@ -369,8 +367,7 @@ def test_dummy_forward_backward_pass(self, task_type_input_shape):
 
     def test_every_backbone_is_valid(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
-
-        assert len(backbone_choice.get_components().keys()) == 8
+        assert len(backbone_choice.get_components().keys()) == 6
 
         for name, backbone in backbone_choice.get_components().items():
             config = backbone.get_hyperparameter_search_space().sample_configuration()
@@ -401,6 +398,9 @@ def test_get_set_config_space(self):
         """
         network_backbone_choice = NetworkBackboneChoice(dataset_properties={})
         for task_type in constants.TASK_TYPES:
+            if task_type in constants.FORECASTING_TASKS:
+                # Forecasting task has individual backbones
+                continue
             dataset_properties = {"task_type": constants.TASK_TYPES_TO_STRING[task_type]}
             cs = network_backbone_choice.get_hyperparameter_search_space(dataset_properties)
 
@@ -506,8 +506,6 @@ def test_all_heads_available(self):
 
     @pytest.mark.parametrize('task_type_input_output_shape', [(constants.IMAGE_CLASSIFICATION, (3, 64, 64), (5,)),
                                                               (constants.IMAGE_REGRESSION, (3, 64, 64), (1,)),
-                                                              (constants.TIMESERIES_CLASSIFICATION, (32, 6), (5,)),
-                                                              (constants.TIMESERIES_REGRESSION, (32, 6), (1,)),
                                                               (constants.TABULAR_CLASSIFICATION, (100,), (5,)),
                                                               (constants.TABULAR_REGRESSION, (100,), (1,))])
     def test_dummy_forward_backward_pass(self, task_type_input_output_shape):
diff --git a/test/test_pipeline/components/setup/test_setup_preprocessing_node.py b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py
index 0fc0bb4c0..1ec858864 100644
--- a/test/test_pipeline/components/setup/test_setup_preprocessing_node.py
+++ b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py
@@ -23,7 +23,7 @@ def setUp(self):
         dataset = mock.MagicMock()
         dataset.__len__.return_value = 1
         datamanager = mock.MagicMock()
-        datamanager.get_dataset_for_training.return_value = (dataset, dataset)
+        datamanager.get_dataset.return_value = (dataset, dataset)
         datamanager.train_tensors = (np.random.random((10, 15)), np.random.random(10))
         datamanager.test_tensors = None
         self.backend.load_datamanager.return_value = datamanager
diff --git a/test/test_pipeline/components/training/test_forecasting_training.py b/test/test_pipeline/components/training/test_forecasting_training.py
new file mode 100644
index 000000000..3780ea206
--- /dev/null
+++ b/test/test_pipeline/components/training/test_forecasting_training.py
@@ -0,0 +1,23 @@
+import unittest
+
+from autoPyTorch.constants import FORECASTING_BUDGET_TYPE
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
+
+
+class TestGetBudgetTracker(unittest.TestCase):
+    def test_get_budget_tracker(self):
+        trainer = ForecastingTrainerChoice({})
+        max_epoch = 50
+
+        X = {'budget_type': 'epochs',
+             'epochs': 5,
+             }
+        budget_tracker = trainer.get_budget_tracker(X)
+        self.assertEqual(budget_tracker.max_epochs, 5)
+
+        for budeget_type in FORECASTING_BUDGET_TYPE:
+            budget_tracker = trainer.get_budget_tracker({'budget_type': budeget_type})
+            self.assertEqual(budget_tracker.max_epochs, max_epoch)
+
+        budget_tracker = trainer.get_budget_tracker({'budget_type': 'runtime'})
+        self.assertIsNone(budget_tracker.max_epochs)
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
new file mode 100644
index 000000000..f17c4e10e
--- /dev/null
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -0,0 +1,496 @@
+import copy
+import unittest
+import unittest.mock
+from typing import List
+from unittest import mock
+
+import numpy as np
+
+import pandas as pd
+
+import torch
+
+import torchvision
+
+from autoPyTorch.datasets.resampling_strategy import HoldOutFuncs, HoldoutValTypes
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
+    TimeSeriesForecastingDataLoader
+)
+from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
+    PadSequenceCollector,
+    SequentialSubSetSampler,
+    TestSequenceDataset,
+    TimeSeriesSampler,
+    pad_sequence_with_minimal_length
+)
+from autoPyTorch.utils.common import HyperparameterSearchSpace
+
+
+class TestTimeSeriesForecastingDataLoader(unittest.TestCase):
+    def setUp(self) -> None:
+        feature_names = ['f1']
+        feature_shapes = {'f1': 1}
+        known_future_features = ('f1',)
+        freq = '1Y'
+        n_prediction_steps = 3
+
+        sequence_lengths_train = [10, 20, 30, 40, 50, 60, 70, 80, 90, 1000]
+
+        backend = unittest.mock.Mock()
+        n_repeats = 2
+
+        with mock.patch('autoPyTorch.datasets.time_series_dataset.TimeSeriesForecastingDataset') as MockDataSet:
+            mockdataset = MockDataSet.return_value
+            mockdataset.holdout_validators = HoldOutFuncs.get_holdout_validators(
+                HoldoutValTypes.time_series_hold_out_validation
+            )
+            datasets = []
+            mockdataset.sequence_lengths_train = sequence_lengths_train
+            for seq_len in sequence_lengths_train:
+                mock_ser = mock.MagicMock()
+                mock_ser.__len__.return_value = seq_len
+                datasets.append(mock_ser)
+            mockdataset.datasets = datasets
+            mockdataset.n_prediction_steps = n_prediction_steps
+
+        split = TimeSeriesForecastingDataset.create_holdout_val_split(mockdataset,
+                                                                      HoldoutValTypes.time_series_hold_out_validation,
+                                                                      0.1,
+                                                                      n_repeats=n_repeats)
+
+        with mock.patch('autoPyTorch.datasets.time_series_dataset.TimeSeriesForecastingDataset') as MockDataSet:
+            dataset = MockDataSet.return_value
+
+            dataset.__len__.return_value = sum(sequence_lengths_train)
+            datamanager = unittest.mock.MagicMock()
+            datamanager.get_dataset.return_value = dataset
+            datamanager.feature_names = ['f1']
+            datamanager.splits.__getitem__.return_value = split
+
+        dataset_properties = dict(feature_names=feature_names,
+                                  feature_shapes=feature_shapes,
+                                  known_future_features=known_future_features,
+                                  freq=freq,
+                                  is_small_preprocess=True,
+                                  uni_variant=False,
+                                  time_feature_transform=True,
+                                  sequence_lengths_train=sequence_lengths_train,
+                                  n_prediction_steps=n_prediction_steps,
+                                  n_repeats=n_repeats)
+
+        self.n_prediction_steps = n_prediction_steps
+
+        backend.load_datamanager.return_value = datamanager
+        self.fit_dictionary = {
+            'dataset_properties': dataset_properties,
+            'lagged_value': [1, 2, 3],
+            'X_train': pd.DataFrame([0.] * sum(sequence_lengths_train)),
+            'y_train': pd.DataFrame([0.] * sum(sequence_lengths_train)),
+            'train_indices': split[0],
+            'test_indices': split[1],
+            'working_dir': '/tmp',
+            'backend': backend,
+            'split_id': 0,
+        }
+
+    def test_get_set_config_space(self):
+        """
+        Makes sure that the configuration space of the base data loader
+        is properly working"""
+        loader = TimeSeriesForecastingDataLoader()
+
+        dataset_properties = {'seq_length_max': 70}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 50)
+
+        dataset_properties = {'seq_length_max': 25}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 25)
+        self.assertEqual(cs.get_hyperparameter('window_size').default_value, 25)
+
+        dataset_properties = {'seq_length_max': 20}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 20)
+        self.assertEqual(cs.get_hyperparameter('window_size').lower, 1)
+
+        dataset_properties = {'seq_length_max': 10}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 10)
+        self.assertEqual(cs.get_hyperparameter('window_size').lower, 1)
+
+        cs = loader.get_hyperparameter_search_space(dataset_properties,
+                                                    window_size=HyperparameterSearchSpace(hyperparameter='window_size',
+                                                                                          value_range=(2, 5),
+                                                                                          default_value=3))
+
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 5)
+        self.assertEqual(cs.get_hyperparameter('window_size').lower, 2)
+
+        for _ in range(5):
+            sample = cs.sample_configuration()
+            self.assertTrue(
+                ('backcast_period' in sample) ^ ('window_size' in sample)
+            )
+
+    def test_base_fit(self):
+        """ Makes sure that fit and transform work as intended """
+        fit_dictionary = copy.copy(self.fit_dictionary)
+
+        # Mock child classes requirements
+        loader = TimeSeriesForecastingDataLoader()
+        loader.build_transform = unittest.mock.Mock()
+        loader._check_transform_requirements = unittest.mock.Mock()
+
+        loader.fit(fit_dictionary)
+
+        # Fit means that we created the data loaders
+        self.assertIsInstance(loader.train_data_loader, torch.utils.data.DataLoader)
+        self.assertIsInstance(loader.val_data_loader, torch.utils.data.DataLoader)
+
+        # Transforms adds this fit dictionaries
+        transformed_fit_dictionary = loader.transform(fit_dictionary)
+        self.assertIn('train_data_loader', transformed_fit_dictionary)
+        self.assertIn('val_data_loader', transformed_fit_dictionary)
+
+        self.assertEqual(transformed_fit_dictionary['train_data_loader'],
+                         loader.train_data_loader)
+        self.assertEqual(transformed_fit_dictionary['val_data_loader'],
+                         loader.val_data_loader)
+        self.assertEqual(transformed_fit_dictionary['window_size'], loader.window_size)
+
+    def test_build_transform_small_preprocess_true(self):
+        """
+        Makes sure a proper composition is created
+        """
+        loader = TimeSeriesForecastingDataLoader()
+        fit_dictionary = copy.deepcopy(self.fit_dictionary)
+        fit_dictionary['dataset_properties']['is_small_preprocess'] = True
+        for thing in ['imputer', 'scaler', 'encoder']:
+            fit_dictionary[thing] = [unittest.mock.Mock()]
+
+        compose = loader.build_transform(fit_dictionary, mode='train')
+
+        self.assertIsInstance(compose, torchvision.transforms.Compose)
+
+        # No preprocessing needed here as it was done before
+        self.assertEqual(len(compose.transforms), 1)
+
+    def test_build_transform_small_preprocess_false(self):
+        """
+        Makes sure a proper composition is created
+        """
+        loader = TimeSeriesForecastingDataLoader()
+        fit_dictionary = copy.deepcopy(self.fit_dictionary)
+        fit_dictionary['dataset_properties']['is_small_preprocess'] = False
+        fit_dictionary['preprocess_transforms'] = [unittest.mock.Mock()]
+
+        compose = loader.build_transform(fit_dictionary, mode='train')
+
+        self.assertIsInstance(compose, torchvision.transforms.Compose)
+
+        # We expect the expand_transformer and Mock
+        self.assertEqual(len(compose.transforms), 2)
+
+    def test_adjust_window_size(self):
+        window_size = 2
+        n_prediction_steps = 5
+        backcast_period = 3
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=1,
+                                                                 window_size=window_size,
+                                                                 n_prediction_steps=n_prediction_steps)
+        self.assertEqual(time_series_dataloader.window_size, window_size)
+
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=1,
+                                                                 backcast=True,
+                                                                 backcast_period=backcast_period,
+                                                                 window_size=window_size,
+                                                                 n_prediction_steps=n_prediction_steps)
+        self.assertEqual(time_series_dataloader.window_size, backcast_period * n_prediction_steps)
+
+        sample_interval = 3
+        self.assertEqual(time_series_dataloader.adjust_window_size(sample_interval),
+                         (backcast_period * n_prediction_steps) // sample_interval)
+
+    @mock.patch("autoPyTorch.pipeline.components.training.data_loader.time_series_util.TimeSeriesSampler.__init__",
+                spec=True)
+    def test_compute_expected_num_instances_per_seq(self, sampler_mock_init):
+        sampler_mock_init.return_value = None
+        batch_size = 5
+        window_size = 5
+        num_batches_per_epoch = 4
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=batch_size,
+                                                                 window_size=window_size,
+                                                                 num_batches_per_epoch=num_batches_per_epoch)
+        fit_dictionary = copy.copy(self.fit_dictionary)
+        time_series_dataloader = time_series_dataloader.fit(fit_dictionary)
+
+        self.assertEqual(time_series_dataloader.window_size, window_size)
+        self.assertEqual(time_series_dataloader.known_future_features_index, (0,))
+
+        sampler = time_series_dataloader.sampler_train
+        self.assertIsInstance(sampler, TimeSeriesSampler)
+        train_split = fit_dictionary['train_indices']
+        self.assertEqual(len(train_split), len(sampler_mock_init.call_args[1]['indices']))
+
+        train_seq_length = fit_dictionary['dataset_properties']['sequence_lengths_train']
+
+        seq_lengths = []
+        for train_seq_len in train_seq_length:
+            n_train_seq = len(
+                HoldOutFuncs.time_series_hold_out_validation(
+                    None, None,
+                    np.arange(train_seq_len),
+                    n_prediction_steps=fit_dictionary['dataset_properties']['n_prediction_steps'],
+                    n_repeats=fit_dictionary['dataset_properties']['n_repeats'])[0])
+            if n_train_seq > 0:
+                seq_lengths.append(n_train_seq)
+        self.assertTrue(np.all(seq_lengths == sampler_mock_init.call_args[1]['seq_lengths']))
+
+        num_instances_per_seqs_full = sampler_mock_init.call_args[1]['num_instances_per_seqs']
+        unique_num_instances_per_seqs = np.unique(num_instances_per_seqs_full)
+        self.assertEqual(len(unique_num_instances_per_seqs), 1)
+
+        self.assertAlmostEqual(unique_num_instances_per_seqs.item(),
+                               num_batches_per_epoch * batch_size / len(seq_lengths))
+
+        self.assertEqual(sampler_mock_init.call_args[1]['min_start'],
+                         fit_dictionary['dataset_properties']['n_prediction_steps'])
+
+        num_instances_dataset = sum(train_seq_length)
+        seq_train_length = seq_lengths
+        min_start = fit_dictionary['dataset_properties']['n_prediction_steps']
+
+        fraction_seq = 0.3
+        num_instances_per_seqs_frac_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start, fraction_seq)
+        instances_to_be_sampled = np.where(num_instances_per_seqs_frac_seq)[0]
+        self.assertEqual(len(instances_to_be_sampled), int(np.ceil(fraction_seq * len(seq_train_length))))
+        self.assertAlmostEqual(np.unique(num_instances_per_seqs_frac_seq[instances_to_be_sampled]),
+                               unique_num_instances_per_seqs)
+
+        fraction_samples_per_seq = 0.3
+        num_instances_per_seqs_frac_per_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start,
+            fraction_samples_per_seq=fraction_samples_per_seq)
+        self.assertTrue(np.allclose(num_instances_per_seqs_frac_per_seq,
+                                    fraction_samples_per_seq * num_instances_per_seqs_full))
+
+        time_series_dataloader.sample_strategy = 'LengthUniform'
+
+        seq_lengths_reduced = np.asarray(seq_lengths) - min_start
+        seq_lengths_reduced = np.where(seq_lengths_reduced <= 0, 0, seq_lengths_reduced)
+
+        num_instances_per_seqs_full = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start)
+
+        self.assertTrue(
+            np.allclose(num_instances_per_seqs_full,
+                        batch_size * num_batches_per_epoch * seq_lengths_reduced / np.sum(seq_lengths_reduced))
+        )
+
+        fraction_seq = 0.3
+        num_instances_per_seqs_frac_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start, fraction_seq)
+        instances_to_be_sampled = np.where(num_instances_per_seqs_frac_seq)[0]
+
+        self.assertTrue(np.allclose(np.unique(num_instances_per_seqs_frac_seq[instances_to_be_sampled]),
+                                    num_instances_per_seqs_full[instances_to_be_sampled]))
+
+        fraction_samples_per_seq = 0.3
+        num_instances_per_seqs_frac_per_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start,
+            fraction_samples_per_seq=fraction_samples_per_seq)
+        self.assertTrue(np.allclose(num_instances_per_seqs_frac_per_seq,
+                                    fraction_samples_per_seq * num_instances_per_seqs_full))
+
+    @mock.patch("autoPyTorch.pipeline.components.training.data_loader.time_series_util.TestSequenceDataset.__init__",
+                spec=True)
+    def test_get_loader(self, loader_init_mock):
+        loader_init_mock.return_value = None
+        batch_size = 5
+        window_size = 5
+        num_batches_per_epoch = 4
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=batch_size,
+                                                                 window_size=window_size,
+                                                                 num_batches_per_epoch=num_batches_per_epoch)
+        fit_dictionary = copy.copy(self.fit_dictionary)
+        time_series_dataloader.fit(fit_dictionary)
+        x_test = TimeSeriesSequence(X=np.array([1, 2, 3, 4, 5]),
+                                    Y=np.array([1, 2, 3, 4, 5]),
+                                    X_test=np.array([1, 2, 3]))
+        test_loader = time_series_dataloader.get_loader(X=copy.deepcopy(x_test))
+        self.assertIsInstance(test_loader, torch.utils.data.DataLoader)
+        self.assertIsInstance(test_loader.dataset, TestSequenceDataset)
+        test_set = loader_init_mock.call_args[0][0]
+        self.assertIsInstance(test_set, List)
+        self.assertEqual(len(test_set), 1)
+
+        x_test = [x_test, x_test]
+        _ = time_series_dataloader.get_loader(X=copy.deepcopy(x_test))
+        test_set = loader_init_mock.call_args[0][0]
+        self.assertEqual(len(test_set), len(x_test))
+
+        for seq in test_set:
+            self.assertIsInstance(seq, TimeSeriesSequence)
+            self.assertTrue(seq.is_test_set)
+            self.assertEqual(seq.freq, time_series_dataloader.freq)
+
+        class DummyEncoder:
+            def fit(self, data):
+                return self
+
+            def transform(self, data: pd.DataFrame):
+                return np.concatenate([data.values, data.values], axis=-1)
+
+        transform = DummyEncoder()
+        time_series_dataloader.feature_preprocessor = transform
+        x_test_copy = copy.deepcopy(x_test)
+        _ = time_series_dataloader.get_loader(X=x_test_copy)
+
+        test_set = loader_init_mock.call_args[0][0]
+        for seq_raw, seq in zip(x_test, test_set):
+            self.assertTrue(seq.X.shape[-1] == 2 * seq_raw.X.shape[-1])
+
+        # ensure that we do not transform the dataset twice
+        _ = time_series_dataloader.get_loader(X=x_test_copy)
+        test_set = loader_init_mock.call_args[0][0]
+        for seq_raw, seq in zip(x_test, test_set):
+            self.assertTrue(seq.X.shape[-1] == 2 * seq_raw.X.shape[-1])
+
+
+class TestTimeSeriesUtil(unittest.TestCase):
+    def test_test_seq_length(self):
+        x_test = TimeSeriesSequence(X=np.array([0, 1, 2, 3, 4]),
+                                    Y=np.array([1, 2, 3, 4, 5]),
+                                    X_test=np.array([1, 2, 3]),
+                                    n_prediction_steps=3,
+                                    is_test_set=True)
+        x_test = [x_test, x_test]
+        test_set = TestSequenceDataset(x_test)
+        self.assertEqual(len(test_set), len(x_test))
+        self.assertTrue(np.allclose(test_set[0][0]['past_targets'].numpy(), x_test[0].Y))
+
+    def test_pad_sequence_with_minimal_length(self):
+        sequences = [torch.ones([10, 1]),
+                     torch.ones([3, 1]),
+                     torch.ones([17, 1])]
+        pad_seq_1 = pad_sequence_with_minimal_length(sequences, 5)
+        self.assertEqual(list(pad_seq_1.shape), [3, 17, 1])
+        self.assertTrue(torch.all(pad_seq_1[0] == torch.tensor([0.] * 7 + [1.] * 10).unsqueeze(-1)))
+
+        pad_seq_2 = pad_sequence_with_minimal_length(sequences, 5, batch_first=False)
+        self.assertEqual(list(pad_seq_2.shape), [17, 3, 1])
+        self.assertTrue(torch.all(pad_seq_2[:, 0] == torch.tensor([0.] * 7 + [1.] * 10).unsqueeze(-1)))
+
+        pad_seq_3 = pad_sequence_with_minimal_length(sequences, 5, padding_value=0.5)
+        self.assertTrue(torch.all(pad_seq_3[0] == torch.tensor([0.5] * 7 + [1.] * 10).unsqueeze(-1)))
+
+        pad_seq_4 = pad_sequence_with_minimal_length(sequences, 5, 10)
+        self.assertEqual(list(pad_seq_4.shape), [3, 10, 1])
+        self.assertTrue(torch.all(pad_seq_4[0] == torch.ones(10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_4[1] == torch.tensor([0] * 7 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_4[2] == torch.ones(10).unsqueeze(-1)))
+
+        pad_seq_5 = pad_sequence_with_minimal_length(sequences, 20)
+        self.assertEqual(list(pad_seq_5.shape), [3, 20, 1])
+        self.assertTrue(torch.all(pad_seq_5[0] == torch.tensor([0] * 10 + [1.] * 10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[1] == torch.tensor([0] * 17 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[2] == torch.tensor([0] * 3 + [1.] * 17).unsqueeze(-1)))
+
+        sequences = [torch.ones(3, dtype=torch.bool),
+                     torch.ones(15, dtype=torch.bool)]
+        pad_seq_6 = pad_sequence_with_minimal_length(sequences, 5)
+        self.assertTrue(pad_seq_6.dtype == torch.bool)
+        self.assertTrue(torch.all(pad_seq_6[0] == torch.tensor([False] * 12 + [True] * 3, dtype=torch.bool)))
+
+    def test_pad_sequence_controller(self):
+        window_size = 3
+        seq_max_length = 5
+        target_padding_value = 0.5
+        pad_seq_controller = PadSequenceCollector(window_size=window_size,
+                                                  sample_interval=1,
+                                                  target_padding_value=target_padding_value,
+                                                  seq_max_length=seq_max_length)
+        n_prediction_steps = 2
+        seq = TimeSeriesSequence(np.arange(10).astype(np.float), np.arange(10).astype(np.float),
+                                 n_prediction_steps=n_prediction_steps)
+        features_padded = pad_seq_controller([seq[0][0], seq[-1][0]])
+        past_targets = features_padded['past_targets']
+        past_features = features_padded['past_features']
+        self.assertEqual(list(past_targets.shape), [2, seq_max_length])
+        self.assertEqual(list(past_features.shape), [2, seq_max_length, 1])
+        self.assertTrue(features_padded['past_observed_targets'].dtype == torch.bool)
+        self.assertTrue(features_padded['decoder_lengths'].dtype == torch.int64)
+
+        self.assertTrue(torch.all(torch.ones(seq_max_length - 1) * target_padding_value == past_targets[0, :-1]))
+        self.assertTrue(torch.all(torch.zeros(seq_max_length - 1) == past_features[0, :-1]))
+
+        targets_padded = pad_seq_controller([seq[0][1], seq[-1][1]])
+        self.assertTrue(list(targets_padded['future_targets']), [2, n_prediction_steps])
+
+        features_padded = pad_seq_controller([seq[0][0], seq[0][0]])
+        self.assertEqual(list(features_padded['past_targets'].shape), [2, window_size])
+
+        pad_seq_controller.sample_interval = 2
+        features_padded = pad_seq_controller([seq[0][0], seq[-1][0]])
+        self.assertEqual(list(features_padded['past_targets'].shape), [2, 3])
+
+        self.assertTrue(torch.all(
+            pad_seq_controller([{'x': 0}, {'x': 1}])['x'] == torch.Tensor([0, 1]))
+        )
+        self.assertTrue(torch.all(
+            pad_seq_controller([{'x': np.array(0)}, {'x': np.array(1)}])['x'] == torch.Tensor([0, 1]))
+        )
+
+    def test_time_series_sampler(self):
+        indices = np.arange(100)
+        seq_lengths = [5, 10, 15, 20, 50]
+        num_instances_per_seqs = [3.3, 1.3, 0.0, 10, 20.1]
+
+        sampler = TimeSeriesSampler(indices, seq_lengths, num_instances_per_seqs, min_start=2)
+        self.assertEqual(sampler.num_instances, int(np.round(np.sum(num_instances_per_seqs))))
+        # The first sequence does not contain enough data to allow 3.3 sequences, so it only has 1 interval
+        # For the others, Interval should be np.floor(n_inst) + 1 (resulting in  np.floor(n_inst) intervals)
+
+        self.assertEqual(list(map(len, sampler.seq_intervals_int)), [1, 2, 1, 10, 21])
+        self.assertTrue(torch.equal(sampler.seq_intervals_decimal, torch.tensor([[2, 5],
+                                                                                 [7, 11],
+                                                                                 [17, 30],
+                                                                                 [32, 33],
+                                                                                 [52, 54]])))
+        self.assertTrue(
+            torch.allclose(sampler.num_expected_ins_decimal,
+                           torch.Tensor(
+                               [3.3000e+00, 3.0000e-01, 1.0000e-08, 1.0000e-08, 1.0000e-01]).type(torch.float64))
+        )
+
+        for i in range(5):
+            samples = torch.stack(list(sampler)).sort()[0].numpy()
+            for seq_intervals_int in sampler.seq_intervals_int:
+                if len(seq_intervals_int) > 1:
+                    for i in range(len(seq_intervals_int) - 1):
+                        self.assertTrue(
+                            len(np.where((seq_intervals_int[i] < samples) & (samples < seq_intervals_int[i + 1]))) == 1
+                        )
+
+    def test_sequential_sub_set_sampler(self):
+        n_samples = 5
+        n_indices = np.arange(100)
+        sampler = SequentialSubSetSampler(n_indices, n_samples)
+        self.assertEqual(len(sampler), n_samples)
+        self.assertEqual(len(list(sampler)), n_samples)
+
+        sampler = SequentialSubSetSampler(n_indices, 150)
+        self.assertEqual(len(sampler), len(n_indices))
+        self.assertEqual(len(list(sampler)), len(n_indices))
diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py
index 8ae2759db..6deda30ad 100644
--- a/test/test_pipeline/components/training/test_training.py
+++ b/test/test_pipeline/components/training/test_training.py
@@ -108,7 +108,7 @@ def test_fit_transform(self):
         dataset = unittest.mock.MagicMock()
         dataset.__len__.return_value = 1
         datamanager = unittest.mock.MagicMock()
-        datamanager.get_dataset_for_training.return_value = (dataset, dataset)
+        datamanager.get_dataset.return_value = (dataset, dataset)
         fit_dictionary['backend'].load_datamanager.return_value = datamanager
 
         # Mock child classes requirements
@@ -236,6 +236,43 @@ def test_train_step(self):
             lr = optimizer.param_groups[0]['lr']
             assert lr == target_lr
 
+    def test_train_epoch_no_step(self):
+        """
+        This test checks if max runtime is reached
+        for an epoch before any train_step has been
+        completed. In this case we would like to
+        return None for train_loss and an empty
+        dictionary for the metrics.
+        """
+        device = torch.device('cpu')
+        model = torch.nn.Linear(1, 1).to(device)
+        optimizer = torch.optim.Adam(model.parameters(), lr=1)
+        data_loader = unittest.mock.MagicMock(spec=torch.utils.data.DataLoader)
+        ms = [3, 5, 6]
+        params = {
+            'metrics': [],
+            'device': device,
+            'task_type': constants.TABULAR_REGRESSION,
+            'labels': torch.Tensor([]),
+            'metrics_during_training': False,
+            'budget_tracker': BudgetTracker(budget_type='runtime', max_runtime=0),
+            'criterion': torch.nn.MSELoss,
+            'optimizer': optimizer,
+            'scheduler': torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=ms, gamma=2),
+            'model': model,
+            'step_interval': StepIntervalUnit.epoch
+        }
+        trainer = StandardTrainer()
+        trainer.prepare(**params)
+
+        loss, metrics = trainer.train_epoch(
+            train_loader=data_loader,
+            epoch=0,
+            writer=None
+        )
+        assert loss is None
+        assert metrics == {}
+
 
 class TestStandardTrainer(BaseTraining):
     def test_regression_epoch_training(self, n_samples):
@@ -386,7 +423,7 @@ def test_get_set_config_space(self):
 
 
 def test_early_stopping():
-    dataset_properties = {'task_type': 'tabular_classification', 'output_type': 'binary'}
+    dataset_properties = {'task_type': 'tabular_classification', 'output_type': 'binary', 'output_shape': 0}
     trainer_choice = TrainerChoice(dataset_properties=dataset_properties)
 
     def dummy_performance(*args, **kwargs):
diff --git a/test/test_pipeline/test_losses.py b/test/test_pipeline/test_losses.py
index 3eeba6a70..d68f030fb 100644
--- a/test/test_pipeline/test_losses.py
+++ b/test/test_pipeline/test_losses.py
@@ -6,11 +6,18 @@
 from torch import nn
 from torch.nn.modules.loss import _Loss as Loss
 
-from autoPyTorch.pipeline.components.training.losses import get_loss, losses
+from autoPyTorch.pipeline.components.training.losses import (
+    LogProbLoss,
+    MAPELoss,
+    MASELoss,
+    QuantileLoss,
+    get_loss,
+    losses
+)
 from autoPyTorch.utils.implementations import (
     LossWeightStrategyWeighted,
     LossWeightStrategyWeightedBinary,
-    get_loss_weight_strategy,
+    get_loss_weight_strategy
 )
 
 
@@ -45,7 +52,8 @@ def test_get_name_error():
 @pytest.mark.parametrize('loss_details', ['loss_cross_entropy_multiclass',
                                           'loss_cross_entropy_binary',
                                           'loss_bce',
-                                          'loss_mse'], indirect=True)
+                                          'loss_mse',
+                                          'loss_mape'], indirect=True)
 def test_losses(weighted, loss_details):
     dataset_properties, predictions, name, targets, labels = loss_details
     loss = get_loss(dataset_properties=dataset_properties, name=name)
@@ -66,6 +74,7 @@ def test_losses(weighted, loss_details):
 def test_loss_dict():
     assert 'classification' in losses.keys()
     assert 'regression' in losses.keys()
+    assert 'forecasting' in losses.keys()
     for task in losses.values():
         for loss in task.values():
             assert 'module' in loss.keys()
@@ -139,3 +148,60 @@ def test_lossweightstrategyweightedbinary(target, expected_weights):
         torch.from_numpy(target).float(),
         torch.from_numpy(target).float(),
     ) > 0
+
+
+def test_forecasting_losses():
+    target_dims = [2, 3, 1]
+    targets = torch.Tensor([[0.0, 1.0, 2.0],
+                            [0.0, 0.0, 0.0]]).reshape(target_dims)
+    prediction_prob = torch.distributions.normal.Normal(
+        torch.zeros(2, 3, 1),
+        torch.ones(2, 3, 1)
+    )
+    prediction_value = torch.Tensor([[[0.0, 0.0, 0.0],
+                                      [0.5, 0.5, 0.5]]]
+                                    ).reshape(target_dims)
+
+    log_prob_loss_raw = LogProbLoss(reduction="raw")
+    loss_prob_raw = log_prob_loss_raw(prediction_prob, targets)
+    assert torch.allclose(loss_prob_raw, - prediction_prob.log_prob(targets))
+
+    log_prob_loss_mean = LogProbLoss(reduction="mean")
+    loss_prob_mean = log_prob_loss_mean(prediction_prob, targets)
+    assert loss_prob_mean == torch.mean(loss_prob_raw)
+
+    log_prob_loss_sum = LogProbLoss(reduction="sum")
+    loss_prob_sum = log_prob_loss_sum(prediction_prob, targets)
+    assert loss_prob_sum == torch.sum(loss_prob_raw)
+
+    mape_loss = MAPELoss(reduction="raw")
+    loss_mape = mape_loss(prediction_value, targets)
+    assert torch.allclose(loss_mape, torch.Tensor([[0., 1., 1.], [0., 0., 0.]]).reshape(target_dims))
+
+    mase_loss = MASELoss(reduction="raw")
+    loss_mase_1 = mase_loss(prediction_value, targets)
+    assert torch.allclose(loss_mase_1, torch.Tensor([[0., 1., 2.], [0.5, 0.5, 0.5]]).reshape(target_dims))
+
+    mase_loss.set_mase_coefficient(torch.Tensor([[2.0], [1.0]]))
+    loss_mase_2 = mase_loss(prediction_value, targets)
+    assert torch.allclose(loss_mase_2, torch.Tensor([[0., 2., 4.], [0.5, 0.5, 0.5]]).reshape(target_dims))
+
+    mase_loss.set_mase_coefficient(torch.Tensor([[2.0, 2.0]]))
+    with pytest.raises(ValueError, match="If self._mase_coefficient is a Tensor"):
+        _ = mase_loss(prediction_value, targets)
+
+    quantile_loss = QuantileLoss(reduction="raw")
+    diff = 0.5
+    quantile_prediction = [
+        targets + diff
+    ]
+    loss_quantile_1 = quantile_loss(quantile_prediction, targets)
+    assert torch.all(loss_quantile_1 == diff / 2)
+
+    quantiles = [0.1, 0.5, 0.8]
+    quantile_loss.set_quantiles([0.1, 0.5, 0.8])
+    quantile_prediction = [
+        targets - diff, targets - diff, targets - diff
+    ]
+    loss_quantile_2 = quantile_loss(quantile_prediction, targets)
+    assert torch.allclose(loss_quantile_2, torch.ones_like(loss_quantile_2) * diff * np.mean(quantiles))
diff --git a/test/test_pipeline/test_metrics.py b/test/test_pipeline/test_metrics.py
index 1f9889807..0a40d84bb 100644
--- a/test/test_pipeline/test_metrics.py
+++ b/test/test_pipeline/test_metrics.py
@@ -4,6 +4,7 @@
 
 import sklearn.metrics
 
+import sktime.performance_metrics.forecasting as forecasting_metrics
 
 from autoPyTorch.constants import (
     BINARY,
@@ -12,20 +13,24 @@
     STRING_TO_TASK_TYPES,
     TABULAR_CLASSIFICATION,
     TABULAR_REGRESSION,
-    TASK_TYPES_TO_STRING
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING
+)
+from autoPyTorch.metrics import (
+    accuracy,
+    balanced_accuracy,
+    compute_mase_coefficient,
+    mean_squared_error
 )
-from autoPyTorch.metrics import accuracy, balanced_accuracy, mean_squared_error
 from autoPyTorch.pipeline.components.training.metrics.base import (
+    ForecastingMetricMixin,
+    _ForecastingMetric,
     _PredictMetric,
     _ThresholdMetric,
     autoPyTorchMetric,
-    make_metric,
-)
-from autoPyTorch.pipeline.components.training.metrics.utils import (
-    calculate_loss,
-    calculate_score,
-    get_metrics,
+    make_metric
 )
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score, get_metrics
 
 
 @pytest.mark.parametrize('output_type', ['multiclass',
@@ -48,6 +53,15 @@ def test_get_no_name_regression(output_type):
         assert isinstance(metric, autoPyTorchMetric)
 
 
+@pytest.mark.parametrize('output_type', ['continuous', 'continuous-multioutput'])
+def test_get_no_name_forecasting(output_type):
+    dataset_properties = {'task_type': 'time_series_forecasting',
+                          'output_type': output_type}
+    metrics = get_metrics(dataset_properties)
+    for metric in metrics:
+        assert isinstance(metric, ForecastingMetricMixin)
+
+
 @pytest.mark.parametrize('metric', ['accuracy', 'average_precision',
                                     'balanced_accuracy', 'f1'])
 def test_get_name(metric):
@@ -96,6 +110,37 @@ def test_regression_metrics():
         assert isinstance(score, float)
 
 
+def test_forecasting_metric():
+    # test of all regression metrics
+    dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
+                          'output_type': OUTPUT_TYPES_TO_STRING[CONTINUOUS]}
+    n_prediction_steps = 5
+    n_seq = 2
+    n_targets = 2
+
+    y_target = np.zeros([n_seq, n_prediction_steps, n_targets])
+    y_pred = np.ones([n_seq, n_prediction_steps, n_targets])
+    mase_coefficient = np.ones([n_seq, n_prediction_steps, n_targets]) * 2
+    metrics = get_metrics(dataset_properties=dataset_properties, all_supported_metrics=True)
+    forecasting_kwargs = {'sp': 4,
+                          'n_prediction_steps': n_prediction_steps,
+                          'mase_coefficient': mase_coefficient,
+                          }
+    score_dict = calculate_score(y_pred, y_target, STRING_TO_TASK_TYPES[dataset_properties['task_type']], metrics,
+                                 **forecasting_kwargs)
+    assert isinstance(score_dict, dict)
+    for name, score in score_dict.items():
+        assert isinstance(name, str)
+        assert isinstance(score, float)
+    forecasting_kwargs = {'sp': 4,
+                          'n_prediction_steps': n_prediction_steps,
+                          'mase_coefficient': np.ones([1, n_prediction_steps, n_targets]),
+                          }
+    with pytest.raises(ValueError, match="the shape of MASE coefficient and target_shape must be consistent"):
+        score_dict = calculate_score(y_pred, y_target, STRING_TO_TASK_TYPES[dataset_properties['task_type']], metrics,
+                                     **forecasting_kwargs)
+
+
 def test_predictmetric_binary():
     y_true = np.array([0, 0, 1, 1])
     y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
@@ -155,6 +200,44 @@ def test_threshold_scorer_binary():
     assert score == pytest.approx(-1.0)
 
 
+def test_forecastingcomputation():
+    scorer_mean = _ForecastingMetric(
+        'mean_mape', forecasting_metrics.mean_absolute_percentage_error, 0.0, np.finfo(np.float64).max, 1,
+        kwargs=dict(aggregation='mean'),
+    )
+    scorer_median = _ForecastingMetric(
+        'median_mape', forecasting_metrics.mean_absolute_percentage_error, 0.0, np.finfo(np.float64).max, 1,
+        kwargs=dict(aggregation='median'),
+    )
+
+    n_seq = 3
+    n_prediction_steps = 5
+    n_targets = 2
+
+    y_true = np.expand_dims(
+        [np.arange(n_prediction_steps) + i * 10 for i in range(n_seq)], -1
+    ).repeat(n_targets, axis=-1)
+    y_pred = y_true + 1
+    score_mean = scorer_mean(y_true=y_true, y_pred=y_pred, sp=1, n_prediction_steps=n_prediction_steps)
+    score_median = scorer_median(y_true=y_true, y_pred=y_pred, sp=1, n_prediction_steps=n_prediction_steps)
+
+    score_all = []
+    for true_seq, pred_seq in zip(y_true, y_pred):
+        score_all.append(forecasting_metrics.mean_absolute_percentage_error(y_true=true_seq, y_pred=pred_seq))
+    assert score_mean == np.mean(score_all)
+    assert score_median == np.median(score_all)
+
+    # Additional parameters
+    horizon_weight = [0.1, 0.2, 0.3, 0.4, 0.5]
+    score_mean = scorer_mean(y_true=y_true, y_pred=y_pred, sp=1,
+                             n_prediction_steps=n_prediction_steps, horizon_weight=horizon_weight)
+    score_all = []
+    for true_seq, pred_seq in zip(y_true, y_pred):
+        score_all.append(forecasting_metrics.mean_absolute_percentage_error(y_true=true_seq, y_pred=pred_seq,
+                                                                            horizon_weight=horizon_weight))
+    assert score_mean == np.mean(score_all)
+
+
 def test_sign_flip():
     y_true = np.arange(0, 1.01, 0.1)
     y_pred = y_true.copy()
@@ -255,3 +338,19 @@ def test_calculate_loss():
         task_type=TABULAR_REGRESSION,
         metrics=[mean_squared_error],
     )['mean_squared_error']
+
+
+def test_compute_mase_coefficient():
+    past_target = np.arange(12)
+    mase_value_1 = compute_mase_coefficient(past_target, 15)
+    assert mase_value_1 == 1 / np.mean(past_target)
+    mase_value_2 = compute_mase_coefficient(past_target, 5)
+    assert mase_value_2 == 0.2
+
+    past_target = np.ones(12) * 2
+    assert compute_mase_coefficient(past_target, 15) == 0.5
+    assert compute_mase_coefficient(past_target, 5) == 0.5
+
+    past_target = np.zeros(12)
+    assert compute_mase_coefficient(past_target, 15) == 1.
+    assert compute_mase_coefficient(past_target, 5) == 1.
diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py
index 52288b199..c679b931d 100644
--- a/test/test_pipeline/test_tabular_classification.py
+++ b/test/test_pipeline/test_tabular_classification.py
@@ -1,6 +1,7 @@
 import os
 import re
 import unittest
+import unittest.mock
 
 from ConfigSpace.hyperparameters import (
     CategoricalHyperparameter,
@@ -8,6 +9,8 @@
     UniformIntegerHyperparameter,
 )
 
+from flaky import flaky
+
 import numpy as np
 
 import pytest
@@ -23,6 +26,11 @@
     parse_hyperparameter_search_space_updates
 
 
+@pytest.fixture
+def exclude():
+    return {'feature_preprocessor': ['SelectRatesClassification', 'SelectPercentileClassification']}
+
+
 @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only',
                                                     'classification_numerical_only',
                                                     'classification_numerical_and_categorical'], indirect=True)
@@ -53,12 +61,14 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates):
             elif isinstance(hyperparameter, CategoricalHyperparameter):
                 assert update.value_range == hyperparameter.choices
 
-    def test_pipeline_fit(self, fit_dictionary_tabular):
+    @flaky(max_runs=3)
+    def test_pipeline_fit(self, fit_dictionary_tabular, exclude):
         """This test makes sure that the pipeline is able to fit
         given random combinations of hyperparameters across the pipeline"""
 
         pipeline = TabularClassificationPipeline(
-            dataset_properties=fit_dictionary_tabular['dataset_properties'])
+            dataset_properties=fit_dictionary_tabular['dataset_properties'],
+            exclude=exclude)
         cs = pipeline.get_hyperparameter_search_space()
         config = cs.sample_configuration()
         pipeline.set_hyperparameters(config)
@@ -80,12 +90,13 @@ def test_pipeline_fit(self, fit_dictionary_tabular):
         # Make sure a network was fit
         assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)
 
-    def test_pipeline_predict(self, fit_dictionary_tabular):
+    def test_pipeline_predict(self, fit_dictionary_tabular, exclude):
         """This test makes sure that the pipeline is able to predict
         given a random configuration"""
         X = fit_dictionary_tabular['X_train'].copy()
         pipeline = TabularClassificationPipeline(
-            dataset_properties=fit_dictionary_tabular['dataset_properties'])
+            dataset_properties=fit_dictionary_tabular['dataset_properties'],
+            exclude=exclude)
 
         cs = pipeline.get_hyperparameter_search_space()
         config = cs.sample_configuration()
@@ -104,14 +115,15 @@ def test_pipeline_predict(self, fit_dictionary_tabular):
         assert isinstance(prediction, np.ndarray)
         assert prediction.shape == expected_output_shape
 
-    def test_pipeline_predict_proba(self, fit_dictionary_tabular):
+    def test_pipeline_predict_proba(self, fit_dictionary_tabular, exclude):
         """This test makes sure that the pipeline is able to fit
         given random combinations of hyperparameters across the pipeline
         And then predict using predict probability
         """
         X = fit_dictionary_tabular['X_train'].copy()
         pipeline = TabularClassificationPipeline(
-            dataset_properties=fit_dictionary_tabular['dataset_properties'])
+            dataset_properties=fit_dictionary_tabular['dataset_properties'],
+            exclude=exclude)
 
         cs = pipeline.get_hyperparameter_search_space()
         config = cs.sample_configuration()
@@ -133,7 +145,7 @@ def test_pipeline_predict_proba(self, fit_dictionary_tabular):
         assert isinstance(prediction, np.ndarray)
         assert prediction.shape == expected_output_shape
 
-    def test_pipeline_transform(self, fit_dictionary_tabular):
+    def test_pipeline_transform(self, fit_dictionary_tabular, exclude):
         """
         In the context of autopytorch, transform expands a fit dictionary with
         components that where previously fit. We can use this as a nice way to make sure
@@ -142,7 +154,8 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
         """
 
         pipeline = TabularClassificationPipeline(
-            dataset_properties=fit_dictionary_tabular['dataset_properties'])
+            dataset_properties=fit_dictionary_tabular['dataset_properties'],
+            exclude=exclude)
         cs = pipeline.get_hyperparameter_search_space()
         config = cs.sample_configuration()
         pipeline.set_hyperparameters(config)
@@ -171,14 +184,15 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
         assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys()
 
     @pytest.mark.parametrize("is_small_preprocess", [True, False])
-    def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess):
+    def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess, exclude):
         """Makes sure that when no config is set, we can trust the
         default configuration from the space"""
 
         fit_dictionary_tabular['is_small_preprocess'] = is_small_preprocess
 
         pipeline = TabularClassificationPipeline(
-            dataset_properties=fit_dictionary_tabular['dataset_properties'])
+            dataset_properties=fit_dictionary_tabular['dataset_properties'],
+            exclude=exclude)
         with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \
              as patch_train:
             patch_train.return_value = 1, {}
@@ -258,7 +272,8 @@ def test_get_fit_requirements(self, fit_dictionary_tabular):
 
     def test_apply_search_space_updates(self, fit_dictionary_tabular, search_space_updates):
         dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2],
-                              'task_type': 'tabular_classification'}
+                              'task_type': 'tabular_classification', 'issparse': False,
+                              'issigned': False}
         pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties,
                                                  search_space_updates=search_space_updates)
         self._assert_pipeline_search_space(pipeline, search_space_updates)
@@ -275,14 +290,16 @@ def test_read_and_update_search_space(self, fit_dictionary_tabular, search_space
         file_search_space_updates = parse_hyperparameter_search_space_updates(updates_file=path)
         assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates)
         dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2],
-                              'task_type': 'tabular_classification'}
+                              'task_type': 'tabular_classification', 'issparse': False,
+                              'issigned': False}
         pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties,
                                                  search_space_updates=file_search_space_updates)
         assert file_search_space_updates == pipeline.search_space_updates
 
     def test_error_search_space_updates(self, fit_dictionary_tabular, error_search_space_updates):
         dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2],
-                              'task_type': 'tabular_classification'}
+                              'task_type': 'tabular_classification', 'issparse': False,
+                              'issigned': False}
         try:
             _ = TabularClassificationPipeline(dataset_properties=dataset_properties,
                                               search_space_updates=error_search_space_updates)
@@ -293,7 +310,8 @@ def test_error_search_space_updates(self, fit_dictionary_tabular, error_search_s
 
     def test_set_range_search_space_updates(self, fit_dictionary_tabular):
         dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2],
-                              'task_type': 'tabular_classification'}
+                              'task_type': 'tabular_classification', 'issparse': False,
+                              'issigned': False}
         config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \
             get_hyperparameter_search_space()._hyperparameters
         updates = HyperparameterSearchSpaceUpdates()
@@ -325,7 +343,8 @@ def test_set_range_search_space_updates(self, fit_dictionary_tabular):
 
     def test_set_choices_updates(self, fit_dictionary_tabular):
         dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2],
-                              'task_type': 'tabular_classification'}
+                              'task_type': 'tabular_classification', 'issparse': False,
+                              'issigned': False}
         config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \
             get_hyperparameter_search_space()._hyperparameters
         updates = HyperparameterSearchSpaceUpdates()
@@ -491,3 +510,30 @@ def test_train_pipeline_with_runtime(fit_dictionary_tabular_dummy):
 
     # More than 200 epochs would have pass in 5 seconds for this dataset
     assert len(run_summary.performance_tracker['start_time']) > 100
+
+
+@pytest.mark.parametrize("fit_dictionary_tabular_dummy", ["classification"], indirect=True)
+def test_train_pipeline_with_runtime_max_reached(fit_dictionary_tabular_dummy):
+    """
+    This test makes sure that the pipeline raises an
+    error in case no epoch has finished successfully
+    due to max runtime reached
+    """
+
+    # Convert the training to runtime
+    fit_dictionary_tabular_dummy.pop('epochs', None)
+    fit_dictionary_tabular_dummy['budget_type'] = 'runtime'
+    fit_dictionary_tabular_dummy['runtime'] = 5
+    fit_dictionary_tabular_dummy['early_stopping'] = -1
+
+    pipeline = TabularClassificationPipeline(
+        dataset_properties=fit_dictionary_tabular_dummy['dataset_properties'])
+
+    cs = pipeline.get_hyperparameter_search_space()
+    config = cs.get_default_configuration()
+    pipeline.set_hyperparameters(config)
+
+    with unittest.mock.patch('autoPyTorch.pipeline.components.training.trainer.BudgetTracker') as patch:
+        patch.is_max_time_reached.return_value = True
+        with pytest.raises(RuntimeError):
+            pipeline.fit(fit_dictionary_tabular_dummy)
diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py
index 75dc8a415..c6c475b91 100644
--- a/test/test_pipeline/test_tabular_regression.py
+++ b/test/test_pipeline/test_tabular_regression.py
@@ -1,6 +1,7 @@
 import os
 import re
 import unittest
+import unittest.mock
 
 from ConfigSpace.hyperparameters import (
     CategoricalHyperparameter,
diff --git a/test/test_pipeline/test_time_series_forecasting_pipeline.py b/test/test_pipeline/test_time_series_forecasting_pipeline.py
new file mode 100644
index 000000000..3e34b71b7
--- /dev/null
+++ b/test/test_pipeline/test_time_series_forecasting_pipeline.py
@@ -0,0 +1,187 @@
+import copy
+
+import pytest
+
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+@pytest.fixture(params=['ForecastingNet', 'ForecastingSeq2SeqNet', 'ForecastingDeepARNet', 'NBEATSNet'])
+def network_type(request):
+    return request.param
+
+
+@pytest.fixture(params=['LearnedEntityEmbedding', 'NoEmbedding'])
+def embedding(request):
+    return request.param
+
+
+@pytest.fixture(params=['OneHotEncoder', 'NoEncoder'])
+def feature_encoding(request):
+    return request.param
+
+
+def generate_light_updates(updates: HyperparameterSearchSpaceUpdates):
+    updates.append(node_name='data_loader',
+                   hyperparameter='window_size',
+                   value_range=[3, 10],
+                   default_value=5)
+    updates.append(node_name='data_loader',
+                   hyperparameter='batch_size',
+                   value_range=[2, 5],
+                   default_value=4)
+    updates.append(node_name='data_loader',
+                   hyperparameter="num_batches_per_epoch",
+                   value_range=(3, 10),
+                   default_value=5)
+    return updates
+
+
+class TestTimeSeriesForecastingPipeline:
+    @pytest.mark.parametrize("fit_dictionary_forecasting", ["uni_variant_wo_missing",
+                                                            "uni_variant_w_missing",
+                                                            "multi_variant_wo_missing",
+                                                            "multi_variant_w_missing",
+                                                            "multi_variant_only_cat",
+                                                            "multi_variant_only_num"], indirect=True)
+    def test_fit_predict(self, fit_dictionary_forecasting, forecasting_budgets):
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+        if not dataset_properties['uni_variant'] and len(dataset_properties['categories']) > 0:
+            include = {'network_embedding': ['LearnedEntityEmbedding']}
+        else:
+            include = None
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 include=include,
+                                                 search_space_updates=updates)
+        step_names = pipeline.named_steps.keys()
+        step_names_multi_processing = ['impute', 'scaler', 'feature_encoding',
+                                       'time_series_transformer', 'preprocessing']
+
+        steps_multi_in_pipeline = [step_name_multi in step_names for step_name_multi in step_names_multi_processing]
+
+        if not dataset_properties['uni_variant']:
+            assert sum(steps_multi_in_pipeline) == len(steps_multi_in_pipeline)
+        else:
+            assert sum(steps_multi_in_pipeline) == 0
+
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        pipeline = pipeline.fit(fit_dict)
+        datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
+        test_sets = datamanager.generate_test_seqs()
+        predict = pipeline.predict(test_sets)
+
+        assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]
+
+    @pytest.mark.parametrize("fit_dictionary_forecasting, forecasting_budgets", [
+        ["multi_variant_wo_missing", 'resolution'],
+        ["multi_variant_wo_missing", 'num_seq'],
+        ["multi_variant_wo_missing", 'num_sample_per_seq'],
+    ], indirect=True)
+    def test_fit_budgets_types(self, fit_dictionary_forecasting, forecasting_budgets):
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
+
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 search_space_updates=updates)
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        pipeline = pipeline.fit(fit_dict)
+        datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
+        test_sets = datamanager.generate_test_seqs()
+        predict = pipeline.predict(test_sets)
+
+        assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]
+
+    @pytest.mark.parametrize("fit_dictionary_forecasting", [["multi_variant_wo_missing"]], indirect=True)
+    def test_network_encoding_variable_selection(self, fit_dictionary_forecasting, embedding, feature_encoding):
+        if embedding == 'LearnedEntityEmbedding' and feature_encoding == 'NoEncoder':
+            return
+        include = {'network_embedding': [embedding],
+                   'feature_encoding': [feature_encoding],
+                   'network_backbone': ['seq_encoder']
+                   }
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
+
+        updates.append(node_name='network_backbone',
+                       hyperparameter='seq_encoder:num_blocks',
+                       value_range=[1, 1],
+                       default_value=1)
+        updates.append(node_name='network_backbone',
+                       hyperparameter='seq_encoder:variable_selection',
+                       value_range=[True, ],
+                       default_value=True)
+
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 include=include,
+                                                 search_space_updates=updates)
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        # No error should be raised
+        _ = pipeline.fit(fit_dict)
+
+    @pytest.mark.parametrize("fit_dictionary_forecasting", ["multi_variant_w_missing"], indirect=True)
+    def test_networks(self, fit_dictionary_forecasting, network_type):
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
+
+        if network_type == 'NBEATSNet':
+            include = {'network_backbone': ['flat_encoder:NBEATSEncoder'],
+                       'loss': ['RegressionLoss']}
+
+            updates.append(node_name='network_backbone',
+                           hyperparameter='flat_encoder:NBEATSDecoder:backcast_loss_ration',
+                           value_range=[0.1, 0.9],
+                           default_value=0.5)
+        else:
+            updates.append(node_name='network_backbone',
+                           hyperparameter='seq_encoder:num_blocks',
+                           value_range=[1, 1],
+                           default_value=1)
+            include = None
+            if network_type == 'ForecastingNet':
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                               value_range=[False, ],
+                               default_value=False)
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:decoder_auto_regressive',
+                               value_range=[False, ],
+                               default_value=False)
+
+            elif network_type == 'ForecastingSeq2SeqNet':
+                include = {'network_backbone': ['seq_encoder']}
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:decoder_auto_regressive',
+                               value_range=[True, ],
+                               default_value=True)
+
+            elif network_type == 'ForecastingDeepARNet':
+                include = {'network_backbone': ['seq_encoder:RNNEncoder'],
+                           'loss': ['DistributionLoss']}
+
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                               value_range=[True, ],
+                               default_value=True)
+
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 include=include,
+                                                 search_space_updates=updates)
+
+        cs = pipeline.get_hyperparameter_search_space()
+
+        pipeline.set_hyperparameters(cs.get_default_configuration())
+
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        pipeline = pipeline.fit(fit_dict)
+        datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
+        test_sets = datamanager.generate_test_seqs()
+        predict = pipeline.predict(test_sets)
+
+        assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]
diff --git a/test/test_api/.tmp_api/runhistory_B.json b/test/test_utils/runhistory.json
similarity index 99%
rename from test/test_api/.tmp_api/runhistory_B.json
rename to test/test_utils/runhistory.json
index 37e499664..a2c3658a8 100755
--- a/test/test_api/.tmp_api/runhistory_B.json
+++ b/test/test_utils/runhistory.json
@@ -1133,6 +1133,7 @@
     "1": {
       "data_loader:batch_size": 64,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "mean",
@@ -1166,6 +1167,7 @@
     "2": {
       "data_loader:batch_size": 142,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "median",
@@ -1203,6 +1205,7 @@
     "3": {
       "data_loader:batch_size": 246,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "most_frequent",
@@ -1281,6 +1284,7 @@
     "4": {
       "data_loader:batch_size": 269,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "median",
@@ -1324,6 +1328,7 @@
     "5": {
       "data_loader:batch_size": 191,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "most_frequent",
@@ -1373,6 +1378,7 @@
     "6": {
       "data_loader:batch_size": 53,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "median",
@@ -1429,6 +1435,7 @@
     "7": {
       "data_loader:batch_size": 232,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "most_frequent",
@@ -1506,6 +1513,7 @@
     "8": {
       "data_loader:batch_size": 164,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "mean",
@@ -1540,6 +1548,7 @@
     "9": {
       "data_loader:batch_size": 94,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PolynomialFeatures",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "mean",
@@ -1589,6 +1598,7 @@
     "10": {
       "data_loader:batch_size": 70,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "constant_zero",
@@ -1637,6 +1647,7 @@
     "11": {
       "data_loader:batch_size": 274,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "mean",
@@ -1675,6 +1686,7 @@
     "12": {
       "data_loader:batch_size": 191,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
       "imputer:categorical_strategy": "constant_!missing!",
       "imputer:numerical_strategy": "median",
@@ -1730,6 +1742,7 @@
     "13": {
       "data_loader:batch_size": 35,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "most_frequent",
@@ -1766,6 +1779,7 @@
     "14": {
       "data_loader:batch_size": 154,
       "encoder:__choice__": "OneHotEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "KernelPCA",
       "imputer:categorical_strategy": "most_frequent",
       "imputer:numerical_strategy": "mean",
diff --git a/test/test_utils/test_coalescer_transformer.py b/test/test_utils/test_coalescer_transformer.py
new file mode 100644
index 000000000..eccd6b7bd
--- /dev/null
+++ b/test/test_utils/test_coalescer_transformer.py
@@ -0,0 +1,101 @@
+import numpy as np
+
+import pytest
+
+import scipy.sparse
+
+from autoPyTorch.utils.implementations import MinorityCoalesceTransformer
+
+
+@pytest.fixture
+def X1():
+    # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 30%,
+    # 30%, 30%, 5% and 5% respectively
+    X = np.vstack((
+        np.ones((30, 10)) * 3,
+        np.ones((30, 10)) * 4,
+        np.ones((30, 10)) * 5,
+        np.ones((5, 10)) * 6,
+        np.ones((5, 10)) * 7,
+    ))
+    for col in range(X.shape[1]):
+        np.random.shuffle(X[:, col])
+    return X
+
+
+@pytest.fixture
+def X2():
+    # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 5%,
+    # 5%, 5%, 35% and 50% respectively
+    X = np.vstack((
+        np.ones((5, 10)) * 3,
+        np.ones((5, 10)) * 4,
+        np.ones((5, 10)) * 5,
+        np.ones((35, 10)) * 6,
+        np.ones((50, 10)) * 7,
+    ))
+    for col in range(X.shape[1]):
+        np.random.shuffle(X[:, col])
+    return X
+
+
+def test_default(X1):
+    X = X1
+    X_copy = np.copy(X)
+    Y = MinorityCoalesceTransformer().fit_transform(X)
+    np.testing.assert_array_almost_equal(Y, X_copy)
+    # Assert no copies were made
+    assert id(X) == id(Y)
+
+
+def test_coalesce_10_percent(X1):
+    X = X1
+    Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X)
+    for col in range(Y.shape[1]):
+        hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
+        np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30])
+    # Assert no copies were made
+    assert id(X) == id(Y)
+
+
+def test_coalesce_10_percent_sparse(X1):
+    X = scipy.sparse.csc_matrix(X1)
+    Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X)
+    # Assert no copies were made
+    assert id(X) == id(Y)
+    Y = Y.todense()
+    for col in range(Y.shape[1]):
+        hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
+        np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30])
+
+
+def test_invalid_X(X1):
+    X = X1 - 5
+    with pytest.raises(ValueError):
+        MinorityCoalesceTransformer().fit_transform(X)
+
+
+@pytest.mark.parametrize("min_frac", [-0.1, 1.1])
+def test_invalid_min_frac(min_frac):
+    with pytest.raises(ValueError):
+        MinorityCoalesceTransformer(min_frac=min_frac)
+
+
+def test_transform_before_fit(X1):
+    with pytest.raises(RuntimeError):
+        MinorityCoalesceTransformer().transform(X1)
+
+
+def test_transform_after_fit(X1, X2):
+    # On both X_fit and X_transf, the categories 3, 4, 5, 6, 7 are present.
+    X_fit = X1  # Here categories 3, 4, 5 have ocurrence above 10%
+    X_transf = X2  # Here it is the opposite, just categs 6 and 7 are above 10%
+
+    mc = MinorityCoalesceTransformer(min_frac=.1).fit(X_fit)
+
+    # transform() should coalesce categories as learned during fit.
+    # Category distribution in X_transf should be irrelevant.
+    Y = mc.transform(X_transf)
+    for col in range(Y.shape[1]):
+        hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
+        np.testing.assert_array_almost_equal(hist[0], [85, 0, 0, 0, 0, 5, 5, 5])
diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py
new file mode 100644
index 000000000..ea3dec563
--- /dev/null
+++ b/test/test_utils/test_common.py
@@ -0,0 +1,72 @@
+"""
+This tests the functionality in autoPyTorch/utils/common.
+"""
+from enum import Enum
+
+import pytest
+
+from autoPyTorch.utils.common import autoPyTorchEnum
+
+
+class SubEnum(autoPyTorchEnum):
+    x = "x"
+    y = "y"
+
+
+class DummyEnum(Enum):  # You need to move it on top
+    x = "x"
+
+
+@pytest.mark.parametrize('iter',
+                         ([SubEnum.x],
+                          ["x"],
+                          {SubEnum.x: "hello"},
+                          {'x': 'hello'},
+                          SubEnum,
+                          ["x", "y"]))
+def test_autopytorch_enum(iter):
+    """
+    This test ensures that a subclass of `autoPyTorchEnum`
+    can be used with strings.
+
+    Args:
+        iter (Iterable):
+            iterable to check for compaitbility
+    """
+
+    e = SubEnum.x
+
+    assert e in iter
+
+
+@pytest.mark.parametrize('iter',
+                         [[SubEnum.y],
+                          ["y"],
+                          {SubEnum.y: "hello"},
+                          {'y': 'hello'}])
+def test_autopytorch_enum_false(iter):
+    """
+    This test ensures that a subclass of `autoPyTorchEnum`
+    can be used with strings.
+    Args:
+        iter (Iterable):
+            iterable to check for compaitbility
+    """
+
+    e = SubEnum.x
+
+    assert e not in iter
+
+
+@pytest.mark.parametrize('others', (1, 2.0, SubEnum, DummyEnum.x))
+def test_raise_errors_autopytorch_enum(others):
+    """
+    This test ensures that a subclass of `autoPyTorchEnum`
+    raises error properly.
+    Args:
+        others (Any):
+            Variable to compare with SubEnum.
+    """
+
+    with pytest.raises(RuntimeError):
+        SubEnum.x == others
diff --git a/test/test_utils/test_results_manager.py b/test/test_utils/test_results_manager.py
new file mode 100644
index 000000000..496aec7fa
--- /dev/null
+++ b/test/test_utils/test_results_manager.py
@@ -0,0 +1,471 @@
+import json
+import os
+from datetime import datetime
+from test.test_api.utils import make_dict_run_history_data
+from unittest.mock import MagicMock
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+import numpy as np
+
+import pytest
+
+from smac.runhistory.runhistory import RunHistory, RunKey, RunValue, StatusType
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.metrics import accuracy, balanced_accuracy, log_loss
+from autoPyTorch.utils.results_manager import (
+    EnsembleResults,
+    MetricResults,
+    ResultsManager,
+    SearchResults,
+    cost2metric,
+    get_start_time
+)
+
+
+T, NT = 'traditional', 'non-traditional'
+SCORES = [0.1 * (i + 1) for i in range(10)]
+END_TIMES = [8, 4, 3, 6, 0, 7, 1, 9, 2, 5]
+
+
+def _check_status(status):
+    """ Based on runhistory.json """
+    ans = [
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.CRASHED, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.SUCCESS, StatusType.SUCCESS,
+        StatusType.TIMEOUT, StatusType.TIMEOUT,
+    ]
+    assert isinstance(status, list)
+    assert isinstance(status[0], StatusType)
+    assert status == ans
+
+
+def _check_costs(costs):
+    """ Based on runhistory.json """
+    ans = [0.15204678362573099, 0.4444444444444444, 0.5555555555555556, 0.29824561403508776,
+           0.4444444444444444, 0.4444444444444444, 1.0, 0.5555555555555556, 0.4444444444444444,
+           0.15204678362573099, 0.15204678362573099, 0.4035087719298246, 0.4444444444444444,
+           0.4444444444444444, 1.0, 1.0]
+    assert np.allclose(1 - np.array(costs), ans)
+    assert isinstance(costs, np.ndarray)
+    assert costs.dtype is np.dtype(np.float)
+
+
+def _check_end_times(end_times):
+    """ Based on runhistory.json """
+    ans = [1637342642.7887495, 1637342647.2651122, 1637342675.2555833, 1637342681.334954,
+           1637342693.2717755, 1637342704.341065, 1637342726.1866672, 1637342743.3274522,
+           1637342749.9442234, 1637342762.5487585, 1637342779.192385, 1637342804.3368232,
+           1637342820.8067145, 1637342846.0210106, 1637342897.1205413, 1637342928.7456856]
+
+    assert np.allclose(end_times, ans)
+    assert isinstance(end_times, np.ndarray)
+    assert end_times.dtype is np.dtype(np.float)
+
+
+def _check_fit_times(fit_times):
+    """ Based on runhistory.json """
+    ans = [3.154788017272949, 3.2763524055480957, 22.723600149154663, 4.990685224533081, 10.684926509857178,
+           9.947429180145264, 11.687273979187012, 8.478890419006348, 5.485020637512207, 11.514830589294434,
+           15.370736837387085, 23.846530199050903, 6.757539510726929, 15.061991930007935, 50.010520696640015,
+           22.011935234069824]
+
+    assert np.allclose(fit_times, ans)
+    assert isinstance(fit_times, np.ndarray)
+    assert fit_times.dtype is np.dtype(np.float)
+
+
+def _check_budgets(budgets):
+    """ Based on runhistory.json """
+    ans = [5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
+           5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555,
+           5.555555555555555, 16.666666666666664, 50.0, 16.666666666666664, 16.666666666666664,
+           16.666666666666664, 50.0, 50.0]
+    assert np.allclose(budgets, ans)
+    assert isinstance(budgets, list)
+    assert isinstance(budgets[0], float)
+
+
+def _check_additional_infos(status_types, additional_infos):
+    for i, status in enumerate(status_types):
+        info = additional_infos[i]
+        if status in (StatusType.SUCCESS, StatusType.DONOTADVANCE):
+            metric_info = info.get('opt_loss', None)
+            assert metric_info is not None
+        elif info is not None:
+            metric_info = info.get('opt_loss', None)
+            assert metric_info is None
+
+
+def _check_metric_dict(metric_dict, status_types, worst_val):
+    assert isinstance(metric_dict['accuracy'], list)
+    assert metric_dict['accuracy'][0] > 0
+    assert isinstance(metric_dict['balanced_accuracy'], list)
+    assert metric_dict['balanced_accuracy'][0] > 0
+
+    for key, vals in metric_dict.items():
+        # ^ is a XOR operator
+        # True and False / False and True must be fulfilled
+        assert all([(s == StatusType.SUCCESS) ^ np.isclose([val], [worst_val])
+                    for s, val in zip(status_types, vals)])
+
+
+def _check_metric_results(scores, metric, run_history, ensemble_performance_history):
+    if metric.name == 'accuracy':  # Check the case when ensemble does not have the metric name
+        dummy_history = [{'Timestamp': datetime(2000, 1, 1), 'train_log_loss': 1, 'test_log_loss': 1}]
+        mr = MetricResults(metric, run_history, dummy_history)
+        # ensemble_results should be None because ensemble evaluated log_loss
+        assert mr.ensemble_results.empty()
+        data = mr.get_ensemble_merged_data()
+        # since ensemble_results is None, merged_data must be identical to the run_history data
+        assert all(np.allclose(data[key], mr.data[key]) for key in data.keys())
+
+    mr = MetricResults(metric, run_history, ensemble_performance_history)
+    perfs = np.array([cost2metric(s, metric) for s in scores])
+    modified_scores = scores[::2] + [0]
+    modified_scores.insert(2, 0)
+    ens_perfs = np.array([s for s in modified_scores])
+    assert np.allclose(mr.data[f'single::train::{metric.name}'], perfs)
+    assert np.allclose(mr.data[f'single::opt::{metric.name}'], perfs)
+    assert np.allclose(mr.data[f'single::test::{metric.name}'], perfs)
+    assert np.allclose(mr.data[f'ensemble::train::{metric.name}'], ens_perfs)
+    assert np.allclose(mr.data[f'ensemble::test::{metric.name}'], ens_perfs)
+
+    # the end times of synthetic ensemble is [0.25, 0.45, 0.45, 0.65, 0.85, 0.85]
+    # the end times of synthetic run history is 0.1 * np.arange(1, 9) or 0.1 * np.arange(2, 10)
+    ensemble_ends_later = mr.search_results.end_times[-1] < mr.ensemble_results.end_times[-1]
+    indices = [2, 4, 4, 6, 8, 8] if ensemble_ends_later else [1, 3, 3, 5, 7, 7]
+
+    merged_data = mr.get_ensemble_merged_data()
+    worst_val = metric._worst_possible_result
+    minimize = metric._sign == -1
+    ans = np.full_like(mr.cum_times, worst_val)
+    for idx, s in zip(indices, mr.ensemble_results.train_scores):
+        ans[idx] = min(ans[idx], s) if minimize else max(ans[idx], s)
+
+    assert np.allclose(ans, merged_data[f'ensemble::train::{metric.name}'])
+    assert np.allclose(ans, merged_data[f'ensemble::test::{metric.name}'])
+
+
+def test_extract_results_from_run_history():
+    # test the raise error for the `status_msg is None`
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    config = Configuration(cs, {})
+    run_history.add(
+        config=config,
+        cost=0.0,
+        time=1.0,
+        status=StatusType.CAPPED,
+    )
+    with pytest.raises(ValueError):
+        SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history)
+
+
+def test_raise_error_in_update_and_sort_by_time():
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+    config = Configuration(cs, {'a': 0.1})
+
+    sr = SearchResults(metric=accuracy, scoring_functions=[], run_history=RunHistory())
+    er = EnsembleResults(metric=accuracy, ensemble_performance_history=[])
+
+    with pytest.raises(RuntimeError):
+        sr._update(
+            config=config,
+            run_key=RunKey(config_id=0, instance_id=0, seed=0),
+            run_value=RunValue(
+                cost=0, time=1, status=StatusType.SUCCESS,
+                starttime=0, endtime=1, additional_info={}
+            )
+        )
+
+    with pytest.raises(RuntimeError):
+        sr._sort_by_endtime()
+
+    with pytest.raises(RuntimeError):
+        er._update(data={})
+
+    with pytest.raises(RuntimeError):
+        er._sort_by_endtime()
+
+
+@pytest.mark.parametrize('starttimes', (list(range(10)), list(range(10))[::-1]))
+@pytest.mark.parametrize('status_types', (
+    [StatusType.SUCCESS] * 9 + [StatusType.STOP],
+    [StatusType.RUNNING] + [StatusType.SUCCESS] * 9
+))
+def test_get_start_time(starttimes, status_types):
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+    endtime = 1e9
+    kwargs = dict(cost=1.0, endtime=endtime)
+    for starttime, status_type in zip(starttimes, status_types):
+        config = Configuration(cs, {'a': 0.1 * starttime})
+        run_history.add(
+            config=config,
+            starttime=starttime,
+            time=endtime - starttime,
+            status=status_type,
+            **kwargs
+        )
+    starttime = get_start_time(run_history)
+
+    # this rule is strictly defined on the inputs defined from pytest
+    ans = min(t for s, t in zip(status_types, starttimes) if s == StatusType.SUCCESS)
+    assert starttime == ans
+
+
+def test_raise_error_in_get_start_time():
+    # test the raise error for the `status_msg is None`
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    config = Configuration(cs, {})
+    run_history.add(
+        config=config,
+        cost=0.0,
+        time=1.0,
+        status=StatusType.CAPPED,
+    )
+
+    with pytest.raises(ValueError):
+        get_start_time(run_history)
+
+
+def test_search_results_sort_by_endtime():
+    run_history = RunHistory()
+    n_configs = len(SCORES)
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+    order = np.argsort(END_TIMES)
+    ans = np.array(SCORES)[order].tolist()
+    status_types = [StatusType.SUCCESS, StatusType.DONOTADVANCE] * (n_configs // 2)
+
+    for i, (fixed_val, et, status) in enumerate(zip(SCORES, END_TIMES, status_types)):
+        config = Configuration(cs, {'a': fixed_val})
+        run_history.add(
+            config=config, cost=fixed_val,
+            status=status, budget=fixed_val,
+            time=et - fixed_val, starttime=fixed_val, endtime=et,
+            additional_info={
+                'a': fixed_val,
+                'configuration_origin': [T, NT][i % 2],
+                'train_loss': {accuracy.name: fixed_val - 0.1},
+                'opt_loss': {accuracy.name: fixed_val},
+                'test_loss': {accuracy.name: fixed_val + 0.1}
+            }
+        )
+
+    sr = SearchResults(accuracy, scoring_functions=[], run_history=run_history, order_by_endtime=True)
+    assert sr.budgets == ans
+    assert np.allclose(accuracy._optimum - accuracy._sign * sr.opt_scores, ans)
+    assert np.allclose(accuracy._optimum - accuracy._sign * sr.train_scores, np.array(ans) - accuracy._sign * 0.1)
+    assert np.allclose(accuracy._optimum - accuracy._sign * sr.test_scores, np.array(ans) + accuracy._sign * 0.1)
+    assert np.allclose(1 - sr.opt_scores, ans)
+    assert sr._end_times == list(range(n_configs))
+    assert all(c.get('a') == val for val, c in zip(ans, sr.configs))
+    assert all(info['a'] == val for val, info in zip(ans, sr.additional_infos))
+    assert np.all(np.array([s for s in status_types])[order] == np.array(sr.status_types))
+    assert sr.is_traditionals == np.array([True, False] * 5)[order].tolist()
+    assert np.allclose(sr.fit_times, np.subtract(np.arange(n_configs), ans))
+
+
+def test_ensemble_results():
+    order = np.argsort(END_TIMES)
+    end_times = [datetime.timestamp(datetime(2000, et + 1, 1)) for et in END_TIMES]
+    ensemble_performance_history = [
+        {'Timestamp': datetime(2000, et + 1, 1), 'train_accuracy': s1, 'test_accuracy': s2}
+        for et, s1, s2 in zip(END_TIMES, SCORES, SCORES[::-1])
+    ]
+
+    er = EnsembleResults(log_loss, ensemble_performance_history)
+    assert er.empty()
+
+    er = EnsembleResults(accuracy, ensemble_performance_history)
+    assert er._train_scores == SCORES
+    assert np.allclose(er.train_scores, SCORES)
+    assert er._test_scores == SCORES[::-1]
+    assert np.allclose(er.test_scores, SCORES[::-1])
+    assert np.allclose(er.end_times, end_times)
+
+    er = EnsembleResults(accuracy, ensemble_performance_history, order_by_endtime=True)
+    assert np.allclose(er.train_scores, np.array(SCORES)[order])
+    assert np.allclose(er.test_scores, np.array(SCORES[::-1])[order])
+    assert np.allclose(er.end_times, np.array(end_times)[order])
+
+
+@pytest.mark.parametrize('metric', (accuracy, log_loss))
+@pytest.mark.parametrize('scores', (SCORES[:8], SCORES[:8][::-1]))
+@pytest.mark.parametrize('ensemble_ends_later', (True, False))
+def test_metric_results(metric, scores, ensemble_ends_later):
+    # since datetime --> timestamp variates between machines and float64 might not
+    # be able to handle time precisely enough, we might need to change t0 in the future.
+    # Basically, it happens because this test is checking by the precision of milli second
+    t0, ms_unit = (1970, 1, 1, 9, 0, 0), 100000
+    ensemble_performance_history = [
+        {'Timestamp': datetime(*t0, ms_unit * 2 * (i + 1) + ms_unit // 2),
+         f'train_{metric.name}': s,
+         f'test_{metric.name}': s}
+        for i, s in enumerate(scores[::2])
+    ]
+    # Add a record with the exact same stamp as the last one
+    ensemble_performance_history.append(
+        {'Timestamp': datetime(*t0, ms_unit * 8 + ms_unit // 2),
+         f'train_{metric.name}': 0,
+         f'test_{metric.name}': 0}
+    )
+    # Add a record with the exact same stamp as a middle one
+    ensemble_performance_history.append(
+        {'Timestamp': datetime(*t0, ms_unit * 4 + ms_unit // 2),
+         f'train_{metric.name}': 0,
+         f'test_{metric.name}': 0}
+    )
+
+    run_history = RunHistory()
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+
+    for i, fixed_val in enumerate(scores):
+        config = Configuration(cs, {'a': fixed_val})
+        st = datetime.timestamp(datetime(*t0, ms_unit * (i + 1 - ensemble_ends_later)))
+        et = datetime.timestamp(datetime(*t0, ms_unit * (i + 2 - ensemble_ends_later)))
+        run_history.add(
+            config=config, cost=1, budget=0,
+            time=0.1, starttime=st, endtime=et,
+            status=StatusType.SUCCESS,
+            additional_info={
+                'configuration_origin': T,
+                'train_loss': {f'{metric.name}': fixed_val},
+                'opt_loss': {f'{metric.name}': fixed_val},
+                'test_loss': {f'{metric.name}': fixed_val}
+            }
+        )
+    _check_metric_results(scores, metric, run_history, ensemble_performance_history)
+
+
+def test_search_results_sprint_statistics():
+    BaseTask.__abstractmethods__ = set()
+    api = BaseTask()
+    for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']:
+        with pytest.raises(RuntimeError):
+            getattr(api, method)()
+
+    run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
+                                                   'runhistory.json'),
+                                      mode='r'))['data']
+    api._results_manager.run_history = MagicMock()
+    api.run_history.empty = MagicMock(return_value=False)
+
+    # The run_history has 16 runs + 1 run interruption ==> 16 runs
+    api.run_history.data = make_dict_run_history_data(run_history_data)
+    api._metric = accuracy
+    api.dataset_name = 'iris'
+    api._scoring_functions = [accuracy, balanced_accuracy]
+    api.search_space = MagicMock(spec=ConfigurationSpace)
+    worst_val = api._metric._worst_possible_result
+    search_results = api.get_search_results()
+
+    _check_status(search_results.status_types)
+    _check_costs(search_results.opt_scores)
+    _check_end_times(search_results.end_times)
+    _check_fit_times(search_results.fit_times)
+    _check_budgets(search_results.budgets)
+    _check_metric_dict(search_results.opt_metric_dict, search_results.status_types, worst_val)
+    _check_additional_infos(status_types=search_results.status_types,
+                            additional_infos=search_results.additional_infos)
+
+    # config_ids can duplicate because of various budget size
+    config_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 10, 11, 12, 10, 13]
+    assert config_ids == search_results.config_ids
+
+    # assert that contents of search_results are of expected types
+    assert isinstance(search_results.rank_opt_scores, np.ndarray)
+    assert search_results.rank_opt_scores.dtype is np.dtype(np.int)
+    assert isinstance(search_results.configs, list)
+
+    n_success, n_timeout, n_memoryout, n_crashed = 13, 2, 0, 1
+    msg = ["autoPyTorch results:", f"\tDataset name: {api.dataset_name}",
+           f"\tOptimisation Metric: {api._metric.name}",
+           f"\tBest validation score: {max(search_results.opt_scores)}",
+           "\tNumber of target algorithm runs: 16", f"\tNumber of successful target algorithm runs: {n_success}",
+           f"\tNumber of crashed target algorithm runs: {n_crashed}",
+           f"\tNumber of target algorithms that exceeded the time limit: {n_timeout}",
+           f"\tNumber of target algorithms that exceeded the memory limit: {n_memoryout}"]
+
+    assert isinstance(api.sprint_statistics(), str)
+    assert all([m1 == m2 for m1, m2 in zip(api.sprint_statistics().split("\n"), msg)])
+
+
+@pytest.mark.parametrize('run_history', (None, RunHistory()))
+def test_check_run_history(run_history):
+    manager = ResultsManager()
+    manager.run_history = run_history
+
+    with pytest.raises(RuntimeError):
+        manager._check_run_history()
+
+
+@pytest.mark.parametrize('include_traditional', (True, False))
+@pytest.mark.parametrize('metric', (accuracy, log_loss))
+@pytest.mark.parametrize('origins', ([T] * 5 + [NT] * 5, [T, NT] * 5, [NT] * 5 + [T] * 5))
+@pytest.mark.parametrize('scores', (SCORES, SCORES[::-1]))
+def test_get_incumbent_results(include_traditional, metric, origins, scores):
+    manager = ResultsManager()
+    cs = ConfigurationSpace()
+    cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1))
+
+    configs = [0.1 * (i + 1) for i in range(len(scores))]
+    if metric.name == "log_loss":
+        # This is to detect mis-computation in reversion
+        metric._optimum = 0.1
+
+    best_cost, best_idx = np.inf, -1
+    for idx, (a, origin, score) in enumerate(zip(configs, origins, scores)):
+        config = Configuration(cs, {'a': a})
+
+        # conversion defined in:
+        # autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss
+        cost = metric._optimum - metric._sign * score
+        manager.run_history.add(
+            config=config,
+            cost=cost,
+            time=1.0,
+            status=StatusType.SUCCESS,
+            additional_info={'train_loss': {metric.name: cost},
+                             'opt_loss': {metric.name: cost},
+                             'test_loss': {metric.name: cost},
+                             'configuration_origin': origin}
+        )
+        if cost > best_cost:
+            continue
+
+        if include_traditional:
+            best_cost, best_idx = cost, idx
+        elif origin != T:
+            best_cost, best_idx = cost, idx
+
+    incumbent_config, incumbent_results = manager.get_incumbent_results(
+        metric=metric,
+        include_traditional=include_traditional
+    )
+
+    assert isinstance(incumbent_config, Configuration)
+    assert isinstance(incumbent_results, dict)
+    best_score, best_a = scores[best_idx], configs[best_idx]
+    assert np.allclose(
+        [best_score, best_score, best_a],
+        [cost2metric(best_cost, metric),
+         cost2metric(incumbent_results['opt_loss'][metric.name], metric),
+         incumbent_config['a']]
+    )
+
+    if not include_traditional:
+        assert incumbent_results['configuration_origin'] != T
diff --git a/test/test_utils/test_results_visualizer.py b/test/test_utils/test_results_visualizer.py
new file mode 100644
index 000000000..e31571ef0
--- /dev/null
+++ b/test/test_utils/test_results_visualizer.py
@@ -0,0 +1,302 @@
+import json
+import os
+from datetime import datetime
+from test.test_api.utils import make_dict_run_history_data
+from unittest.mock import MagicMock
+
+from ConfigSpace import ConfigurationSpace
+
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+import pytest
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.metrics import accuracy, balanced_accuracy
+from autoPyTorch.utils.results_visualizer import (
+    ColorLabelSettings,
+    PlotSettingParams,
+    ResultsVisualizer,
+    _get_perf_and_time
+)
+
+
+TEST_CL = ('test color', 'test label')
+
+
+@pytest.mark.parametrize('cl_settings', (
+    ColorLabelSettings(single_opt=TEST_CL),
+    ColorLabelSettings(single_opt=TEST_CL, single_test=None, single_train=None)
+))
+@pytest.mark.parametrize('with_ensemble', (True, False))
+def test_extract_dicts(cl_settings, with_ensemble):
+    dummy_keys = [name for name in [
+        'single::train::dummy',
+        'single::opt::dummy',
+        'single::test::dummy',
+        'ensemble::train::dummy',
+        'ensemble::test::dummy'
+    ] if (
+        (with_ensemble or not name.startswith('ensemble'))
+        and getattr(cl_settings, "_".join(name.split('::')[:2])) is not None
+    )
+    ]
+
+    results = MagicMock()
+    results.data.keys = MagicMock(return_value=dummy_keys)
+    cd, ld = cl_settings.extract_dicts(results)
+    assert set(dummy_keys) == set(cd.keys())
+    assert set(dummy_keys) == set(ld.keys())
+
+    opt_key = 'single::opt::dummy'
+    assert TEST_CL == (cd[opt_key], ld[opt_key])
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(show=True),
+    PlotSettingParams(show=False),
+    PlotSettingParams(show=True, figname='dummy')
+))
+def test_plt_show_in_set_plot_args(params):  # TODO
+    plt.show = MagicMock()
+    plt.savefig = MagicMock()
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+
+    viz._set_plot_args(ax, params)
+    # if figname is not None, show will not be called. (due to the matplotlib design)
+    assert plt.show._mock_called == (params.figname is None and params.show)
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(),
+    PlotSettingParams(figname='fig')
+))
+def test_plt_savefig_in_set_plot_args(params):  # TODO
+    plt.savefig = MagicMock()
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+
+    viz._set_plot_args(ax, params)
+    assert plt.savefig._mock_called == (params.figname is not None)
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(grid=True),
+    PlotSettingParams(grid=False)
+))
+def test_ax_grid_in_set_plot_args(params):  # TODO
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    ax.grid = MagicMock()
+    viz = ResultsVisualizer()
+
+    viz._set_plot_args(ax, params)
+    assert ax.grid._mock_called == params.grid
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(xscale='none', yscale='none'),
+    PlotSettingParams(xscale='none', yscale='log'),
+    PlotSettingParams(xscale='none', yscale='none'),
+    PlotSettingParams(xscale='none', yscale='log')
+))
+def test_raise_value_error_in_set_plot_args(params):  # TODO
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+
+    with pytest.raises(ValueError):
+        viz._set_plot_args(ax, params)
+
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(xlim=(-100, 100), ylim=(-200, 200)),
+    PlotSettingParams(xlabel='x label', ylabel='y label'),
+    PlotSettingParams(xscale='log', yscale='log'),
+    PlotSettingParams(legend=False, title='Title')
+))
+def test_set_plot_args(params):  # TODO
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    viz = ResultsVisualizer()
+    viz._set_plot_args(ax, params)
+
+    if params.xlim is not None:
+        assert ax.get_xlim() == params.xlim
+    if params.ylim is not None:
+        assert ax.get_ylim() == params.ylim
+
+    assert ax.xaxis.get_label()._text == ('' if params.xlabel is None else params.xlabel)
+    assert ax.yaxis.get_label()._text == ('' if params.ylabel is None else params.ylabel)
+    assert ax.get_title() == ('' if params.title is None else params.title)
+    assert params.xscale == ax.get_xscale()
+    assert params.yscale == ax.get_yscale()
+
+    if params.legend:
+        assert ax.get_legend() is not None
+    else:
+        assert ax.get_legend() is None
+
+    plt.close()
+
+
+@pytest.mark.parametrize('metric_name', ('unknown', 'accuracy'))
+def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name):
+    BaseTask.__abstractmethods__ = set()
+    api = BaseTask()
+
+    if metric_name == 'unknown':
+        with pytest.raises(ValueError):
+            api.plot_perf_over_time(metric_name)
+    else:
+        with pytest.raises(RuntimeError):
+            api.plot_perf_over_time(metric_name)
+
+
+@pytest.mark.parametrize('metric_name', ('balanced_accuracy', 'accuracy'))
+def test_plot_perf_over_time(metric_name):  # TODO
+    dummy_history = [{'Timestamp': datetime(2022, 1, 1), 'train_accuracy': 1, 'test_accuracy': 1}]
+    BaseTask.__abstractmethods__ = set()
+    api = BaseTask()
+    run_history_data = json.load(open(os.path.join(os.path.dirname(__file__),
+                                                   'runhistory.json'),
+                                      mode='r'))['data']
+    api._results_manager.run_history = MagicMock()
+    api.run_history.empty = MagicMock(return_value=False)
+
+    # The run_history has 16 runs + 1 run interruption ==> 16 runs
+    api.run_history.data = make_dict_run_history_data(run_history_data)
+    api._results_manager.ensemble_performance_history = dummy_history
+    api._metric = accuracy
+    api.dataset_name = 'iris'
+    api._scoring_functions = [accuracy, balanced_accuracy]
+    api.search_space = MagicMock(spec=ConfigurationSpace)
+
+    api.plot_perf_over_time(metric_name=metric_name)
+    _, ax = plt.subplots(nrows=1, ncols=1)
+    api.plot_perf_over_time(metric_name=metric_name, ax=ax)
+
+    # remove ensemble keys if metric name is not for the opt score
+    ans = set([
+        name
+        for name in [f'single train {metric_name}',
+                     f'single test {metric_name}',
+                     f'single opt {metric_name}',
+                     f'ensemble train {metric_name}',
+                     f'ensemble test {metric_name}']
+        if metric_name == api._metric.name or not name.startswith('ensemble')
+    ])
+    legend_set = set([txt._text for txt in ax.get_legend().texts])
+    assert ans == legend_set
+    plt.close()
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(xscale='none', yscale='none'),
+    PlotSettingParams(xscale='none', yscale='log'),
+    PlotSettingParams(xscale='log', yscale='none'),
+    PlotSettingParams(yscale='log')
+))
+def test_raise_error_get_perf_and_time(params):
+    results = np.linspace(-1, 1, 10)
+    cum_times = np.linspace(0, 1, 10)
+
+    with pytest.raises(ValueError):
+        _get_perf_and_time(
+            cum_results=results,
+            cum_times=cum_times,
+            plot_setting_params=params,
+            worst_val=np.inf
+        )
+
+
+@pytest.mark.parametrize('params', (
+    PlotSettingParams(n_points=20, xscale='linear', yscale='linear'),
+    PlotSettingParams(n_points=20, xscale='log', yscale='log')
+))
+def test_get_perf_and_time(params):
+    y_min, y_max = 1e-5, 1
+    results = np.linspace(y_min, y_max, 10)
+    cum_times = np.linspace(y_min, y_max, 10)
+
+    check_points, perf_by_time_step = _get_perf_and_time(
+        cum_results=results,
+        cum_times=cum_times,
+        plot_setting_params=params,
+        worst_val=np.inf
+    )
+
+    times_ans = np.linspace(
+        y_min if params.xscale == 'linear' else np.log(y_min),
+        y_max if params.xscale == 'linear' else np.log(y_max),
+        params.n_points
+    )
+    times_ans = times_ans if params.xscale == 'linear' else np.exp(times_ans)
+    assert np.allclose(check_points, times_ans)
+
+    if params.xscale == 'linear':
+        """
+        each time step to record the result
+        [1.00000000e-05, 5.26410526e-02, 1.05272105e-01, 1.57903158e-01,
+         2.10534211e-01, 2.63165263e-01, 3.15796316e-01, 3.68427368e-01,
+         4.21058421e-01, 4.73689474e-01, 5.26320526e-01, 5.78951579e-01,
+         6.31582632e-01, 6.84213684e-01, 7.36844737e-01, 7.89475789e-01,
+         8.42106842e-01, 8.94737895e-01, 9.47368947e-01, 1.00000000e+00]
+
+        The time steps when each result was recorded
+        [
+            1.0000e-05,  # cover index 0 ~ 2
+            1.1112e-01,  # cover index 3, 4
+            2.2223e-01,  # cover index 5, 6
+            3.3334e-01,  # cover index 7, 8
+            4.4445e-01,  # cover index 9, 10
+            5.5556e-01,  # cover index 11, 12
+            6.6667e-01,  # cover index 13, 14
+            7.7778e-01,  # cover index 15, 16
+            8.8889e-01,  # cover index 17, 18
+            1.0000e+00   # cover index 19
+        ]
+        Since the sequence is monotonically increasing,
+        if multiple elements cover the same index, take the best.
+        """
+        results_ans = [r for r in results]
+        results_ans = [results[0]] + results_ans + results_ans[:-1]
+        results_ans = np.sort(results_ans)
+    else:
+        """
+        each time step to record the result
+        [1.00000000e-05, 1.83298071e-05, 3.35981829e-05, 6.15848211e-05,
+         1.12883789e-04, 2.06913808e-04, 3.79269019e-04, 6.95192796e-04,
+         1.27427499e-03, 2.33572147e-03, 4.28133240e-03, 7.84759970e-03,
+         1.43844989e-02, 2.63665090e-02, 4.83293024e-02, 8.85866790e-02,
+         1.62377674e-01, 2.97635144e-01, 5.45559478e-01, 1.00000000e+00]
+
+        The time steps when each result was recorded
+        [
+            1.0000e-05,  # cover index 0 ~ 15
+            1.1112e-01,  # cover index 16
+            2.2223e-01,  # cover index 17
+            3.3334e-01,  # cover index 18
+            4.4445e-01,  # cover index 18
+            5.5556e-01,  # cover index 19
+            6.6667e-01,  # cover index 19
+            7.7778e-01,  # cover index 19
+            8.8889e-01,  # cover index 19
+            1.0000e+00   # cover index 19
+        ]
+        Since the sequence is monotonically increasing,
+        if multiple elements cover the same index, take the best.
+        """
+        results_ans = [
+            *([results[0]] * 16),
+            results[1],
+            results[2],
+            results[4],
+            results[-1]
+        ]
+
+    assert np.allclose(perf_by_time_step, results_ans)