diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml
deleted file mode 100644
index 36cda1470e..0000000000
--- a/.github/workflows/flake8.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: Flake8 & Docformat
-
-on: [push, pull_request]
-
-jobs:
-  flake8:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.8'
-    - name: Install dependencies
-      run: |
-        pip install ruff
-        pip install docformatter[tomli]==1.5.0
-    - name: Ruff (Flake8)
-      run: |
-        ruff src
-    - name: DocFormatter
-      run: |
-        docformatter -r src/gluonts
diff --git a/.github/workflows/lints.yml b/.github/workflows/lints.yml
new file mode 100644
index 0000000000..0931abe329
--- /dev/null
+++ b/.github/workflows/lints.yml
@@ -0,0 +1,23 @@
+name: Ruff & Docformat
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        check: ["ruff", "docformatter"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+    - name: Install tools
+      run: pip install "ruff==0.2.2" "docformatter[tomli]==1.5.0"
+    - name: Ruff (Flake8)
+      if: matrix.check == 'ruff'
+      working-directory: src/
+      run: ruff check .
+    - name: Docformatter
+      if: matrix.check == 'docformatter'
+      run: docformatter --check -r src/
diff --git a/Justfile b/Justfile
index db6f48cedb..986927c91c 100644
--- a/Justfile
+++ b/Justfile
@@ -34,7 +34,7 @@ release:
   python setup.py sdist
 
 black:
-  black --check --color src test examples
+  black --check --diff --color src test examples
 
 mypy:
   python setup.py type_check
diff --git a/examples/persist_model.py b/examples/persist_model.py
index 149550f48f..de07e3edcc 100644
--- a/examples/persist_model.py
+++ b/examples/persist_model.py
@@ -12,7 +12,7 @@
 # permissions and limitations under the License.
 
 """
-This example shows how to serialize and deserialize a model
+This example shows how to serialize and deserialize a model.
 """
 import os
 import pprint
diff --git a/pyproject.toml b/pyproject.toml
index 0e409fddbe..96524e4daa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,17 +22,15 @@ filterwarnings = "ignore"
 line-length = 79
 
 lint.ignore = [
-    # line-length is handled by black
-    "E501",
-
-    # TODO: remove usage of `l`
-    "E741"
+    "E501",  # line-length is handled by black
+    "E741"  # TODO: remove usage of `l`
 ]
 
 exclude = ["src/gluonts/nursery"]
 
 
 [tool.docformatter]
+black = true
 pre-summary-newline = true
 make-summary-multi-line = true
 wrap-descriptions = 79
diff --git a/setup.py b/setup.py
index 8673c79c8b..eb628f654f 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,9 @@ def get_version_cmdclass(version_file) -> dict:
 
 
 class TypeCheckCommand(distutils.cmd.Command):
-    """A custom command to run MyPy on the project sources."""
+    """
+    A custom command to run MyPy on the project sources.
+    """
 
     description = "run MyPy on Python source files"
     user_options = []
diff --git a/src/gluonts/dataset/artificial/recipe.py b/src/gluonts/dataset/artificial/recipe.py
index 36cdbede42..5a35113dab 100644
--- a/src/gluonts/dataset/artificial/recipe.py
+++ b/src/gluonts/dataset/artificial/recipe.py
@@ -1063,10 +1063,10 @@ def normalized_ar1(tau, x0=None, norm="minmax", sigma=1.0):
     r"""
     Returns an ar1 process with an auto correlation time of tau.
 
-    norm can be
-    None -> no normalization
-    'minmax' -> min_max_scaled
-    'standard' -> 0 mean, unit variance
+    norm can be:
+      - None -> no normalization
+      - 'minmax' -> min_max_scaled
+      - 'standard' -> 0 mean, unit variance
     """
     assert norm in [None, "minmax", "standard"]
     phi = lifted_numpy.exp(-1.0 / tau)
diff --git a/src/gluonts/dataset/common.py b/src/gluonts/dataset/common.py
index 8bde27dd2d..6aa530a9c0 100644
--- a/src/gluonts/dataset/common.py
+++ b/src/gluonts/dataset/common.py
@@ -128,10 +128,11 @@ def infer_file_type(path):
 
 
 def _rglob(path: Path, pattern="*", levels=1):
-    """Like ``path.rglob(pattern)`` except this limits the number of sub
-    directories that are traversed. ``levels = 0`` is thus the same as
-    ``path.glob(pattern)``.
+    """
+    Like ``path.rglob(pattern)`` except this limits the number of sub
+    directories that are traversed.
 
+    ``levels = 0`` is thus the same as  ``path.glob(pattern)``.
     """
     if levels is not None:
         levels -= 1
diff --git a/src/gluonts/dataset/jsonl.py b/src/gluonts/dataset/jsonl.py
index 310e68cd95..30ed1c4090 100644
--- a/src/gluonts/dataset/jsonl.py
+++ b/src/gluonts/dataset/jsonl.py
@@ -146,6 +146,7 @@ def __len__(self):
     def _line_starts(self):
         """
         Calculate the position for each line in the file.
+
         This information can be used with ``file.seek`` to directly jump to a
         specific line in the file.
         """
diff --git a/src/gluonts/dataset/multivariate_grouper.py b/src/gluonts/dataset/multivariate_grouper.py
index ddc4909f62..a16207299c 100644
--- a/src/gluonts/dataset/multivariate_grouper.py
+++ b/src/gluonts/dataset/multivariate_grouper.py
@@ -92,8 +92,9 @@ def _preprocess(self, dataset: Dataset) -> None:
         The preprocess function iterates over the dataset to gather data that
         is necessary for alignment.
 
-        This includes     1) Storing first/last timestamp in the dataset     2)
-        Storing the frequency of the dataset
+        This includes:
+             1. Storing first/last timestamp in the dataset
+             2. Storing the frequency of the dataset
         """
         for data in dataset:
             timestamp = data[FieldName.START]
diff --git a/src/gluonts/dataset/schema/translate.py b/src/gluonts/dataset/schema/translate.py
index 5ea7c41955..f9d4be95dd 100644
--- a/src/gluonts/dataset/schema/translate.py
+++ b/src/gluonts/dataset/schema/translate.py
@@ -44,7 +44,9 @@ def fields(self):
 
 @dataclass
 class Get(Op):
-    """Extracts the field ``name`` from the input."""
+    """
+    Extracts the field ``name`` from the input.
+    """
 
     name: str
 
@@ -69,7 +71,9 @@ def fields(self):
 
 @dataclass
 class GetAttr(Op):
-    """Invokes ``obj.name``"""
+    """
+    Invokes ``obj.name``.
+    """
 
     obj: Op
     name: str
@@ -298,7 +302,8 @@ def parse(x: Union[str, list]) -> Op:
 
 @dataclass
 class Translator:
-    """Simple translation for GluonTS Datasets.
+    """
+    Simple translation for GluonTS Datasets.
 
     A given translator transforms an input dictionary (data-entry) into an
     output dictionary.
diff --git a/src/gluonts/dataset/split.py b/src/gluonts/dataset/split.py
index 07a8a769dc..22d9e1393d 100644
--- a/src/gluonts/dataset/split.py
+++ b/src/gluonts/dataset/split.py
@@ -86,8 +86,8 @@ def periods_between(
     end: pd.Period,
 ) -> int:
     """
-    Count how many periods fit between ``start`` and ``end``
-    (inclusive). The frequency is taken from ``start``.
+    Count how many periods fit between ``start`` and ``end`` (inclusive). The
+    frequency is taken from ``start``.
 
     For example:
 
diff --git a/src/gluonts/ev/aggregations.py b/src/gluonts/ev/aggregations.py
index 634175ad71..6b9232eee5 100644
--- a/src/gluonts/ev/aggregations.py
+++ b/src/gluonts/ev/aggregations.py
@@ -34,7 +34,8 @@ def get(self) -> np.ndarray:
 
 @dataclass
 class Sum(Aggregation):
-    """Map-reduce way of calculating the sum of a stream of values.
+    """
+    Map-reduce way of calculating the sum of a stream of values.
 
     `partial_result` represents one of two things, depending on the axis:
     Case 1 - axis 0 is aggregated (axis is None or 0):
@@ -75,7 +76,8 @@ def get(self) -> np.ndarray:
 
 @dataclass
 class Mean(Aggregation):
-    """Map-reduce way of calculating the mean of a stream of values.
+    """
+    Map-reduce way of calculating the mean of a stream of values.
 
     `partial_result` represents one of two things, depending on the axis:
     Case 1 - axis 0 is aggregated (axis is None or 0):
diff --git a/src/gluonts/ev/metrics.py b/src/gluonts/ev/metrics.py
index 4d3e55b335..c14189eae2 100644
--- a/src/gluonts/ev/metrics.py
+++ b/src/gluonts/ev/metrics.py
@@ -50,7 +50,9 @@ class MetricCollection:
     metrics: List[Metric]
 
     def update(self, data: Mapping[str, np.ndarray]) -> Self:
-        """Update metrics using a single data instance."""
+        """
+        Update metrics using a single data instance.
+        """
 
         for metric in self.metrics:
             metric.update(data)
@@ -58,7 +60,9 @@ def update(self, data: Mapping[str, np.ndarray]) -> Self:
         return self
 
     def update_all(self, stream: Iterator[Mapping[str, np.ndarray]]) -> Self:
-        """Update metrics using a stream of data instances."""
+        """
+        Update metrics using a stream of data instances.
+        """
 
         for element in stream:
             self.update(element)
@@ -74,12 +78,16 @@ class Metric:
     name: str
 
     def update(self, data: Mapping[str, np.ndarray]) -> Self:
-        """Update metric using a single data instance."""
+        """
+        Update metric using a single data instance.
+        """
 
         raise NotImplementedError
 
     def update_all(self, stream: Iterator[Mapping[str, np.ndarray]]) -> Self:
-        """Update metric using a stream of data instances."""
+        """
+        Update metric using a stream of data instances.
+        """
 
         for element in stream:
             self.update(element)
@@ -92,7 +100,9 @@ def get(self) -> np.ndarray:
 
 @dataclass
 class DirectMetric(Metric):
-    """A Metric which uses a single function and aggregation strategy."""
+    """
+    A Metric which uses a single function and aggregation strategy.
+    """
 
     stat: Callable
     aggregate: Aggregation
@@ -108,10 +118,11 @@ def get(self) -> np.ndarray:
 
 @dataclass
 class DerivedMetric(Metric):
-    """A Metric that is computed using other metrics.
+    """
+    A Metric that is computed using other metrics.
 
-    A derived metric updates multiple, simpler metrics independently and in
-    the end combines their results as defined in `post_process`.
+    A derived metric updates multiple, simpler metrics independently and in the
+    end combines their results as defined in `post_process`.
     """
 
     metrics: Dict[str, Metric]
@@ -237,7 +248,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class MAE(BaseMetricDefinition):
-    """Mean Absolute Error"""
+    """
+    Mean Absolute Error.
+    """
 
     forecast_type: str = "0.5"
 
@@ -254,7 +267,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class MSE(BaseMetricDefinition):
-    """Mean Squared Error"""
+    """
+    Mean Squared Error.
+    """
 
     forecast_type: str = "mean"
 
@@ -295,7 +310,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class MAPE(BaseMetricDefinition):
-    """Mean Absolute Percentage Error"""
+    """
+    Mean Absolute Percentage Error.
+    """
 
     forecast_type: str = "0.5"
 
@@ -314,7 +331,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class SMAPE(BaseMetricDefinition):
-    """Symmetric Mean Absolute Percentage Error"""
+    """
+    Symmetric Mean Absolute Percentage Error.
+    """
 
     forecast_type: str = "0.5"
 
@@ -334,7 +353,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class MSIS(BaseMetricDefinition):
-    """Mean Scaled Interval Score"""
+    """
+    Mean Scaled Interval Score.
+    """
 
     alpha: float = 0.05
 
@@ -351,7 +372,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class MASE(BaseMetricDefinition):
-    """Mean Absolute Scaled Error"""
+    """
+    Mean Absolute Scaled Error.
+    """
 
     forecast_type: str = "0.5"
 
@@ -382,7 +405,9 @@ def __call__(self, axis: Optional[int] = None) -> DirectMetric:
 
 @dataclass
 class ND(BaseMetricDefinition):
-    """Normalized Deviation"""
+    """
+    Normalized Deviation.
+    """
 
     forecast_type: str = "0.5"
 
@@ -410,7 +435,9 @@ def __call__(self, axis: Optional[int] = None) -> DerivedMetric:
 
 @dataclass
 class RMSE(BaseMetricDefinition):
-    """Root Mean Squared Error"""
+    """
+    Root Mean Squared Error.
+    """
 
     forecast_type: str = "mean"
 
@@ -435,7 +462,9 @@ def __call__(self, axis: Optional[int] = None) -> DerivedMetric:
 
 @dataclass
 class NRMSE(BaseMetricDefinition):
-    """RMSE, normalized by the mean absolute label"""
+    """
+    RMSE, normalized by the mean absolute label.
+    """
 
     forecast_type: str = "mean"
 
@@ -582,7 +611,9 @@ def __call__(self, axis: Optional[int] = None) -> DerivedMetric:
 
 @dataclass
 class OWA(BaseMetricDefinition):
-    """Overall Weighted Average"""
+    """
+    Overall Weighted Average.
+    """
 
     forecast_type: str = "0.5"
 
diff --git a/src/gluonts/ev/ts_stats.py b/src/gluonts/ev/ts_stats.py
index 6851b5651e..dea6557c8e 100644
--- a/src/gluonts/ev/ts_stats.py
+++ b/src/gluonts/ev/ts_stats.py
@@ -17,9 +17,11 @@
 def seasonal_error(
     time_series: np.ndarray, seasonality: int, time_axis=0
 ) -> np.ndarray:
-    """The mean abs. difference of a time series, shifted by its seasonality.
+    """
+    The mean abs. difference of a time series, shifted by its seasonality.
 
-    Some metrics use the seasonal error for normalization."""
+    Some metrics use the seasonal error for normalization.
+    """
 
     time_length = time_series.shape[time_axis]
 
diff --git a/src/gluonts/evaluation/_base.py b/src/gluonts/evaluation/_base.py
index b623bf3d75..5f624522af 100644
--- a/src/gluonts/evaluation/_base.py
+++ b/src/gluonts/evaluation/_base.py
@@ -108,8 +108,9 @@ def aggregate_valid(
 def validate_forecast(
     forecast: Forecast, quantiles: Iterable[Quantile]
 ) -> bool:
-    """Validates a Forecast object by checking it for `NaN` values.
-    The supplied quantiles and mean (if available) are checked.
+    """
+    Validates a Forecast object by checking it for `NaN` values. The supplied
+    quantiles and mean (if available) are checked.
 
     Parameters
     ----------
@@ -767,7 +768,8 @@ def __call__(
         fcst_iterator: Iterable[Forecast],
         num_series=None,
     ) -> Tuple[Dict[str, float], pd.DataFrame]:
-        """Compute accuracy metrics for multivariate forecasts.
+        """
+        Compute accuracy metrics for multivariate forecasts.
 
         Parameters
         ----------
diff --git a/src/gluonts/evaluation/metrics.py b/src/gluonts/evaluation/metrics.py
index 41a2c97bea..342602468e 100644
--- a/src/gluonts/evaluation/metrics.py
+++ b/src/gluonts/evaluation/metrics.py
@@ -65,8 +65,9 @@ def mse(target: np.ndarray, forecast: np.ndarray) -> float:
 
 def abs_error(target: np.ndarray, forecast: np.ndarray) -> float:
     r"""
-    .. math::
+    Absolute error.
 
+    .. math::
         abs\_error = sum(|Y - \hat{Y}|)
     """
     return np.sum(np.abs(target - forecast))
@@ -74,8 +75,9 @@ def abs_error(target: np.ndarray, forecast: np.ndarray) -> float:
 
 def quantile_loss(target: np.ndarray, forecast: np.ndarray, q: float) -> float:
     r"""
-    .. math::
+    Quantile loss.
 
+    .. math::
         quantile\_loss = 2 * sum(|(Y - \hat{Y}) * (Y <= \hat{Y}) - q|)
     """
     return 2 * np.sum(np.abs((forecast - target) * ((target <= forecast) - q)))
@@ -83,8 +85,9 @@ def quantile_loss(target: np.ndarray, forecast: np.ndarray, q: float) -> float:
 
 def coverage(target: np.ndarray, forecast: np.ndarray) -> float:
     r"""
-    .. math::
+    coverage.
 
+    .. math::
         coverage = mean(Y <= \hat{Y})
     """
     return float(np.mean(target <= forecast))
@@ -156,8 +159,9 @@ def msis(
 
 def abs_target_sum(target) -> float:
     r"""
-    .. math::
+    Absolute target sum.
 
+    .. math::
         abs\_target\_sum = sum(|Y|)
     """
     return np.sum(np.abs(target))
@@ -165,8 +169,9 @@ def abs_target_sum(target) -> float:
 
 def abs_target_mean(target) -> float:
     r"""
-    .. math::
+    Absolute target mean.
 
+    .. math::
         abs\_target\_mean = mean(|Y|)
     """
     return np.mean(np.abs(target))
@@ -174,7 +179,7 @@ def abs_target_mean(target) -> float:
 
 def num_masked_values(target) -> float:
     """
-    Count number of masked values in target
+    Count number of masked values in target.
     """
     if np.ma.isMaskedArray(target):
         return np.ma.count_masked(target)
diff --git a/src/gluonts/ext/hierarchicalforecast.py b/src/gluonts/ext/hierarchicalforecast.py
index 44ed9d7b54..e05b3ec0dc 100644
--- a/src/gluonts/ext/hierarchicalforecast.py
+++ b/src/gluonts/ext/hierarchicalforecast.py
@@ -44,9 +44,8 @@ def get_formatted_S(
     ts_names: List[str],
 ) -> pd.DataFrame:
     """
-    We format the summation matrix S as a dataframe,
-    where the index and columns have the
-    corresponding time series names.
+    We format the summation matrix S as a dataframe, where the index and
+    columns have the corresponding time series names.
     """
 
     S = np.array(_S)
@@ -60,12 +59,13 @@ def format_data_entry(entry: DataEntry, S: pd.DataFrame) -> pd.DataFrame:
     """
     Format data entry as required by hierarchicalforecast.
 
-    ``entry`` is a dictionary with keys: ``"start"``, ``"item_id"``, ``"target"``.
-    ``entry["target"]`` is a ``np.ndarray`` with shape ``(num_ts, num_timestamps)``,
-    and each row corresponds to one time series of the hierarchy.
-    The goal is to reshape this DataEntry as a dataframe where:
-    1) the index corresponds to the name of the time series,
-    2) the columns ``"ds"`` and ``"y"`` correspond to timestamps and actuals, respectively.
+    ``entry`` is a dictionary with keys: ``"start"``, ``"item_id"``,
+    ``"target"``. ``entry["target"]`` is a ``np.ndarray`` with shape ``(num_ts,
+    num_timestamps)``, and each row corresponds to one time series of the
+    hierarchy. The goal is to reshape this DataEntry as a dataframe where:
+
+        1) the index corresponds to the name of the time series,
+        2) the columns ``"ds"`` and ``"y"`` correspond to timestamps and actuals, respectively.
     """
 
     df = pd.DataFrame(entry["target"]).T
@@ -86,13 +86,12 @@ def unpivot(df: pd.DataFrame) -> pd.DataFrame:
     """
     Unpivot data frame.
 
-    The input dataframe has as index the time stamps,
-    and one column per each time series of the hierarchy.
-    We unpivot this so that the final dataframe has
-    three columns, i.e. ``"unique_id"``, ``"ds"``, and ``"y"``, where
-    1) ``"unique_id"`` has the name of the corresponding time series,
-    2) ``"ds"`` has the corresponding time stamps,
-    3) ``"y"`` has the actuals.
+    The input dataframe has as index the time stamps, and one column per each
+    time series of the hierarchy. We unpivot this so that the final dataframe
+    has three columns, i.e. ``"unique_id"``, ``"ds"``, and ``"y"``, where:
+
+        1) ``"unique_id"`` has the name of the corresponding time series,
+        2) ``"ds"`` has the corresponding time stamps, 3) ``"y"`` has the actuals.
     """
 
     n, k = df.shape
diff --git a/src/gluonts/ext/naive_2/_predictor.py b/src/gluonts/ext/naive_2/_predictor.py
index ca4937b091..7bc58f42d3 100644
--- a/src/gluonts/ext/naive_2/_predictor.py
+++ b/src/gluonts/ext/naive_2/_predictor.py
@@ -107,8 +107,9 @@ def naive_2(
 
 class Naive2Predictor(RepresentablePredictor):
     """
-    Naïve 2 forecaster as described in the M4 Competition Guide:
-    http://www.unic.ac.cy/test/wp-content/uploads/sites/2/2018/09/M4-Competitors-Guide.pdf
+    Naïve 2 forecaster as described in the M4 Competition Guide.
+
+    See: http://www.unic.ac.cy/test/wp-content/uploads/sites/2/2018/09/M4-Competitors-Guide.pdf.
 
     The Python analogue implementation to:
     https://github.com/Mcompetitions/M4-methods/blob/master/Benchmarks%20and%20Evaluation.R#L118
diff --git a/src/gluonts/ext/r_forecast/_hierarchical_predictor.py b/src/gluonts/ext/r_forecast/_hierarchical_predictor.py
index 63a0374443..f42968afb0 100644
--- a/src/gluonts/ext/r_forecast/_hierarchical_predictor.py
+++ b/src/gluonts/ext/r_forecast/_hierarchical_predictor.py
@@ -40,9 +40,8 @@
 
 
 class RHierarchicalForecastPredictor(RBasePredictor):
-    """
-    Wrapper for calling the `R hts package
-    <https://www.r-pkg.org/pkg/hts>`_.
+    r"""
+    Wrapper for calling the `R hts package <https://www.r-pkg.org/pkg/hts>`_.
 
     In order to use it you need to install R and rpy2. You also need the R `hts` package which
     can be installed by running:
diff --git a/src/gluonts/ext/r_forecast/_predictor.py b/src/gluonts/ext/r_forecast/_predictor.py
index 09b93417d1..8d144676e5 100644
--- a/src/gluonts/ext/r_forecast/_predictor.py
+++ b/src/gluonts/ext/r_forecast/_predictor.py
@@ -57,8 +57,8 @@
 
 class RBasePredictor(RepresentablePredictor):
     """
-    The `RBasePredictor` is a thin wrapper for calling R packages.
-    In order to use it you need to install R and rpy2.
+    The `RBasePredictor` is a thin wrapper for calling R packages. In order to
+    use it you need to install R and rpy2.
 
     Note that specific R packages need to be installed, depending
     on which wrapper one needs to run.
@@ -140,7 +140,6 @@ def _get_r_forecast(self, data: Dict) -> Dict:
         -------
         Dictionary
             Forecasts saved in a dictionary.
-
         """
         raise NotImplementedError()
 
@@ -156,7 +155,6 @@ def _run_r_forecast(self, data: Dict) -> Tuple[Dict, List]:
         Returns
         -------
         Tuple[Dict, List]:
-
         """
         buf = []
 
@@ -184,8 +182,8 @@ def dont_save(x):
 
     def _preprocess_data(self, data: Dict) -> Dict:
         """
-        Preprocessing of target time series, e.g., truncating length or
-        slicing bottom time series in case of hierarchical forecasting etc.
+        Preprocessing of target time series, e.g., truncating length or slicing
+        bottom time series in case of hierarchical forecasting etc.
 
         Parameters
         ----------
@@ -195,7 +193,6 @@ def _preprocess_data(self, data: Dict) -> Dict:
         Returns
         -------
         Dict
-
         """
         raise NotImplementedError()
 
@@ -203,10 +200,6 @@ def _warning_message(self) -> None:
         """
         Prints warning messages (once per whole dataset), e.g., if default
         parameters are overridden.
-
-        Returns
-        -------
-
         """
         return
 
@@ -235,7 +228,6 @@ def _forecast_dict_to_obj(
         -------
         Forecast
             Sample based or quantile based forecasts.
-
         """
         raise NotImplementedError()
 
diff --git a/src/gluonts/ext/r_forecast/_univariate_predictor.py b/src/gluonts/ext/r_forecast/_univariate_predictor.py
index d437017592..aaf259e326 100644
--- a/src/gluonts/ext/r_forecast/_univariate_predictor.py
+++ b/src/gluonts/ext/r_forecast/_univariate_predictor.py
@@ -43,7 +43,8 @@
 
 class RForecastPredictor(RBasePredictor):
     """
-    Wrapper for calling the `R forecast package
+    Wrapper for calling the `R forecast package.
+
     <http://pkg.robjhyndman.com/forecast/>`_.
 
     In order to use it you need to install R and rpy2. You also need the R `forecast` package which
diff --git a/src/gluonts/ext/r_forecast/util.py b/src/gluonts/ext/r_forecast/util.py
index 0679685633..f972771a27 100644
--- a/src/gluonts/ext/r_forecast/util.py
+++ b/src/gluonts/ext/r_forecast/util.py
@@ -31,7 +31,9 @@ def unlist(l):
 
 
 def interval_to_quantile_level(interval_level: int, side: str) -> float:
-    """Convert a prediction interval level (upper or lower) into a quantile level."""
+    """
+    Convert a prediction interval level (upper or lower) into a quantile level.
+    """
     if side == "upper":
         level = 50 + interval_level / 2
     elif side == "lower":
@@ -42,6 +44,8 @@ def interval_to_quantile_level(interval_level: int, side: str) -> float:
 
 
 def quantile_to_interval_level(quantile_level: float) -> Tuple[int, str]:
-    """Convert a quantile level into a prediction interval level (upper or lower)."""
+    """
+    Convert a quantile level into a prediction interval level (upper or lower).
+    """
     side = "upper" if quantile_level >= 0.5 else "lower"
     return round(200 * abs(0.5 - quantile_level)), side
diff --git a/src/gluonts/ext/rotbaum/_predictor.py b/src/gluonts/ext/rotbaum/_predictor.py
index 6631e8dde0..bab40f11fd 100644
--- a/src/gluonts/ext/rotbaum/_predictor.py
+++ b/src/gluonts/ext/rotbaum/_predictor.py
@@ -348,9 +348,10 @@ def predict(  # type: ignore
 
     def serialize(self, path: Path) -> None:
         """
-        This function calls parent class serialize() in order to serialize
-        the class name, version information and constuctor arguments. It
-        persists the tree predictor by pickling the model list that is
+        This function calls parent class serialize() in order to serialize the
+        class name, version information and constructor arguments.
+
+        It persists the tree predictor by pickling the model list that is
         generated when pickling the TreePredictor.
         """
         super().serialize(path)
@@ -360,9 +361,10 @@ def serialize(self, path: Path) -> None:
     @classmethod
     def deserialize(cls, path: Path, **kwargs: Any) -> "TreePredictor":
         """
-        This function loads and returns the serialized model. It loads
-        the predictor class with the serialized arguments. It then loads
-        the trained model list by reading the pickle file.
+        This function loads and returns the serialized model.
+
+        It loads the predictor class with the serialized arguments. It then
+        loads the trained model list by reading the pickle file.
         """
 
         predictor = super().deserialize(path)
@@ -375,13 +377,13 @@ def explain(
         self, importance_type: str = "gain", percentage: bool = True
     ) -> ExplanationResult:
         """
-        This function only works for self.method == "QuantileRegression",
-        and uses lightgbm's feature importance functionality. It takes the
-        mean feature importance across quantiles and timestamps in the
-        forecast horizon; and then adds these mean values across all of the
-        feature coordinates that are associated to "target",
-        "feat_static_real", "feat_static_cat", "past_feat_dynamic_real",
-        "feat_dynamic_real", "feat_dynamic_cat"
+        This function only works for ``self.method == "QuantileRegression"``,
+        and uses lightgbm's feature importance functionality. It takes the mean
+        feature importance across quantiles and timestamps in the forecast
+        horizon; and then adds these mean values across all of the feature
+        coordinates that are associated to "target", "feat_static_real",
+        "feat_static_cat", "past_feat_dynamic_real", "feat_dynamic_real",
+        "feat_dynamic_cat".
 
         Parameters
         ----------
diff --git a/src/gluonts/ext/rotbaum/_types.py b/src/gluonts/ext/rotbaum/_types.py
index d8dbf1d019..98f7b6352e 100644
--- a/src/gluonts/ext/rotbaum/_types.py
+++ b/src/gluonts/ext/rotbaum/_types.py
@@ -29,10 +29,10 @@ class FeatureImportanceResult(BaseModel):
     @root_validator()
     def check_shape(cls, values):
         """
-        Validate the second dimension is the same for 2d results and all fields share the same dimensionality
-        For example, time aligned results with dimension of (features, pred_length), the pred_length shall be the same
-        :param values:
-        :return:
+        Validate the second dimension is the same for 2d results and all fields
+        share the same dimensionality For example, time aligned results with
+        dimension of (features, pred_length), the pred_length shall be the
+        same.
         """
         dim = np.array(values.get("target")).ndim
         assert (
diff --git a/src/gluonts/ext/statsforecast.py b/src/gluonts/ext/statsforecast.py
index f9f6502ddc..b52bc05f09 100644
--- a/src/gluonts/ext/statsforecast.py
+++ b/src/gluonts/ext/statsforecast.py
@@ -254,7 +254,8 @@ class IMAPAPredictor(StatsForecastPredictor):
 
 class DynamicOptimizedThetaPredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``DynamicOptimizedTheta`` model from `statsforecast`_.
+    A predictor wrapping the ``DynamicOptimizedTheta`` model from
+    `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
@@ -350,7 +351,8 @@ class OptimizedThetaPredictor(StatsForecastPredictor):
 
 class RandomWalkWithDriftPredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``RandomWalkWithDrift`` model from `statsforecast`_.
+    A predictor wrapping the ``RandomWalkWithDrift`` model from
+    `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
@@ -362,7 +364,8 @@ class RandomWalkWithDriftPredictor(StatsForecastPredictor):
 
 class SeasonalExponentialSmoothingPredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``SeasonalExponentialSmoothing`` model from `statsforecast`_.
+    A predictor wrapping the ``SeasonalExponentialSmoothing`` model from
+    `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
@@ -374,7 +377,8 @@ class SeasonalExponentialSmoothingPredictor(StatsForecastPredictor):
 
 class SeasonalExponentialSmoothingOptimizedPredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``SeasonalExponentialSmoothingOptimized`` model from `statsforecast`_.
+    A predictor wrapping the ``SeasonalExponentialSmoothingOptimized`` model
+    from `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
@@ -398,7 +402,8 @@ class SeasonalNaivePredictor(StatsForecastPredictor):
 
 class SeasonalWindowAveragePredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``SeasonalWindowAverage`` model from `statsforecast`_.
+    A predictor wrapping the ``SeasonalWindowAverage`` model from
+    `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
@@ -410,7 +415,8 @@ class SeasonalWindowAveragePredictor(StatsForecastPredictor):
 
 class SimpleExponentialSmoothingPredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``SimpleExponentialSmoothing`` model from `statsforecast`_.
+    A predictor wrapping the ``SimpleExponentialSmoothing`` model from
+    `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
@@ -422,7 +428,8 @@ class SimpleExponentialSmoothingPredictor(StatsForecastPredictor):
 
 class SimpleExponentialSmoothingOptimizedPredictor(StatsForecastPredictor):
     """
-    A predictor wrapping the ``SimpleExponentialSmoothingOptimized`` model from `statsforecast`_.
+    A predictor wrapping the ``SimpleExponentialSmoothingOptimized`` model from
+    `statsforecast`_.
 
     See :class:`StatsForecastPredictor` for the list of arguments.
 
diff --git a/src/gluonts/itertools.py b/src/gluonts/itertools.py
index 198559d6f6..db9587c735 100644
--- a/src/gluonts/itertools.py
+++ b/src/gluonts/itertools.py
@@ -105,7 +105,6 @@ def stream(self):
         >>> s = Cyclic([1, 2, 3, 4]).stream()
         >>> assert list(take(5, s)) == [1, 2, 3, 4, 1]
         >>> assert list(take(5, s)) == [2, 3, 4, 1, 2]
-
         """
         return iter(self)
 
@@ -160,7 +159,8 @@ class _SubIndex(NamedTuple):
 
 @dataclass
 class Fuse:
-    """Fuse collections together to act as single collections.
+    """
+    Fuse collections together to act as single collections.
 
     >>> a = [0, 1, 2]
     >>> b = [3, 4, 5]
@@ -209,8 +209,9 @@ def _get_range(self, start: _SubIndex, stop: _SubIndex) -> "Fuse":
         return Fuse(items)
 
     def _location_for(self, idx, side="right") -> _SubIndex:
-        """Map global index to pair of index to collection and index within
-        that collection.
+        """
+        Map global index to pair of index to collection and index within that
+        collection.
 
         >>> fuse = Fuse([[0, 0], [1, 1]])
         >>> fuse._location_for(0)
@@ -221,7 +222,6 @@ def _location_for(self, idx, side="right") -> _SubIndex:
         _SubIndex(item=1, local=0)
         >>> fuse._location_for(3)
         _SubIndex(item=1, local=1)
-
         """
         if idx == 0 or not self:
             return _SubIndex(0, 0)
@@ -262,7 +262,8 @@ def __repr__(self):
 
 
 def split(xs: Sequence, indices: List[int]) -> List[Sequence]:
-    """Split ``xs`` into subsets given ``indices``.
+    """
+    Split ``xs`` into subsets given ``indices``.
 
     >>> split("abcdef", [1, 3])
     ['a', 'bc', 'def']
@@ -281,13 +282,13 @@ def split(xs: Sequence, indices: List[int]) -> List[Sequence]:
 
 
 def split_into(xs: Sequence, n: int) -> Sequence:
-    """Split ``xs`` into ``n`` parts of similar size.
+    """
+    Split ``xs`` into ``n`` parts of similar size.
 
     >>> split_into("abcd", 2)
     ['ab', 'cd']
     >>> split_into("abcd", 3)
     ['ab', 'c', 'd']
-
     """
 
     bucket_size, remainder = divmod(len(xs), n)
@@ -336,8 +337,9 @@ def __len__(self) -> int:
 
 @dataclass
 class PickleCached:
-    """A caching wrapper for ``iterable`` using ``pickle`` to store cached
-    values on disk.
+    """
+    A caching wrapper for ``iterable`` using ``pickle`` to store cached values
+    on disk.
 
     See :class:`Cached` for more information.
     """
@@ -659,7 +661,9 @@ def inverse(dct: Dict[K, V]) -> Dict[V, K]:
 
 @curry
 def pluck_attr(seq, name, default=_no_default):
-    """Get attribute ``name`` from elements in ``seq``."""
+    """
+    Get attribute ``name`` from elements in ``seq``.
+    """
 
     if default is _no_default:
         return [getattr(el, name) for el in seq]
@@ -696,7 +700,6 @@ def join_items(left, right, how="outer", default=None):
     * ``right``: use only keys from ``right``
 
     If a key is not present in either input, ``default`` is chosen instead.
-
     """
 
     if how == "outer":
@@ -718,13 +721,13 @@ def join_items(left, right, how="outer", default=None):
 
 
 def replace(values: Sequence[T], idx: int, value: T) -> Sequence[T]:
-    """Replace value at index ``idx`` with ``value``.
+    """
+    Replace value at index ``idx`` with ``value``.
 
     Like ``setitem``, but for tuples.
 
     >>> replace((1, 2, 3, 4), -1, 99)
     (1, 2, 3, 99)
-
     """
     xs = list(values)
     xs[idx] = value
diff --git a/src/gluonts/maybe.py b/src/gluonts/maybe.py
index 0f1460996a..5d3de43cca 100644
--- a/src/gluonts/maybe.py
+++ b/src/gluonts/maybe.py
@@ -35,7 +35,6 @@
     compared to their Rust counterparts.
 
     ``do`` is not implemented in Rust but mimics ``toolz.do`` instead.
-
 """
 
 from __future__ import annotations
@@ -90,7 +89,8 @@ def unbox(val: OptionalOrMaybe[T]) -> Optional[T]:
 
 
 def flatten(val: Optional[Optional[T]]) -> Optional[T]:
-    """Flatten nested optional value.
+    """
+    Flatten nested optional value.
 
     Note: This just returns the value, but changes the type from
     ``Optional[Optional[T]]`` to ``Optional[T].``
@@ -110,7 +110,6 @@ def expect(val: OptionalOrMaybe[T], msg: str) -> T:
     Traceback (most recent call last):
         ...
     ValueError: My message
-
     """
     return box(val).expect(msg)
 
@@ -123,7 +122,6 @@ def do(val: OptionalOrMaybe[T], fn: Callable[[T], U]) -> Optional[T]:
     a
     'a'
     >>> do(None, print)
-
     """
     return box(val).do(fn).unbox()
 
@@ -160,7 +158,6 @@ def map_or(val: OptionalOrMaybe[T], fn: Callable[[T], U], default: U) -> U:
     1
     >>> map_or(None, len, 0)
     0
-
     """
     return box(val).map_or(fn, default)
 
@@ -179,7 +176,6 @@ def map_or_else(
     [1]
     >>> map_or_else(None, lambda n: [n], list)
     []
-
     """
     return box(val).map_or_else(fn, factory)
 
@@ -194,7 +190,6 @@ def unwrap(val: OptionalOrMaybe[T]) -> T:
     Traceback (most recent call last):
         ...
     ValueError: Trying to unwrap `None` value.
-
     """
     return box(val).expect("Trying to unwrap `None` value.")
 
@@ -207,7 +202,6 @@ def unwrap_or(val: OptionalOrMaybe[T], default: T) -> T:
     1
     >>> unwrap_or(None, 2)
     2
-
     """
     return box(val).unwrap_or(default)
 
@@ -220,7 +214,6 @@ def unwrap_or_else(val: OptionalOrMaybe[T], factory: Callable[[], T]) -> T:
     [1, 2, 3]
     >>> unwrap_or_else(None, list)
     []
-
     """
 
     return box(val).unwrap_or_else(factory)
@@ -237,7 +230,6 @@ def and_(val: OptionalOrMaybe[T], other: OptionalOrMaybe[U]) -> Optional[U]:
     2
     >>> and_(1, None)
     >>> and_(None, 2)
-
     """
     return box(val).and_(other).unbox()
 
@@ -329,15 +321,15 @@ def xor(val: OptionalOrMaybe[T], other: OptionalOrMaybe[T]) -> Optional[T]:
     2
     >>> xor(1, 2)
     >>> xor(None, None)
-
     """
     return box(val).xor(other).unbox()
 
 
 def iter(val: OptionalOrMaybe[T]) -> List[T]:
     """
-    Wrap ``val`` into a list, if it is not ``None``. Allows to use for loops
-    on optional values.
+    Wrap ``val`` into a list, if it is not ``None``.
+
+    Allows to use for loops on optional values.
     """
     return box(val).iter()
 
@@ -363,7 +355,6 @@ def zip_with(
     3
     >>> zip_with(1, None, add)
     >>> zip_with(None, 2, add)
-
     """
     return box(val).zip_with(other, fn).unbox()
 
@@ -378,7 +369,6 @@ def unbox(self) -> Optional[T]:
         1
         >>> Some(None).unbox() is None
         True
-
         """
 
     @abstractmethod
@@ -400,7 +390,6 @@ def expect(self, msg: str) -> T:
         Traceback (most recent call last):
             ...
         ValueError: My message
-
         """
 
     @abstractmethod
@@ -432,7 +421,6 @@ def map(
 
         >>> Some(10).map(divmod, 3)
         Some((3, 1))
-
         """
 
     @abstractmethod
@@ -448,7 +436,6 @@ def map_or(self, fn: Callable[[T], U], default: U) -> U:
         1
         >>> Nothing.map_or(len, 0)
         0
-
         """
 
     @abstractmethod
@@ -468,7 +455,6 @@ def map_or_else(
         [1]
         >>> Nothing.map_or_else(lambda n: [n], list)
         []
-
         """
 
     @abstractmethod
@@ -482,7 +468,6 @@ def unwrap(self) -> T:
         Traceback (most recent call last):
             ...
         ValueError: Trying to unwrap `None` value.
-
         """
 
     @abstractmethod
@@ -494,7 +479,6 @@ def unwrap_or(self, default: T) -> T:
         1
         >>> Nothing.unwrap_or(2)
         2
-
         """
 
     @abstractmethod
@@ -507,7 +491,6 @@ def unwrap_or_else(self, fn: Callable[[], T]) -> T:
         [1, 2, 3]
         >>> Nothing.unwrap_or_else(list)
         []
-
         """
 
     @abstractmethod
@@ -524,7 +507,6 @@ def and_(self, other: OptionalOrMaybe[U]) -> Maybe[U]:
         Nothing
         >>> Nothing.and_(2)
         Nothing
-
         """
 
     def __and__(self, other: OptionalOrMaybe[U]) -> Maybe[U]:
@@ -540,7 +522,6 @@ def __and__(self, other: OptionalOrMaybe[U]) -> Maybe[U]:
         Nothing
         >>> Nothing & 2
         Nothing
-
         """
         return self.and_(other)
 
@@ -563,7 +544,6 @@ def and_then(
         Nothing
         >>> Nothing.and_then(lambda xs: xs[0] if xs else None)
         Nothing
-
         """
 
     @abstractmethod
@@ -577,7 +557,6 @@ def or_(self, default: Optional[T]) -> Maybe[T]:
         Some(1)
         >>> Nothing.or_(2)
         Some(2)
-
         """
 
     def __or__(self, default: Optional[T]) -> Maybe[T]:
@@ -590,7 +569,6 @@ def __or__(self, default: Optional[T]) -> Maybe[T]:
         Some(1)
         >>> Nothing | 2
         Some(2)
-
         """
 
         return self.or_(default)
@@ -604,7 +582,6 @@ def or_else(self, factory: Callable[[], Optional[T]]) -> Maybe[T]:
         Some([42])
         >>> Nothing.or_else(list)
         Some([])
-
         """
 
     @abstractmethod
@@ -619,7 +596,6 @@ def contains(self, other: U) -> bool:
         False
         >>> Nothing.contains(3)
         False
-
         """
 
     @abstractmethod
@@ -635,14 +611,13 @@ def filter(self, pred: Callable[[T], bool]) -> Maybe[T]:
         Some(2)
         >>> Nothing.filter(is_even)
         Nothing
-
         """
 
     @abstractmethod
     def xor(self, other: OptionalOrMaybe[T]) -> Maybe[T]:
         """
-        Return either ``val`` or ``other`` if the other is ``None``. Also return
-        ``None`` if both are not ``None``.
+        Return either ``val`` or ``other`` if the other is ``None``. Also
+        return ``None`` if both are not ``None``.
 
         >>> xor(1, None)
         1
@@ -650,7 +625,6 @@ def xor(self, other: OptionalOrMaybe[T]) -> Maybe[T]:
         2
         >>> xor(1, 2)
         >>> xor(None, None)
-
         """
 
     def __xor__(self, other: OptionalOrMaybe[T]) -> Maybe[T]:
@@ -659,8 +633,9 @@ def __xor__(self, other: OptionalOrMaybe[T]) -> Maybe[T]:
     @abstractmethod
     def iter(self) -> List[T]:
         """
-        Wrap ``val`` into a list, if it is not ``None``. Allows to use for loops
-        on optional values.
+        Wrap ``val`` into a list, if it is not ``None``.
+
+        Allows to use for loops on optional values.
         """
 
     def __iter__(self):
@@ -668,14 +643,16 @@ def __iter__(self):
 
     @abstractmethod
     def zip(self, other: OptionalOrMaybe[U]) -> Maybe[Tuple[T, U]]:
-        """ """
+        """
+        Abstract zip.
+        """
 
     @abstractmethod
     def zip_with(
         self, other: OptionalOrMaybe[U], fn: Callable[[T, U], R]
     ) -> Maybe[R]:
-        """
-        Apply function to two optional values, if neither of them is ``None``:
+        """Apply function to two optional values, if neither of them is
+        ``None``:
 
         >>> add = lambda left, right: left + right
         >>> Some(1).zip_with(2, add)
@@ -684,12 +661,12 @@ def zip_with(
         Nothing
         >>> Nothing.zip_with(2, add)
         Nothing
-
         """
 
     @abstractmethod
     def flatten(self: "Maybe[OptionalOrMaybe[T]]") -> Maybe[T]:
-        """Flatten nested optional value.
+        """
+        Flatten nested optional value.
 
         Note: This just returns the value, but changes the type from
         ``Optional[Optional[T]]`` to ``Optional[T].``
diff --git a/src/gluonts/model/evaluation.py b/src/gluonts/model/evaluation.py
index 473aa2397e..c5a62ab047 100644
--- a/src/gluonts/model/evaluation.py
+++ b/src/gluonts/model/evaluation.py
@@ -34,9 +34,8 @@
 @dataclass
 class BatchForecast:
     """
-    Wrapper around ``Forecast`` objects, that adds a batch dimension
-    to arrays returned by ``__getitem__``, for compatibility with
-    ``gluonts.ev``.
+    Wrapper around ``Forecast`` objects, that adds a batch dimension to arrays
+    returned by ``__getitem__``, for compatibility with ``gluonts.ev``.
     """
 
     forecasts: List[Forecast]
@@ -107,8 +106,8 @@ def evaluate_forecasts_raw(
     seasonality: Optional[int] = None,
 ) -> dict:
     """
-    Evaluate ``forecasts`` by comparing them with ``test_data``, according
-    to ``metrics``.
+    Evaluate ``forecasts`` by comparing them with ``test_data``, according to
+    ``metrics``.
 
     .. note:: This feature is experimental and may be subject to changes.
 
@@ -192,8 +191,8 @@ def evaluate_forecasts(
     seasonality: Optional[int] = None,
 ) -> pd.DataFrame:
     """
-    Evaluate ``forecasts`` by comparing them with ``test_data``, according
-    to ``metrics``.
+    Evaluate ``forecasts`` by comparing them with ``test_data``, according to
+    ``metrics``.
 
     .. note:: This feature is experimental and may be subject to changes.
 
@@ -246,8 +245,7 @@ def evaluate_model(
     seasonality: Optional[int] = None,
 ) -> pd.DataFrame:
     """
-    Evaluate ``model`` when applied to ``test_data``, according
-    to ``metrics``.
+    Evaluate ``model`` when applied to ``test_data``, according to ``metrics``.
 
     .. note:: This feature is experimental and may be subject to changes.
 
diff --git a/src/gluonts/model/predictor.py b/src/gluonts/model/predictor.py
index 1b64124574..000ccb9b28 100644
--- a/src/gluonts/model/predictor.py
+++ b/src/gluonts/model/predictor.py
@@ -149,8 +149,8 @@ def from_inputs(cls, train_iter, **params):
 class RepresentablePredictor(Predictor):
     """
     An abstract predictor that can be subclassed by framework-specific models.
-    Subclasses should have ``@validated()`` constructors:
-    (de)serialization and equality test are all implemented on top of its logic.
+    Subclasses should have ``@validated()`` constructors: (de)serialization and
+    equality test are all implemented on top of its logic.
 
     Parameters
     ----------
diff --git a/src/gluonts/mx/component.py b/src/gluonts/mx/component.py
index 29b2debe7e..adb1d31119 100644
--- a/src/gluonts/mx/component.py
+++ b/src/gluonts/mx/component.py
@@ -27,9 +27,8 @@
 def equals_parameter_dict(
     this: mx.gluon.ParameterDict, that: mx.gluon.ParameterDict
 ) -> bool:
-    """
-    Structural equality check between two :class:`~mxnet.gluon.ParameterDict`
-    objects.
+    """Structural equality check between two
+    :class:`~mxnet.gluon.ParameterDict` objects.
 
     Two parameter dictionaries ``this`` and ``that`` are considered
     *structurally equal* if the following conditions are satisfied:
diff --git a/src/gluonts/mx/distribution/bijection.py b/src/gluonts/mx/distribution/bijection.py
index 858216530d..2dfac4f8f0 100644
--- a/src/gluonts/mx/distribution/bijection.py
+++ b/src/gluonts/mx/distribution/bijection.py
@@ -36,20 +36,20 @@ def __init__(self):
 
     def f(self, x: Tensor) -> Tensor:
         r"""
-        Forward transformation x -> y
+        Forward transformation x -> y.
         """
         raise NotImplementedError
 
     def f_inv(self, y: Tensor) -> Tensor:
         r"""
-        Inverse transformation y -> x
+        Inverse transformation y -> x.
         """
         raise NotImplementedError
 
     def log_abs_det_jac(self, x: Tensor, y: Tensor) -> Tensor:
         r"""
         Receives (x, y) and returns log of the absolute value of the Jacobian
-        determinant
+        determinant.
 
         .. math::
             \log |dy/dx|
diff --git a/src/gluonts/mx/distribution/binned.py b/src/gluonts/mx/distribution/binned.py
index 61cc71b413..eb33a07012 100644
--- a/src/gluonts/mx/distribution/binned.py
+++ b/src/gluonts/mx/distribution/binned.py
@@ -26,8 +26,8 @@
 
 class Binned(Distribution):
     r"""
-    A binned distribution defined by a set of bins via
-    bin centers and bin probabilities.
+    A binned distribution defined by a set of bins via bin centers and bin
+    probabilities.
 
     Parameters
     ----------
diff --git a/src/gluonts/mx/distribution/box_cox_transform.py b/src/gluonts/mx/distribution/box_cox_transform.py
index 99125a032a..cbf899c71b 100644
--- a/src/gluonts/mx/distribution/box_cox_transform.py
+++ b/src/gluonts/mx/distribution/box_cox_transform.py
@@ -26,7 +26,8 @@
 class BoxCoxTransform(Bijection):
     r"""
     Implements Box-Cox transformation of a uni-variate random variable.
-    The Box-Cox transformation of an observation :math:`z` is given by
+
+    The Box-Cox transformation of an observation :math:`z` is given by:
 
     .. math::
 
@@ -178,7 +179,8 @@ def f(self, z: Tensor) -> Tensor:
         )
 
     def f_inv(self, y: Tensor) -> Tensor:
-        r"""Inverse of the Box-Cox Transform
+        r"""
+        Inverse of the Box-Cox Transform.
 
         Parameters
         ----------
@@ -189,7 +191,6 @@ def f_inv(self, y: Tensor) -> Tensor:
         -------
         Tensor
             Observations
-
         """
         F = self.F
         lambda_1 = self.lambda_1
@@ -211,7 +212,7 @@ def f_inv(self, y: Tensor) -> Tensor:
     def log_abs_det_jac(self, z: Tensor, y: Tensor = None) -> Tensor:
         r"""
         Logarithm of the absolute value of the Jacobian determinant
-        corresponding to the Box-Cox Transform is given by
+        corresponding to the Box-Cox Transform is given by.
 
         .. math::
             \log \frac{d}{dz} BoxCox(z; \lambda_1, \lambda_2) = \begin{cases}
@@ -231,7 +232,6 @@ def log_abs_det_jac(self, z: Tensor, y: Tensor = None) -> Tensor:
         Returns
         -------
         Tensor
-
         """  # noqa: E501
         F = self.F
         lambda_1 = self.lambda_1
diff --git a/src/gluonts/mx/distribution/dirichlet.py b/src/gluonts/mx/distribution/dirichlet.py
index 3b065a008f..31d57d7cfb 100644
--- a/src/gluonts/mx/distribution/dirichlet.py
+++ b/src/gluonts/mx/distribution/dirichlet.py
@@ -26,7 +26,7 @@
 class Dirichlet(Distribution):
     r"""
     Dirichlet distribution, specified by the concentration vector alpha of
-    length d. https://en.wikipedia.org/wiki/Dirichlet_distribution
+    length d. https://en.wikipedia.org/wiki/Dirichlet_distribution.
 
     The Dirichlet distribution is defined on the open (d-1)-simplex, which
     means that a sample (or observation) x = (x_0,..., x_{d-1}) must satisfy:
diff --git a/src/gluonts/mx/distribution/dirichlet_multinomial.py b/src/gluonts/mx/distribution/dirichlet_multinomial.py
index f691d2fcc5..f79ec7a5df 100644
--- a/src/gluonts/mx/distribution/dirichlet_multinomial.py
+++ b/src/gluonts/mx/distribution/dirichlet_multinomial.py
@@ -27,7 +27,7 @@ class DirichletMultinomial(Distribution):
     r"""
     Dirichlet-Multinomial distribution, specified by the concentration vector
     alpha of length dim, and a number of trials n_trials.
-    https://en.wikipedia.org/wiki/Dirichlet-multinomial_distribution
+    https://en.wikipedia.org/wiki/Dirichlet-multinomial_distribution.
 
     The Dirichlet-Multinomial distribution is a discrete multivariate
     probability distribution, a sample (or observation)
diff --git a/src/gluonts/mx/distribution/distribution.py b/src/gluonts/mx/distribution/distribution.py
index 6033b4967c..d7c0144357 100644
--- a/src/gluonts/mx/distribution/distribution.py
+++ b/src/gluonts/mx/distribution/distribution.py
@@ -197,8 +197,8 @@ def event_dim(self) -> int:
         r"""
         Number of event dimensions, i.e., length of the `event_shape` tuple.
 
-        This is `0` for distributions over scalars, `1` over vectors,
-        `2` over matrices, and so on.
+        This is `0` for distributions over scalars, `1` over vectors, `2` over
+        matrices, and so on.
         """
         raise NotImplementedError()
 
@@ -276,7 +276,8 @@ def variance(self) -> Tensor:
 
     def cdf(self, x: Tensor) -> Tensor:
         r"""
-        Return the value of the cumulative distribution function evaluated at x
+        Return the value of the cumulative distribution function evaluated at
+        x.
         """
         raise NotImplementedError()
 
@@ -353,18 +354,17 @@ def _tensor_cdf_bisection(
 
     def quantile(self, level: Tensor) -> Tensor:
         r"""
-
         Calculates quantiles for the given levels.
 
         Parameters
         ----------
-        level
+        level:
             Level values to use for computing the quantiles.
             `level` should be a 1d tensor of level values between 0 and 1.
 
         Returns
         -------
-        quantiles
+        quantiles:
             Quantile values corresponding to the levels passed.
             The return shape is
 
diff --git a/src/gluonts/mx/distribution/distribution_output.py b/src/gluonts/mx/distribution/distribution_output.py
index 974e3a35fc..d24dcbd45c 100644
--- a/src/gluonts/mx/distribution/distribution_output.py
+++ b/src/gluonts/mx/distribution/distribution_output.py
@@ -73,7 +73,7 @@ def hybrid_forward(self, F, x: Tensor, **kwargs) -> Tuple[Tensor]:
 
 class Output:
     r"""
-    Class to connect a network to some output
+    Class to connect a network to some output.
     """
 
     args_dim: Dict[str, int]
@@ -144,33 +144,35 @@ def distribution(
     @property
     def event_shape(self) -> Tuple:
         r"""
-        Shape of each individual event contemplated by the distributions
-        that this object constructs.
+        Shape of each individual event contemplated by the distributions that
+        this object constructs.
         """
         raise NotImplementedError()
 
     @property
     def event_dim(self) -> int:
         r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple,
-        of the distributions that this object constructs.
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of
+        the distributions that this object constructs.
         """
         return len(self.event_shape)
 
     @property
     def value_in_support(self) -> float:
         r"""
-        A float that will have a valid numeric value when computing the
-        log-loss of the corresponding distribution. By default 0.0.
+        A float that will have a valid numeric value when computing the log-
+        loss of the corresponding distribution; by default 0.0.
+
         This value will be used when padding data series.
         """
         return 0.0
 
     def domain_map(self, F, *args: Tensor):
         r"""
-        Converts arguments to the right shape and domain. The domain depends
-        on the type of distribution, while the correct shape is obtained by
-        reshaping the trailing axis in such a way that the returned tensors
-        define a distribution of the right event_shape.
+        Converts arguments to the right shape and domain.
+
+        The domain depends on the type of distribution, while the correct shape
+        is obtained by reshaping the trailing axis in such a way that the
+        returned tensors define a distribution of the right event_shape.
         """
         raise NotImplementedError()
diff --git a/src/gluonts/mx/distribution/empirical_distribution.py b/src/gluonts/mx/distribution/empirical_distribution.py
index 0b83b8eb70..5a3f2eeaa3 100644
--- a/src/gluonts/mx/distribution/empirical_distribution.py
+++ b/src/gluonts/mx/distribution/empirical_distribution.py
@@ -161,7 +161,8 @@ def quantile_losses(
         )
 
     def crps_univariate(self, x: Tensor) -> Tensor:
-        r"""Compute the *continuous rank probability score* (CRPS) of `obs`
+        r"""
+        Compute the *continuous rank probability score* (CRPS) of `obs`
         according to the empirical distribution.
 
         The last dimension of `obs` specifies the "event dimension" of the
diff --git a/src/gluonts/mx/distribution/gaussian.py b/src/gluonts/mx/distribution/gaussian.py
index a492c2956b..6c3dd26f31 100644
--- a/src/gluonts/mx/distribution/gaussian.py
+++ b/src/gluonts/mx/distribution/gaussian.py
@@ -81,7 +81,8 @@ def stddev(self) -> Tensor:
     @classmethod
     def fit(cls, F, samples: Tensor):
         """
-        Returns an instance of `Gaussian` after fitting parameters to the given data.
+        Returns an instance of `Gaussian` after fitting parameters to the given
+        data.
 
         Parameters
         ----------
@@ -92,7 +93,6 @@ def fit(cls, F, samples: Tensor):
         Returns
         -------
         Distribution instance of type `Gaussian`.
-
         """
 
         # Compute mean and standard deviations
diff --git a/src/gluonts/mx/distribution/inflated_beta.py b/src/gluonts/mx/distribution/inflated_beta.py
index 93c0781893..06f145883d 100644
--- a/src/gluonts/mx/distribution/inflated_beta.py
+++ b/src/gluonts/mx/distribution/inflated_beta.py
@@ -96,9 +96,8 @@ def log_prob(self, x: Tensor) -> Tensor:
 
 
 class ZeroInflatedBeta(ZeroAndOneInflatedBeta):
-    r"""
-    Zero Inflated Beta distribution as in Raydonal Ospina, Silvia L.P. Ferrari:
-    Inflated Beta Distributions
+    r"""Zero Inflated Beta distribution as in Raydonal Ospina, Silvia L.P.
+    Ferrari: Inflated Beta Distributions.
 
     Parameters
     ----------
@@ -129,9 +128,8 @@ def __init__(
 
 
 class OneInflatedBeta(ZeroAndOneInflatedBeta):
-    r"""
-    One Inflated Beta distribution as in Raydonal Ospina, Silvia L.P. Ferrari:
-    Inflated Beta Distributions
+    r"""One Inflated Beta distribution as in Raydonal Ospina, Silvia L.P.
+    Ferrari: Inflated Beta Distributions.
 
     Parameters
     ----------
diff --git a/src/gluonts/mx/distribution/isqf.py b/src/gluonts/mx/distribution/isqf.py
index d707bf2ee9..f2dfee5cca 100644
--- a/src/gluonts/mx/distribution/isqf.py
+++ b/src/gluonts/mx/distribution/isqf.py
@@ -29,8 +29,8 @@ class ISQF(Distribution):
     r"""
     Distribution class for the Incremental (Spline) Quantile Function in the
     paper ``Learning Quantile Functions without Quantile Crossing for
-    Distribution-free Time Series Forecasting``
-    by Park, Robinson, Aubet, Kan, Gasthaus, Wang
+    Distribution-free Time Series Forecasting`` by Park, Robinson, Aubet, Kan,
+    Gasthaus, Wang.
 
     Parameters
     ----------
@@ -136,8 +136,8 @@ def parametrize_qk(
         F, quantile_knots: Tensor
     ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         r"""
-        Function to parametrize the x or y positions
-        of the num_qk quantile knots
+        Function to parametrize the x or y positions of the num_qk quantile
+        knots.
 
         Parameters
         ----------
@@ -233,7 +233,7 @@ def parametrize_tail(
         F, beta: Tensor, qk_x: Tensor, qk_y: Tensor
     ) -> Tuple[Tensor, Tensor]:
         r"""
-        Function to parametrize the tail parameters
+        Function to parametrize the tail parameters.
 
         Note that the exponential tails are given by
         q(alpha)
@@ -388,8 +388,8 @@ def quantile_spline(
         axis: Optional[int] = None,
     ) -> Tensor:
         r"""
-        Evaluates the spline functions at the
-        quantile levels contained in alpha
+        Evaluates the spline functions at the quantile levels contained in
+        alpha.
 
         Parameters
         ----------
@@ -399,6 +399,7 @@ def quantile_spline(
             Axis along which to expand
             For details of input_alpha shape and axis,
             refer to the description in quantile_internal
+
         Returns
         -------
         Tensor
@@ -486,9 +487,9 @@ def quantile_tail(
 
     def cdf_spline(self, z: Tensor) -> Tensor:
         r"""
-        For observations z and splines defined in [qk_x[k], qk_x[k+1]]
-        Computes the quantile level alpha_tilde such that
-        alpha_tilde
+        For observations z and splines defined in [qk_x[k], qk_x[k+1]] Computes
+        the quantile level alpha_tilde such that alpha_tilde.
+
         = q^{-1}(z) if z is in-between qk_x[k] and qk_x[k+1]
         = qk_x[k] if z<qk_x[k]
         = qk_x[k+1] if z>qk_x[k+1]
@@ -565,10 +566,11 @@ def cdf_spline(self, z: Tensor) -> Tensor:
 
     def cdf_tail(self, z: Tensor, left_tail: bool = True) -> Tensor:
         r"""
-        Computes the quantile level alpha_tilde such that
-        alpha_tilde
+        Computes the quantile level alpha_tilde such that alpha_tilde.
+
         = q^{-1}(z) if z is in the tail region
         = qk_x_l or qk_x_r if z is in the non-tail region
+
         Parameters
         ----------
         z
@@ -576,6 +578,7 @@ def cdf_tail(self, z: Tensor, left_tail: bool = True) -> Tensor:
         left_tail
             If True, compute alpha_tilde for the left tail
             Otherwise, compute alpha_tilde for the right tail
+
         Returns
         -------
         alpha_tilde
@@ -595,7 +598,8 @@ def cdf_tail(self, z: Tensor, left_tail: bool = True) -> Tensor:
 
     def crps_tail(self, z: Tensor, left_tail: bool = True) -> Tensor:
         r"""
-        Compute CRPS in analytical form for left/right tails
+        Compute CRPS in analytical form for left/right tails.
+
         Parameters
         ----------
         z
@@ -603,6 +607,7 @@ def crps_tail(self, z: Tensor, left_tail: bool = True) -> Tensor:
         left_tail
             If True, compute CRPS for the left tail
             Otherwise, compute CRPS for the right tail
+
         Returns
         -------
         Tensor
@@ -828,7 +833,7 @@ def event_dim(self) -> int:
 
 class ISQFOutput(DistributionOutput):
     r"""
-    DistributionOutput class for the Incremental (Spline) Quantile Function
+    DistributionOutput class for the Incremental (Spline) Quantile Function.
 
     Parameters
     ----------
diff --git a/src/gluonts/mx/distribution/lowrank_multivariate_gaussian.py b/src/gluonts/mx/distribution/lowrank_multivariate_gaussian.py
index f446799367..2409aac6ad 100644
--- a/src/gluonts/mx/distribution/lowrank_multivariate_gaussian.py
+++ b/src/gluonts/mx/distribution/lowrank_multivariate_gaussian.py
@@ -76,7 +76,6 @@ def log_det(F, batch_D: Tensor, batch_capacitance_tril: Tensor) -> Tensor:
 
     Returns
     -------
-
     """
     log_D = batch_D.log().sum(axis=-1)
     log_C = 2 * F.linalg.sumlogdiag(batch_capacitance_tril)
@@ -87,7 +86,7 @@ def mahalanobis_distance(
     F, W: Tensor, D: Tensor, capacitance_tril: Tensor, x: Tensor
 ) -> Tensor:
     r"""
-    Uses the Woodbury matrix identity
+    Uses the Woodbury matrix identity.
 
     .. math::
         (W W^T + D)^{-1} = D^{-1} - D^{-1} W C^{-1} W^T D^{-1},
@@ -109,7 +108,6 @@ def mahalanobis_distance(
 
     Returns
     -------
-
     """
     xx = x.expand_dims(axis=-1)
 
@@ -166,8 +164,8 @@ def lowrank_log_likelihood(
 
 class LowrankMultivariateGaussian(Distribution):
     r"""
-    Multivariate Gaussian distribution, with covariance matrix parametrized
-    as the sum of a diagonal matrix and a low-rank matrix
+    Multivariate Gaussian distribution, with covariance matrix parametrized as
+    the sum of a diagonal matrix and a low-rank matrix.
 
     .. math::
         \Sigma = D + W W^T
diff --git a/src/gluonts/mx/distribution/multivariate_gaussian.py b/src/gluonts/mx/distribution/multivariate_gaussian.py
index 73a8b29c0f..725f408e31 100644
--- a/src/gluonts/mx/distribution/multivariate_gaussian.py
+++ b/src/gluonts/mx/distribution/multivariate_gaussian.py
@@ -26,8 +26,8 @@
 
 class MultivariateGaussian(Distribution):
     r"""
-    Multivariate Gaussian distribution, specified by the mean vector
-    and the Cholesky factor of its covariance matrix.
+    Multivariate Gaussian distribution, specified by the mean vector and the
+    Cholesky factor of its covariance matrix.
 
     Parameters
     ----------
@@ -104,8 +104,8 @@ def sample_rep(
         self, num_samples: Optional[int] = None, dtype=np.float32
     ) -> Tensor:
         r"""
-        Draw samples from the multivariate Gaussian distributions.
-        Internally, Cholesky factorization of the covariance matrix is used:
+        Draw samples from the multivariate Gaussian distributions. Internally,
+        Cholesky factorization of the covariance matrix is used:
 
             sample = L v + mu,
 
diff --git a/src/gluonts/mx/distribution/nan_mixture.py b/src/gluonts/mx/distribution/nan_mixture.py
index 5333741c5d..3666d6a783 100644
--- a/src/gluonts/mx/distribution/nan_mixture.py
+++ b/src/gluonts/mx/distribution/nan_mixture.py
@@ -27,7 +27,7 @@
 class NanMixture(MixtureDistribution):
     r"""
     A mixture distribution of a NaN-valued Deterministic distribution and
-    Distribution
+    Distribution.
 
     Parameters
     ----------
diff --git a/src/gluonts/mx/distribution/poisson.py b/src/gluonts/mx/distribution/poisson.py
index 31dd1e743c..39e7fc9517 100644
--- a/src/gluonts/mx/distribution/poisson.py
+++ b/src/gluonts/mx/distribution/poisson.py
@@ -26,8 +26,8 @@
 
 class Poisson(Distribution):
     r"""
-    Poisson distribution, i.e. the distribution of the number of
-    successes in a specified region.
+    Poisson distribution, i.e. the distribution of the number of successes in a
+    specified region.
 
     Parameters
     ----------
diff --git a/src/gluonts/mx/distribution/transformed_distribution.py b/src/gluonts/mx/distribution/transformed_distribution.py
index ef5151a8b8..ea3e5cc822 100644
--- a/src/gluonts/mx/distribution/transformed_distribution.py
+++ b/src/gluonts/mx/distribution/transformed_distribution.py
@@ -27,8 +27,8 @@
 
 class TransformedDistribution(Distribution):
     r"""
-    A distribution obtained by applying a sequence of transformations on top
-    of a base distribution.
+    A distribution obtained by applying a sequence of transformations on top of
+    a base distribution.
     """
 
     @validated()
diff --git a/src/gluonts/mx/distribution/transformed_distribution_output.py b/src/gluonts/mx/distribution/transformed_distribution_output.py
index 713c3da298..357449da50 100644
--- a/src/gluonts/mx/distribution/transformed_distribution_output.py
+++ b/src/gluonts/mx/distribution/transformed_distribution_output.py
@@ -29,8 +29,8 @@
 
 class TransformedDistributionOutput(DistributionOutput):
     r"""
-    Class to connect a network to a distribution that is transformed
-    by a sequence of learnable bijections.
+    Class to connect a network to a distribution that is transformed by a
+    sequence of learnable bijections.
     """
 
     @validated()
diff --git a/src/gluonts/mx/kernels/_periodic_kernel.py b/src/gluonts/mx/kernels/_periodic_kernel.py
index a3c4fdcafc..612afc7362 100644
--- a/src/gluonts/mx/kernels/_periodic_kernel.py
+++ b/src/gluonts/mx/kernels/_periodic_kernel.py
@@ -150,8 +150,8 @@ def gp_params_scaling(
     @classmethod
     def domain_map(cls, F, amplitude, length_scale, frequency):
         r"""
-        This function applies the softmax to the Periodic Kernel
-        hyper-parameters.
+        This function applies the softmax to the Periodic Kernel hyper-
+        parameters.
 
         Parameters
         ----------
diff --git a/src/gluonts/mx/kernels/_rbf_kernel.py b/src/gluonts/mx/kernels/_rbf_kernel.py
index 4c16053f91..e2cf4ff6a9 100644
--- a/src/gluonts/mx/kernels/_rbf_kernel.py
+++ b/src/gluonts/mx/kernels/_rbf_kernel.py
@@ -85,8 +85,8 @@ def gp_params_scaling(
         self, F, past_target: Tensor, past_time_feat: Tensor
     ) -> Tuple[Tensor, Tensor, Tensor]:
         """
-        This function returns the scales for the GP RBF Kernel hyper-parameters
-        by using the standard deviations of the past_target and
+        This function returns the scales for the GP RBF Kernel hyper-
+        parameters by using the standard deviations of the past_target and
         past_time_features.
 
         Parameters
diff --git a/src/gluonts/mx/model/deepstate/issm.py b/src/gluonts/mx/model/deepstate/issm.py
index 28ae8edd5c..ffa747f3f1 100644
--- a/src/gluonts/mx/model/deepstate/issm.py
+++ b/src/gluonts/mx/model/deepstate/issm.py
@@ -101,7 +101,6 @@ class ISSM:
         * dimension of the latent state
         * transition and innovation coefficients of the transition model
         * emission coefficient of the observation model
-
     """
 
     @validated()
diff --git a/src/gluonts/mx/model/deepvar_hierarchical/_network.py b/src/gluonts/mx/model/deepvar_hierarchical/_network.py
index 1764aad498..8544a332ba 100755
--- a/src/gluonts/mx/model/deepvar_hierarchical/_network.py
+++ b/src/gluonts/mx/model/deepvar_hierarchical/_network.py
@@ -99,7 +99,7 @@ def reconcile_samples(
 
 def coherency_error(S: np.ndarray, samples: np.ndarray) -> float:
     r"""
-    Computes the maximum relative coherency error
+    Computes the maximum relative coherency error.
 
     .. math::
 
diff --git a/src/gluonts/mx/model/gp_forecaster/gaussian_process.py b/src/gluonts/mx/model/gp_forecaster/gaussian_process.py
index 50d5d1821d..5883d69824 100644
--- a/src/gluonts/mx/model/gp_forecaster/gaussian_process.py
+++ b/src/gluonts/mx/model/gp_forecaster/gaussian_process.py
@@ -166,7 +166,7 @@ def _compute_cholesky_gp(
 
     def log_prob(self, x_train: Tensor, y_train: Tensor) -> Tensor:
         r"""
-        This method computes the negative marginal log likelihood
+        This method computes the negative marginal log likelihood.
 
         .. math::
             :nowrap:
diff --git a/src/gluonts/mx/model/seq2seq/_forking_estimator.py b/src/gluonts/mx/model/seq2seq/_forking_estimator.py
index 7bb38915f2..b1af11396c 100644
--- a/src/gluonts/mx/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/mx/model/seq2seq/_forking_estimator.py
@@ -69,8 +69,8 @@
 
 class ForkingSeq2SeqEstimator(GluonEstimator):
     r"""
-    Sequence-to-Sequence (seq2seq) structure with the so-called
-    "Forking Sequence" proposed in [WTN+17]_.
+    Sequence-to-Sequence (seq2seq) structure with the so-called "Forking
+    Sequence" proposed in [WTN+17]_.
 
     The basic idea is that, given a sequence :math:`x_1, x_2, \cdots, x_T`,
     with a decoding length :math:`\tau`, we learn a NN that solves the
diff --git a/src/gluonts/mx/model/tpp/distribution/base.py b/src/gluonts/mx/model/tpp/distribution/base.py
index 990d57d880..0cacfedbb7 100644
--- a/src/gluonts/mx/model/tpp/distribution/base.py
+++ b/src/gluonts/mx/model/tpp/distribution/base.py
@@ -126,8 +126,9 @@ def log_intensity(self, y: Tensor) -> Tensor:
 
     def log_survival(self, y: Tensor) -> Tensor:
         r"""
-        Logarithm of the survival function
-        :math:`\log S(y) = \log(1 - CDF(y))`.
+        Logarithm of the survival function.
+
+        :math:`\log S(y) = \log(1 - CDF(y))`
         """
         x = y
         for t in self.transforms[::-1]:
diff --git a/src/gluonts/mx/model/tpp/distribution/loglogistic.py b/src/gluonts/mx/model/tpp/distribution/loglogistic.py
index 247307551c..7dafa73fe9 100644
--- a/src/gluonts/mx/model/tpp/distribution/loglogistic.py
+++ b/src/gluonts/mx/model/tpp/distribution/loglogistic.py
@@ -87,8 +87,9 @@ def log_intensity(self, x: Tensor) -> Tensor:
 
     def log_survival(self, x: Tensor) -> Tensor:
         r"""
-        Logarithm of the survival function
-        :math:`\log S(x) = \log(1 - CDF(x))`.
+        Logarithm of the survival function.
+
+        :math:`\log S(x) = \log(1 - CDF(x))`
 
         We define :math:`z = (\log(x) - \mu) / \sigma` and obtain the survival
         function as :math:`S(x) = sigmoid(-z)`, or equivalently
diff --git a/src/gluonts/mx/model/tpp/distribution/weibull.py b/src/gluonts/mx/model/tpp/distribution/weibull.py
index 09b73b51f6..6930efa914 100644
--- a/src/gluonts/mx/model/tpp/distribution/weibull.py
+++ b/src/gluonts/mx/model/tpp/distribution/weibull.py
@@ -74,7 +74,8 @@ def log_intensity(self, x: Tensor) -> Tensor:
 
     def log_survival(self, x: Tensor) -> Tensor:
         r"""
-        Logarithm of the survival function
+        Logarithm of the survival function.
+
         :math:`\log S(x) = \log(1 - CDF(x))`.
 
         The survival function of the Weibull distribution is
diff --git a/src/gluonts/mx/model/transformer/layers.py b/src/gluonts/mx/model/transformer/layers.py
index 0126d44d1f..1be302bb8d 100644
--- a/src/gluonts/mx/model/transformer/layers.py
+++ b/src/gluonts/mx/model/transformer/layers.py
@@ -491,6 +491,7 @@ def hybrid_forward(self, F, x: Tensor, *args) -> Tensor:
 class TransformerProcessBlock(HybridBlock):
     r"""
     Block to perform pre/post processing on layer inputs.
+
     The processing steps are determined by the sequence argument, which can
     contain one of the three operations:
     n: layer normalization
diff --git a/src/gluonts/mx/trainer/callback.py b/src/gluonts/mx/trainer/callback.py
index 002207879d..286211e73c 100644
--- a/src/gluonts/mx/trainer/callback.py
+++ b/src/gluonts/mx/trainer/callback.py
@@ -379,7 +379,8 @@ def is_running(self) -> bool:
 
 
 class TrainingTimeLimit(BaseModel, Callback):
-    """Limit time spent for training.
+    """
+    Limit time spent for training.
 
     This is useful when ensuring that training for a given model doesn't
     exceed a budget, for example when doing AutoML.
diff --git a/src/gluonts/mx/trainer/learning_rate_scheduler.py b/src/gluonts/mx/trainer/learning_rate_scheduler.py
index 91ecd43aa8..e550d1d0cf 100644
--- a/src/gluonts/mx/trainer/learning_rate_scheduler.py
+++ b/src/gluonts/mx/trainer/learning_rate_scheduler.py
@@ -66,7 +66,8 @@ def should_update(self, metric: float) -> bool:
 
 @dataclass
 class Patience:
-    """Simple patience tracker.
+    """
+    Simple patience tracker.
 
     Given an `Objective`, it will check whether the metric has improved and
     update its patience count. A better value sets the patience back to zero.
@@ -110,11 +111,11 @@ def step(self, metric_value: float) -> bool:
 class MetricAttentiveScheduler:
     """
     This scheduler decreases the learning rate based on the value of some
-    validation metric to be optimized (maximized or minimized). The value
-    of such metric is provided by calling the `step` method on the scheduler.
-    A `patience` parameter must be provided, and the scheduler will reduce
-    the learning rate if no improvement in the metric is done before
-    `patience` observations of the metric.
+    validation metric to be optimized (maximized or minimized). The value of
+    such metric is provided by calling the `step` method on the scheduler. A
+    `patience` parameter must be provided, and the scheduler will reduce the
+    learning rate if no improvement in the metric is done before `patience`
+    observations of the metric.
 
     Examples:
 
@@ -196,11 +197,11 @@ def step(self, metric_value: float) -> bool:
 class LearningRateReduction(Callback):
     """
     This Callback decreases the learning rate based on the value of some
-    validation metric to be optimized (maximized or minimized). The value
-    of such metric is provided by calling the `step` method on the scheduler.
-    A `patience` parameter must be provided, and the scheduler will reduce
-    the learning rate if no improvement in the metric is done before
-    `patience` observations of the metric.
+    validation metric to be optimized (maximized or minimized). The value of
+    such metric is provided by calling the `step` method on the scheduler. A
+    `patience` parameter must be provided, and the scheduler will reduce the
+    learning rate if no improvement in the metric is done before `patience`
+    observations of the metric.
 
     Examples:
 
diff --git a/src/gluonts/mx/trainer/model_iteration_averaging.py b/src/gluonts/mx/trainer/model_iteration_averaging.py
index e4e93637b1..239af586f1 100644
--- a/src/gluonts/mx/trainer/model_iteration_averaging.py
+++ b/src/gluonts/mx/trainer/model_iteration_averaging.py
@@ -25,11 +25,12 @@
 
 class IterationAveragingStrategy:
     r"""
-    The model averaging is based on paper
-    "Stochastic Gradient Descent for Non-smooth Optimization: Convergence
-    Results and Optimal Averaging Schemes",
+    The model averaging is based on paper "Stochastic Gradient Descent for Non-
+    smooth Optimization: Convergence Results and Optimal Averaging Schemes",
     (http://proceedings.mlr.press/v28/shamir13.pdf), which implements
-    polynomial-decay averaging, parameterized by eta. When eta = 0, it is
+    polynomial-decay averaging, parameterized by eta.
+
+    When eta = 0, it is
     equivalent to simple average over all iterations with same weights.
     """
 
@@ -156,6 +157,7 @@ def load_cached_model(self, model: nn.HybridBlock):
 class NTA(IterationAveragingStrategy):
     r"""
     Implement Non-monotonically Triggered AvSGD (NTA).
+
     This method is based on paper "Regularizing and Optimizing LSTM Language
     Models", (https://openreview.net/pdf?id=SyyGPP0TZ), and an implementation
     is available in Salesforce GitHub
@@ -255,6 +257,7 @@ def update_average_trigger(
 class Alpha_Suffix(IterationAveragingStrategy):
     r"""
     Implement Alpha Suffix model averaging.
+
     This method is based on paper "Making Gradient Descent Optimalfor Strongly
     Convex Stochastic Optimization" (https://arxiv.org/pdf/1109.5647.pdf).
     """
@@ -264,7 +267,7 @@ class Alpha_Suffix(IterationAveragingStrategy):
     @validated()
     def __init__(self, epochs: int, alpha: float = 0.75, eta: float = 0):
         r"""
-        Taking iteration average for the last epoch*alpha epochs
+        Taking iteration average for the last epoch*alpha epochs.
 
         Parameters
         ----------
diff --git a/src/gluonts/nursery/daf/tslib/dataset/loader.py b/src/gluonts/nursery/daf/tslib/dataset/loader.py
index ad9eb3da65..a0f104568d 100644
--- a/src/gluonts/nursery/daf/tslib/dataset/loader.py
+++ b/src/gluonts/nursery/daf/tslib/dataset/loader.py
@@ -30,7 +30,9 @@
 
 
 def _default_collate(batch):
-    r"""Puts each data field into a tensor with outer dimension batch size"""
+    r"""
+    Puts each data field into a tensor with outer dimension batch size.
+    """
 
     elem = batch[0]
     elem_type = type(elem)
@@ -101,7 +103,8 @@ def copy_to_gpu(data, cuda_device: int, non_blocking: bool):
 
 class MetaDataset(NamedTuple):
     """
-    Dataset Split Manager. Possess train/valid/test datasets and provide data loaders
+    Dataset Split Manager. Possess train/valid/test datasets and provide data
+    loaders.
 
     Parameters:
     --------------
diff --git a/src/gluonts/nursery/daf/tslib/metrics/meters.py b/src/gluonts/nursery/daf/tslib/metrics/meters.py
index 38e43d9bd5..d88d7b7a55 100644
--- a/src/gluonts/nursery/daf/tslib/metrics/meters.py
+++ b/src/gluonts/nursery/daf/tslib/metrics/meters.py
@@ -24,8 +24,8 @@
 
 class Meter(ABC):
     """
-    Abstract class for meters used in metric stats.
-    Every subclass has cached values for some metric
+    Abstract class for meters used in metric stats. Every subclass has cached
+    values for some metric.
 
     Parameters
     ----------
@@ -52,7 +52,8 @@ def restart(self) -> None:
         """
         reset the meter to clear the cached values;
 
-        at the same time, the optimal value so far is compared with current metric and updated
+        at the same time, the optimal value so far is compared with current
+        metric and updated
         """
         if self.is_optimal:
             self.best = self.value
@@ -65,14 +66,14 @@ def update(self, *args) -> None:
     @abstractmethod
     def value(self):
         """
-        read currently measured metric
+        Read currently measured metric.
         """
         pass
 
     @property
     def is_optimal(self) -> bool:
         """
-        indicates whether the current metric reaches optimality
+        Indicates whether the current metric reaches optimality.
 
         Raises
         ------
@@ -104,7 +105,7 @@ def __repr__(self) -> str:
 
 class Timer(Meter):
     """
-    Timer to save elapsed time
+    Timer to save elapsed time.
     """
 
     def __init__(self):
@@ -113,7 +114,7 @@ def __init__(self):
     @property
     def value(self) -> float:
         """
-        elapsed time in seconds since instantiation or last restart call
+        Elapsed time in seconds since instantiation or last restart call.
         """
         return time.time() - self.start_time
 
@@ -123,7 +124,7 @@ def _initialize(self) -> None:
 
 class NumericalAverageMeter(Meter):
     """
-    Maintains running average for scalar metrics
+    Maintains running average for scalar metrics.
 
     Call .restart() to clear added values and to start a new round of averaging
     """
@@ -135,7 +136,7 @@ def _initialize(self) -> None:
 
     def update(self, value: Union[Tensor, float]) -> None:
         """
-        add the current value to be further averaged
+        Add the current value to be further averaged.
 
         Parameters
         ----------
@@ -153,7 +154,7 @@ def update(self, value: Union[Tensor, float]) -> None:
     @property
     def value(self) -> float:
         """
-        average of all values added since instantiation or last restart call
+        Average of all values added since instantiation or last restart call.
 
         Returns
         -------
@@ -169,8 +170,8 @@ def value(self) -> float:
 
 class BatchAverageMeter(NumericalAverageMeter):
     """
-    Maintains running average for a stream of batched data, in which the outer dimension is
-    assumed to be batches
+    Maintains running average for a stream of batched data, in which the outer
+    dimension is assumed to be batches.
 
     Call .restart() to clear added values and to start a new round of averaging
     """
@@ -183,7 +184,7 @@ def __init__(
 
     def update(self, values: Tensor) -> None:
         """
-        add a batch of values to be further averaged
+        Add a batch of values to be further averaged.
 
         Parameters
         ----------
@@ -206,10 +207,11 @@ def update(self, values: Tensor) -> None:
 
 class MeanDeviationMeter(Meter):
     """
-    Maintains a stream of deviation and base values, and compute the ratio of their sums
-    e.g. WAPE in demand forecasting.
+    Maintains a stream of deviation and base values, and compute the ratio of
+    their sums e.g. WAPE in demand forecasting.
 
-    MD = \sigma{deviation} / \sigma{base}
+    .. math::
+        MD = \sigma{deviation} / \sigma{base}
 
     Call .restart() to clear added values and to start a new round of averaging
     """
@@ -224,7 +226,7 @@ def _initialize(self):
 
     def update(self, deviation: Tensor, base: Tensor) -> None:
         """
-        add new deviation and base values
+        add new deviation and base values.
 
         Parameters
         ----------
@@ -255,16 +257,18 @@ def value(self) -> float:
 
 class RootMeanSquareDeviationMeter(MeanDeviationMeter):
     """
-    Maintains a stream of deviation and base values, and compute the following ratio
+    Maintains a stream of deviation and base values, and compute the following
+    ratio.
 
-    MD = \sqrt{\sigma{deviation^2}} / \sqrt{\sigma{base^2}}
+    .. math::
+        MD = \sqrt{\sigma{deviation^2}} / \sqrt{\sigma{base^2}}
 
     Call .restart() to clear added values and to start a new round of averaging
     """
 
     def update(self, deviation: Tensor, base: Tensor) -> None:
         """
-        add new deviation and base values
+        Add new deviation and base values.
 
         Parameters
         ----------
diff --git a/src/gluonts/nursery/daf/tslib/nn/activations.py b/src/gluonts/nursery/daf/tslib/nn/activations.py
index a4b80b9ba6..e77fd007ed 100644
--- a/src/gluonts/nursery/daf/tslib/nn/activations.py
+++ b/src/gluonts/nursery/daf/tslib/nn/activations.py
@@ -21,8 +21,10 @@
 
 class GeLU(nn.Module):
     """
-    Gaussian error Linear Unit
-    y = 1/2 * x * (1 + tanh(\sqrt{2/pi} * (x + 0.044715*x^3)))
+    Gaussian error Linear Unit.
+
+    .. math::
+        y = 1/2 * x * (1 + tanh(\sqrt{2/pi} * (x + 0.044715*x^3)))
     """
 
     def forward(self, x: Tensor) -> Tensor:
@@ -40,8 +42,10 @@ def forward(self, x: Tensor) -> Tensor:
 
 class Swish(nn.Sigmoid):
     """
-    Swish activation by google https://arxiv.org/pdf/1710.05941v1.pdf
-    y = \sigma(x) * x
+    Swish activation by google https://arxiv.org/pdf/1710.05941v1.pdf.
+
+    .. math::
+        y = \sigma(x) * x
     """
 
     def forward(self, x: Tensor) -> Tensor:
@@ -50,16 +54,14 @@ def forward(self, x: Tensor) -> Tensor:
 
 class PositiveSoftplus(nn.Softplus):
     """
-    Softplus function that ensures a strictly positive activation
+    Softplus function that ensures a strictly positive activation.
 
     Parameters
     ----------
     margin : float
         the minimum value of activation. when =0, same as vanilla softplus
     beta: float
-
     threshold: float
-
     """
 
     def __init__(
@@ -78,7 +80,8 @@ def forward(self, x: Tensor) -> Tensor:
 
 class GatedLinearUnit(nn.Module):
     """
-    Gated Linear Unit activation proposed by https://arxiv.org/pdf/1612.08083.pdf
+    Gated Linear Unit activation proposed by
+    https://arxiv.org/pdf/1612.08083.pdf.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/daf/tslib/nn/attention/base.py b/src/gluonts/nursery/daf/tslib/nn/attention/base.py
index 40c0b5aa87..110d29f9c8 100644
--- a/src/gluonts/nursery/daf/tslib/nn/attention/base.py
+++ b/src/gluonts/nursery/daf/tslib/nn/attention/base.py
@@ -22,7 +22,8 @@
 
 class Attention(nn.Module):
     """
-    Base class of attention modules
+    Base class of attention modules.
+
     *NOTE*: d_hidden must be divisible by n_head
 
     Parameters
@@ -61,10 +62,10 @@ def __init__(
 
     def _split_head(self, x: Tensor) -> Tensor:
         """
-        Split hidden state into multi-heads
+        Split hidden state into multi-heads.
 
         Args
-        ----------
+        ----
             x : Tensor [batch, length, d_hidden]
 
         Returns
@@ -77,7 +78,7 @@ def _split_head(self, x: Tensor) -> Tensor:
 
     def _merge_head(self, x: Tensor) -> Tensor:
         """
-        Merge multi-heads into one hidden state
+        Merge multi-heads into one hidden state.
 
         Args
         ----------
diff --git a/src/gluonts/nursery/daf/tslib/nn/attention/interattn.py b/src/gluonts/nursery/daf/tslib/nn/attention/interattn.py
index f84ebde911..9dfbd022b3 100644
--- a/src/gluonts/nursery/daf/tslib/nn/attention/interattn.py
+++ b/src/gluonts/nursery/daf/tslib/nn/attention/interattn.py
@@ -27,7 +27,7 @@
 
 class InterAttention(Attention):
     """
-    Inter-attention module with k,v from source and q from target
+    Inter-attention module with k,v from source and q from target.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/daf/tslib/nn/attention/selfattn.py b/src/gluonts/nursery/daf/tslib/nn/attention/selfattn.py
index 0d8964d804..c9ce187510 100644
--- a/src/gluonts/nursery/daf/tslib/nn/attention/selfattn.py
+++ b/src/gluonts/nursery/daf/tslib/nn/attention/selfattn.py
@@ -27,7 +27,7 @@
 
 class SelfAttention(Attention):
     """
-    Self-attention module with q,k,v from the same input
+    Self-attention module with q,k,v from the same input.
 
     Parameters
     ----------
@@ -228,9 +228,10 @@ def forward(
 
 class GroupSelfAttention(SelfAttention):
     """
-    Self-attention module with q,k from the same input tensor.
-    The input tensor is the concatenation of `n_groups` of slightly different feature maps.
-    Thus the projections are 1x1 group convolutions.
+    Self-attention module with q,k from the same input tensor. The input tensor
+    is the concatenation of `n_groups` of slightly different feature maps. Thus
+    the projections are 1x1 group convolutions.
+
     *NOTE*: d_qk, d_hidden, n_head must be divisible by n_groups
     """
 
diff --git a/src/gluonts/nursery/daf/tslib/nn/transformer.py b/src/gluonts/nursery/daf/tslib/nn/transformer.py
index c97ea46ea7..638cbe4599 100644
--- a/src/gluonts/nursery/daf/tslib/nn/transformer.py
+++ b/src/gluonts/nursery/daf/tslib/nn/transformer.py
@@ -38,7 +38,7 @@
 
 class PositionwiseFFN(nn.Module):
     """
-    Positionwise feedforward network in transformers
+    Positionwise feedforward network in transformers.
     """
 
     def __init__(
diff --git a/src/gluonts/nursery/daf/tslib/nn/utils.py b/src/gluonts/nursery/daf/tslib/nn/utils.py
index c535b96971..a94932a0e3 100644
--- a/src/gluonts/nursery/daf/tslib/nn/utils.py
+++ b/src/gluonts/nursery/daf/tslib/nn/utils.py
@@ -25,7 +25,7 @@
 
 class ResidualBlock(nn.Module):
     """
-    Network module wrapped by residual connection
+    Network module wrapped by residual connection.
 
     Args
     ----------
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/count.py b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/count.py
index b91af33b6c..a1f48846b4 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/count.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/count.py
@@ -17,7 +17,9 @@
 class ParameterCountCallback(Callback):  # type: ignore
     """
     This callback allows counting model parameters during training.
-    The output is printed to the console and can be retrieved from the log files.
+
+    The output is printed to the console and can be retrieved from the log
+    files.
     """
 
     def __init__(self) -> None:
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/metric.py b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/metric.py
index 8812a06c88..9e626d5b32 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/metric.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/metric.py
@@ -23,8 +23,9 @@
 
 class QuantileMetricLoggerCallback(Callback):
     """
-    A callback that computes additional metrics on a numpy representation of the dataset every n epochs.
-    The computed values are logged to the output file of the pytorch lightning logger.
+    A callback that computes additional metrics on a numpy representation of
+    the dataset every n epochs. The computed values are logged to the output
+    file of the pytorch lightning logger.
 
     Args:
         quantiles: The quantiles that are predicted.
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/plot.py b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/plot.py
index 7169bd6cb4..a627a4ae48 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/plot.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/plot.py
@@ -27,9 +27,10 @@
 
 class ForecastPlotLoggerCallback(Callback):
     """
-    A callback that stores plots of the  predictions for a collection of samples every n epochs.
-    The plots display the query (past and future) and forecasted quantiles.
-    This callback is intended for models without support set and attention mechanism.
+    A callback that stores plots of the  predictions for a collection of
+    samples every n epochs. The plots display the query (past and future) and
+    forecasted quantiles. This callback is intended for models without support
+    set and attention mechanism.
 
     Args:
         log_batch: For each sample in the batch the prediction is plotted when the callback is called.
@@ -83,9 +84,11 @@ def on_validation_epoch_end(
 
 class ForecastSupportSetAttentionPlotLoggerCallback(Callback):
     """
-    A callback that stores plots of the  predictions for a collection of samples every n epochs.
-    The plots display the query (past and future), forecasted quantiles and the time series in the support set
-    of this sample aligned with their attention scores. This callback works only for models with attention mechanism!
+    A callback that stores plots of the  predictions for a collection of
+    samples every n epochs. The plots display the query (past and future),
+    forecasted quantiles and the time series in the support set of this sample
+    aligned with their attention scores. This callback works only for models
+    with attention mechanism!
 
     Args:
         log_batch: For each sample in the batch the prediction is plotted when the callback is called.
@@ -160,7 +163,8 @@ def on_validation_epoch_end(
 
 class LossPlotLoggerCallback(Callback):
     """
-    A callback that stores plots of the training and macro-averaged validation loss curve every n epochs.
+    A callback that stores plots of the training and macro-averaged validation
+    loss curve every n epochs.
 
     Args:
         every_n_epochs: Specifies how often the plots are generated.
@@ -233,7 +237,8 @@ def plot_loss(
 
 class CheatLossPlotLoggerCallback(LossPlotLoggerCallback):
     """
-    A callback that stores plots of the training and multiple validation losses curve every n epochs.
+    A callback that stores plots of the training and multiple validation losses
+    curve every n epochs.
 
     Args:
         every_n_epochs: Specifies how often the plots are generated.
@@ -272,7 +277,8 @@ def plot_loss(
 
 class MacroCRPSPlotCallback(Callback):
     """
-    A callback that stores plots of the validation losses and a macro-averaged validation loss curve every n epochs.
+    A callback that stores plots of the validation losses and a macro-averaged
+    validation loss curve every n epochs.
 
     Args:
         every_n_epochs: Specifies how often the plots are generated.
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/save.py b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/save.py
index 45381d0e9c..67183a4c86 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/save.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/callbacks/save.py
@@ -20,9 +20,10 @@
 
 class InitialSaveCallback(Callback):  # type: ignore
     """
-    This callback saves the initial model using the save method of the LightKit ConfigModule.
-    This allows to load the model later without access to the hyper parameters needed to instantiate the class.
-    Additionally, a dictionary of arguments is stored.
+    This callback saves the initial model using the save method of the LightKit
+    ConfigModule. This allows to load the model later without access to the
+    hyper-parameters needed to instantiate the class. Additionally, a
+    dictionary of arguments is stored.
 
     Args:
         args_to_save: Contains the arguments that are stored.
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/data/batch.py b/src/gluonts/nursery/few_shot_prediction/src/meta/data/batch.py
index 6512d7f5df..e4accdc270 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/data/batch.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/data/batch.py
@@ -25,8 +25,9 @@
 @dataclass
 class SeriesBatch:
     """
-    A batch of series from different base datasets, represented by the padded batch, the lengths of the series
-    and the splits sizes indicating the corresponding base dataset.
+    A batch of series from different base datasets, represented by the padded
+    batch, the lengths of the series and the splits sizes indicating the
+    corresponding base dataset.
     """
 
     sequences: (
@@ -106,7 +107,10 @@ def to(self, device) -> SeriesBatch:
 
     def rescale(self) -> SeriesBatch:
         """
-        Redo standardization. The series must contain the same time series in the same order as the dataset.
+        Redo standardization.
+
+        The series must contain the same time series in the same order as the
+        dataset.
         """
         m = self.scales[:, 0].unsqueeze(1)
         std = self.scales[:, 1].unsqueeze(1)
@@ -118,8 +122,11 @@ def rescale(self) -> SeriesBatch:
 
     def one_per_split(self) -> SeriesBatch:
         """
-        Choose the first element of every split section and return the resulting series batch.
-        This method should only be used on query (past and future) series batches.
+        Choose the first element of every split section and return the
+        resulting series batch.
+
+        This method should only be used on query (past and future) series
+        batches.
         """
         splits = torch.split(self.sequences, self.split_sections.tolist())
         lengths = torch.split(self.lengths, self.split_sections.tolist())
@@ -131,7 +138,9 @@ def one_per_split(self) -> SeriesBatch:
 
     def first_n(self, n: int) -> SeriesBatch:
         """
-        Choose the first n splits of the sequences as defined by split_sections.
+        Choose the first n splits of the sequences as defined by
+        ``split_sections``.
+
         Self.sequences has thus sum(split_section[i], i=0, ..., n-1) elements.
         """
         splits = torch.split(self.sequences, self.split_sections.tolist())
@@ -157,7 +166,9 @@ def __getitem__(self, index: int) -> SeriesBatch:
 @dataclass
 class TripletBatch:
     """
-    A triplet batch, composed of a batch of support sets, query contexts and prediction horizons.
+    A triplet batch, composed of a batch of support sets, query contexts and
+    prediction horizons.
+
     Compared to a simple triplet, it also manages the lengths of all samples.
     """
 
@@ -168,7 +179,8 @@ class TripletBatch:
     @classmethod
     def collate(cls, triplets: List[Triplet]) -> TripletBatch:
         """
-        Combines a list of triplets into a batched triplet to pass to a network.
+        Combines a list of triplets into a batched triplet to pass to a
+        network.
         """
         s, p, f = zip(*triplets)
         return TripletBatch(
@@ -192,8 +204,10 @@ def pin_memory(self):
 
     def reduce_to_unique_query(self) -> TripletBatch:
         """
-        Selects the first query of every group of queries that use the same support set
-        (see split_sections of queries) and returns the resulting triplet batch.
+        Selects the first query of every group of queries that use the same
+        support set (see split_sections of queries) and returns the resulting
+        triplet batch.
+
         Query past and future are reduced, support sets are not touched.
         """
         return TripletBatch(
@@ -204,7 +218,8 @@ def reduce_to_unique_query(self) -> TripletBatch:
 
     def first_n(self, n: int) -> TripletBatch:
         """
-        Choose the first n splits of each series batch and return the resulting triplet batch.
+        Choose the first n splits of each series batch and return the resulting
+        triplet batch.
         """
         return TripletBatch(
             support_set=self.support_set.first_n(n),
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/data/dataset.py b/src/gluonts/nursery/few_shot_prediction/src/meta/data/dataset.py
index 40d50ff6fd..4e773fa0fb 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/data/dataset.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/data/dataset.py
@@ -22,8 +22,8 @@
 @dataclass
 class TimeSeries:
     """
-    A time series contains the time series data along with metadata about the time series as well
-    as static and dynamic features.
+    A time series contains the time series data along with metadata about the
+    time series as well as static and dynamic features.
     """
 
     dataset_name: str
@@ -103,7 +103,10 @@ def __init__(
 
     def rescale_dataset(self, series: torch.Tensor):
         """
-        Redo standardization. The series must contain the same time series in the same order as the dataset.
+        Redo standardization.
+
+        The series must contain the same time series in the same order as the
+        dataset.
         """
         return (
             (series * self.stds.unsqueeze(2)) + self.means.unsqueeze(2)
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/data/sampling.py b/src/gluonts/nursery/few_shot_prediction/src/meta/data/sampling.py
index 9fc089fafe..a9f3adceca 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/data/sampling.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/data/sampling.py
@@ -23,11 +23,13 @@
 
 
 class WeightedIndexIterator:
-    """Iterator that caches a number of indices sampled according to given weights.
+    """
+    Iterator that caches a number of indices sampled according to given
+    weights.
 
-    This gives a great performance speedup since np.random.choice is the bottleneck
-    of the data loading. This class samples and caches a certain number of indices and
-    return them until new ones need to be sampled.
+    This gives a great performance speedup since np.random.choice is the
+    bottleneck of the data loading. This class samples and caches a certain
+    number of indices and return them until new ones need to be sampled.
     """
 
     def __init__(self, weights: np.ndarray, num_cache: int = 1024):
@@ -58,7 +60,8 @@ def __next__(self):
 @dataclass
 class Triplet:
     """
-    A triplet is composed of a support set, observed queries, and corresponding (unobserved) future queries.
+    A triplet is composed of a support set, observed queries, and corresponding
+    (unobserved) future queries.
     """
 
     support_set: List[
@@ -77,7 +80,8 @@ def __iter__(self):
 
 class TripletDataset(Dataset[Triplet]):
     """
-    The triplet dataset gets a list of queries and corresponding support set and returns them as triplets.
+    The triplet dataset gets a list of queries and corresponding support set
+    and returns them as triplets.
     """
 
     def __init__(
@@ -104,9 +108,11 @@ class SamplingTripletDataset(IterableDataset[Triplet]):  # type: ignore
     """
     The sampling triplet dataset randomly samples support sets and past queries
     along with their future prediction horizon.
-    All three sets consist of time series windows sliced from the original time series. The support set time series
-    end before the prediction horizon begins to avoid time leakage.
-    The dataset yields infinitely many items. Support set time series length is for now context_length.
+
+    All three sets consist of time series windows sliced from the original time
+    series. The support set time series end before the prediction horizon
+    begins to avoid time leakage. The dataset yields infinitely many items.
+    Support set time series length is for now context_length.
     """
 
     def __init__(
@@ -225,9 +231,11 @@ def _sample_queries(self):
 
 class SequentialTripletDataset(Dataset[Triplet]):  # type: ignore
     """
-    The sequential triplet dataset traverses the dataset and uses the last prediction length slice as future query.
-    The support set is sampled randomly. The length of dataset is the number of times series
-    divided by the number of queries.
+    The sequential triplet dataset traverses the dataset and uses the last
+    prediction length slice as future query.
+
+    The support set is sampled randomly. The length of dataset is the number of
+    times series divided by the number of queries.
     """
 
     def __init__(
@@ -386,11 +394,13 @@ def sample_supps(
 
 class SuperSamplingTripletDataset(IterableDataset[Triplet]):  # type: ignore
     """
-    The super sampling triplet dataset randomly samples support sets and past queries
-    along with their future prediction horizon from a list of sampling datasets.
-    First a sampling dataset is randomly chosen.
-    Then the chosen triplet dataset samples support, query past and query future set.
-    The dataset yields infinitely many items.
+    The super sampling triplet dataset randomly samples support sets and past
+    queries along with their future prediction horizon from a list of sampling
+    datasets.
+
+    First a sampling dataset is randomly chosen. Then the chosen triplet
+    dataset samples support, query past and query future set. The dataset
+    yields infinitely many items.
     """
 
     def __init__(
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/artificial.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/artificial.py
index 5a2dab200b..2d6353d137 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/artificial.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/artificial.py
@@ -37,7 +37,7 @@
 @dataclass
 class MetaData:
     """
-    Meta data for artificial datasets
+    Meta data for artificial datasets.
     """
 
     context_length_multiple: int
@@ -116,7 +116,8 @@ def prediction_length(self) -> int:
     @property
     def root(self) -> Path:
         """
-        Returns the directory where all the data pertaining to this dataset is stored.
+        Returns the directory where all the data pertaining to this dataset is
+        stored.
         """
         return self.data_dir / "artificial" / self.dataset_name
 
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/cheat.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/cheat.py
index 8f53989c7f..51e205155f 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/cheat.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/cheat.py
@@ -39,7 +39,7 @@
 @dataclass(frozen=True)
 class CheatMetaData:
     """
-    Meta data for a cheating dataset
+    Meta data for a cheating dataset.
     """
 
     context_length_multiple: int
@@ -76,8 +76,8 @@ def get_hash(self):
 @register_data_module
 class CheatArtificialDataModule(pl.LightningDataModule):
     """
-    A data module which provides datasets with different
-    ground truth / counter factual injection modes for the support set.
+    A data module which provides datasets with different ground truth / counter
+    factual injection modes for the support set.
 
     Parameters
     ----------
@@ -200,7 +200,8 @@ def dataset_name(self) -> str:
     @property
     def root(self) -> Path:
         """
-        Returns the directory where all the data pertaining to this dataset is stored.
+        Returns the directory where all the data pertaining to this dataset is
+        stored.
         """
         return self.data_dir / "artificial" / self.dataset_name
 
@@ -491,7 +492,7 @@ def name(cls) -> str:
 @dataclass(frozen=True)
 class CheatCounterfactual(CheatMetaData):
     """
-    Meta data for a cheating dataset
+    Meta data for a cheating dataset.
     """
 
     counterfactual_size: int
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/gluonts.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/gluonts.py
index 6755d83b6f..92b1572dce 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/gluonts.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/gluonts.py
@@ -138,7 +138,8 @@ def prediction_length(self) -> int:
     @property
     def root(self) -> Path:
         """
-        Returns the directory where all the data pertaining to this dataset is stored.
+        Returns the directory where all the data pertaining to this dataset is
+        stored.
         """
         return self.data_dir / "datasets" / self.dataset_name
 
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m1.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m1.py
index f243bf4bde..4a9a8fedc7 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m1.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m1.py
@@ -29,7 +29,8 @@
 @register_data_module
 class M1DataModule(GluonTSDataModule):
     """
-    A data module which provides a frequency-category split of the M1 dataset as a standalone dataset.
+    A data module which provides a frequency-category split of the M1 dataset
+    as a standalone dataset.
     """
 
     def __init__(self, **kwargs):
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m3.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m3.py
index e49de52262..56df977054 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m3.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m3.py
@@ -29,7 +29,8 @@
 @register_data_module
 class M3DataModule(GluonTSDataModule):
     """
-    A data module which provides a frequency-category split of the M3 dataset as a standalone dataset.
+    A data module which provides a frequency-category split of the M3 dataset
+    as a standalone dataset.
     """
 
     def __init__(self, **kwargs):
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m4.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m4.py
index a510676208..2a292bed04 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m4.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/m4.py
@@ -36,7 +36,8 @@
 @register_data_module
 class M4DataModule(GluonTSDataModule):
     """
-    A data module which provides a frequency-category split of the M4 dataset as a standalone dataset.
+    A data module which provides a frequency-category split of the M4 dataset
+    as a standalone dataset.
     """
 
     def __init__(self, **kwargs):
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/filters.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/filters.py
index 8955f4ec4f..ed01ce0858 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/filters.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/filters.py
@@ -27,7 +27,8 @@ class Filter(ABC):
     @abstractmethod
     def __call__(self, items: List[Item]) -> List[Item]:
         """
-        Filters the given items and returns the ones that should be kept in the dataset.
+        Filters the given items and returns the ones that should be kept in the
+        dataset.
 
         Args:
             items:  The items to filter.
@@ -42,8 +43,9 @@ def __call__(self, items: List[Item]) -> List[Item]:
 
 class ConstantTargetFilter(Filter):
     """
-    A filter which removes items having only constant target values. This filter should be used
-    whenever metrics such as the MASE are required.
+    A filter which removes items having only constant target values.
+
+    This filter should be used whenever metrics such as the MASE are required.
     """
 
     def __init__(self, prediction_length: int, required_length: int = 0):
@@ -67,7 +69,8 @@ def __call__(self, items: List[Item]) -> List[Item]:
 
 class AbsoluteValueFilter(Filter):
     """
-    A filter which removes items having absolute average values of more than the provided value.
+    A filter which removes items having absolute average values of more than
+    the provided value.
     """
 
     def __init__(self, value: float):
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/transform.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/transform.py
index f5041f0e04..572a44fc87 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/transform.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/preprocessing/transform.py
@@ -26,7 +26,8 @@
 
 class Transform(ABC):
     """
-    A transform enables transforming the set of time series contained in a dataset.
+    A transform enables transforming the set of time series contained in a
+    dataset.
     """
 
     @abstractmethod
@@ -44,8 +45,10 @@ def __call__(self, items: List[Item]) -> List[Item]:
 
 class ItemIDTransform(Transform):
     """
-    Adds an id to the time series. For rolling test sets
-    the time series that are an extension of each other have the same id.
+    Adds an id to the time series.
+
+    For rolling test sets the time series that are an extension of each other
+    have the same id.
     """
 
     def __init__(self, required_length: int = 0):
@@ -68,8 +71,8 @@ def read_transform_write(
     source: Optional[Path] = None,
 ) -> None:
     """
-    Reads the dataset from the provided path, applies the given transform and writes it back to the
-    same file.
+    Reads the dataset from the provided path, applies the given transform and
+    writes it back to the same file.
 
     Args:
         file: The path from where to read the data.
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/registry.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/registry.py
index 27f7cd3a6b..2bd5b1284a 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/registry.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/registry.py
@@ -33,8 +33,9 @@ def register_data_module(cls: M) -> M:
 def get_data_module(name: str, **kwargs: Any) -> pl.LightningDataModule:
     """
     This method creates the data module with the specified name. The provided
-    keyword arguments must contain ALL arguments required by the model configuration.
-    Superfluous arguments may be provided and are simply ignored.
+    keyword arguments must contain ALL arguments required by the model
+    configuration. Superfluous arguments may be provided and are simply
+    ignored.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/splits.py b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/splits.py
index ad0cbf24ce..069ad1fda3 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/splits.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/datasets/splits.py
@@ -30,9 +30,11 @@
 @dataclass
 class DatasetSplits:
     """
-    The dataset splits provide train, validation and test data for a particular dataset. Calling
-    any of the functions here, is a noop. Data is only loaded once a particular representation of
-    the data is accessed.
+    The dataset splits provide train, validation and test data for a particular
+    dataset.
+
+    Calling any of the functions here, is a noop. Data is only loaded once a
+    particular representation of the data is accessed.
     """
 
     _metadata: MetaData
@@ -65,7 +67,9 @@ def train(self, val: bool = True, name: str = "") -> DatasetSplit:
 
     def val(self, name: str = "") -> DatasetSplit:
         """
-        Returns the validation data for the dataset. This is the same as :meth:`train(False)`.
+        Returns the validation data for the dataset.
+
+        This is the same as :meth:`train(False)`.
         """
         return DatasetSplit(
             self._metadata,
@@ -93,8 +97,8 @@ def test(self, name: str = "") -> DatasetSplit:
 @dataclass
 class DatasetSplit:
     """
-    A dataset split provides all the representations for a particular split (train/val/test) of a
-    dataset.
+    A dataset split provides all the representations for a particular split
+    (train/val/test) of a dataset.
     """
 
     _metadata: MetaData
@@ -106,8 +110,9 @@ class DatasetSplit:
 
     def data(self, evaluation: bool = False) -> TimeSeriesDataset:
         """
-        Returns a time series dataset for the dataset split. This loads the associated JSON file and
-        is, thus, potentially slow.
+        Returns a time series dataset for the dataset split.
+
+        This loads the associated JSON file and is, thus, potentially slow.
         """
         gluonts = FileDataset(
             self._directory / self._split, freq=self._metadata.freq
@@ -170,8 +175,8 @@ def evaluation(self) -> EvaluationDataset:
 
     def prepare(self) -> None:
         """
-        Prepares all required representations provided that the GluonTS dataset is already
-        generated.
+        Prepares all required representations provided that the GluonTS dataset
+        is already generated.
         """
         target = self._directory / "numpy" / self._split
         if self._split == "train":
@@ -202,9 +207,11 @@ def prepare(self) -> None:
 @dataclass
 class EvaluationDataset:
     """
-    The evaluation dataset is a simple dataset representation that contains a two-dimensional array
-    of future values as well as a two-dimensional (masked) array of the past values that a model
-    sees during training. This representation is very efficient for evaluation.
+    The evaluation dataset is a simple dataset representation that contains a
+    two-dimensional array of future values as well as a two-dimensional
+    (masked) array of the past values that a model sees during training.
+
+    This representation is very efficient for evaluation.
     """
 
     future: np.ndarray
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/crps.py b/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/crps.py
index 3044a7ef58..99cc4f2a54 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/crps.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/crps.py
@@ -18,7 +18,8 @@
 
 class CRPS(Metric):
     """
-    Same as mean_weighted_quantile_loss in meta.evaluation.metrics just for pytorch
+    Same as mean_weighted_quantile_loss in meta.evaluation.metrics just for
+    pytorch.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/numpy.py b/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/numpy.py
index bf79e186c7..e13c00670f 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/numpy.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/metrics/numpy.py
@@ -82,7 +82,8 @@ def compute_metrics(
     seasonality: int,
 ) -> Dict[str, float]:
     """
-    Evaluates the forecasts on the provided dataset and returns summary metrics.
+    Evaluates the forecasts on the provided dataset and returns summary
+    metrics.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/attention.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/attention.py
index b741d91871..2d6758345b 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/attention.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/attention.py
@@ -29,7 +29,8 @@ def forward(
         self, query: SeriesBatch, supps: SeriesBatch
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
         """
-        Each query attends to each time point in each series of its support set.
+        Each query attends to each time point in each series of its support
+        set.
 
         Note that we cannot use a canonical attention mechanism here because of the composition of the batches.
         The support time series are stacked along the batch dimension. Suppose a fixed support set size `x`
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/decoder.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/decoder.py
index 41a9b9c0df..8af4a039e6 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/decoder.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/decoder.py
@@ -27,7 +27,8 @@ def forward(
         self, query: torch.Tensor, value: torch.Tensor
     ) -> torch.Tensor:
         """
-        Forecast from encoded (query, support set) (typically via attention) and encoded queries.
+        Forecast from encoded (query, support set) (typically via attention)
+        and encoded queries.
 
         Parameters
         ----------
@@ -55,7 +56,6 @@ class FeedForwardQuantileDecoder(Decoder):
     Returns
     -------
     torch.Tensor: predictions of size [batch, prediction_length, num_quantiles]
-
     """
 
     def __init__(
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/supps.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/supps.py
index 396c135e2d..41dfe8463f 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/supps.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/supps.py
@@ -29,7 +29,8 @@ class SupportSetEncoder(nn.Module, ABC):
     @abstractmethod
     def forward(self, supps: SeriesBatch) -> SeriesBatch:
         """
-        Encodes each time point of each support set time series with a vector of fixed size.
+        Encodes each time point of each support set time series with a vector
+        of fixed size.
 
         Parameters
         ----------
@@ -46,7 +47,8 @@ def forward(self, supps: SeriesBatch) -> SeriesBatch:
 
 class LSTMSupportSetEncoder(SupportSetEncoder):
     """
-    Encodes each time step in the support set times series via the hidden states of a LSTM.
+    Encodes each time step in the support set times series via the hidden
+    states of a LSTM.
 
     Parameters
     ----------
@@ -145,7 +147,7 @@ def forward(self, supps: SeriesBatch) -> torch.Tensor:
 
 class TcnSupportSetEncoder(SupportSetEncoder):
     """
-    Encodes each time point of the support set time series via WaveNet
+    Encodes each time point of the support set time series via WaveNet.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/tcn.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/tcn.py
index 6d0db4d1bc..62b18eebd1 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/tcn.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/components/tcn.py
@@ -56,8 +56,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class CausalConvolution(torch.nn.Module):
     """
-    A single causal convolution applies the causal convolution itself, weight normalization, and
-    an activation function.
+    A single causal convolution applies the causal convolution itself, weight
+    normalization, and an activation function.
     """
 
     def __init__(
@@ -88,8 +88,8 @@ def __init__(
 
     def forward(self, sequences: torch.Tensor) -> torch.Tensor:
         """
-        Computes the causal convolution for the provided sequence. Inputs are padded such that
-        the output sequence length
+        Computes the causal convolution for the provided sequence. Inputs are
+        padded such that the output sequence length.
 
         Args:
             sequences: Tensor of shape `[batch_size, in_channels, sequence_length]`.
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/meta.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/meta.py
index f8379a6601..6d84344b10 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/meta.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/meta.py
@@ -99,6 +99,7 @@ def forward(
 class IwataKumagaiEcDcConfig:
     """
     Configuration class for a LSTMEncoderFeedforwardDecoder.
+
     See also:
         :class:`LSTMEncoderFeedforwardDecoder`
     """
@@ -130,9 +131,11 @@ class IwataKumagaiEcDc(
     EncoderDecoderMetaModel, Configurable[IwataKumagaiEcDcConfig]
 ):
     """
-    The base model from the paper https://arxiv.org/abs/2009.14379 by Iwata and Kumagai.
-    Differences are quantile prediction and (optionally) longer prediction window.
+    The base model from the paper https://arxiv.org/abs/2009.14379 by Iwata and
+    Kumagai.
 
+    Differences are quantile prediction and (optionally) longer prediction
+    window.
     """
 
     def __init__(self, config: IwataKumagaiEcDcConfig):
@@ -175,6 +178,7 @@ def name(cls) -> str:
 class CNNLSTMEcDcConfig:
     """
     Configuration class for a LSTMEncoderFeedforwardDecoder.
+
     See also:
         :class:`LSTMEncoderFeedforwardDecoder`
     """
@@ -239,6 +243,7 @@ def name(cls) -> str:
 class TcnEcDcConfig:
     """
     Configuration class for a  TcnEcDc.
+
     See also:
         :class:` TcnEcDc`
     """
@@ -271,6 +276,7 @@ class TcnEcDcConfig:
 class TcnEcDc(EncoderDecoderMetaModel, Configurable[TcnEcDcConfig]):
     """
     Shared WaveNet-like encoder for query and support set.
+
     Multi-head attention to match the encoded query and support set.
     Feedforward decoder for multi-step quantile prediction.
     """
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/series.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/series.py
index 420ef6a00a..d6f481a7e6 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/series.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/EcDc/series.py
@@ -31,6 +31,7 @@
 class LSTMEncoderFeedforwardDecoderConfig:
     """
     Configuration class for a LSTMEncoderFeedforwardDecoder.
+
     See also:
         :class:`LSTMEncoderFeedforwardDecoder`
     """
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/model.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/model.py
index aeee080fe7..178c31902f 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/model.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/model.py
@@ -19,7 +19,8 @@
 
 class MetaModel(ABC, nn.Module):
     """
-    Base class for all meta models that make predictions based on a support set and queries
+    Base class for all meta models that make predictions based on a support set
+    and queries.
     """
 
     @abstractmethod
@@ -48,7 +49,7 @@ def device(self):
 
 class SeriesModel(ABC, nn.Module):
     """
-    Base class for all models that make predictions based on a query only
+    Base class for all models that make predictions based on a query only.
     """
 
     @abstractmethod
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/models/registry.py b/src/gluonts/nursery/few_shot_prediction/src/meta/models/registry.py
index d34e19c31c..1fcc0fabd2 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/models/registry.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/models/registry.py
@@ -33,9 +33,10 @@ def register_model(cls: M) -> M:
 
 def get_model(name: str, **kwargs: Any) -> nn.Module:
     """
-    This method creates the model configuration of the model with the specified name. The provided
-    keyword arguments must contain ALL arguments required by the model configuration. Superfluous
-    arguments may be provided and are simply ignored.
+    This method creates the model configuration of the model with the specified
+    name. The provided keyword arguments must contain ALL arguments required by
+    the model configuration. Superfluous arguments may be provided and are
+    simply ignored.
 
     In case the name is unknown or parameters for the model config's initializer are missing, an
     assertion error occurs.
diff --git a/src/gluonts/nursery/few_shot_prediction/src/meta/vis/forecast.py b/src/gluonts/nursery/few_shot_prediction/src/meta/vis/forecast.py
index 9129df2d30..238fd262c9 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/meta/vis/forecast.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/meta/vis/forecast.py
@@ -25,8 +25,9 @@ def plot_forecast_supportset_attention(
     quantiles: List[str],
 ) -> plt.Figure:
     """
-    Plots the provided forecasts for each sample with confidence intervals using all provided quantiles.
-    Furthermore, plots the time series in the support set of this sample aligned with their attention scores.
+    Plots the provided forecasts for each sample with confidence intervals
+    using all provided quantiles. Furthermore, plots the time series in the
+    support set of this sample aligned with their attention scores.
 
     Parameters
     ----------
@@ -101,7 +102,8 @@ def plot_quantile_forecast(
     quantiles: List[str],
 ) -> plt.Figure:
     """
-    Plots the provided forecasts for each sample with confidence intervals using all provided quantiles.
+    Plots the provided forecasts for each sample with confidence intervals
+    using all provided quantiles.
 
     Parameters
     ----------
@@ -144,7 +146,8 @@ def _plot_quantile_forecast(
     quantiles: List[str],
 ):
     """
-    Plots the provided forecast with confidence intervals using all provided quantiles.
+    Plots the provided forecast with confidence intervals using all provided
+    quantiles.
 
     Parameters
     ----------
diff --git a/src/gluonts/nursery/few_shot_prediction/src/scripts/data.py b/src/gluonts/nursery/few_shot_prediction/src/scripts/data.py
index 0f0faf6589..7ff9a71e4a 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/scripts/data.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/scripts/data.py
@@ -100,7 +100,9 @@ def artificial(
 
 @main.command()
 def statistics():
-    """Compute statistics of real-world datasets."""
+    """
+    Compute statistics of real-world datasets.
+    """
 
     path_to_data = Path.home() / ".mxnet" / "gluon-ts" / "datasets"
     path_save_plots = Path.home() / "data" / "plots"
diff --git a/src/gluonts/nursery/few_shot_prediction/src/scripts/train.py b/src/gluonts/nursery/few_shot_prediction/src/scripts/train.py
index b250d44577..e5342c70b9 100644
--- a/src/gluonts/nursery/few_shot_prediction/src/scripts/train.py
+++ b/src/gluonts/nursery/few_shot_prediction/src/scripts/train.py
@@ -195,7 +195,7 @@ def main(
     **kwargs: int,
 ):
     """
-    Trains a model in the meta learning framework
+    Trains a model in the meta learning framework.
     """
     args_to_save = locals()
     random.seed(seed)
diff --git a/src/gluonts/nursery/robust-mts-attack/eval.py b/src/gluonts/nursery/robust-mts-attack/eval.py
index 501c0f0ae1..56632dc352 100644
--- a/src/gluonts/nursery/robust-mts-attack/eval.py
+++ b/src/gluonts/nursery/robust-mts-attack/eval.py
@@ -101,7 +101,7 @@
         )
         with torch.no_grad():
             """
-            Clean data
+            Clean data.
             """
             inputs = dict(
                 [(key, batch[key]) for key in PREDICTION_INPUT_NAMES]
diff --git a/src/gluonts/nursery/robust-mts-attack/multivariate/datasets/grouper.py b/src/gluonts/nursery/robust-mts-attack/multivariate/datasets/grouper.py
index 5f65081d1e..416e206920 100644
--- a/src/gluonts/nursery/robust-mts-attack/multivariate/datasets/grouper.py
+++ b/src/gluonts/nursery/robust-mts-attack/multivariate/datasets/grouper.py
@@ -81,10 +81,11 @@ def _preprocess(self, dataset: Dataset) -> None:
         """
         The preprocess function iterates over the dataset to gather data that
         is necessary for grouping.
+
         This includes:
-            1) Storing first/last timestamp in the dataset
-            2) Aligning time series
-            3) Calculating groups
+             1. Storing first/last timestamp in the dataset
+             2. Aligning time series
+             3. Calculating groups
         """
         for data in dataset:
             timestamp = data["start"]
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/distributions/implicit_quantile.py b/src/gluonts/nursery/robust-mts-attack/pts/distributions/implicit_quantile.py
index 32a609b781..57cc9273b1 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/distributions/implicit_quantile.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/distributions/implicit_quantile.py
@@ -42,9 +42,10 @@ def __init__(
 
     @torch.no_grad()
     def sample(self, sample_shape=torch.Size()):
-        """See arXiv: 1806.06923
-        Once the model has learned how to predict a given quantile tau, one can sample from the
-        distribution of the target, by sampling tau values.
+        """
+        See arXiv: 1806.06923 Once the model has learned how to predict a given
+        quantile tau, one can sample from the distribution of the target, by
+        sampling tau values.
         """
         if len(sample_shape) == 0:
             num_parallel_samples = 1
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/distributions/utils.py b/src/gluonts/nursery/robust-mts-attack/pts/distributions/utils.py
index 4c0bbf877f..c7fd3c8a02 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/distributions/utils.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/distributions/utils.py
@@ -17,8 +17,8 @@
 
 def broadcast_shape(*shapes, **kwargs):
     """
-    Similar to ``np.broadcast()`` but for shapes.
-    Equivalent to ``np.broadcast(*map(np.empty, shapes)).shape``.
+    Similar to ``np.broadcast()`` but for shapes. Equivalent to
+    ``np.broadcast(*map(np.empty, shapes)).shape``.
 
     :param tuple shapes: shapes of tensors.
     :param bool strict: whether to use extend-but-not-resize broadcasting.
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/feature/holiday.py b/src/gluonts/nursery/robust-mts-attack/pts/feature/holiday.py
index aef6823c36..4d860f4d70 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/feature/holiday.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/feature/holiday.py
@@ -22,8 +22,8 @@
 class CustomDateFeatureSet:
     """
     Implements calculation of date features. The CustomDateFeatureSet is
-    applied on a pandas Series with Datetimeindex and returns a 1D array of
-    the shape (1, len(date_indices)).
+    applied on a pandas Series with Datetimeindex and returns a 1D array of the
+    shape (1, len(date_indices)).
 
     Note that for lower than daily granularity the distance to the holiday is
     still computed on a per-day basis.
@@ -104,9 +104,9 @@ def __call__(self, dates):
 class CustomHolidayFeatureSet:
     """
     Implements calculation of holiday features. The CustomHolidayFeatureSet is
-    applied on a pandas Series with Datetimeindex and returns a 2D array of
-    the shape (len(dates), num_features), where num_features are the number
-    of holidays.
+    applied on a pandas Series with Datetimeindex and returns a 2D array of the
+    shape (len(dates), num_features), where num_features are the number of
+    holidays.
 
     Note that for lower than daily granularity the distance to the holiday is
     still computed on a per-day basis.
@@ -140,7 +140,6 @@ class CustomHolidayFeatureSet:
                 1.12535175e-07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
                [3.67879441e-01, 1.00000000e+00, 3.67879441e-01, 1.83156389e-02,
                 1.23409804e-04, 1.12535175e-07, 0.00000000e+00, 0.00000000e+00]])
-
     """
 
     def __init__(
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/deepvar/deepvar_network.py b/src/gluonts/nursery/robust-mts-attack/pts/model/deepvar/deepvar_network.py
index 64353b4adf..f6a1c58939 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/deepvar/deepvar_network.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/deepvar/deepvar_network.py
@@ -240,11 +240,10 @@ def unroll_encoder(
         torch.Tensor,
     ]:
         """
-        Unrolls the RNN encoder over past and, if present, future data.
-        Returns outputs and state of the encoder, plus the scale of
-        past_target_cdf and a vector of static features that was constructed
-        and fed as input to the encoder. All tensor arguments should have NTC
-        layout.
+        Unrolls the RNN encoder over past and, if present, future data. Returns
+        outputs and state of the encoder, plus the scale of past_target_cdf and
+        a vector of static features that was constructed and fed as input to
+        the encoder. All tensor arguments should have NTC layout.
 
         Parameters
         ----------
@@ -282,7 +281,6 @@ def unroll_encoder(
             Scaled lags(batch_size, sub_seq_len, target_dim, num_lags)
         inputs
             inputs to the RNN
-
         """
         # print(past_observed_values.shape)
         # print(past_is_pad.unsqueeze(-1).shape)
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/estimator.py b/src/gluonts/nursery/robust-mts-attack/pts/model/estimator.py
index 378f81c24b..953bef8bb5 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/estimator.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/estimator.py
@@ -74,7 +74,8 @@ def create_transformation(self) -> Transformation:
 
     def create_instance_splitter(self, mode: str) -> Transformation:
         """
-        Create and return the instance splitter needed for training, validation or testing.
+        Create and return the instance splitter needed for training, validation
+        or testing.
 
         Returns
         -------
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/n_beats/n_beats_ensemble.py b/src/gluonts/nursery/robust-mts-attack/pts/model/n_beats/n_beats_ensemble.py
index ff9234273d..386dceebc3 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/n_beats/n_beats_ensemble.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/n_beats/n_beats_ensemble.py
@@ -113,8 +113,8 @@ def predict(
 
 class NBEATSEnsembleEstimator(PyTorchEstimator):
     """
-    An ensemble N-BEATS Estimator (approximately) as described
-    in the paper:  https://arxiv.org/abs/1905.10437.
+    An ensemble N-BEATS Estimator (approximately) as described in the paper:
+    https://arxiv.org/abs/1905.10437.
 
     The three meta parameters 'meta_context_length', 'meta_loss_function' and 'meta_bagging_size'
     together define the way the sub-models are assembled together.
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/simple_feedforward/simple_feedforward_estimator.py b/src/gluonts/nursery/robust-mts-attack/pts/model/simple_feedforward/simple_feedforward_estimator.py
index 18e33a0546..450b43ea97 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/simple_feedforward/simple_feedforward_estimator.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/simple_feedforward/simple_feedforward_estimator.py
@@ -109,7 +109,9 @@ def __init__(
         num_parallel_samples: int = 100,
     ) -> None:
         """
-        Defines an estimator. All parameters should be serializable.
+        Defines an estimator.
+
+        All parameters should be serializable.
         """
         super().__init__(trainer=trainer)
 
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/tempflow/tempflow_network.py b/src/gluonts/nursery/robust-mts-attack/pts/model/tempflow/tempflow_network.py
index 6fc6ae1fa2..0fa842076f 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/tempflow/tempflow_network.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/tempflow/tempflow_network.py
@@ -216,11 +216,10 @@ def unroll_encoder(
         torch.Tensor,
     ]:
         """
-        Unrolls the RNN encoder over past and, if present, future data.
-        Returns outputs and state of the encoder, plus the scale of
-        past_target_cdf and a vector of static features that was constructed
-        and fed as input to the encoder. All tensor arguments should have NTC
-        layout.
+        Unrolls the RNN encoder over past and, if present, future data. Returns
+        outputs and state of the encoder, plus the scale of past_target_cdf and
+        a vector of static features that was constructed and fed as input to
+        the encoder. All tensor arguments should have NTC layout.
 
         Parameters
         ----------
@@ -256,7 +255,6 @@ def unroll_encoder(
             Scaled lags(batch_size, sub_seq_len, target_dim, num_lags)
         inputs
             inputs to the RNN
-
         """
 
         past_observed_values = torch.min(
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/time_grad/time_grad_network.py b/src/gluonts/nursery/robust-mts-attack/pts/model/time_grad/time_grad_network.py
index ecd657bf2f..edaa324211 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/time_grad/time_grad_network.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/time_grad/time_grad_network.py
@@ -232,11 +232,10 @@ def unroll_encoder(
         torch.Tensor,
     ]:
         """
-        Unrolls the RNN encoder over past and, if present, future data.
-        Returns outputs and state of the encoder, plus the scale of
-        past_target_cdf and a vector of static features that was constructed
-        and fed as input to the encoder. All tensor arguments should have NTC
-        layout.
+        Unrolls the RNN encoder over past and, if present, future data. Returns
+        outputs and state of the encoder, plus the scale of past_target_cdf and
+        a vector of static features that was constructed and fed as input to
+        the encoder. All tensor arguments should have NTC layout.
 
         Parameters
         ----------
@@ -272,7 +271,6 @@ def unroll_encoder(
             Scaled lags(batch_size, sub_seq_len, target_dim, num_lags)
         inputs
             inputs to the RNN
-
         """
 
         past_observed_values = torch.min(
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/transformer/transformer_network.py b/src/gluonts/nursery/robust-mts-attack/pts/model/transformer/transformer_network.py
index 3ac17fef6e..0557ee148c 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/transformer/transformer_network.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/transformer/transformer_network.py
@@ -160,6 +160,7 @@ def create_network_input(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Creates inputs for the transformer network.
+
         All tensor arguments should have NTC layout.
         """
 
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/transformer_tempflow/transformer_tempflow_network.py b/src/gluonts/nursery/robust-mts-attack/pts/model/transformer_tempflow/transformer_tempflow_network.py
index 55d60d1537..66e49cc9b7 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/transformer_tempflow/transformer_tempflow_network.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/transformer_tempflow/transformer_tempflow_network.py
@@ -171,11 +171,10 @@ def create_network_input(
         torch.Tensor,
     ]:
         """
-        Unrolls the RNN encoder over past and, if present, future data.
-        Returns outputs and state of the encoder, plus the scale of
-        past_target_cdf and a vector of static features that was constructed
-        and fed as input to the encoder. All tensor arguments should have NTC
-        layout.
+        Unrolls the RNN encoder over past and, if present, future data. Returns
+        outputs and state of the encoder, plus the scale of past_target_cdf and
+        a vector of static features that was constructed and fed as input to
+        the encoder. All tensor arguments should have NTC layout.
 
         Parameters
         ----------
@@ -211,7 +210,6 @@ def create_network_input(
             Scaled lags(batch_size, sub_seq_len, target_dim, num_lags)
         inputs
             inputs to the RNN
-
         """
 
         past_observed_values = torch.min(
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/model/utils.py b/src/gluonts/nursery/robust-mts-attack/pts/model/utils.py
index e4051d124c..25a5cdc022 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/model/utils.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/model/utils.py
@@ -30,6 +30,7 @@ def weighted_average(
     """
     Computes the weighted average of a given tensor across a given dim, masking
     values associated with weight zero,
+
     meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
 
     Parameters
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/modules/flows.py b/src/gluonts/nursery/robust-mts-attack/pts/modules/flows.py
index ad5c106923..66ab4e5def 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/modules/flows.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/modules/flows.py
@@ -74,7 +74,9 @@ def create_masks(
 
 
 class FlowSequential(nn.Sequential):
-    """Container for layers of a normalizing flow"""
+    """
+    Container for layers of a normalizing flow.
+    """
 
     def forward(self, x, y):
         sum_log_abs_det_jacobians = 0
@@ -92,7 +94,9 @@ def inverse(self, u, y):
 
 
 class BatchNorm(nn.Module):
-    """RealNVP BatchNorm layer"""
+    """
+    RealNVP BatchNorm layer.
+    """
 
     def __init__(self, input_size, momentum=0.9, eps=1e-5):
         super().__init__()
@@ -152,7 +156,9 @@ def inverse(self, y, cond_y=None):
 
 
 class LinearMaskedCoupling(nn.Module):
-    """Modified RealNVP Coupling Layers per the MAF paper"""
+    """
+    Modified RealNVP Coupling Layers per the MAF paper.
+    """
 
     def __init__(
         self, input_size, hidden_size, n_hidden, mask, cond_label_size=None
@@ -227,7 +233,9 @@ def inverse(self, u, y=None):
 
 
 class MaskedLinear(nn.Linear):
-    """MADE building block layer"""
+    """
+    MADE building block layer.
+    """
 
     def __init__(self, input_size, n_outputs, mask, cond_label_size=None):
         super().__init__(input_size, n_outputs)
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/modules/gaussian_diffusion.py b/src/gluonts/nursery/robust-mts-attack/pts/modules/gaussian_diffusion.py
index 5666592254..acacdde352 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/modules/gaussian_diffusion.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/modules/gaussian_diffusion.py
@@ -43,8 +43,7 @@ def noise_like(shape, device, repeat=False):
 
 def cosine_beta_schedule(timesteps, s=0.008):
     """
-    cosine schedule
-    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    Cosine schedule as proposed in https://openreview.net/forum?id=-NEXDKk8gZ.
     """
     steps = timesteps + 1
     x = np.linspace(0, steps, steps)
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/modules/iqn_modules.py b/src/gluonts/nursery/robust-mts-attack/pts/modules/iqn_modules.py
index 4f02eb0651..9b519069c6 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/modules/iqn_modules.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/modules/iqn_modules.py
@@ -18,12 +18,13 @@
 
 
 class ImplicitQuantileModule(nn.Module):
-    """See arXiv: 1806.06923
-    This module, in combination with quantile loss,
+    """
+    See arXiv: 1806.06923 This module, in combination with quantile loss,
     learns how to generate the quantile of the distribution of the target.
-    A quantile value, tau, is randomly generated with a Uniform([0, 1])).
-    This quantile value is embedded in this module and also passed to the quantile loss:
-    this should force the model to learn the appropriate quantile.
+
+    A quantile value, tau, is randomly generated with a Uniform([0, 1])). This
+    quantile value is embedded in this module and also passed to the quantile
+    loss: this should force the model to learn the appropriate quantile.
     """
 
     def __init__(self, in_features, output_domain_cls):
@@ -46,7 +47,11 @@ def forward(self, input_data, tau):
 
 
 class QuantileLayer(nn.Module):
-    """Define quantile embedding layer, i.e. phi in the IQN paper (arXiv: 1806.06923)."""
+    """
+    Define quantile embedding layer.
+
+    i.e. phi in the IQN paper (arXiv: 1806.06923)
+    """
 
     def __init__(self, num_output):
         super(QuantileLayer, self).__init__()
diff --git a/src/gluonts/nursery/robust-mts-attack/pts/modules/scaler.py b/src/gluonts/nursery/robust-mts-attack/pts/modules/scaler.py
index 55b70983d0..9cfa9e0a98 100644
--- a/src/gluonts/nursery/robust-mts-attack/pts/modules/scaler.py
+++ b/src/gluonts/nursery/robust-mts-attack/pts/modules/scaler.py
@@ -124,8 +124,8 @@ def compute_scale(
 
 class NOPScaler(Scaler):
     """
-    The ``NOPScaler`` assigns a scale equals to 1 to each input item, i.e.,
-    no scaling is applied upon calling the ``NOPScaler``.
+    The ``NOPScaler`` assigns a scale equals to 1 to each input item, i.e., no
+    scaling is applied upon calling the ``NOPScaler``.
     """
 
     @validated()
diff --git a/src/gluonts/nursery/robust-mts-attack/utils.py b/src/gluonts/nursery/robust-mts-attack/utils.py
index 75a36ec169..9ad63f528c 100644
--- a/src/gluonts/nursery/robust-mts-attack/utils.py
+++ b/src/gluonts/nursery/robust-mts-attack/utils.py
@@ -37,7 +37,9 @@
 
 
 class Params:
-    """Class that loads hyperparameters from a json file.
+    """
+    Class that loads hyperparameters from a json file.
+
     Example:
     params = Params(json_path)
     print(params.learning_rate)
@@ -55,14 +57,19 @@ def save(self, json_path):
             json.dump(self.__dict__, f, indent=4, ensure_ascii=False)
 
     def update(self, json_path):
-        """Loads parameters from json file"""
+        """
+        Loads parameters from json file.
+        """
         with open(json_path) as f:
             params = json.load(f)
             self.__dict__.update(params)
 
     @property
     def dict(self):
-        """Gives dict-like access to Params instance by params.dict['learning_rate']"""
+        """
+        Gives dict-like access to Params instance by
+        params.dict['learning_rate']
+        """
         return self.__dict__
 
 
diff --git a/src/gluonts/nursery/spliced_binned_pareto/spliced_binned_pareto.py b/src/gluonts/nursery/spliced_binned_pareto/spliced_binned_pareto.py
index 844af9dfdf..7c5eb29e85 100644
--- a/src/gluonts/nursery/spliced_binned_pareto/spliced_binned_pareto.py
+++ b/src/gluonts/nursery/spliced_binned_pareto/spliced_binned_pareto.py
@@ -25,7 +25,7 @@
 
 class Binned(torch.nn.Module):
     r"""
-    Binned univariate distribution designed as an nn.Module
+    Binned univariate distribution designed as an nn.Module.
 
     Arguments
     ----------
diff --git a/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/_network.py b/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/_network.py
index 72b8329e25..ecce005c9d 100644
--- a/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/_network.py
+++ b/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/_network.py
@@ -134,7 +134,8 @@ def get_target_related_feat_at_agg_level(
         future_observed_values: Optional[Tensor] = None,
     ) -> Dict:
         """
-        Aggregate target at the given aggregate level along with updating observed value and pad indicators.
+        Aggregate target at the given aggregate level along with updating
+        observed value and pad indicators.
 
         :param agg_level:
         :param past_target:
@@ -341,8 +342,8 @@ def hybrid_forward(
         agg_features_dict: Dict,
     ) -> Tensor:
         """
-        Computes the loss for training COPDeepAR, all inputs tensors representing
-        time series have NTC layout.
+        Computes the loss for training COPDeepAR, all inputs tensors
+        representing time series have NTC layout.
 
         Parameters
         ----------
@@ -359,7 +360,6 @@ def hybrid_forward(
 
         Returns loss with shape (batch_size, context + prediction_length, 1)
         -------
-
         """
 
         embeddings_at_all_levels_ls = []
diff --git a/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/gluonts_fixes.py b/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/gluonts_fixes.py
index b66c8f6887..c8de71af0e 100644
--- a/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/gluonts_fixes.py
+++ b/src/gluonts/nursery/temporal_hierarchical_forecasting/model/cop_deepar/gluonts_fixes.py
@@ -69,8 +69,10 @@ def batchify_with_dict(
 
 class RepresentableBlockPredictorBatchifyWithDict(RepresentableBlockPredictor):
     """
-    We need the stack function `batchify_with_dict` in order to pass the features at the aggregated level properly
-    during prediction. Gluonts does not allow this without changing the line corresponding to the
+    We need the stack function `batchify_with_dict` in order to pass the
+    features at the aggregated level properly during prediction.
+
+    Gluonts does not allow this without changing the line corresponding to the
     `InferenceDataLoader` in the `predict` function.
     """
 
diff --git a/src/gluonts/nursery/temporal_hierarchical_forecasting/utils/utils.py b/src/gluonts/nursery/temporal_hierarchical_forecasting/utils/utils.py
index 289f590e30..a3876eff49 100644
--- a/src/gluonts/nursery/temporal_hierarchical_forecasting/utils/utils.py
+++ b/src/gluonts/nursery/temporal_hierarchical_forecasting/utils/utils.py
@@ -196,8 +196,8 @@ def _check_freqs(deltas: List[pd.Timedelta]):
 
 def freqs_to_agg_mulitples(freq_strs: List[str]) -> List[int]:
     """
-    Returns aggregation multiples that are used to construct
-    aggregation matrix.
+    Returns aggregation multiples that are used to construct aggregation
+    matrix.
 
     Parameters
     ----------
diff --git a/src/gluonts/pydantic.py b/src/gluonts/pydantic.py
index 99269afea7..5ab67e77e4 100644
--- a/src/gluonts/pydantic.py
+++ b/src/gluonts/pydantic.py
@@ -12,7 +12,9 @@
 # permissions and limitations under the License.
 
 
-"""This modules contains pydantic imports, which are used throughout the codebase."""
+"""
+This modules contains pydantic imports, which are used throughout the codebase.
+"""
 
 from pydantic import __version__
 
diff --git a/src/gluonts/time_feature/_base.py b/src/gluonts/time_feature/_base.py
index ab6ab20935..0d88971002 100644
--- a/src/gluonts/time_feature/_base.py
+++ b/src/gluonts/time_feature/_base.py
@@ -25,7 +25,9 @@
 
 
 def _normalize(xs, num: float):
-    """Scale values of ``xs`` to [-0.5, 0.5]."""
+    """
+    Scale values of ``xs`` to [-0.5, 0.5].
+    """
 
     return np.asarray(xs) / (num - 1) - 0.5
 
diff --git a/src/gluonts/torch/distributions/binned_uniforms.py b/src/gluonts/torch/distributions/binned_uniforms.py
index 46a8a5e586..eeabfcf154 100644
--- a/src/gluonts/torch/distributions/binned_uniforms.py
+++ b/src/gluonts/torch/distributions/binned_uniforms.py
@@ -147,6 +147,7 @@ def log_bins_prob(self):
     def log_prob(self, x):
         """
         Log probability for a tensor of datapoints `x`.
+
         'x' is to have shape (*batch_shape)
         """
         for i in range(0, len(x.shape)):
@@ -158,8 +159,8 @@ def log_prob(self, x):
     def log_binned_p(self, x):
         """
         Log probability for a tensor of datapoints `x`.
-        'x' is to have shape (*batch_shape)
 
+        'x' is to have shape (*batch_shape)
         """
         one_hot_bin_indicator = self.get_one_hot_bin_indicator(
             x, in_float=True
@@ -172,14 +173,15 @@ def log_binned_p(self, x):
     def pdf(self, x):
         """
         Probability for a tensor of data points `x`.
+
         'x' is to have shape (*batch_shape)
         """
         return torch.exp(self.log_prob(x))
 
     def get_one_hot_bin_indicator(self, x, in_float=False):
         """
-        'x' is to have shape (*batch_shape) which can be for example () or
-        (32, ) or (32, 168, )
+        'x' is to have shape (*batch_shape) which can be for example () or (32,
+        ) or (32, 168, )
         """
         for i in range(0, len(x.shape)):
             assert (
@@ -221,8 +223,8 @@ def get_one_hot_bin_indicator(self, x, in_float=False):
 
     def icdf(self, quantiles):
         """
-        Inverse cdf of a tensor of quantile `quantiles`
-        'quantiles' is of shape (*batch_shape) with values between (0.0, 1.0)
+        Inverse cdf of a tensor of quantile `quantiles` 'quantiles' is of shape
+        (*batch_shape) with values between (0.0, 1.0)
 
         This is the function to be called from the outside.
         """
@@ -248,15 +250,15 @@ def icdf(self, quantiles):
 
     def _inverse_cdf(self, quantiles):
         """
-        Inverse cdf of a tensor of quantile `quantiles`
-        'quantiles' is of shape (*batch_shape) with values between (0.0, 1.0)
+        Inverse cdf of a tensor of quantile `quantiles` 'quantiles' is of shape
+        (*batch_shape) with values between (0.0, 1.0)
         """
         return self._icdf_binned(quantiles)
 
     def _icdf_binned(self, quantiles):
         """
-        Inverse cdf of a tensor of quantile `quantiles`
-        'quantiles' is of shape (*batch_shape) with values between (0.0, 1.0)
+        Inverse cdf of a tensor of quantile `quantiles` 'quantiles' is of shape
+        (*batch_shape) with values between (0.0, 1.0)
         """
         quantiles = quantiles.unsqueeze(dim=-1)
         # quantiles.shape: (*batch_shape, 1)
@@ -315,6 +317,7 @@ def _icdf_binned(self, quantiles):
     def cdf(self, x):
         """
         Cumulative density tensor for a tensor of data points `x`.
+
         'x' is expected to be of shape (*batch_shape)
         """
         for i in range(0, len(x.shape)):
@@ -326,12 +329,11 @@ def cdf(self, x):
     def _cdf_binned(self, x):
         """
         Cumulative density tensor for a tensor of data points `x`.
-        'x' is expected to be of shape (*batch_shape)
 
-        The cdf is composed of 2 parts:
-            the cdf up to the bin
-            the cdf within the bin that the point falls into (modeled with a
-            uniform distribution)
+        'x' is expected to be of shape (*batch_shape). The cdf is composed of 2
+        parts:
+        - the cdf up to the bin
+        - the cdf within the bin that the point falls into (modeled with a uniform distribution)
         """
 
         bins_prob = self.bins_prob
@@ -399,7 +401,6 @@ def sample(self, sample_shape=torch.Size()):
 
         Returns:
             samples of shape (*sample_shape, *batch_shape)
-
         """
         if len(sample_shape) == 0:
             quantiles = torch.rand(self.batch_shape)
diff --git a/src/gluonts/torch/distributions/discrete_distribution.py b/src/gluonts/torch/distributions/discrete_distribution.py
index e8e957ad33..32fe342e06 100755
--- a/src/gluonts/torch/distributions/discrete_distribution.py
+++ b/src/gluonts/torch/distributions/discrete_distribution.py
@@ -19,9 +19,8 @@
 
 class DiscreteDistribution(torch.distributions.Distribution):
     """
-    Implements discrete distribution where the underlying random variable
-    takes a value from the finite set `values` with the corresponding
-    probabilities.
+    Implements discrete distribution where the underlying random variable takes
+    a value from the finite set `values` with the corresponding probabilities.
 
     Note: `values` can have duplicates in which case the probability mass
     of duplicates is added up.
@@ -127,7 +126,6 @@ def rps(self, obs: torch.Tensor, check_for_duplicates: bool = True):
 
         Returns
         -------
-
         """
         if self._validate_args:
             self._validate_sample(obs)
diff --git a/src/gluonts/torch/distributions/distribution_output.py b/src/gluonts/torch/distributions/distribution_output.py
index be3744b96f..af786ca4ef 100644
--- a/src/gluonts/torch/distributions/distribution_output.py
+++ b/src/gluonts/torch/distributions/distribution_output.py
@@ -93,25 +93,26 @@ def loss(
     @property
     def event_shape(self) -> Tuple:
         r"""
-        Shape of each individual event contemplated by the distributions
-        that this object constructs.
+        Shape of each individual event contemplated by the distributions that
+        this object constructs.
         """
         raise NotImplementedError()
 
     @property
     def event_dim(self) -> int:
         r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple,
-        of the distributions that this object constructs.
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of
+        the distributions that this object constructs.
         """
         return len(self.event_shape)
 
     def domain_map(self, *args: torch.Tensor):
         r"""
-        Converts arguments to the right shape and domain. The domain depends
-        on the type of distribution, while the correct shape is obtained by
-        reshaping the trailing axis in such a way that the returned tensors
-        define a distribution of the right event_shape.
+        Converts arguments to the right shape and domain.
+
+        The domain depends on the type of distribution, while the correct shape
+        is obtained by reshaping the trailing axis in such a way that the
+        returned tensors define a distribution of the right event_shape.
         """
         raise NotImplementedError()
 
diff --git a/src/gluonts/torch/distributions/implicit_quantile_network.py b/src/gluonts/torch/distributions/implicit_quantile_network.py
index 69d089e99f..f81ec6436d 100644
--- a/src/gluonts/torch/distributions/implicit_quantile_network.py
+++ b/src/gluonts/torch/distributions/implicit_quantile_network.py
@@ -27,8 +27,10 @@
 class QuantileLayer(nn.Module):
     r"""
     Implicit Quantile Layer from the paper ``IQN for Distributional
-    Reinforcement Learning`` (https://arxiv.org/abs/1806.06923) by
-    Dabney et al. 2018.
+    Reinforcement Learning`` (https://arxiv.org/abs/1806.06923) by Dabney et
+    al.
+
+    2018.
     """
 
     def __init__(self, num_output: int, cos_embedding_dim: int = 128):
@@ -50,8 +52,10 @@ def forward(self, tau: torch.Tensor) -> torch.Tensor:  # tau: [B, T]
 class ImplicitQuantileModule(nn.Module):
     r"""
     Implicit Quantile Network from the paper ``IQN for Distributional
-    Reinforcement Learning`` (https://arxiv.org/abs/1806.06923) by
-    Dabney et al. 2018.
+    Reinforcement Learning`` (https://arxiv.org/abs/1806.06923) by Dabney et
+    al.
+
+    2018.
     """
 
     def __init__(
@@ -102,8 +106,8 @@ def forward(self, inputs: torch.Tensor):
 
 class ImplicitQuantileNetwork(Distribution):
     r"""
-    Distribution class for the Implicit Quantile from which
-    we can sample or calculate the quantile loss.
+    Distribution class for the Implicit Quantile from which we can sample or
+    calculate the quantile loss.
 
     Parameters
     ----------
@@ -140,8 +144,8 @@ def quantile_loss(self, value: torch.Tensor) -> torch.Tensor:
 
 class ImplicitQuantileNetworkOutput(DistributionOutput):
     r"""
-    DistributionOutput class for the IQN from the paper
-    ``Probabilistic Time Series Forecasting with Implicit Quantile Networks``
+    DistributionOutput class for the IQN from the paper ``Probabilistic Time
+    Series Forecasting with Implicit Quantile Networks``
     (https://arxiv.org/abs/2107.03743) by Gouttes et al. 2021.
 
     Parameters
diff --git a/src/gluonts/torch/distributions/isqf.py b/src/gluonts/torch/distributions/isqf.py
index ee59fad19b..3481adeb3e 100644
--- a/src/gluonts/torch/distributions/isqf.py
+++ b/src/gluonts/torch/distributions/isqf.py
@@ -205,9 +205,9 @@ def parameterize_tail(
         beta: torch.Tensor, qk_x: torch.Tensor, qk_y: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         r"""
-        Function to parameterize the tail parameters
-        Note that the exponential tails are given by
-        q(alpha)
+        Function to parameterize the tail parameters Note that the exponential
+        tails are given by q(alpha)
+
         = a_l log(alpha) + b_l if left tail
         = a_r log(1-alpha) + b_r if right tail
         where
@@ -391,9 +391,9 @@ def quantile_tail(
 
     def cdf_spline(self, z: torch.Tensor) -> torch.Tensor:
         r"""
-        For observations z and splines defined in [qk_x[k], qk_x[k+1]]
-        Computes the quantile level alpha_tilde such that
-        alpha_tilde
+        For observations z and splines defined in [qk_x[k], qk_x[k+1]] Computes
+        the quantile level alpha_tilde such that alpha_tilde.
+
         = q^{-1}(z) if z is in-between qk_x[k] and qk_x[k+1]
         = qk_x[k] if z<qk_x[k]
         = qk_x[k+1] if z>qk_x[k+1]
@@ -467,8 +467,8 @@ def cdf_tail(
         self, z: torch.Tensor, left_tail: bool = True
     ) -> torch.Tensor:
         r"""
-        Computes the quantile level alpha_tilde such that
-        alpha_tilde
+        Computes the quantile level alpha_tilde such that alpha_tilde.
+
         = q^{-1}(z) if z is in the tail region
         = qk_x_l or qk_x_r if z is in the non-tail region
         Parameters
diff --git a/src/gluonts/torch/distributions/negative_binomial.py b/src/gluonts/torch/distributions/negative_binomial.py
index 909b20fb1b..d7d75eb0a7 100644
--- a/src/gluonts/torch/distributions/negative_binomial.py
+++ b/src/gluonts/torch/distributions/negative_binomial.py
@@ -27,9 +27,11 @@
 
 class NegativeBinomial(TorchNegativeBinomial):
     """
-    Negative binomial distribution with `total_count` and `probs` or `logits` parameters.
+    Negative binomial distribution with `total_count` and `probs` or `logits`
+    parameters.
 
-    Based on torch.distributions.NegativeBinomial, with added `cdf` and `icdf` methods.
+    Based on torch.distributions.NegativeBinomial, with added `cdf` and `icdf`
+    methods.
     """
 
     def __init__(
diff --git a/src/gluonts/torch/distributions/output.py b/src/gluonts/torch/distributions/output.py
index 089cc95772..49385f0e55 100644
--- a/src/gluonts/torch/distributions/output.py
+++ b/src/gluonts/torch/distributions/output.py
@@ -23,8 +23,8 @@
 
 class PtArgProj(nn.Module):
     r"""
-    A PyTorch module that can be used to project from a dense layer
-    to PyTorch distribution arguments.
+    A PyTorch module that can be used to project from a dense layer to PyTorch
+    distribution arguments.
 
     Parameters
     ----------
@@ -84,7 +84,8 @@ def loss(
         loc: Optional[torch.Tensor] = None,
         scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        """Compute loss for target data given network output.
+        """
+        Compute loss for target data given network output.
 
         Parameters
         ----------
@@ -117,17 +118,20 @@ def get_args_proj(self, in_features: int) -> nn.Module:
 
     def domain_map(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
         r"""
-        Converts arguments to the right shape and domain. The domain depends
-        on the type of distribution, while the correct shape is obtained by
-        reshaping the trailing axis in such a way that the returned tensors
-        define a distribution of the right event_shape.
+        Converts arguments to the right shape and domain.
+
+        The domain depends on the type of distribution, while the correct shape
+        is obtained by reshaping the trailing axis in such a way that the
+        returned tensors define a distribution of the right event_shape.
         """
         raise NotImplementedError()
 
     @property
     def value_in_support(self) -> float:
         r"""
-        A float value that is valid for computing the loss of the
-        corresponding output. By default 0.0.
+        A float value that is valid for computing the loss of the corresponding
+        output.
+
+        By default 0.0.
         """
         return 0.0
diff --git a/src/gluonts/torch/distributions/spliced_binned_pareto.py b/src/gluonts/torch/distributions/spliced_binned_pareto.py
index 94e36a56cf..a62a36d5ce 100644
--- a/src/gluonts/torch/distributions/spliced_binned_pareto.py
+++ b/src/gluonts/torch/distributions/spliced_binned_pareto.py
@@ -178,6 +178,7 @@ def log_prob(self, x: torch.Tensor, for_training=True):
     def pdf(self, x):
         """
         Probability for a tensor of data points `x`.
+
         'x' is to have shape (*batch_shape)
         """
         # By default we put the for training parameter of the pdf on false as
@@ -186,8 +187,8 @@ def pdf(self, x):
 
     def _inverse_cdf(self, quantiles: torch.Tensor):
         """
-        Inverse cdf of a tensor of quantile `quantiles`
-        'quantiles' is of shape (*batch_shape) with values between (0.0, 1.0)
+        Inverse cdf of a tensor of quantile `quantiles` 'quantiles' is of shape
+        (*batch_shape) with values between (0.0, 1.0)
         """
 
         # The quantiles for the body of the distribution:
@@ -229,6 +230,7 @@ def _inverse_cdf(self, quantiles: torch.Tensor):
     def cdf(self, x: torch.Tensor):
         """
         Cumulative density tensor for a tensor of data points `x`.
+
         'x' is expected to be of shape (*batch_shape)
         """
         for i in range(0, len(x.shape)):
diff --git a/src/gluonts/torch/distributions/studentT.py b/src/gluonts/torch/distributions/studentT.py
index 135f64bf92..d82bbe8958 100644
--- a/src/gluonts/torch/distributions/studentT.py
+++ b/src/gluonts/torch/distributions/studentT.py
@@ -23,8 +23,9 @@
 
 
 class StudentT(TorchStudentT):
-    """Student's t-distribution parametrized by degree of freedom `df`,
-    mean `loc` and scale `scale`.
+    """
+    Student's t-distribution parametrized by degree of freedom `df`, mean `loc`
+    and scale `scale`.
 
     Based on torch.distributions.StudentT, with added `cdf` and `icdf` methods.
     """
diff --git a/src/gluonts/torch/distributions/truncated_normal.py b/src/gluonts/torch/distributions/truncated_normal.py
index 6328d48759..20eddb4ef3 100644
--- a/src/gluonts/torch/distributions/truncated_normal.py
+++ b/src/gluonts/torch/distributions/truncated_normal.py
@@ -32,7 +32,8 @@
 
 
 class TruncatedNormal(Distribution):
-    """Implements a Truncated Normal distribution with location scaling.
+    """
+    Implements a Truncated Normal distribution with location scaling.
 
     Location scaling prevents the location to be "too far" from 0, which ultimately
     leads to numerically unstable samples and poor gradient computation (e.g. gradient explosion).
diff --git a/src/gluonts/torch/model/d_linear/estimator.py b/src/gluonts/torch/model/d_linear/estimator.py
index 0a4691a10a..0d6b796f5f 100644
--- a/src/gluonts/torch/model/d_linear/estimator.py
+++ b/src/gluonts/torch/model/d_linear/estimator.py
@@ -48,7 +48,8 @@
 class DLinearEstimator(PyTorchLightningEstimator):
     """
     An estimator training the d-linear model form the paper
-    https://arxiv.org/pdf/2205.13504.pdf extended for probabilistic forecasting.
+    https://arxiv.org/pdf/2205.13504.pdf extended for probabilistic
+    forecasting.
 
     This class is uses the model defined in ``DLinearModel``,
     and wraps it into a ``DLinearLightningModule`` for training
diff --git a/src/gluonts/torch/model/d_linear/lightning_module.py b/src/gluonts/torch/model/d_linear/lightning_module.py
index ec80e0e7d2..a2f923a234 100644
--- a/src/gluonts/torch/model/d_linear/lightning_module.py
+++ b/src/gluonts/torch/model/d_linear/lightning_module.py
@@ -22,8 +22,8 @@
 
 class DLinearLightningModule(pl.LightningModule):
     """
-    A ``pl.LightningModule`` class that can be used to train a
-    ``DLinearModel`` with PyTorch Lightning.
+    A ``pl.LightningModule`` class that can be used to train a ``DLinearModel``
+    with PyTorch Lightning.
 
     This is a thin layer around a (wrapped) ``DLinearModel`` object,
     that exposes the methods to evaluate training and validation loss.
diff --git a/src/gluonts/torch/model/d_linear/module.py b/src/gluonts/torch/model/d_linear/module.py
index b9ae146b50..c597517870 100644
--- a/src/gluonts/torch/model/d_linear/module.py
+++ b/src/gluonts/torch/model/d_linear/module.py
@@ -26,7 +26,7 @@
 
 class MovingAvg(nn.Module):
     """
-    Moving average block to highlight the trend of time series
+    Moving average block to highlight the trend of time series.
     """
 
     def __init__(self, kernel_size, stride):
@@ -48,7 +48,7 @@ def forward(self, x):
 
 class SeriesDecomp(nn.Module):
     """
-    Series decomposition block
+    Series decomposition block.
     """
 
     def __init__(self, kernel_size):
@@ -64,7 +64,8 @@ def forward(self, x):
 class DLinearModel(nn.Module):
     """
     Module implementing a feed-forward model form the paper
-    https://arxiv.org/pdf/2205.13504.pdf extended for probabilistic forecasting.
+    https://arxiv.org/pdf/2205.13504.pdf extended for probabilistic
+    forecasting.
 
     Parameters
     ----------
diff --git a/src/gluonts/torch/model/deep_npts/_estimator.py b/src/gluonts/torch/model/deep_npts/_estimator.py
index f3cca87da1..028dd5003d 100755
--- a/src/gluonts/torch/model/deep_npts/_estimator.py
+++ b/src/gluonts/torch/model/deep_npts/_estimator.py
@@ -63,9 +63,9 @@
 
 class DeepNPTSEstimator(Estimator):
     """
-    Construct a DeepNPTS estimator. This is a tunable extension of NPTS
-    where the sampling probabilities are learned from the data. This is a
-    global-model unlike NPTS.
+    Construct a DeepNPTS estimator. This is a tunable extension of NPTS where
+    the sampling probabilities are learned from the data. This is a global-
+    model unlike NPTS.
 
     Currently two variants of the model are implemented:
     (i) `DeepNPTSNetworkDiscrete`: the forecast distribution is a discrete
diff --git a/src/gluonts/torch/model/deep_npts/_network.py b/src/gluonts/torch/model/deep_npts/_network.py
index c29d1935c3..1eb436f11d 100755
--- a/src/gluonts/torch/model/deep_npts/_network.py
+++ b/src/gluonts/torch/model/deep_npts/_network.py
@@ -45,7 +45,9 @@ def init_weights(module: nn.Module, scale: float = 1.0):
 
 
 class FeatureEmbedder(nn.Module):
-    """Creates a feature embedding for the static categorical features."""
+    """
+    Creates a feature embedding for the static categorical features.
+    """
 
     @validated()
     def __init__(
@@ -98,8 +100,9 @@ def forward(self, features: torch.Tensor):
 
 
 class DeepNPTSNetwork(nn.Module):
-    """Base class implementing a simple feed-forward neural network that takes
-    in static and dynamic features and produces `num_hidden_nodes` independent
+    """
+    Base class implementing a simple feed-forward neural network that takes in
+    static and dynamic features and produces `num_hidden_nodes` independent
     outputs. These outputs are then used by derived classes to construct the
     forecast distribution for a single time step.
 
@@ -210,8 +213,8 @@ def forward(
 
 class DeepNPTSNetworkDiscrete(DeepNPTSNetwork):
     """
-    Extends `DeepNTPSNetwork` by implementing the output layer which
-    converts the outputs from the base network into probabilities of length
+    Extends `DeepNTPSNetwork` by implementing the output layer which converts
+    the outputs from the base network into probabilities of length
     `context_length`. These probabilities together with the past values in the
     context window constitute the one-step-ahead forecast distribution.
     Specifically, the forecast is always one of the values observed in the
@@ -269,11 +272,11 @@ def forward(
 
 class DeepNPTSNetworkSmooth(DeepNPTSNetwork):
     """
-    Extends `DeepNTPSNetwork` by implementing the output layer which
-    converts the outputs from the base network into a smoothed mixture
-    distribution. The components of the mixture are Gaussians centered around
-    the observations in the context window. The mixing probabilities as well as
-    the width of the Gaussians are predicted by the network.
+    Extends `DeepNTPSNetwork` by implementing the output layer which converts
+    the outputs from the base network into a smoothed mixture distribution. The
+    components of the mixture are Gaussians centered around the observations in
+    the context window. The mixing probabilities as well as the width of the
+    Gaussians are predicted by the network.
 
     This mixture distribution represents the one-step-ahead forecast
     distribution. Note that the forecast can contain values not observed in the
@@ -345,7 +348,8 @@ def forward(
         past_time_feat: torch.Tensor,
         future_time_feat: torch.Tensor,
     ):
-        """Generates samples from the forecast distribution.
+        """
+        Generates samples from the forecast distribution.
 
         Parameters
         ----------
diff --git a/src/gluonts/torch/model/deepar/lightning_module.py b/src/gluonts/torch/model/deepar/lightning_module.py
index 7f5a557d08..25fd8bbf43 100644
--- a/src/gluonts/torch/model/deepar/lightning_module.py
+++ b/src/gluonts/torch/model/deepar/lightning_module.py
@@ -24,8 +24,8 @@
 
 class DeepARLightningModule(pl.LightningModule):
     """
-    A ``pl.LightningModule`` class that can be used to train a
-    ``DeepARModel`` with PyTorch Lightning.
+    A ``pl.LightningModule`` class that can be used to train a ``DeepARModel``
+    with PyTorch Lightning.
 
     This is a thin layer around a (wrapped) ``DeepARModel`` object,
     that exposes the methods to evaluate training and validation loss.
diff --git a/src/gluonts/torch/model/deepar/module.py b/src/gluonts/torch/model/deepar/module.py
index ebe4ef2479..138cd06c1f 100644
--- a/src/gluonts/torch/model/deepar/module.py
+++ b/src/gluonts/torch/model/deepar/module.py
@@ -337,7 +337,7 @@ def output_distribution(
         self, params, scale=None, trailing_n=None
     ) -> torch.distributions.Distribution:
         """
-        Instantiate the output distribution
+        Instantiate the output distribution.
 
         Parameters
         ----------
diff --git a/src/gluonts/torch/model/i_transformer/estimator.py b/src/gluonts/torch/model/i_transformer/estimator.py
index 71855820dd..c9a77f0d99 100644
--- a/src/gluonts/torch/model/i_transformer/estimator.py
+++ b/src/gluonts/torch/model/i_transformer/estimator.py
@@ -48,8 +48,9 @@
 
 class ITransformerEstimator(PyTorchLightningEstimator):
     """
-    An estimator training the iTransformer model for multivariate forecasting as described in
-    https://arxiv.org/abs/2310.06625 extended to be probabilistic.
+    An estimator training the iTransformer model for multivariate forecasting
+    as described in https://arxiv.org/abs/2310.06625 extended to be
+    probabilistic.
 
     This class uses the model defined in ``ITransformerModel``,
     and wraps it into a ``ITransformerLightningModule`` for training
diff --git a/src/gluonts/torch/model/i_transformer/module.py b/src/gluonts/torch/model/i_transformer/module.py
index 2355443215..75f7fbcdce 100644
--- a/src/gluonts/torch/model/i_transformer/module.py
+++ b/src/gluonts/torch/model/i_transformer/module.py
@@ -25,8 +25,8 @@
 
 class ITransformerModel(nn.Module):
     """
-    Module implementing the iTransformer model for multivariate forecasting as described in
-    https://arxiv.org/abs/2310.06625 extended to be probabilistic.
+    Module implementing the iTransformer model for multivariate forecasting as
+    described in https://arxiv.org/abs/2310.06625 extended to be probabilistic.
 
     Parameters
     ----------
diff --git a/src/gluonts/torch/model/lag_tst/lightning_module.py b/src/gluonts/torch/model/lag_tst/lightning_module.py
index 5aa6512774..9b10bc25dd 100644
--- a/src/gluonts/torch/model/lag_tst/lightning_module.py
+++ b/src/gluonts/torch/model/lag_tst/lightning_module.py
@@ -22,8 +22,8 @@
 
 class LagTSTLightningModule(pl.LightningModule):
     """
-    A ``pl.LightningModule`` class that can be used to train a
-    ``LagTSTModel`` with PyTorch Lightning.
+    A ``pl.LightningModule`` class that can be used to train a ``LagTSTModel``
+    with PyTorch Lightning.
 
     This is a thin layer around a (wrapped) ``LagTSTModel`` object,
     that exposes the methods to evaluate training and validation loss.
diff --git a/src/gluonts/torch/model/mqf2/distribution.py b/src/gluonts/torch/model/mqf2/distribution.py
index d4a0408bc1..bb010d0639 100644
--- a/src/gluonts/torch/model/mqf2/distribution.py
+++ b/src/gluonts/torch/model/mqf2/distribution.py
@@ -25,9 +25,9 @@
 
 class MQF2Distribution(torch.distributions.Distribution):
     r"""
-    Distribution class for the model MQF2 proposed in the paper
-    ``Multivariate Quantile Function Forecaster``
-    by Kan, Aubet, Januschowski, Park, Benidis, Ruthotto, Gasthaus
+    Distribution class for the model MQF2 proposed in the paper ``Multivariate
+    Quantile Function Forecaster`` by Kan, Aubet, Januschowski, Park, Benidis,
+    Ruthotto, Gasthaus.
 
     Parameters
     ----------
diff --git a/src/gluonts/torch/model/mqf2/estimator.py b/src/gluonts/torch/model/mqf2/estimator.py
index faca118475..0b406e3c75 100644
--- a/src/gluonts/torch/model/mqf2/estimator.py
+++ b/src/gluonts/torch/model/mqf2/estimator.py
@@ -23,9 +23,9 @@
 
 class MQF2MultiHorizonEstimator(DeepAREstimator):
     r"""
-    Estimator class for the model MQF2 proposed in the paper
-    ``Multivariate Quantile Function Forecaster``
-    by Kan, Aubet, Januschowski, Park, Benidis, Ruthotto, Gasthaus
+    Estimator class for the model MQF2 proposed in the paper ``Multivariate
+    Quantile Function Forecaster`` by Kan, Aubet, Januschowski, Park, Benidis,
+    Ruthotto, Gasthaus.
 
     This is the multi-horizon (multivariate in time step) variant of MQF2
 
diff --git a/src/gluonts/torch/model/mqf2/icnn_utils.py b/src/gluonts/torch/model/mqf2/icnn_utils.py
index 7ca2925b71..a530fb5a34 100644
--- a/src/gluonts/torch/model/mqf2/icnn_utils.py
+++ b/src/gluonts/torch/model/mqf2/icnn_utils.py
@@ -32,9 +32,9 @@
 
 class DeepConvexNet(DeepConvexFlow):
     r"""
-    Class that takes a partially input convex neural network (picnn)
-    as input and equips it with functions of logdet
-    computation (both estimation and exact computation)
+    Class that takes a partially input convex neural network (picnn) as input
+    and equips it with functions of logdet computation (both estimation and
+    exact computation)
 
     This class is based on DeepConvexFlow of the CP-Flow
     repo (https://github.com/CW-Huang/CP-Flow)
@@ -126,8 +126,8 @@ def forward_transform(
 
 class SequentialNet(SequentialFlow):
     r"""
-    Class that combines a list of DeepConvexNet and ActNorm
-    layers and provides energy score computation
+    Class that combines a list of DeepConvexNet and ActNorm layers and provides
+    energy score computation.
 
     This class is based on SequentialFlow of the CP-Flow repo
     (https://github.com/CW-Huang/CP-Flow)
diff --git a/src/gluonts/torch/model/mqf2/lightning_module.py b/src/gluonts/torch/model/mqf2/lightning_module.py
index 8bddd0a10a..12e3c395fb 100644
--- a/src/gluonts/torch/model/mqf2/lightning_module.py
+++ b/src/gluonts/torch/model/mqf2/lightning_module.py
@@ -26,8 +26,8 @@
 class MQF2MultiHorizonLightningModule(pl.LightningModule):
     r"""
     LightningModule class for the model MQF2 proposed in the paper
-    ``Multivariate Quantile Function Forecaster``
-    by Kan, Aubet, Januschowski, Park, Benidis, Ruthotto, Gasthaus
+    ``Multivariate Quantile Function Forecaster`` by Kan, Aubet, Januschowski,
+    Park, Benidis, Ruthotto, Gasthaus.
 
     This is the multi-horizon (multivariate in time step) variant of MQF2
 
diff --git a/src/gluonts/torch/model/mqf2/module.py b/src/gluonts/torch/model/mqf2/module.py
index e045e5a0da..f3f70746f8 100644
--- a/src/gluonts/torch/model/mqf2/module.py
+++ b/src/gluonts/torch/model/mqf2/module.py
@@ -52,9 +52,9 @@ def __init__(
         estimate_logdet: bool = False,
     ) -> None:
         r"""
-        Model class for the model MQF2 proposed in the paper
-        ``Multivariate Quantile Function Forecaster``
-        by Kan, Aubet, Januschowski, Park, Benidis, Ruthotto, Gasthaus
+        Model class for the model MQF2 proposed in the paper ``Multivariate
+        Quantile Function Forecaster`` by Kan, Aubet, Januschowski, Park,
+        Benidis, Ruthotto, Gasthaus.
 
         This is the multi-horizon (multivariate in time step) variant of MQF2
 
diff --git a/src/gluonts/torch/model/patch_tst/module.py b/src/gluonts/torch/model/patch_tst/module.py
index 4e829e2ea1..3a59f80299 100644
--- a/src/gluonts/torch/model/patch_tst/module.py
+++ b/src/gluonts/torch/model/patch_tst/module.py
@@ -26,7 +26,9 @@
 
 
 class SinusoidalPositionalEmbedding(nn.Embedding):
-    """This module produces sinusoidal positional embeddings of any length."""
+    """
+    This module produces sinusoidal positional embeddings of any length.
+    """
 
     def __init__(self, num_positions: int, embedding_dim: int) -> None:
         super().__init__(num_positions, embedding_dim)
@@ -35,7 +37,9 @@ def __init__(self, num_positions: int, embedding_dim: int) -> None:
     @staticmethod
     def _init_weight(out: torch.Tensor) -> torch.Tensor:
         """
-        Features are not interleaved. The cos features are in the 2nd half of the vector. [dim // 2:]
+        Features are not interleaved.
+
+        The cos features are in the 2nd half of the vector. [dim // 2:]
         """
         n_pos, dim = out.shape
         position_enc = np.array(
@@ -57,7 +61,9 @@ def _init_weight(out: torch.Tensor) -> torch.Tensor:
     def forward(  # type: ignore
         self, input_ids_shape: torch.Size, past_key_values_length: int = 0
     ) -> torch.Tensor:
-        """`input_ids_shape` is expected to be [bsz x seqlen x ...]."""
+        """
+        `input_ids_shape` is expected to be [bsz x seqlen x ...].
+        """
         _, seq_len = input_ids_shape[:2]
         positions = torch.arange(
             past_key_values_length,
diff --git a/src/gluonts/torch/model/simple_feedforward/estimator.py b/src/gluonts/torch/model/simple_feedforward/estimator.py
index ebd2467219..3bee9becbf 100644
--- a/src/gluonts/torch/model/simple_feedforward/estimator.py
+++ b/src/gluonts/torch/model/simple_feedforward/estimator.py
@@ -86,7 +86,6 @@ class SimpleFeedForwardEstimator(PyTorchLightningEstimator):
         Controls the sampling of windows during training.
     validation_sampler
         Controls the sampling of windows during validation.
-
     """
 
     @validated()
diff --git a/src/gluonts/torch/model/tft/estimator.py b/src/gluonts/torch/model/tft/estimator.py
index 8009046941..6e1ecfacf3 100644
--- a/src/gluonts/torch/model/tft/estimator.py
+++ b/src/gluonts/torch/model/tft/estimator.py
@@ -63,7 +63,8 @@
 
 class TemporalFusionTransformerEstimator(PyTorchLightningEstimator):
     """
-    Estimator class to train a Temporal Fusion Transformer (TFT) model, as described in [LAL+21]_.
+    Estimator class to train a Temporal Fusion Transformer (TFT) model, as
+    described in [LAL+21]_.
 
     TFT internally performs feature selection when making forecasts. For this
     reason, the dimensions of real-valued features can be grouped together if
diff --git a/src/gluonts/torch/model/tft/module.py b/src/gluonts/torch/model/tft/module.py
index f113160cb7..54e90ed90d 100644
--- a/src/gluonts/torch/model/tft/module.py
+++ b/src/gluonts/torch/model/tft/module.py
@@ -37,7 +37,8 @@
 
 
 class TemporalFusionTransformerModel(nn.Module):
-    """Temporal Fusion Transformer neural network.
+    """
+    Temporal Fusion Transformer neural network.
 
     Partially based on the implementation in github.com/kashif/pytorch-transformer-ts.
 
diff --git a/src/gluonts/torch/model/tide/lightning_module.py b/src/gluonts/torch/model/tide/lightning_module.py
index e23b2872d6..32b5d1ace6 100644
--- a/src/gluonts/torch/model/tide/lightning_module.py
+++ b/src/gluonts/torch/model/tide/lightning_module.py
@@ -26,8 +26,8 @@
 
 class TiDELightningModule(pl.LightningModule):
     """
-    A ``pl.LightningModule`` class that can be used to train a
-    ``TiDEModel`` with PyTorch Lightning.
+    A ``pl.LightningModule`` class that can be used to train a ``TiDEModel``
+    with PyTorch Lightning.
 
     This is a thin layer around a (wrapped) ``TiDEModel`` object,
     that exposes the methods to evaluate training and validation loss.
diff --git a/src/gluonts/torch/model/wavenet/estimator.py b/src/gluonts/torch/model/wavenet/estimator.py
index 4a5b2ff0b1..b73381400d 100644
--- a/src/gluonts/torch/model/wavenet/estimator.py
+++ b/src/gluonts/torch/model/wavenet/estimator.py
@@ -98,7 +98,9 @@ def __init__(
         negative_data: bool = False,
         trainer_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
-        """WaveNet estimator that uses the architecture proposed in
+        """
+        WaveNet estimator that uses the architecture proposed in.
+
         [Oord et al., 2016] with quantized targets. The model is trained
         using the cross-entropy loss.
 
diff --git a/src/gluonts/torch/model/wavenet/lightning_module.py b/src/gluonts/torch/model/wavenet/lightning_module.py
index f4e7043ebb..16b7524f34 100644
--- a/src/gluonts/torch/model/wavenet/lightning_module.py
+++ b/src/gluonts/torch/model/wavenet/lightning_module.py
@@ -20,7 +20,8 @@
 
 
 class WaveNetLightningModule(pl.LightningModule):
-    """LightningModule wrapper over WaveNet.
+    """
+    LightningModule wrapper over WaveNet.
 
     Parameters
     ----------
diff --git a/src/gluonts/torch/model/wavenet/module.py b/src/gluonts/torch/model/wavenet/module.py
index 386e6a7e25..943f876cb7 100644
--- a/src/gluonts/torch/model/wavenet/module.py
+++ b/src/gluonts/torch/model/wavenet/module.py
@@ -83,7 +83,8 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 class WaveNet(nn.Module):
-    """The WaveNet model.
+    """
+    The WaveNet model.
 
     Parameters
     ----------
@@ -226,7 +227,8 @@ def get_full_features(
         future_observed_values: Optional[torch.Tensor],
         scale: torch.Tensor,
     ) -> torch.Tensor:
-        """Prepares the inputs for the network by repeating static feature and
+        """
+        Prepares the inputs for the network by repeating static feature and
         concatenating it with time features and observed value indicator.
 
         Parameters
@@ -286,7 +288,8 @@ def get_full_features(
     def target_feature_embedding(
         self, target: torch.Tensor, features: torch.Tensor
     ) -> torch.Tensor:
-        """Provides a joint embedding for the target and features.
+        """
+        Provides a joint embedding for the target and features.
 
         Parameters
         ----------
@@ -311,7 +314,8 @@ def base_net(
         inputs: torch.Tensor,
         queues: Optional[List[torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
-        """Forward pass through the WaveNet.
+        """
+        Forward pass through the WaveNet.
 
         Parameters
         ----------
@@ -368,7 +372,8 @@ def loss(
         future_observed_values: torch.Tensor,
         scale: torch.Tensor,
     ) -> torch.Tensor:
-        """Computes the training loss for the wavenet model.
+        """
+        Computes the training loss for the wavenet model.
 
         Parameters
         ----------
@@ -428,7 +433,8 @@ def loss(
     def _initialize_conv_queues(
         self, past_target: torch.Tensor, features: torch.Tensor
     ) -> List[torch.Tensor]:
-        """Initialize the convolutional queues to speed up predictions.
+        """
+        Initialize the convolutional queues to speed up predictions.
 
         Parameters
         ----------
@@ -469,7 +475,8 @@ def forward(
         num_parallel_samples: Optional[int] = None,
         temperature: Optional[float] = None,
     ) -> torch.Tensor:
-        """Generate predictions from the WaveNet model.
+        """
+        Generate predictions from the WaveNet model.
 
         Parameters
         ----------
diff --git a/src/gluonts/torch/modules/lookup_table.py b/src/gluonts/torch/modules/lookup_table.py
index edf362925a..a088e8ad79 100644
--- a/src/gluonts/torch/modules/lookup_table.py
+++ b/src/gluonts/torch/modules/lookup_table.py
@@ -18,7 +18,8 @@
 
 
 class LookupValues(nn.Module):
-    """A lookup table mapping bin indices to values.
+    """
+    A lookup table mapping bin indices to values.
 
     Parameters
     ----------
diff --git a/src/gluonts/torch/scaler.py b/src/gluonts/torch/scaler.py
index 38f61dee0c..7d2e68c1e1 100644
--- a/src/gluonts/torch/scaler.py
+++ b/src/gluonts/torch/scaler.py
@@ -131,7 +131,8 @@ def __call__(
 
 class StdScaler(Scaler):
     """
-    Computes a std scaling  value along dimension ``dim``, and scales the data accordingly.
+    Computes a std scaling  value along dimension ``dim``, and scales the data
+    accordingly.
 
     Parameters
     ----------
diff --git a/src/gluonts/torch/util.py b/src/gluonts/torch/util.py
index 12a2a1e4f0..0504b5d109 100644
--- a/src/gluonts/torch/util.py
+++ b/src/gluonts/torch/util.py
@@ -23,8 +23,8 @@ def resolve_device(
     """
     Resolves a torch device to the most appropriate one.
 
-    The ``"auto"`` device is resolved to ``"cuda"`` if CUDA is available,
-    and to ``"cpu"`` otherwise. Otherwise the device is unchanged.
+    The ``"auto"`` device is resolved to ``"cuda"`` if CUDA is available, and
+    to ``"cpu"`` otherwise. Otherwise the device is unchanged.
     """
     if device == "auto":
         if torch.cuda.is_available():
diff --git a/src/gluonts/transform/convert.py b/src/gluonts/transform/convert.py
index 156ddcf638..1054d2d73a 100644
--- a/src/gluonts/transform/convert.py
+++ b/src/gluonts/transform/convert.py
@@ -907,8 +907,9 @@ def flatmap_transform(
 
 
 class QuantizeMeanScaled(SimpleTransformation):
-    """Rescale and quantize the target variable.
-    Requires `past_target_field`, and `future_target_field` to be present.
+    """
+    Rescale and quantize the target variable. Requires `past_target_field`, and
+    `future_target_field` to be present.
 
     The mean absolute value of the past_target is used to rescale
     past_target and future_target. Then the bin_edges are used to quantize
diff --git a/src/gluonts/transform/split.py b/src/gluonts/transform/split.py
index ac0e9be65c..f435945dce 100644
--- a/src/gluonts/transform/split.py
+++ b/src/gluonts/transform/split.py
@@ -475,12 +475,13 @@ def flatmap_transform(
 
 
 class TFTInstanceSplitter(InstanceSplitter):
-    """Instance splitter used by the Temporal Fusion Transformer model.
+    """
+    Instance splitter used by the Temporal Fusion Transformer model.
 
-    Unlike ``InstanceSplitter``, this class returns known dynamic features as
-    a single tensor of shape [..., context_length + prediction_length, ...]
-    without splitting it into past & future parts. Moreover, this class supports
-    dynamic features that are known in the past.
+    Unlike ``InstanceSplitter``, this class returns known dynamic features as a
+    single tensor of shape [..., context_length + prediction_length, ...]
+    without splitting it into past & future parts. Moreover, this class
+    supports dynamic features that are known in the past.
     """
 
     @validated()
diff --git a/src/gluonts/util.py b/src/gluonts/util.py
index 544ad20881..1a7f11d917 100644
--- a/src/gluonts/util.py
+++ b/src/gluonts/util.py
@@ -51,7 +51,8 @@ class MyClass:
 else:
 
     def lazy_property(method):
-        """Property that is lazily evaluated.
+        """
+        Property that is lazily evaluated.
 
         This is the same as::
 
@@ -72,8 +73,8 @@ def my_property(self):
 
 def will_extractall_into(tar: tarfile.TarFile, path: Path) -> None:
     """
-    Check that the content of ``tar`` will be extracted within ``path``
-    upon calling ``extractall``.
+    Check that the content of ``tar`` will be extracted within ``path`` upon
+    calling ``extractall``.
 
     Raise a ``PermissionError`` if not.
     """
diff --git a/src/gluonts/zebras/__init__.py b/src/gluonts/zebras/__init__.py
index 1b9f437748..d767b3359f 100644
--- a/src/gluonts/zebras/__init__.py
+++ b/src/gluonts/zebras/__init__.py
@@ -59,8 +59,10 @@ def batch(xs: list):
 
 
 def from_pandas(obj):
-    """Convert pandas offsets, date indices and data frames into ``zebras``
-    equivalents."""
+    """
+    Convert pandas offsets, date indices and data frames into ``zebras``
+    equivalents.
+    """
     import pandas as pd
     from pandas.core.base import IndexOpsMixin
     from pandas.tseries.offsets import BaseOffset
diff --git a/src/gluonts/zebras/_base.py b/src/gluonts/zebras/_base.py
index f8d2938761..3f4cbf0bb2 100644
--- a/src/gluonts/zebras/_base.py
+++ b/src/gluonts/zebras/_base.py
@@ -28,7 +28,8 @@
 
 
 class Pad(NamedTuple):
-    """Indicator for padded values.
+    """
+    Indicator for padded values.
 
     >>> from gluonts.zebras import time_series
     >>> ts = time_series([1, 2, 3]).pad(0, left=2, right=2)
@@ -36,7 +37,6 @@ class Pad(NamedTuple):
     >>> assert list(ts) == [0, 0, 1, 2, 3, 0, 0]
     >>> assert ts._pad.left == 2
     >>> assert ts._pad.right == 2
-
     """
 
     left: int = 0
@@ -115,7 +115,8 @@ def resize(
         pad: LeftOrRight = "l",
         skip: LeftOrRight = "r",
     ) -> TimeBase:
-        """Force time frame to have length ``length``.
+        """
+        Force time frame to have length ``length``.
 
         This pads or slices the time frame, depending on whether its size is
         smaller or bigger than the required length.
diff --git a/src/gluonts/zebras/_freq.py b/src/gluonts/zebras/_freq.py
index 0face3c95a..2d8923ec43 100644
--- a/src/gluonts/zebras/_freq.py
+++ b/src/gluonts/zebras/_freq.py
@@ -35,7 +35,8 @@
 
 
 def _canonical_freqstr(n: int, name: str, suffix: Optional[str] = None) -> str:
-    """Canonical name of frequency.
+    """
+    Canonical name of frequency.
 
     >>> _canonical_freqstr(1, "X")
     'X'
@@ -173,7 +174,8 @@ def to_pandas(self):
         return to_offset(str(self))
 
     def align(self, timestamp: np.datetime64) -> np.datetime64:
-        """Align ``timestamp`` according to the frequency.
+        """
+        Align ``timestamp`` according to the frequency.
 
         For example, for daily frequency, any timestamps that fall into the
         same day align to the same value.
diff --git a/src/gluonts/zebras/_period.py b/src/gluonts/zebras/_period.py
index 4cda3178a9..2e82ed3341 100644
--- a/src/gluonts/zebras/_period.py
+++ b/src/gluonts/zebras/_period.py
@@ -202,7 +202,6 @@ def end(self) -> Period:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.end == period("2021-12-31", "D")
-
         """
 
         return self[-1]
@@ -213,7 +212,6 @@ def head(self, count: int) -> Periods:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.head(5) == periods("2021-01-01", "D", 5)
-
         """
 
         return self[:count]
@@ -224,7 +222,6 @@ def tail(self, count: int) -> Periods:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.tail(5) == periods("2021-12-27", "D", 5)
-
         """
 
         return self[-count:]
@@ -235,7 +232,6 @@ def future(self, count: int) -> Periods:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.future(5) == periods("2022-01-01", "D", 5)
-
         """
         return (self.end + 1).periods(count)
 
@@ -245,7 +241,6 @@ def past(self, count: int) -> Periods:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.past(5) == periods("2020-12-27", "D", 5)
-
         """
 
         return (self.start - count).periods(count)
@@ -256,7 +251,6 @@ def prepend(self, count: int) -> Periods:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.prepend(5) == periods("2020-12-27", "D", 370)
-
         """
         return Periods(
             np.concatenate([self.past(count).data, self.data]),
@@ -269,7 +263,6 @@ def extend(self, count: int) -> Periods:
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.extend(5) == periods("2021", "D", 370)
-
         """
         return Periods(
             np.concatenate([self.data, self.future(count).data]),
@@ -286,7 +279,8 @@ def to_pandas(self):
 
     @classmethod
     def from_pandas(cls, index):
-        """Turn ``pandas.PeriodIndex`` or ``pandas.DatetimeIndex`` into
+        """
+        Turn ``pandas.PeriodIndex`` or ``pandas.DatetimeIndex`` into
         ``Periods``.
         """
 
@@ -313,7 +307,6 @@ def index_of(self, period: Union[str, Period]):
 
         >>> p = periods("2021", "D", 365)
         >>> assert p.index_of(period("2021-02-01", "D")) == 31
-
         """
 
         if isinstance(period, str):
@@ -375,7 +368,8 @@ def _encode_zebras_periods(v: Periods):
 def period(
     data: Union[Period, str], freq: Optional[Union[Freq, str]] = None
 ) -> Period:
-    """Create a ``zebras.Period`` object that represents a period of time.
+    """
+    Create a ``zebras.Period`` object that represents a period of time.
 
     Parameters
     ----------
@@ -422,7 +416,8 @@ def period(
 def periods(
     start: Union[Period, str], freq: Union[Freq, str], count: int
 ) -> Period:
-    """Create a ``zebras.Periods`` object that represents multiple consecutive
+    """
+    Create a ``zebras.Periods`` object that represents multiple consecutive
     periods of time.
 
     Parameters
diff --git a/src/gluonts/zebras/_split_frame.py b/src/gluonts/zebras/_split_frame.py
index 93b98259dc..d36568a44d 100644
--- a/src/gluonts/zebras/_split_frame.py
+++ b/src/gluonts/zebras/_split_frame.py
@@ -331,8 +331,9 @@ def split_frame(
     metadata=None,
     default_tdim=-1,
 ):
-    """Create a ``zebras.SplitFrame`` where columns can either be `past`,
-    `future` or `full`, which spans both past and future.
+    """
+    Create a ``zebras.SplitFrame`` where columns can either be `past`, `future`
+    or `full`, which spans both past and future.
 
     ``past_length`` and ``future_length`` is derived from the input data if
     possible or default to ``0`` in case no respective data is available. It is
diff --git a/src/gluonts/zebras/_time_frame.py b/src/gluonts/zebras/_time_frame.py
index dec7e591e8..b845252586 100644
--- a/src/gluonts/zebras/_time_frame.py
+++ b/src/gluonts/zebras/_time_frame.py
@@ -99,7 +99,9 @@ def eq_to(self, other: TimeFrame) -> bool:
         return True
 
     def _time_view(self, column):
-        """View of column with respect to time."""
+        """
+        View of column with respect to time.
+        """
 
         return AxisView(self.columns[column], self.tdims[column])
 
@@ -246,7 +248,9 @@ def _repr_html_(self):
 
     @classmethod
     def from_pandas(cls, df):
-        """Turn ``pandas.DataFrame`` into ``TimeFrame``."""
+        """
+        Turn ``pandas.DataFrame`` into ``TimeFrame``.
+        """
         import pandas as pd
 
         try:
@@ -300,7 +304,8 @@ def like(self, columns=None, static=None):
         return _replace(self, columns=columns, static=static)
 
     def rename(self, mapping=None, **kwargs):
-        """Rename ``columns`` of ``TimeFrame``.
+        """
+        Rename ``columns`` of ``TimeFrame``.
 
         The keys in ``mapping`` denote the target column names, i.e.
         ``rename({"target": "source"})``. For convenience one can use keyword
@@ -320,7 +325,8 @@ def rename(self, mapping=None, **kwargs):
         return _replace(self, columns=columns, tdims=tdims)
 
     def rename_static(self, mapping=None, **kwargs):
-        """Rename ``static`` fields of ``TimeFrame``.
+        """
+        Rename ``static`` fields of ``TimeFrame``.
 
         The keys in ``mapping`` denote the target column names, i.e.
         ``rename({"target": "source"})``. For convenience one can use keyword
@@ -383,7 +389,8 @@ def rolsplit(
         n: Optional[int] = None,
         pad_value=0.0,
     ):
-        """Create rolling split of past/future pairs.
+        """
+        Create rolling split of past/future pairs.
 
         Parameters
         ----------
@@ -605,8 +612,9 @@ def time_frame(
     default_tdim: int = -1,
     metadata: Optional[Mapping] = None,
 ):
-    """Create a ``zebras.TimeFrame`` object that represents one
-    or more time series.
+    """
+    Create a ``zebras.TimeFrame`` object that represents one or more time
+    series.
 
     Parameters
     ----------
diff --git a/src/gluonts/zebras/_time_series.py b/src/gluonts/zebras/_time_series.py
index 7997d13cb8..68cf89cc95 100644
--- a/src/gluonts/zebras/_time_series.py
+++ b/src/gluonts/zebras/_time_series.py
@@ -238,7 +238,8 @@ def time_series(
     name: Optional[str] = None,
     metadata: Optional[Dict] = None,
 ):
-    """Create a ``zebras.TimeSeries`` object that represents a time series.
+    """
+    Create a ``zebras.TimeSeries`` object that represents a time series.
 
     Parameters
     ----------
diff --git a/src/gluonts/zebras/_util.py b/src/gluonts/zebras/_util.py
index 8f656a3f09..ce2f5cb211 100644
--- a/src/gluonts/zebras/_util.py
+++ b/src/gluonts/zebras/_util.py
@@ -51,7 +51,8 @@ def pad_axis(
 
 
 def _replace(obj, **kwargs):
-    """Copy and replace dataclass instance.
+    """
+    Copy and replace dataclass instance.
 
     Compared to ``dataclasses.replace`` this first creates a copy where each
     field in the object is copied. Thus, each field of the returned object is
diff --git a/src/gluonts/zebras/schema.py b/src/gluonts/zebras/schema.py
index 5bfa777fb2..04c7cac675 100644
--- a/src/gluonts/zebras/schema.py
+++ b/src/gluonts/zebras/schema.py
@@ -50,7 +50,6 @@
 class Field(BaseModel):
     """
     Specification for user provided input data.
-
     """
 
     required: bool = True
@@ -155,7 +154,8 @@ class TimeSeries(Field):
     past_only: bool = True
 
     def _load(self, value, length: Optional[int] = None) -> np.ndarray:
-        """Load field ``name`` from ``data`` and apply validation.
+        """
+        Load field ``name`` from ``data`` and apply validation.
 
         Note: We do the lookup of the value in this function, since the field
         can be optional.
@@ -216,7 +216,8 @@ def __init__(self, fields=None, **kwargs):
                 )
 
     def _load_static(self, data: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """Helper to load static data from ``data``.
+        """
+        Helper to load static data from ``data``.
 
         Used by ``load_timeframe`` and ``load_splitframe``.
         """
diff --git a/test/dataset/test_dataset_types.py b/test/dataset/test_dataset_types.py
index 4962a25b1f..c9e9fd2c66 100644
--- a/test/dataset/test_dataset_types.py
+++ b/test/dataset/test_dataset_types.py
@@ -32,7 +32,9 @@
 
 
 class Timer:
-    """Context manager for measuring the time of enclosed code fragments."""
+    """
+    Context manager for measuring the time of enclosed code fragments.
+    """
 
     def __enter__(self):
         self.start = time.perf_counter()
diff --git a/test/evaluation/test_evaluator.py b/test/evaluation/test_evaluator.py
index ca21c56aa8..d43dd2ea93 100644
--- a/test/evaluation/test_evaluator.py
+++ b/test/evaluation/test_evaluator.py
@@ -662,7 +662,7 @@ def rmsle(target, forecast):
     r"""
     .. math::
 
-        rmsle = sqrt(mean(square(log(Y+1) - log(Y_hat+1))))
+    rmsle = sqrt(mean(square(log(Y+1) - log(Y_hat+1))))
     """
     return np.sqrt(
         np.mean(np.square(np.log(target + 1) - np.log(forecast + 1)))
diff --git a/test/model/npts/test_npts.py b/test/model/npts/test_npts.py
index a9b7183b58..ab6887e3ec 100644
--- a/test/model/npts/test_npts.py
+++ b/test/model/npts/test_npts.py
@@ -560,8 +560,8 @@ def _test_nans_in_target(predictor: NPTSPredictor, dataset: Dataset) -> None:
 
 def _inject_nans_in_target(data_entry: DataEntry, p: float) -> DataEntry:
     """
-    Returns a copy of the given `data_entry` where approximately `p` percent
-    of the target values are NaNs.
+    Returns a copy of the given `data_entry` where approximately `p` percent of
+    the target values are NaNs.
 
     Parameters
     ----------
diff --git a/test/mx/distribution/test_default_quantile_method.py b/test/mx/distribution/test_default_quantile_method.py
index fd6ad3e40a..b0ff3bb80a 100644
--- a/test/mx/distribution/test_default_quantile_method.py
+++ b/test/mx/distribution/test_default_quantile_method.py
@@ -25,9 +25,12 @@
 
 def test_quantile() -> None:
     r"""
-    Tests if the quantiles of a single Gaussian and the quantiles of the mixture of two Gaussians
-    identical to the first are equal. The quantiles of the single Gaussian are given by the
-    Gaussian.quantile() method while the quantiles of the mixture from the Distribution.quantile() method.
+    Tests if the quantiles of a single Gaussian and the quantiles of the
+    mixture of two Gaussians identical to the first are equal.
+
+    The quantiles of the single Gaussian are given by the Gaussian.quantile()
+    method while the quantiles of the mixture from the Distribution.quantile()
+    method.
     """
     mu = mx.nd.array(
         [[1, 10, 100, 1000, 10000], [-1, -10, -100, -1000, -10000]]
diff --git a/test/mx/distribution/test_lds.py b/test/mx/distribution/test_lds.py
index bf100132c6..d51ea435fb 100644
--- a/test/mx/distribution/test_lds.py
+++ b/test/mx/distribution/test_lds.py
@@ -54,6 +54,7 @@ def test_lds_likelihood(data_filename):
     """
     Test to check that likelihood is correctly computed for different
     innovation state space models (ISSM).
+
     Note that ISSM is a special case of LDS.
     """
     with gzip.GzipFile(data_filename, "r") as fp:
diff --git a/test/mx/distribution/test_mx_distribution_inference.py b/test/mx/distribution/test_mx_distribution_inference.py
index 21ca98d86f..b26a2d2ce7 100644
--- a/test/mx/distribution/test_mx_distribution_inference.py
+++ b/test/mx/distribution/test_mx_distribution_inference.py
@@ -12,8 +12,8 @@
 # permissions and limitations under the License.
 
 """
-Test that maximizing likelihood allows to correctly recover distribution parameters for all
-distributions exposed to the user.
+Test that maximizing likelihood allows to correctly recover distribution
+parameters for all distributions exposed to the user.
 """
 import random
 from functools import reduce
@@ -178,7 +178,7 @@ def maximum_likelihood_estimate_sgd(
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_beta_likelihood(alpha: float, beta: float, hybridize: bool) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -225,7 +225,7 @@ def test_inflated_beta_likelihood(
     one_probability: float,
 ) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -316,7 +316,7 @@ def test_studentT_likelihood(
     mu: float, sigma: float, nu: float, hybridize: bool
 ) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -359,7 +359,7 @@ def test_studentT_likelihood(
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_gamma_likelihood(alpha: float, beta: float, hybridize: bool) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -395,7 +395,7 @@ def test_gamma_likelihood(alpha: float, beta: float, hybridize: bool) -> None:
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_gaussian_likelihood(mu: float, sigma: float, hybridize: bool):
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -609,9 +609,10 @@ def test_lowrank_multivariate_gaussian(hybridize: bool, rank: int) -> None:
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_empirical_distribution(hybridize: bool) -> None:
     r"""
-    This verifies if the loss implemented by `EmpiricalDistribution` is correct.
-    This is done by recovering parameters of a parametric distribution not by maximizing likelihood but by
-    optimizing CRPS loss on the Monte Carlo samples drawn from the underlying parametric distribution.
+    This verifies if the loss implemented by `EmpiricalDistribution` is
+    correct. This is done by recovering parameters of a parametric distribution
+    not by maximizing likelihood but by optimizing CRPS loss on the Monte Carlo
+    samples drawn from the underlying parametric distribution.
 
     More precisely, given observations `obs` drawn from the true distribution p(x; \theta^*), we solve
 
@@ -623,7 +624,6 @@ def test_empirical_distribution(hybridize: bool) -> None:
     This test uses Multivariate Gaussian with diagonal covariance. Once multivariate CRPS is implemented in
     `EmpiricalDistribution` one could use `LowrankMultivariateGaussian` as well. Any univariate distribution whose
     `sample_rep` is differentiable can also be used in this test.
-
     """
     num_obs = 2000
     dim = 2
@@ -689,8 +689,9 @@ def test_empirical_distribution(hybridize: bool) -> None:
 def test_deterministic_l2(mu: float, hybridize: bool) -> None:
     """
     Test to check that maximizing the likelihood recovers the parameters.
-    This tests uses the Gaussian distribution with fixed variance and sample mean.
-    This essentially reduces to determistic L2.
+
+    This tests uses the Gaussian distribution with fixed variance and sample
+    mean. This essentially reduces to determistic L2.
     """
     # generate samples
     mu = mu
@@ -723,8 +724,9 @@ def domain_map(cls, F, mu, sigma):
 def test_deterministic_l1(mu: float, hybridize: bool) -> None:
     """
     Test to check that maximizing the likelihood recovers the parameters.
-    This tests uses the Laplace distribution with fixed variance and sample mean.
-    This essentially reduces to determistic L1.
+
+    This tests uses the Laplace distribution with fixed variance and sample
+    mean. This essentially reduces to determistic L1.
     """
     # generate samples
     mu = mu
@@ -756,7 +758,7 @@ def domain_map(cls, F, mu, b):
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_neg_binomial(mu_alpha: Tuple[float, float], hybridize: bool) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
     random.seed(0)
     np.random.seed(0)
@@ -796,7 +798,7 @@ def test_neg_binomial(mu_alpha: Tuple[float, float], hybridize: bool) -> None:
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_laplace(mu_b: Tuple[float, float], hybridize: bool) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
     # test instance
     mu, b = mu_b
@@ -837,7 +839,7 @@ def test_piecewise_linear(
     hybridize: bool,
 ) -> None:
     """
-    Test to check that minimizing the CRPS recovers the quantile function
+    Test to check that minimizing the CRPS recovers the quantile function.
     """
     num_samples = 500  # use a few samples for timeout failure
 
@@ -923,7 +925,7 @@ def test_box_cox_tranform(
     lam_1: float, lam_2: float, mu: float, sigma: float, hybridize: bool
 ):
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -991,7 +993,7 @@ def test_binned_likelihood(
     num_bins: float, bin_probabilites: np.ndarray, hybridize: bool
 ):
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     bin_prob = mx.nd.array(bin_probabilites)
@@ -1035,7 +1037,7 @@ def test_categorical_likelihood(
     num_cats: int, cat_probs: np.ndarray, hybridize: bool
 ):
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
     cat_prob = mx.nd.array(cat_probs)
     cat_probs = mx.nd.zeros((NUM_SAMPLES, num_cats)) + cat_prob
@@ -1070,7 +1072,7 @@ def test_categorical_likelihood(
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_poisson_likelihood(rate: float, hybridize: bool) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -1099,7 +1101,7 @@ def test_poisson_likelihood(rate: float, hybridize: bool) -> None:
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_logit_normal_likelihood(mu: float, sigma: float, hybridize: bool):
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -1137,7 +1139,7 @@ def test_loglogistic_likelihood(
     mu: float, sigma: float, hybridize: bool
 ) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -1176,7 +1178,7 @@ def test_weibull_likelihood(
     rate: float, shape: float, hybridize: bool
 ) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -1213,7 +1215,7 @@ def test_weibull_likelihood(
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_genpareto_likelihood(xi: float, beta: float, hybridize: bool) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -1257,7 +1259,7 @@ def test_inflated_poisson_likelihood(
     zero_probability: float,
 ) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
     random.seed(0)
     np.random.seed(0)
@@ -1310,7 +1312,7 @@ def test_inflated_neg_binomial_likelihood(
     hybridize: bool,
 ) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
diff --git a/test/mx/model/deepar/test_deepar_auxiliary_outputs.py b/test/mx/model/deepar/test_deepar_auxiliary_outputs.py
index 2b36fe67e7..5c2fb63229 100644
--- a/test/mx/model/deepar/test_deepar_auxiliary_outputs.py
+++ b/test/mx/model/deepar/test_deepar_auxiliary_outputs.py
@@ -26,7 +26,7 @@
 
 def test_distribution():
     """
-    Makes sure additional tensors can be accessed and have expected shapes
+    Makes sure additional tensors can be accessed and have expected shapes.
     """
     prediction_length = ds_info.prediction_length
 
diff --git a/test/mx/model/seq2seq/test_cnn.py b/test/mx/model/seq2seq/test_cnn.py
index 796c151f48..1022b7b7b8 100644
--- a/test/mx/model/seq2seq/test_cnn.py
+++ b/test/mx/model/seq2seq/test_cnn.py
@@ -22,7 +22,7 @@ def compute_causalconv1d(
     x: np.ndarray, kernels: np.ndarray, dilation: int
 ) -> np.ndarray:
     """
-    Naive way to compute the 1-d causal convolution
+    Naive way to compute the 1-d causal convolution.
 
     Parameters:
     x: np.array
diff --git a/test/mx/model/tpp/test_tpp_predictor.py b/test/mx/model/tpp/test_tpp_predictor.py
index f1753d9753..9a8bbdded9 100644
--- a/test/mx/model/tpp/test_tpp_predictor.py
+++ b/test/mx/model/tpp/test_tpp_predictor.py
@@ -51,9 +51,9 @@ def hybrid_forward(
         self, F, past_target: Tensor, past_valid_length: Tensor
     ) -> Tuple[Tensor, Tensor]:
         """
-        Return two tensors, of shape
-        (batch_size, num_samples, max_prediction_length, target_dim)
-        and (batch_size, num_samples) respectively.
+        Return two tensors, of shape (batch_size, num_samples,
+        max_prediction_length, target_dim) and (batch_size, num_samples)
+        respectively.
         """
         batch_size = past_target.shape[0]
         assert past_valid_length.shape[0] == batch_size
diff --git a/test/paper_examples/test_axiv_paper_examples.py b/test/paper_examples/test_axiv_paper_examples.py
index b9db367e37..de63e2a7ff 100644
--- a/test/paper_examples/test_axiv_paper_examples.py
+++ b/test/paper_examples/test_axiv_paper_examples.py
@@ -20,7 +20,7 @@
 def test_listing_1():
     """
     Test GluonTS paper examples from arxiv paper:
-    https://arxiv.org/abs/1906.05264
+    https://arxiv.org/abs/1906.05264.
 
     Listing 1
     """
@@ -56,7 +56,7 @@ def test_listing_1():
 def test_appendix_c():
     """
     Test GluonTS paper examples from arxiv paper:
-    https://arxiv.org/abs/1906.05264
+    https://arxiv.org/abs/1906.05264.
 
     Appendix C
     """
diff --git a/test/time_feature/test_lag.py b/test/time_feature/test_lag.py
index 05377ef72f..951a5f9cb4 100644
--- a/test/time_feature/test_lag.py
+++ b/test/time_feature/test_lag.py
@@ -12,7 +12,7 @@
 # permissions and limitations under the License.
 
 """
-Test the lags computed for different frequencies
+Test the lags computed for different frequencies.
 """
 
 import gluonts.time_feature.lag as date_feature_set
diff --git a/test/torch/modules/test_torch_distribution_inference.py b/test/torch/modules/test_torch_distribution_inference.py
index 8e9e617a8b..94df137826 100644
--- a/test/torch/modules/test_torch_distribution_inference.py
+++ b/test/torch/modules/test_torch_distribution_inference.py
@@ -12,8 +12,8 @@
 # permissions and limitations under the License.
 
 """
-Test that maximizing likelihood allows to correctly recover distribution parameters for all
-distributions exposed to the user.
+Test that maximizing likelihood allows to correctly recover distribution
+parameters for all distributions exposed to the user.
 """
 from typing import List
 
@@ -65,7 +65,7 @@ def inv_softplus(y: np.ndarray) -> np.ndarray:
 
 def inv_softmax(y: np.ndarray) -> np.ndarray:
     """
-    Inverse of the scipy.special.softmax
+    Inverse of the scipy.special.softmax.
     """
     return np.log(y)
 
@@ -111,8 +111,9 @@ def compare_logits(
     logits_true: np.array, logits_hat: np.array, TOL: int = 0.3
 ):
     """
-    Since logits {x_i} and logits {x_i + K} will result in the same probabilities {exp(x_i)/(sum_j exp(x_j))},
-    one needs to apply softmax and inv_softmax before comparing logits within a certain tolerance
+    Since logits {x_i} and logits {x_i + K} will result in the same
+    probabilities {exp(x_i)/(sum_j exp(x_j))}, one needs to apply softmax and
+    inv_softmax before comparing logits within a certain tolerance.
     """
     param_true = inv_softmax(softmax(logits_true, axis=-1))
     param_hat = inv_softmax(softmax(logits_hat, axis=-1))
@@ -125,7 +126,7 @@ def compare_logits(
 @pytest.mark.parametrize("concentration1, concentration0", [(3.75, 1.25)])
 def test_beta_likelihood(concentration1: float, concentration0: float) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -164,7 +165,7 @@ def test_beta_likelihood(concentration1: float, concentration0: float) -> None:
 @pytest.mark.parametrize("concentration, rate", [(3.75, 1.25)])
 def test_gamma_likelihood(concentration: float, rate: float) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
 
     # generate samples
@@ -264,7 +265,7 @@ def test_studentT_likelihood(df: float, loc: float, scale: float):
 @pytest.mark.parametrize("rate", [1.0])
 def test_poisson(rate: float) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
     # generate samples
     rates = torch.zeros((NUM_SAMPLES,)) + rate
@@ -297,7 +298,7 @@ def test_poisson(rate: float) -> None:
 @pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_neg_binomial(total_count: float, logit: float) -> None:
     """
-    Test to check that maximizing the likelihood recovers the parameters
+    Test to check that maximizing the likelihood recovers the parameters.
     """
     seed_everything(42)
     # generate samples