From 3a3a48158e3797d7672ebe98aea5223844f556b0 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 27 Dec 2022 14:47:46 +0000
Subject: [PATCH 001/164] added informer to gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index cf8183463613..bca127a3bce2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,6 @@ tags
 *.lock
 
 # DS_Store (MacOS)
-.DS_Store
\ No newline at end of file
+.DS_Store
+
+Informer2020/
\ No newline at end of file

From 9f8e00d0d781f677029840681382fc0d69a6b7bf Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 27 Dec 2022 14:47:46 +0000
Subject: [PATCH 002/164] added informer to gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index cf8183463613..bca127a3bce2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,6 @@ tags
 *.lock
 
 # DS_Store (MacOS)
-.DS_Store
\ No newline at end of file
+.DS_Store
+
+Informer2020/
\ No newline at end of file

From 965eeb68d9808659c957c91d09fb32ca8c8522ad Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Thu, 12 Jan 2023 11:03:57 +0000
Subject: [PATCH 003/164] WIP informer2020

---
 docs/source/en/model_doc/informer.mdx         |   46 +
 src/transformers/__init__.py                  |   22 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 src/transformers/models/informer/__init__.py  |   67 +
 .../models/informer/configuration_informer.py |  227 ++
 .../models/informer/modeling_informer.py      | 1955 +++++++++++++++++
 tests/models/informer/__init__.py             |    0
 .../models/informer/test_modeling_informer.py |  442 ++++
 10 files changed, 2764 insertions(+)
 create mode 100644 docs/source/en/model_doc/informer.mdx
 create mode 100644 src/transformers/models/informer/__init__.py
 create mode 100644 src/transformers/models/informer/configuration_informer.py
 create mode 100644 src/transformers/models/informer/modeling_informer.py
 create mode 100644 tests/models/informer/__init__.py
 create mode 100644 tests/models/informer/test_modeling_informer.py

diff --git a/docs/source/en/model_doc/informer.mdx b/docs/source/en/model_doc/informer.mdx
new file mode 100644
index 000000000000..6765b9768fc6
--- /dev/null
+++ b/docs/source/en/model_doc/informer.mdx
@@ -0,0 +1,46 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Informer
+
+## Overview
+
+The Informer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## InformerConfig
+
+[[autodoc]] InformerConfig
+
+
+## InformerModel
+
+[[autodoc]] InformerModel
+    - forward
+
+
+## InformerForPrediction
+
+[[autodoc]] InformerForPrediction
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 829c0a18bdc2..eb04b8546cb6 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -400,6 +400,10 @@
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
+    "models.informer": [
+        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InformerConfig",
+    ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -2208,6 +2212,14 @@
             "TimeSeriesTransformerPreTrainedModel",
         ]
     )
+    _import_structure["models.informer"].extend(
+        [
+            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "InformerForPrediction",
+            "InformerModel",
+            "InformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.timesformer"].extend(
         [
             "TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3758,6 +3770,10 @@
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
+    from .models.informer import (
+        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InformerConfig,
+    )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
     from .models.trajectory_transformer import (
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5253,6 +5269,12 @@
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
         )
+        from .models.informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
         from .models.timesformer import (
             TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TimesformerForVideoClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 43ed17f30dee..5ce1df92ae6f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -160,6 +160,7 @@
     tapas,
     tapex,
     time_series_transformer,
+    informer,
     timesformer,
     trajectory_transformer,
     transfo_xl,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6a49d2f4e2c0..d85e19b22f54 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -156,6 +156,7 @@
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
+        ("informer", "InformerConfig"),
         ("timesformer", "TimesformerConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
         ("transfo-xl", "TransfoXLConfig"),
@@ -307,6 +308,7 @@
         ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -482,6 +484,7 @@
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
+        ("informer", "Informer"),
         ("timesformer", "TimeSformer"),
         ("trajectory_transformer", "Trajectory Transformer"),
         ("transfo-xl", "Transformer-XL"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a6c43a2f1e78..8b2840036fe8 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -152,6 +152,7 @@
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
+        ("informer", "InformerModel"),
         ("timesformer", "TimesformerModel"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
diff --git a/src/transformers/models/informer/__init__.py b/src/transformers/models/informer/__init__.py
new file mode 100644
index 000000000000..927fad5e5e7f
--- /dev/null
+++ b/src/transformers/models/informer/__init__.py
@@ -0,0 +1,67 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_informer": [
+        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InformerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_informer"] = [
+        "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "InformerForPrediction",
+        "InformerModel",
+        "InformerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_informer import (
+        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InformerConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
new file mode 100644
index 000000000000..b08f1ee982bc
--- /dev/null
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Informer model configuration"""
+
+from typing import List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "elisim/informer": "https://huggingface.co/elisim/informer/resolve/main/config.json",
+}
+
+
+
+class InformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
+    instantiate a Informer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
+    Transformer
+    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If `None`, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        scaling (`bool`, *optional* defaults to `True`):
+            Whether to scale the input targets.
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
+            5, 6, 7]`.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+
+        Example:
+
+    ```python
+    >>> from transformers import InformerConfig, InformerModel
+
+    >>> # Initializing a default Informer configuration
+    >>> configuration = InformerConfig()
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = InformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "informer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
+    def __init__(
+        self,
+        input_size: int = 1,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        scaling: bool = True,
+        num_dynamic_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_static_real_features: int = 0,
+        num_time_features: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        is_encoder_decoder: bool = True,
+        activation_function: str = "gelu",
+        dropout: float = 0.1,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache=True,
+        **kwargs
+    ):
+        # time series specific configuration
+        self.prediction_length = prediction_length
+        self.context_length = context_length or prediction_length
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.scaling = scaling
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+        if cardinality and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError(
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [1]
+        if embedding_dimension and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError(
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.d_model = input_size * len(lags_sequence) + self._number_of_features
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
+        self.init_std = init_std
+
+        self.output_attentions = False
+        self.output_hidden_states = False
+
+        self.use_cache = use_cache
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
+            + self.input_size  # the log(scale)
+        )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
new file mode 100644
index 000000000000..8d36a171da35
--- /dev/null
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -0,0 +1,1955 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Informer model."""
+
+import random
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_informer import InformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InformerConfig"
+
+
+INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "elisim/informer",
+    # See all Informer models at https://huggingface.co/models?filter=informer
+]
+
+
+
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class ParameterProjection(nn.Module):
+    def __init__(
+        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self.function = function
+
+    def forward(self, x, *args):
+        return self.function(x, *args)
+
+
+class DistributionOutput:
+    distribution_class: type
+    in_features: int
+    args_dim: Dict[str, int]
+
+    def __init__(self, dim: int = 1) -> None:
+        self.dim = dim
+        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
+
+    def _base_distribution(self, distr_args):
+        if self.dim == 1:
+            return self.distribution_class(*distr_args)
+        else:
+            return Independent(self.distribution_class(*distr_args), 1)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions that this object constructs.
+        """
+        return () if self.dim == 1 else (self.dim,)
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
+        """
+        return 0.0
+
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distribution_class: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        df = 2.0 + cls.squareplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distribution_class: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distribution_class: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        if self.dim == 1:
+            return self.distribution_class(total_count=total_count, logits=logits)
+        else:
+            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            # See scaling property of Gamma.
+            logits += scale.log()
+
+        return self._base_distribution((total_count, logits))
+
+
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, scale
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+class NegativeLogLikelihood:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+@dataclass
+class Seq2SeqTimeSeriesModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
+    """
+    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
+    distribution.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
+            Distributional loss.
+        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
+            Parameters of the chosen distribution.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    params: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class SampleTimeSeriesPredictionOutput(ModelOutput):
+    sequences: torch.FloatTensor = None
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
+class InformerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->Informer
+class InformerEncoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = InformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->Informer
+class InformerDecoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = InformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = InformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+INFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`InformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Past values of the time series, that serve as context in order to predict the future. These values may
+            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features`).
+
+            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+
+            Missing values need to be replaced with zeros.
+
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features.
+
+            The Informer only learns additional embeddings for `static_categorical_features`.
+
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs to learn to output, given the `past_values`.
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `future_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional features.
+
+            The Informer only learns additional embeddings for `static_categorical_features`.
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
+class InformerEncoder(InformerPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`InformerEncoderLayer`].
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
+class InformerDecoder(InformerPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`InformerDecoderLayer`]
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        if config.scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
+
+        self.embedder = FeatureEmbedder(
+            cardinalities=config.cardinality,
+            embedding_dims=config.embedding_dimension,
+        )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def _past_length(self) -> int:
+        return self.config.context_length + max(self.config.lags_sequence)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
+            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
+            j, :, k] = sequence[i, -indices[k]-S+j, :].
+
+        Args:
+            sequence: Tensor
+                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            subsequences_length : int
+                Length of the subsequences to be extracted.
+            shift: int
+                Shift the lags by this amount back.
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.config.lags_sequence]
+
+        try:
+            assert max(indices) + subsequences_length <= sequence_length, (
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+        except AssertionError as e:
+            e.args += (max(indices), sequence_length)
+            raise
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
+                ),
+                dim=1,
+            )
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+        )
+
+        # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
+        _, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            torch.cat((past_values, future_values), dim=1) / scale
+            if future_values is not None
+            else past_values / scale
+        )
+
+        inputs_length = (
+            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
+        )
+        try:
+            assert inputs.shape[1] == inputs_length, (
+                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
+            )
+        except AssertionError as e:
+            e.args += (inputs.shape[1], inputs_length)
+            raise
+
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
+
+        # embeddings
+        embedded_cat = self.embedder(static_categorical_features)
+        # static features
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        # all features
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
+
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, scale, static_feat
+
+    def enc_dec_outputs(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.config.context_length, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+
+        encoder_outputs = self.encoder(inputs_embeds=enc_input)
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        )
+        return encoder_outputs, decoder_outputs
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = transformer_inputs[:, : self.config.context_length, ...]
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (scale, static_feat)
+
+        return Seq2SeqTimeSeriesModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+@add_start_docstrings(
+    "The Informer Model with a distribution head on top for time-series forecasting.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.model = InformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = NegativeLogLikelihood()
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, dec_output):
+        return self.parameter_projection(dec_output)
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, scale=scale)
+
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerForPrediction.from_pretrained(
+        ...     "huggingface/time-series-transformer-tourism-monthly"
+        ... )
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            params = self.output_params(outputs[0])  # outputs.last_hidden_state
+            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTimeSeriesPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_values: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.Tensor:
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=True,
+        )
+
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        scale = outputs.scale
+        static_feat = outputs.static_features
+
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        future_samples = []
+
+        # greedy decoding
+        for k in range(self.config.prediction_length):
+            lagged_sequence = self.model.get_lagged_subsequences(
+                sequence=repeated_past_values,
+                subsequences_length=1 + k,
+                shift=1,
+            )
+
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+
+            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+            dec_last_hidden = dec_output.last_hidden_state
+
+            params = self.parameter_projection(dec_last_hidden[:, -1:])
+            distr = self.output_distribution(params, scale=repeated_scale)
+            next_sample = distr.sample()
+
+            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+
+        return SampleTimeSeriesPredictionOutput(
+            sequences=concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
diff --git a/tests/models/informer/__init__.py b/tests/models/informer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
new file mode 100644
index 000000000000..ecbea487e790
--- /dev/null
+++ b/tests/models/informer/test_modeling_informer.py
@@ -0,0 +1,442 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Informer model. """
+
+import inspect
+import tempfile
+import unittest
+
+from huggingface_hub import hf_hub_download
+from transformers import is_torch_available
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        InformerConfig,
+        InformerForPrediction,
+        InformerModel,
+    )
+    from transformers.models.informer.modeling_informer import (
+        InformerDecoder,
+        InformerEncoder,
+    )
+
+
+@require_torch
+class InformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        lags_sequence=[1, 2, 3, 4, 5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.embedding_dimension = embedding_dimension
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+        self.encoder_seq_length = context_length
+        self.decoder_seq_length = prediction_length
+
+    def get_config(self):
+        return InformerConfig(
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            prediction_length=self.prediction_length,
+            context_length=self.context_length,
+            lags_sequence=self.lags_sequence,
+            num_time_features=self.num_time_features,
+            num_static_categorical_features=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
+        )
+
+    def prepare_informer_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        static_real_features = floats_tensor([self.batch_size, 1])
+
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length])
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "static_real_features": static_real_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_informer_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = InformerModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = InformerEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict)
+        enc_input = transformer_inputs[:, : config.context_length, ...]
+        dec_input = transformer_inputs[:, config.context_length :, ...]
+
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = InformerDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_last_hidden_state,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class InformerModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (InformerModel, InformerForPrediction) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (InformerForPrediction,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    def setUp(self):
+        self.model_tester = InformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=InformerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # Ignore since we have no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # # Input is 'static_categorical_features' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(InformerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(InformerModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "past_values",
+                "past_time_features",
+                "past_observed_mask",
+                "static_categorical_features",
+                "static_real_features",
+                "future_values",
+                "future_time_features",
+            ]
+
+            expected_arg_names.extend(
+                [
+                    "future_observed_mask",
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+                if "future_observed_mask" in arg_names
+                else [
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 6
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_seq_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    encoder_seq_length,
+                ],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+        )
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="kashif/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class InformerModelIntegrationTests(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
+        batch = prepare_batch()
+
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            )[0]
+
+        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = InformerForPrediction.from_pretrained(
+            "huggingface/time-series-transformer-tourism-monthly"
+        ).to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                future_time_features=batch["future_time_features"],
+            )[1]
+        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = InformerForPrediction.from_pretrained(
+            "huggingface/time-series-transformer-tourism-monthly"
+        ).to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From ec34ab1a6c9a53a145272601ed3b72baea893990 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 13 Jan 2023 10:53:54 +0000
Subject: [PATCH 004/164] added checking that instantiate works

---
 src/transformers/models/informer/check_instantiate_works.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 src/transformers/models/informer/check_instantiate_works.py

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
new file mode 100644
index 000000000000..f392af08b552
--- /dev/null
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -0,0 +1,5 @@
+from transformers import InformerModel, InformerConfig
+
+if __name__ == '__main__':
+    model = InformerModel(InformerConfig())
+    print(model)

From f274b760d33d0af1e90fbd77958d3f85a7e8e8ca Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 14 Jan 2023 11:56:07 +0000
Subject: [PATCH 005/164] added config using gluonTS by kashif

---
 .../config_using_gluonTS.py                   | 723 ++++++++++++++++++
 1 file changed, 723 insertions(+)
 create mode 100644 src/transformers/models/time_series_transformer/config_using_gluonTS.py

diff --git a/src/transformers/models/time_series_transformer/config_using_gluonTS.py b/src/transformers/models/time_series_transformer/config_using_gluonTS.py
new file mode 100644
index 000000000000..dde188b9c79e
--- /dev/null
+++ b/src/transformers/models/time_series_transformer/config_using_gluonTS.py
@@ -0,0 +1,723 @@
+from math import sqrt
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from gluonts.core.component import validated
+from gluonts.time_feature import get_lags_for_frequency
+from gluonts.torch.distributions import DistributionOutput, StudentTOutput
+from gluonts.torch.modules.feature import FeatureEmbedder
+from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
+
+
+class TriangularCausalMask:
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class ProbMask:
+    def __init__(self, B, H, L, index, scores, device="cpu"):
+        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
+        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
+        indicator = _mask_ex[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ].to(device)
+        self._mask = indicator.view(scores.shape).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class FullAttention(nn.Module):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(FullAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1.0 / sqrt(E)
+
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
+
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
+
+        if self.output_attention:
+            return (V.contiguous(), A)
+        else:
+            return (V.contiguous(), None)
+
+
+class ProbAttention(nn.Module):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(ProbAttention, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
+        # Q [B, H, L, D]
+        B, H, L_K, E = K.shape
+        _, _, L_Q, _ = Q.shape
+
+        # calculate the sampled Q_K
+        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
+        index_sample = torch.randint(
+            L_K, (L_Q, sample_k)
+        )  # real U = U_part(factor*ln(L_k))*L_q
+        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(
+            -2
+        )
+
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
+        M_top = M.topk(n_top, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        Q_reduce = Q[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
+        ]  # factor*ln(L_q)
+        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        B, H, L_V, D = V.shape
+        if not self.mask_flag:
+            # V_sum = V.sum(dim=-2)
+            V_sum = V.mean(dim=-2)
+            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
+        else:  # use mask
+            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
+            contex = V.cumsum(dim=-2)
+        return contex
+
+    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
+        B, H, L_V, D = V.shape
+
+        if self.mask_flag:
+            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
+
+        context_in[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ] = torch.matmul(attn, V).type_as(context_in)
+        if self.output_attention:
+            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
+            attns[
+                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+            ] = attn
+            return (context_in, attns)
+        else:
+            return (context_in, None)
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L_Q, H, D = queries.shape
+        _, L_K, _, _ = keys.shape
+
+        queries = queries.transpose(2, 1)
+        keys = keys.transpose(2, 1)
+        values = values.transpose(2, 1)
+
+        U_part = self.factor * np.ceil(np.log1p(L_K)).astype("int").item()  # c*ln(L_k)
+        u = self.factor * np.ceil(np.log1p(L_Q)).astype("int").item()  # c*ln(L_q)
+
+        U_part = U_part if U_part < L_K else L_K
+        u = u if u < L_Q else L_Q
+
+        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
+
+        # add scale factor
+        scale = self.scale or 1.0 / sqrt(D)
+        if scale is not None:
+            scores_top = scores_top * scale
+        # get the context
+        context = self._get_initial_context(values, L_Q)
+        # update the context with selected top_k queries
+        context, attn = self._update_context(
+            context, values, scores_top, index, L_Q, attn_mask
+        )
+
+        return context.transpose(2, 1).contiguous(), attn
+
+
+class AttentionLayer(nn.Module):
+    def __init__(
+        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
+    ):
+        super(AttentionLayer, self).__init__()
+
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+
+        self.inner_attention = attention
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+        self.mix = mix
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+
+        out, attn = self.inner_attention(queries, keys, values, attn_mask)
+        if self.mix:
+            out = out.transpose(2, 1).contiguous()
+        out = out.view(B, L, -1)
+
+        return self.out_projection(out), attn
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        # x = x + self.dropout(self.attention(
+        #     x, x, x,
+        #     attn_mask = attn_mask
+        # ))
+        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = (
+            nn.ModuleList(conv_layers) if conv_layers is not None else None
+        )
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
+class DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+    ):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
+        )
+
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+
+class Decoder(nn.Module):
+    def __init__(self, layers, norm_layer=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        for layer in self.layers:
+            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x
+
+
+class InformerModel(nn.Module):
+    @validated()
+    def __init__(
+        self,
+        freq: str,
+        context_length: int,
+        prediction_length: int,
+        num_feat_dynamic_real: int,
+        num_feat_static_real: int,
+        num_feat_static_cat: int,
+        cardinality: List[int],
+        # Informer arguments
+        nhead: int,
+        num_encoder_layers: int,
+        num_decoder_layers: int,
+        dim_feedforward: int,
+        activation: str = "gelu",
+        dropout: float = 0.1,
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        # univariate input
+        input_size: int = 1,
+        embedding_dimension: Optional[List[int]] = None,
+        distr_output: DistributionOutput = StudentTOutput(),
+        lags_seq: Optional[List[int]] = None,
+        scaling: bool = True,
+        num_parallel_samples: int = 100,
+    ) -> None:
+        super().__init__()
+
+        self.input_size = input_size
+
+        self.target_shape = distr_output.event_shape
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+        self.embedding_dimension = (
+            embedding_dimension
+            if embedding_dimension is not None or cardinality is None
+            else [min(50, (cat + 1) // 2) for cat in cardinality]
+        )
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
+        self.num_parallel_samples = num_parallel_samples
+        self.history_length = context_length + max(self.lags_seq)
+        self.embedder = FeatureEmbedder(
+            cardinalities=cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+        if scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
+
+        # total feature size
+        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+
+        self.context_length = context_length
+        self.prediction_length = prediction_length
+        self.distr_output = distr_output
+        self.param_proj = distr_output.get_args_proj(d_model)
+
+        # Informer enc-decoder
+        Attn = ProbAttention if attn == "prob" else FullAttention
+        # Encoder
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
+                    ),
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                )
+                for l in range(num_encoder_layers)
+            ],
+            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
+            if distil
+            else None,
+            norm_layer=torch.nn.LayerNorm(d_model),
+        )
+
+        # Masked Decoder
+        self.decoder = Decoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
+                    ),
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                )
+                for l in range(num_decoder_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(d_model),
+        )
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_feat_static_real
+            + self.input_size  # the log(scale)
+        )
+
+    @property
+    def _past_length(self) -> int:
+        return self.context_length + max(self.lags_seq)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence.
+        Parameters
+        ----------
+        sequence : Tensor
+            the sequence from which lagged subsequences should be extracted.
+            Shape: (N, T, C).
+        subsequences_length : int
+            length of the subsequences to be extracted.
+        shift: int
+            shift the lags by this amount back.
+        Returns
+        --------
+        lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and
+            I = len(indices), containing lagged subsequences. Specifically,
+            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.lags_seq]
+
+        assert max(indices) + subsequences_length <= sequence_length, (
+            f"lags cannot go further than history length, found lag {max(indices)} "
+            f"while history length is only {sequence_length}"
+        )
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def _check_shapes(
+        self,
+        prior_input: torch.Tensor,
+        inputs: torch.Tensor,
+        features: Optional[torch.Tensor],
+    ) -> None:
+        assert len(prior_input.shape) == len(inputs.shape)
+        assert (
+            len(prior_input.shape) == 2 and self.input_size == 1
+        ) or prior_input.shape[2] == self.input_size
+        assert (len(inputs.shape) == 2 and self.input_size == 1) or inputs.shape[
+            -1
+        ] == self.input_size
+        assert (
+            features is None or features.shape[2] == self._number_of_features
+        ), f"{features.shape[2]}, expected {self._number_of_features}"
+
+    def create_network_inputs(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_feat[:, self._past_length - self.context_length :, ...],
+                    future_time_feat,
+                ),
+                dim=1,
+            )
+            if future_target is not None
+            else past_time_feat[:, self._past_length - self.context_length :, ...]
+        )
+
+        # target
+        context = past_target[:, -self.context_length :]
+        observed_context = past_observed_values[:, -self.context_length :]
+        _, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            torch.cat((past_target, future_target), dim=1) / scale
+            if future_target is not None
+            else past_target / scale
+        )
+
+        inputs_length = (
+            self._past_length + self.prediction_length
+            if future_target is not None
+            else self._past_length
+        )
+        assert inputs.shape[1] == inputs_length
+
+        subsequences_length = (
+            self.context_length + self.prediction_length
+            if future_target is not None
+            else self.context_length
+        )
+
+        # embeddings
+        embedded_cat = self.embedder(feat_static_cat)
+        log_scale = scale.log() if self.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat(
+            (embedded_cat, feat_static_real, log_scale),
+            dim=1,
+        )
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, time_feat.shape[1], -1
+        )
+
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # self._check_shapes(prior_input, inputs, features)
+
+        # sequence = torch.cat((prior_input, inputs), dim=1)
+        lagged_sequence = self.get_lagged_subsequences(
+            sequence=inputs,
+            subsequences_length=subsequences_length,
+        )
+
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(
+            lags_shape[0], lags_shape[1], -1
+        )
+
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, scale, static_feat
+
+    def output_params(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.context_length, ...]
+        dec_input = transformer_inputs[:, self.context_length :, ...]
+
+        enc_out, _ = self.encoder(enc_input)
+        dec_output = self.decoder(dec_input, enc_out)
+
+        return self.param_proj(dec_output)
+
+    @torch.jit.ignore
+    def output_distribution(
+        self, params, scale=None, trailing_n=None
+    ) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distr_output.distribution(sliced_params, scale=scale)
+
+    # for prediction
+    def forward(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: torch.Tensor,
+        num_parallel_samples: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        if num_parallel_samples is None:
+            num_parallel_samples = self.num_parallel_samples
+
+        encoder_inputs, scale, static_feat = self.create_network_inputs(
+            feat_static_cat,
+            feat_static_real,
+            past_time_feat,
+            past_target,
+            past_observed_values,
+        )
+
+        enc_out, _ = self.encoder(encoder_inputs)
+
+        repeated_scale = scale.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
+
+        repeated_past_target = (
+            past_target.repeat_interleave(repeats=self.num_parallel_samples, dim=0)
+            / repeated_scale
+        )
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, future_time_feat.shape[1], -1
+        )
+        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+        repeated_features = features.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
+
+        repeated_enc_out = enc_out.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
+
+        future_samples = []
+
+        # greedy decoding
+        for k in range(self.prediction_length):
+            # self._check_shapes(repeated_past_target, next_sample, next_features)
+            # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
+
+            lagged_sequence = self.get_lagged_subsequences(
+                sequence=repeated_past_target,
+                subsequences_length=1 + k,
+                shift=1,
+            )
+
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(
+                lags_shape[0], lags_shape[1], -1
+            )
+
+            decoder_input = torch.cat(
+                (reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1
+            )
+
+            output = self.decoder(decoder_input, repeated_enc_out)
+
+            params = self.param_proj(output[:, -1:])
+            distr = self.output_distribution(params, scale=repeated_scale)
+            next_sample = distr.sample()
+
+            repeated_past_target = torch.cat(
+                (repeated_past_target, next_sample / repeated_scale), dim=1
+            )
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+        return concat_future_samples.reshape(
+            (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
+        )
+    
\ No newline at end of file

From b35a8b23dcdb10e434cd3a4f89c6e3f96f0da0c6 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sun, 15 Jan 2023 05:12:45 +0000
Subject: [PATCH 006/164] WIP config

---
 .../config_using_gluonTS.py                     | 17 ++++++++---------
 .../configuration_time_series_transformer.py    |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)
 rename src/transformers/models/{time_series_transformer => informer}/config_using_gluonTS.py (98%)

diff --git a/src/transformers/models/time_series_transformer/config_using_gluonTS.py b/src/transformers/models/informer/config_using_gluonTS.py
similarity index 98%
rename from src/transformers/models/time_series_transformer/config_using_gluonTS.py
rename to src/transformers/models/informer/config_using_gluonTS.py
index dde188b9c79e..c407b3ac3ec5 100644
--- a/src/transformers/models/time_series_transformer/config_using_gluonTS.py
+++ b/src/transformers/models/informer/config_using_gluonTS.py
@@ -350,21 +350,21 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
 
 class InformerModel(nn.Module):
     @validated()
-    def __init__(
+    def __init__(  # add loss param
         self,
-        freq: str,
+        freq: str, # frequency
         context_length: int,
         prediction_length: int,
-        num_feat_dynamic_real: int,
-        num_feat_static_real: int,
-        num_feat_static_cat: int,
+        num_feat_dynamic_real: int,  # num_dynamic_real_features
+        num_feat_static_real: int,  # num_static_real_features
+        num_feat_static_cat: int,  # num_static_categorical_features
         cardinality: List[int],
         # Informer arguments
         nhead: int,
-        num_encoder_layers: int,
-        num_decoder_layers: int,
+        num_encoder_layers: int, # encoder_layers
+        num_decoder_layers: int, # decoder_layers
         dim_feedforward: int,
-        activation: str = "gelu",
+        activation: str = "gelu", # activation_function
         dropout: float = 0.1,
         attn: str = "prob",
         factor: int = 5,
@@ -720,4 +720,3 @@ def forward(
         return concat_future_samples.reshape(
             (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
         )
-    
\ No newline at end of file
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 8d89d5cd7f19..258230654b0a 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -151,7 +151,7 @@ def __init__(
         decoder_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
-        is_encoder_decoder: bool = True,
+        is_encoder_decoder: bool = True, # Eli: remove from signature?
         activation_function: str = "gelu",
         dropout: float = 0.1,
         encoder_layerdrop: float = 0.1,

From 27ebb7318777e79f35dfb018f26f2497ab946a0f Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 16 Jan 2023 12:41:19 +0000
Subject: [PATCH 007/164] adding informeConfig. need to remove FeatureEmbedder

---
 .../informer/check_instantiate_works.py       |    3 +
 .../models/informer/configuration_informer.py |  229 +-
 .../models/informer/modeling_informer.py      | 2360 ++++-------------
 .../configuration_time_series_transformer.py  |    4 +-
 4 files changed, 624 insertions(+), 1972 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index f392af08b552..487bf2a9a21b 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -1,5 +1,8 @@
 from transformers import InformerModel, InformerConfig
+from gluonts.time_feature import get_lags_for_frequency
 
 if __name__ == '__main__':
+    freq = "h"
+    lags = get_lags_for_frequency(freq_str=freq)
     model = InformerModel(InformerConfig())
     print(model)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index b08f1ee982bc..51ac98700310 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,206 +22,91 @@
 
 logger = logging.get_logger(__name__)
 
-INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "elisim/informer": "https://huggingface.co/elisim/informer/resolve/main/config.json",
+TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "huggingface/time-series-transformer-tourism-monthly": (
+        "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json"
+    ),
+    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
 }
 
 
-
 class InformerConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
-    instantiate a Informer model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
-    Transformer
-    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
-    architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        prediction_length (`int`):
-            The prediction length for the decoder. In other words, the prediction horizon of the model.
-        context_length (`int`, *optional*, defaults to `prediction_length`):
-            The context length for the encoder. If `None`, the context length will be the same as the
-            `prediction_length`.
-        distribution_output (`string`, *optional*, defaults to `"student_t"`):
-            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
-        loss (`string`, *optional*, defaults to `"nll"`):
-            The loss function for the model corresponding to the `distribution_output` head. For parametric
-            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
-        input_size (`int`, *optional*, defaults to 1):
-            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
-            multivariate targets.
-        scaling (`bool`, *optional* defaults to `True`):
-            Whether to scale the input targets.
-        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
-            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
-            5, 6, 7]`.
-        num_time_features (`int`, *optional*, defaults to 0):
-            The number of time features in the input time series.
-        num_dynamic_real_features (`int`, *optional*, defaults to 0):
-            The number of dynamic real valued features.
-        num_static_categorical_features (`int`, *optional*, defaults to 0):
-            The number of static categorical features.
-        num_static_real_features (`int`, *optional*, defaults to 0):
-            The number of static real valued features.
-        cardinality (`list[int]`, *optional*):
-            The cardinality (number of different values) for each of the static categorical features. Should be a list
-            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
-            `num_static_categorical_features` is > 0.
-        embedding_dimension (`list[int]`, *optional*):
-            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
-            having the same length as `num_static_categorical_features`. Cannot be `None` if
-            `num_static_categorical_features` is > 0.
-        encoder_layers (`int`, *optional*, defaults to 2):
-            Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 2):
-            Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 2):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 2):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 32):
-            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 32):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
-            `"relu"` are supported.
-        dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the encoder, and decoder.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention and fully connected layers for each encoder layer.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention and fully connected layers for each decoder layer.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability used between the two layers of the feed-forward networks.
-        num_parallel_samples (`int`, *optional*, defaults to 100):
-            The number of samples to generate in parallel for each time step of inference.
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated normal weight initialization distribution.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
-
-        Example:
-
-    ```python
-    >>> from transformers import InformerConfig, InformerModel
-
-    >>> # Initializing a default Informer configuration
-    >>> configuration = InformerConfig()
-
-    >>> # Randomly initializing a model (with random weights) from the configuration
-    >>> model = InformerModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "informer"
-    attribute_map = {
-        "hidden_size": "d_model",
-        "num_attention_heads": "encoder_attention_heads",
-        "num_hidden_layers": "encoder_layers",
-    }
-
     def __init__(
         self,
+        context_length: int,
+        prediction_length: int,
+        num_feat_dynamic_real: int,  # num_dynamic_real_features
+        num_feat_static_real: int,  # num_static_real_features
+        num_feat_static_cat: int,  # num_static_categorical_features
+        cardinality: List[int],
+        # Informer arguments
+        nhead: int,
+        num_encoder_layers: int, # encoder_layers
+        num_decoder_layers: int, # decoder_layers
+        dim_feedforward: int,
+        activation: str = "gelu", # activation_function
+        dropout: float = 0.1,
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        # univariate input
         input_size: int = 1,
-        prediction_length: Optional[int] = None,
-        context_length: Optional[int] = None,
-        distribution_output: str = "student_t",
-        loss: str = "nll",
-        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
-        scaling: bool = True,
-        num_dynamic_real_features: int = 0,
-        num_static_categorical_features: int = 0,
-        num_static_real_features: int = 0,
-        num_time_features: int = 0,
-        cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
-        encoder_ffn_dim: int = 32,
-        decoder_ffn_dim: int = 32,
-        encoder_attention_heads: int = 2,
-        decoder_attention_heads: int = 2,
-        encoder_layers: int = 2,
-        decoder_layers: int = 2,
-        is_encoder_decoder: bool = True,
-        activation_function: str = "gelu",
-        dropout: float = 0.1,
-        encoder_layerdrop: float = 0.1,
-        decoder_layerdrop: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
+        distr_output: str = "student_t",
+        lags_seq: Optional[List[int]] = None, # used to be freq.
+        scaling: bool = True,
         num_parallel_samples: int = 100,
-        init_std: float = 0.02,
-        use_cache=True,
-        **kwargs
+        is_encoder_decoder: bool = True,
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.distribution_output = distribution_output
-        self.loss = loss
+        self.distr_output = distr_output # Eli: change to distribution_output
+        # self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
-        self.num_time_features = num_time_features
-        self.lags_sequence = lags_sequence
-        self.scaling = scaling
-        self.num_dynamic_real_features = num_dynamic_real_features
-        self.num_static_real_features = num_static_real_features
-        self.num_static_categorical_features = num_static_categorical_features
-        if cardinality and num_static_categorical_features > 0:
-            if len(cardinality) != num_static_categorical_features:
+        # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
+        # self.num_time_features = num_time_features # Eli: From vanilla ts transformer
+        self.lags_seq = lags_seq
+        # self.scaling = scaling # Eli: From vanilla ts transformer
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+
+        # set cardinality
+        if cardinality and num_feat_static_cat > 0:
+            if len(cardinality) != num_feat_static_cat:
                 raise ValueError(
                     "The cardinality should be a list of the same length as `num_static_categorical_features`"
                 )
             self.cardinality = cardinality
         else:
             self.cardinality = [1]
-        if embedding_dimension and num_static_categorical_features > 0:
-            if len(embedding_dimension) != num_static_categorical_features:
+
+        # set embedding_dimension
+        if embedding_dimension and num_feat_static_cat > 0:
+            if len(embedding_dimension) != num_feat_static_cat:
                 raise ValueError(
                     "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                 )
             self.embedding_dimension = embedding_dimension
         else:
             self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+            
         self.num_parallel_samples = num_parallel_samples
 
-        # Transformer architecture configuration
-        self.d_model = input_size * len(lags_sequence) + self._number_of_features
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_layers = encoder_layers
-        self.decoder_layers = decoder_layers
-
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-
-        self.activation_function = activation_function
-        self.init_std = init_std
+        # self.history_length = context_length + max(self.lags_seq)
+        self.embedder = FeatureEmbedder(
+            cardinalities=cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+        if scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        self.output_attentions = False
-        self.output_hidden_states = False
+        # total feature size
+        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
 
-        self.use_cache = use_cache
+        self.param_proj = distr_output.get_args_proj(d_model)
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
-
-    @property
-    def _number_of_features(self) -> int:
-        return (
-            sum(self.embedding_dimension)
-            + self.num_dynamic_real_features
-            + self.num_time_features
-            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
-            + self.input_size  # the log(scale)
-        )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 8d36a171da35..9616af397751 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1,1483 +1,519 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Informer model."""
-
-import random
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from math import sqrt
+from typing import List, Optional
 
+import numpy as np
 import torch
-from torch import nn
-from torch.distributions import (
-    AffineTransform,
-    Distribution,
-    Independent,
-    NegativeBinomial,
-    Normal,
-    StudentT,
-    TransformedDistribution,
-)
+import torch.nn as nn
+import torch.nn.functional as F
+from gluonts.torch.distributions import DistributionOutput, StudentTOutput
+from gluonts.torch.modules.feature import FeatureEmbedder
+from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_informer import InformerConfig
 
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "InformerConfig"
-
-
-INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "elisim/informer",
-    # See all Informer models at https://huggingface.co/models?filter=informer
-]
-
-
-
-class AffineTransformed(TransformedDistribution):
-    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
-        self.scale = 1.0 if scale is None else scale
-        self.loc = 0.0 if loc is None else loc
-
-        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+class TriangularCausalMask:
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
 
     @property
-    def mean(self):
-        """
-        Returns the mean of the distribution.
-        """
-        return self.base_dist.mean * self.scale + self.loc
+    def mask(self):
+        return self._mask
 
-    @property
-    def variance(self):
-        """
-        Returns the variance of the distribution.
-        """
-        return self.base_dist.variance * self.scale**2
+
+class ProbMask:
+    def __init__(self, B, H, L, index, scores, device="cpu"):
+        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
+        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
+        indicator = _mask_ex[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ].to(device)
+        self._mask = indicator.view(scores.shape).to(device)
 
     @property
-    def stddev(self):
-        """
-        Returns the standard deviation of the distribution.
-        """
-        return self.variance.sqrt()
+    def mask(self):
+        return self._mask
 
 
-class ParameterProjection(nn.Module):
+class FullAttention(nn.Module):
     def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        self.args_dim = args_dim
-        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
-        self.domain_map = domain_map
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
-        params_unbounded = [proj(x) for proj in self.proj]
-
-        return self.domain_map(*params_unbounded)
-
-
-class LambdaLayer(nn.Module):
-    def __init__(self, function):
-        super().__init__()
-        self.function = function
-
-    def forward(self, x, *args):
-        return self.function(x, *args)
-
-
-class DistributionOutput:
-    distribution_class: type
-    in_features: int
-    args_dim: Dict[str, int]
-
-    def __init__(self, dim: int = 1) -> None:
-        self.dim = dim
-        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
-
-    def _base_distribution(self, distr_args):
-        if self.dim == 1:
-            return self.distribution_class(*distr_args)
-        else:
-            return Independent(self.distribution_class(*distr_args), 1)
-
-    def distribution(
         self,
-        distr_args,
-        loc: Optional[torch.Tensor] = None,
-        scale: Optional[torch.Tensor] = None,
-    ) -> Distribution:
-        distr = self._base_distribution(distr_args)
-        if loc is None and scale is None:
-            return distr
-        else:
-            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
-
-    @property
-    def event_shape(self) -> Tuple:
-        r"""
-        Shape of each individual event contemplated by the distributions that this object constructs.
-        """
-        return () if self.dim == 1 else (self.dim,)
-
-    @property
-    def event_dim(self) -> int:
-        r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
-        constructs.
-        """
-        return len(self.event_shape)
-
-    @property
-    def value_in_support(self) -> float:
-        r"""
-        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
-        default 0.0. This value will be used when padding data series.
-        """
-        return 0.0
-
-    def get_parameter_projection(self, in_features: int) -> nn.Module:
-        r"""
-        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
-        """
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        r"""
-        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
-        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
-        distribution of the right event_shape.
-        """
-        raise NotImplementedError()
-
-    @classmethod
-    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
-        r"""
-        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
-        https://twitter.com/jon_barron/status/1387167648669048833
-        """
-        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
-
-
-class StudentTOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
-    distribution_class: type = StudentT
-
-    @classmethod
-    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        df = 2.0 + cls.squareplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NormalOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
-    distribution_class: type = Normal
-
-    @classmethod
-    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NegativeBinomialOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
-    distribution_class: type = NegativeBinomial
-
-    @classmethod
-    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
-        total_count = cls.squareplus(total_count)
-        return total_count.squeeze(-1), logits.squeeze(-1)
-
-    def _base_distribution(self, distr_args) -> Distribution:
-        total_count, logits = distr_args
-        if self.dim == 1:
-            return self.distribution_class(total_count=total_count, logits=logits)
-        else:
-            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
-
-    # Overwrites the parent class method. We cannot scale using the affine
-    # transformation since negative binomial should return integers. Instead
-    # we scale the parameters.
-    def distribution(
-        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
-    ) -> Distribution:
-        total_count, logits = distr_args
-
-        if scale is not None:
-            # See scaling property of Gamma.
-            logits += scale.log()
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(FullAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
 
-        return self._base_distribution((total_count, logits))
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1.0 / sqrt(E)
 
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
 
-class FeatureEmbedder(nn.Module):
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
+            scores.masked_fill_(attn_mask.mask, -np.inf)
 
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
 
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        if self.output_attention:
+            return (V.contiguous(), A)
         else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
+            return (V.contiguous(), None)
 
 
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
-
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # these will have shape (N, C)
-        total_weight = weights.sum(dim=self.dim)
-        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
-
-        # first compute a global scale per-dimension
-        total_observed = total_weight.sum(dim=0)
-        denominator = torch.max(total_observed, torch.ones_like(total_observed))
-        default_scale = weighted_sum.sum(dim=0) / denominator
-
-        # then compute a per-item, per-dimension scale
-        denominator = torch.max(total_weight, torch.ones_like(total_weight))
-        scale = weighted_sum / denominator
-
-        # use per-batch scale when no element is observed
-        # or when the sequence contains only zeros
-        scale = (
-            torch.max(
-                self.minimum_scale,
-                torch.where(
-                    weighted_sum > torch.zeros_like(weighted_sum),
-                    scale,
-                    default_scale * torch.ones_like(total_weight),
-                ),
-            )
-            .detach()
-            .unsqueeze(dim=self.dim)
-        )
-
-        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
-
-
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, scale
-
-
-def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
-    """
-    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-
-    Args:
-        input_tensor (`torch.FloatTensor`):
-            Input tensor, of which the average must be computed.
-        weights (`torch.FloatTensor`, *optional*):
-            Weights tensor, of the same shape as `input_tensor`.
-        dim (`int`, *optional*):
-            The dim along which to average `input_tensor`.
-
-    Returns:
-        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
-    """
-    if weights is not None:
-        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
-        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
-        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
-    else:
-        return input_tensor.mean(dim=dim)
-
-
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-@dataclass
-class Seq2SeqTimeSeriesModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
-    """
-    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
-    distribution.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
-            Distributional loss.
-        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
-            Parameters of the chosen distribution.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    params: Optional[Tuple[torch.FloatTensor]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class SampleTimeSeriesPredictionOutput(ModelOutput):
-    sequences: torch.FloatTensor = None
-
-
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
-class InformerAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
+class ProbAttention(nn.Module):
     def __init__(
         self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
     ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        super(ProbAttention, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
+        # Q [B, H, L, D]
+        B, H, L_K, E = K.shape
+        _, _, L_Q, _ = Q.shape
+
+        # calculate the sampled Q_K
+        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
+        index_sample = torch.randint(
+            L_K, (L_Q, sample_k)
+        )  # real U = U_part(factor*ln(L_k))*L_q
+        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(
+            -2
+        )
 
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
+        M_top = M.topk(n_top, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        Q_reduce = Q[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
+        ]  # factor*ln(L_q)
+        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        B, H, L_V, D = V.shape
+        if not self.mask_flag:
+            # V_sum = V.sum(dim=-2)
+            V_sum = V.mean(dim=-2)
+            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
+        else:  # use mask
+            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
+            contex = V.cumsum(dim=-2)
+        return contex
+
+    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
+        B, H, L_V, D = V.shape
+
+        if self.mask_flag:
+            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
+
+        context_in[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ] = torch.matmul(attn, V).type_as(context_in)
+        if self.output_attention:
+            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
+            attns[
+                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+            ] = attn
+            return (context_in, attns)
         else:
-            attn_weights_reshaped = None
+            return (context_in, None)
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+    def forward(self, queries, keys, values, attn_mask):
+        B, L_Q, H, D = queries.shape
+        _, L_K, _, _ = keys.shape
 
-        attn_output = torch.bmm(attn_probs, value_states)
+        queries = queries.transpose(2, 1)
+        keys = keys.transpose(2, 1)
+        values = values.transpose(2, 1)
 
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
+        U_part = self.factor * np.ceil(np.log1p(L_K)).astype("int").item()  # c*ln(L_k)
+        u = self.factor * np.ceil(np.log1p(L_Q)).astype("int").item()  # c*ln(L_q)
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
+        U_part = U_part if U_part < L_K else L_K
+        u = u if u < L_Q else L_Q
 
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
 
-        attn_output = self.out_proj(attn_output)
+        # add scale factor
+        scale = self.scale or 1.0 / sqrt(D)
+        if scale is not None:
+            scores_top = scores_top * scale
+        # get the context
+        context = self._get_initial_context(values, L_Q)
+        # update the context with selected top_k queries
+        context, attn = self._update_context(
+            context, values, scores_top, index, L_Q, attn_mask
+        )
 
-        return attn_output, attn_weights_reshaped, past_key_value
+        return context.transpose(2, 1).contiguous(), attn
 
 
-# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->Informer
-class InformerEncoderLayer(nn.Module):
-    def __init__(self, config: InformerConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = InformerAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
+class AttentionLayer(nn.Module):
+    def __init__(
+        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
+    ):
+        super(AttentionLayer, self).__init__()
+
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+
+        self.inner_attention = attention
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+        self.mix = mix
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+
+        out, attn = self.inner_attention(queries, keys, values, attn_mask)
+        if self.mix:
+            out = out.transpose(2, 1).contiguous()
+        out = out.view(B, L, -1)
+
+        return self.out_projection(out), attn
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
         )
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: torch.FloatTensor,
-        layer_head_mask: torch.FloatTensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states, attn_weights, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        # x = x + self.dropout(self.attention(
+        #     x, x, x,
+        #     attn_mask = attn_mask
+        # ))
+        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = (
+            nn.ModuleList(conv_layers) if conv_layers is not None else None
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        residual = hidden_states
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
 
+        if self.norm is not None:
+            x = self.norm(x)
 
-# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->Informer
-class InformerDecoderLayer(nn.Module):
-    def __init__(self, config: InformerConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
+        return x, attns
 
-        self.self_attn = InformerAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = InformerAttention(
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
-    def forward(
+class DecoderLayer(nn.Module):
+    def __init__(
         self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = True,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
-            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
-                size `(decoder_attention_heads,)`.
-            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+    ):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
-            )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-            hidden_states = residual + hidden_states
-            hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
-class InformerPreTrainedModel(PreTrainedModel):
-    config_class = InformerConfig
-    base_model_prefix = "model"
-    main_input_name = "past_values"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (InformerDecoder, InformerEncoder)):
-            module.gradient_checkpointing = value
-
-
-INFORMER_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
 
-    Parameters:
-        config ([`InformerConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
 
-INFORMER_INPUTS_DOCSTRING = r"""
-    Args:
-        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Past values of the time series, that serve as context in order to predict the future. These values may
-            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
-            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
-            `static_categorical_features`, `static_real_features`, `past_time_features`).
+        return self.norm3(x + y)
 
-            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
 
-            Missing values need to be replaced with zeros.
+class Decoder(nn.Module):
+    def __init__(self, layers, norm_layer=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
 
-        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `past_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        for layer in self.layers:
+            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
 
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
-            the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional time features.
+        if self.norm is not None:
+            x = self.norm(x)
 
-            The Informer only learns additional embeddings for `static_categorical_features`.
+        return x
 
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
-            `[0, 1]`:
 
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
-            Optional static categorical features for which the model will learn an embedding, which it will add to the
-            values of the time series.
-
-            Static categorical features are features which have the same value for all time steps (static over time).
-
-            A typical example of a static categorical feature is a time series ID.
-
-        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
-            Optional static real features which the model will add to the values of the time series.
-
-            Static real features are features which have the same value for all time steps (static over time).
-
-            A typical example of a static real feature is promotion information.
-
-        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
-            Future values of the time series, that serve as labels for the model. The `future_values` is what the
-            Transformer needs to learn to output, given the `past_values`.
-
-            See the demo notebook and code snippets for details.
-
-            Missing values need to be replaced with zeros.
-
-        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `future_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
-
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
-            the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional features.
-
-            The Informer only learns additional embeddings for `static_categorical_features`.
-
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
-            make sure the model can only look at previous inputs in order to predict the future.
-
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
-class InformerEncoder(InformerPreTrainedModel):
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`InformerEncoderLayer`].
-
-    Args:
-        config: InformerConfig
-    """
-
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-
-        embed_dim = config.d_model
-
-        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(embed_dim)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
+class InformerModel(nn.Module):
+    def __init__(  # add loss param
         self,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-
-        # check if head_mask has a correct number of layers specified if desired
-        if head_mask is not None:
-            if head_mask.size()[0] != (len(self.layers)):
-                raise ValueError(
-                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
-
-        for idx, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
-                layer_outputs = (None, None)
-            else:
-                if self.gradient_checkpointing and self.training:
-
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs, output_attentions)
-
-                        return custom_forward
-
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(encoder_layer),
-                        hidden_states,
-                        attention_mask,
-                        (head_mask[idx] if head_mask is not None else None),
-                    )
-                else:
-                    layer_outputs = encoder_layer(
-                        hidden_states,
-                        attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                        output_attentions=output_attentions,
-                    )
-
-                hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
-class InformerDecoder(InformerPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
-    [`InformerDecoderLayer`]
-
-    Args:
-        config: InformerConfig
-    """
-
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-
-        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(config.d_model)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
+        freq: str, # frequency
+        context_length: int,
+        prediction_length: int,
+        num_feat_dynamic_real: int,  # num_dynamic_real_features
+        num_feat_static_real: int,  # num_static_real_features
+        num_feat_static_cat: int,  # num_static_categorical_features
+        cardinality: List[int],
+        # Informer arguments
+        nhead: int,
+        num_encoder_layers: int, # encoder_layers
+        num_decoder_layers: int, # decoder_layers
+        dim_feedforward: int,
+        activation: str = "gelu", # activation_function
+        dropout: float = 0.1,
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        # univariate input
+        input_size: int = 1,
+        embedding_dimension: Optional[List[int]] = None,
+        distr_output: DistributionOutput = StudentTOutput(),
+        lags_seq: Optional[List[int]] = None,
+        scaling: bool = True,
+        num_parallel_samples: int = 100,
+    ) -> None:
+        super().__init__()
 
-        return combined_attention_mask
+        self.input_size = input_size
 
-    def forward(
-        self,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        r"""
-        Args:
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
-                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        self.target_shape = distr_output.event_shape
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+        self.embedding_dimension = (
+            embedding_dimension
+            if embedding_dimension is not None or cardinality is None
+            else [min(50, (cat + 1) // 2) for cat in cardinality]
         )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
+        self.num_parallel_samples = num_parallel_samples
+        self.history_length = context_length + max(self.lags_seq)
+        self.embedder = FeatureEmbedder(
+            cardinalities=cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+        if scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        input_shape = inputs_embeds.size()[:-1]
+        # total feature size
+        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
 
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        self.context_length = context_length
+        self.prediction_length = prediction_length
+        self.distr_output = distr_output
+        self.param_proj = distr_output.get_args_proj(d_model)
 
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        # Informer enc-decoder
+        Attn = ProbAttention if attn == "prob" else FullAttention
+        # Encoder
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
+                    ),
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                )
+                for l in range(num_encoder_layers)
+            ],
+            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
+            if distil
+            else None,
+            norm_layer=torch.nn.LayerNorm(d_model),
         )
 
-        # expand encoder attention mask
-        if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
-
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-        next_decoder_cache = () if use_cache else None
-
-        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != (len(self.layers)):
-                    raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {head_mask.size()[0]}."
-                    )
-
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, use_cache)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    head_mask[idx] if head_mask is not None else None,
-                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
-                    None,
-                )
-            else:
-
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    cross_attn_layer_head_mask=(
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+        # Masked Decoder
+        self.decoder = Decoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
                     ),
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
                 )
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
+                for l in range(num_decoder_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(d_model),
         )
 
-
-@add_start_docstrings(
-    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
-    INFORMER_START_DOCSTRING,
-)
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
-class InformerModel(InformerPreTrainedModel):
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-
-        if config.scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
-        else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
-
-        self.embedder = FeatureEmbedder(
-            cardinalities=config.cardinality,
-            embedding_dims=config.embedding_dimension,
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_feat_static_real
+            + self.input_size  # the log(scale)
         )
 
-        # transformer encoder-decoder and mask initializer
-        self.encoder = InformerEncoder(config)
-        self.decoder = InformerDecoder(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
     @property
     def _past_length(self) -> int:
-        return self.config.context_length + max(self.config.lags_sequence)
+        return self.context_length + max(self.lags_seq)
 
     def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
-        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
-            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
-            j, :, k] = sequence[i, -indices[k]-S+j, :].
-
-        Args:
-            sequence: Tensor
-                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
-            subsequences_length : int
-                Length of the subsequences to be extracted.
-            shift: int
-                Shift the lags by this amount back.
+        Returns lagged subsequences of a given sequence.
+        Parameters
+        ----------
+        sequence : Tensor
+            the sequence from which lagged subsequences should be extracted.
+            Shape: (N, T, C).
+        subsequences_length : int
+            length of the subsequences to be extracted.
+        shift: int
+            shift the lags by this amount back.
+        Returns
+        --------
+        lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and
+            I = len(indices), containing lagged subsequences. Specifically,
+            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
         """
         sequence_length = sequence.shape[1]
-        indices = [lag - shift for lag in self.config.lags_sequence]
+        indices = [lag - shift for lag in self.lags_seq]
 
-        try:
-            assert max(indices) + subsequences_length <= sequence_length, (
-                f"lags cannot go further than history length, found lag {max(indices)} "
-                f"while history length is only {sequence_length}"
-            )
-        except AssertionError as e:
-            e.args += (max(indices), sequence_length)
-            raise
+        assert max(indices) + subsequences_length <= sequence_length, (
+            f"lags cannot go further than history length, found lag {max(indices)} "
+            f"while history length is only {sequence_length}"
+        )
 
         lagged_values = []
         for lag_index in indices:
@@ -1486,470 +522,198 @@ def get_lagged_subsequences(
             lagged_values.append(sequence[:, begin_index:end_index, ...])
         return torch.stack(lagged_values, dim=-1)
 
+    def _check_shapes(
+        self,
+        prior_input: torch.Tensor,
+        inputs: torch.Tensor,
+        features: Optional[torch.Tensor],
+    ) -> None:
+        assert len(prior_input.shape) == len(inputs.shape)
+        assert (
+            len(prior_input.shape) == 2 and self.input_size == 1
+        ) or prior_input.shape[2] == self.input_size
+        assert (len(inputs.shape) == 2 and self.input_size == 1) or inputs.shape[
+            -1
+        ] == self.input_size
+        assert (
+            features is None or features.shape[2] == self._number_of_features
+        ), f"{features.shape[2]}, expected {self._number_of_features}"
+
     def create_network_inputs(
         self,
-        past_values: torch.Tensor,
-        past_time_features: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        past_observed_mask: Optional[torch.Tensor] = None,
-        future_values: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
     ):
         # time feature
         time_feat = (
             torch.cat(
                 (
-                    past_time_features[:, self._past_length - self.config.context_length :, ...],
-                    future_time_features,
+                    past_time_feat[:, self._past_length - self.context_length :, ...],
+                    future_time_feat,
                 ),
                 dim=1,
             )
-            if future_values is not None
-            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+            if future_target is not None
+            else past_time_feat[:, self._past_length - self.context_length :, ...]
         )
 
         # target
-        if past_observed_mask is None:
-            past_observed_mask = torch.ones_like(past_values)
-
-        context = past_values[:, -self.config.context_length :]
-        observed_context = past_observed_mask[:, -self.config.context_length :]
+        context = past_target[:, -self.context_length :]
+        observed_context = past_observed_values[:, -self.context_length :]
         _, scale = self.scaler(context, observed_context)
 
         inputs = (
-            torch.cat((past_values, future_values), dim=1) / scale
-            if future_values is not None
-            else past_values / scale
+            torch.cat((past_target, future_target), dim=1) / scale
+            if future_target is not None
+            else past_target / scale
         )
 
         inputs_length = (
-            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
+            self._past_length + self.prediction_length
+            if future_target is not None
+            else self._past_length
         )
-        try:
-            assert inputs.shape[1] == inputs_length, (
-                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
-            )
-        except AssertionError as e:
-            e.args += (inputs.shape[1], inputs_length)
-            raise
+        assert inputs.shape[1] == inputs_length
 
         subsequences_length = (
-            self.config.context_length + self.config.prediction_length
-            if future_values is not None
-            else self.config.context_length
+            self.context_length + self.prediction_length
+            if future_target is not None
+            else self.context_length
         )
 
         # embeddings
-        embedded_cat = self.embedder(static_categorical_features)
-        # static features
-        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
-        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
-        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
-
-        # all features
-        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
-
-        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
-
-        lags_shape = lagged_sequence.shape
-        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
-
-        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
-
-        return transformer_inputs, scale, static_feat
-
-    def enc_dec_outputs(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.config.context_length, ...]
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
-
-        encoder_outputs = self.encoder(inputs_embeds=enc_input)
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        embedded_cat = self.embedder(feat_static_cat)
+        log_scale = scale.log() if self.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat(
+            (embedded_cat, feat_static_real, log_scale),
+            dim=1,
         )
-        return encoder_outputs, decoder_outputs
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from huggingface_hub import hf_hub_download
-        >>> import torch
-        >>> from transformers import InformerModel
-
-        >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
-        ... )
-        >>> batch = torch.load(file)
-
-        >>> model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
-
-        >>> # during training, one provides both past and future values
-        >>> # as well as possible additional features
-        >>> outputs = model(
-        ...     past_values=batch["past_values"],
-        ...     past_time_features=batch["past_time_features"],
-        ...     past_observed_mask=batch["past_observed_mask"],
-        ...     static_categorical_features=batch["static_categorical_features"],
-        ...     static_real_features=batch["static_real_features"],
-        ...     future_values=batch["future_values"],
-        ...     future_time_features=batch["future_time_features"],
-        ... )
-
-        >>> last_hidden_state = outputs.last_hidden_state
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_inputs, scale, static_feat = self.create_network_inputs(
-            past_values=past_values,
-            past_time_features=past_time_features,
-            past_observed_mask=past_observed_mask,
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            future_values=future_values,
-            future_time_features=future_time_features,
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, time_feat.shape[1], -1
         )
 
-        if encoder_outputs is None:
-            enc_input = transformer_inputs[:, : self.config.context_length, ...]
-            encoder_outputs = self.encoder(
-                inputs_embeds=enc_input,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        # self._check_shapes(prior_input, inputs, features)
 
-        if not return_dict:
-            return decoder_outputs + encoder_outputs + (scale, static_feat)
-
-        return Seq2SeqTimeSeriesModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-            scale=scale,
-            static_features=static_feat,
+        # sequence = torch.cat((prior_input, inputs), dim=1)
+        lagged_sequence = self.get_lagged_subsequences(
+            sequence=inputs,
+            subsequences_length=subsequences_length,
         )
 
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(
+            lags_shape[0], lags_shape[1], -1
+        )
 
-@add_start_docstrings(
-    "The Informer Model with a distribution head on top for time-series forecasting.",
-    INFORMER_START_DOCSTRING,
-)
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
-class InformerForPrediction(InformerPreTrainedModel):
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-        self.model = InformerModel(config)
-        if config.distribution_output == "student_t":
-            self.distribution_output = StudentTOutput(dim=config.input_size)
-        elif config.distribution_output == "normal":
-            self.distribution_output = NormalOutput(dim=config.input_size)
-        elif config.distribution_output == "negative_binomial":
-            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
-        else:
-            raise ValueError(f"Unknown distribution output {config.distribution_output}")
-
-        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
-        self.target_shape = self.distribution_output.event_shape
-
-        if config.loss == "nll":
-            self.loss = NegativeLogLikelihood()
-        else:
-            raise ValueError(f"Unknown loss function {config.loss}")
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
 
-        # Initialize weights of distribution_output and apply final processing
-        self.post_init()
+        return transformer_inputs, scale, static_feat
 
-    def output_params(self, dec_output):
-        return self.parameter_projection(dec_output)
+    def output_params(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.context_length, ...]
+        dec_input = transformer_inputs[:, self.context_length :, ...]
 
-    def get_encoder(self):
-        return self.model.get_encoder()
+        enc_out, _ = self.encoder(enc_input)
+        dec_output = self.decoder(dec_input, enc_out)
 
-    def get_decoder(self):
-        return self.model.get_decoder()
+        return self.param_proj(dec_output)
 
     @torch.jit.ignore
-    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+    def output_distribution(
+        self, params, scale=None, trailing_n=None
+    ) -> torch.distributions.Distribution:
         sliced_params = params
         if trailing_n is not None:
             sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distribution_output.distribution(sliced_params, scale=scale)
+        return self.distr_output.distribution(sliced_params, scale=scale)
 
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    # for prediction
     def forward(
         self,
-        past_values: torch.Tensor,
-        past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
-        future_observed_mask: Optional[torch.Tensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
-        r"""
-        Returns:
-
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-
-        Examples:
-
-        ```python
-        >>> from huggingface_hub import hf_hub_download
-        >>> import torch
-        >>> from transformers import InformerForPrediction
-
-        >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
-        ... )
-        >>> batch = torch.load(file)
-
-        >>> model = InformerForPrediction.from_pretrained(
-        ...     "huggingface/time-series-transformer-tourism-monthly"
-        ... )
-
-        >>> # during training, one provides both past and future values
-        >>> # as well as possible additional features
-        >>> outputs = model(
-        ...     past_values=batch["past_values"],
-        ...     past_time_features=batch["past_time_features"],
-        ...     past_observed_mask=batch["past_observed_mask"],
-        ...     static_categorical_features=batch["static_categorical_features"],
-        ...     static_real_features=batch["static_real_features"],
-        ...     future_values=batch["future_values"],
-        ...     future_time_features=batch["future_time_features"],
-        ... )
-
-        >>> loss = outputs.loss
-        >>> loss.backward()
-
-        >>> # during inference, one only provides past values
-        >>> # as well as possible additional features
-        >>> # the model autoregressively generates future values
-        >>> outputs = model.generate(
-        ...     past_values=batch["past_values"],
-        ...     past_time_features=batch["past_time_features"],
-        ...     past_observed_mask=batch["past_observed_mask"],
-        ...     static_categorical_features=batch["static_categorical_features"],
-        ...     static_real_features=batch["static_real_features"],
-        ...     future_time_features=batch["future_time_features"],
-        ... )
-
-        >>> mean_prediction = outputs.sequences.mean(dim=1)
-        ```"""
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if future_values is not None:
-            use_cache = False
-
-        outputs = self.model(
-            past_values=past_values,
-            past_time_features=past_time_features,
-            past_observed_mask=past_observed_mask,
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            future_values=future_values,
-            future_time_features=future_time_features,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            past_key_values=past_key_values,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            return_dict=return_dict,
-        )
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: torch.Tensor,
+        num_parallel_samples: Optional[int] = None,
+    ) -> torch.Tensor:
 
-        prediction_loss = None
-        params = None
-        if future_values is not None:
-            params = self.output_params(outputs[0])  # outputs.last_hidden_state
-            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
-
-            loss = self.loss(distribution, future_values)
-
-            if future_observed_mask is None:
-                future_observed_mask = torch.ones_like(future_values)
-
-            if len(self.target_shape) == 0:
-                loss_weights = future_observed_mask
-            else:
-                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
-
-            prediction_loss = weighted_average(loss, weights=loss_weights)
-
-        if not return_dict:
-            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
-            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
-
-        return Seq2SeqTimeSeriesPredictionOutput(
-            loss=prediction_loss,
-            params=params,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-            scale=outputs.scale,
-            static_features=outputs.static_features,
-        )
+        if num_parallel_samples is None:
+            num_parallel_samples = self.num_parallel_samples
 
-    @torch.no_grad()
-    def generate(
-        self,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        past_time_features: torch.Tensor,
-        past_values: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor],
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-    ) -> torch.Tensor:
-        outputs = self(
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            past_time_features=past_time_features,
-            past_values=past_values,
-            past_observed_mask=past_observed_mask,
-            future_time_features=future_time_features,
-            future_values=None,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=True,
-            use_cache=True,
+        encoder_inputs, scale, static_feat = self.create_network_inputs(
+            feat_static_cat,
+            feat_static_real,
+            past_time_feat,
+            past_target,
+            past_observed_values,
         )
 
-        decoder = self.model.get_decoder()
-        enc_last_hidden = outputs.encoder_last_hidden_state
-        scale = outputs.scale
-        static_feat = outputs.static_features
+        enc_out, _ = self.encoder(encoder_inputs)
 
-        num_parallel_samples = self.config.num_parallel_samples
-        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_scale = scale.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
 
-        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+        repeated_past_target = (
+            past_target.repeat_interleave(repeats=self.num_parallel_samples, dim=0)
+            / repeated_scale
+        )
 
-        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
-        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
-        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, future_time_feat.shape[1], -1
+        )
+        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+        repeated_features = features.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
 
-        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_enc_out = enc_out.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
 
         future_samples = []
 
         # greedy decoding
-        for k in range(self.config.prediction_length):
-            lagged_sequence = self.model.get_lagged_subsequences(
-                sequence=repeated_past_values,
+        for k in range(self.prediction_length):
+            # self._check_shapes(repeated_past_target, next_sample, next_features)
+            # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
+
+            lagged_sequence = self.get_lagged_subsequences(
+                sequence=repeated_past_target,
                 subsequences_length=1 + k,
                 shift=1,
             )
 
             lags_shape = lagged_sequence.shape
-            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+            reshaped_lagged_sequence = lagged_sequence.reshape(
+                lags_shape[0], lags_shape[1], -1
+            )
 
-            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+            decoder_input = torch.cat(
+                (reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1
+            )
 
-            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
-            dec_last_hidden = dec_output.last_hidden_state
+            output = self.decoder(decoder_input, repeated_enc_out)
 
-            params = self.parameter_projection(dec_last_hidden[:, -1:])
+            params = self.param_proj(output[:, -1:])
             distr = self.output_distribution(params, scale=repeated_scale)
             next_sample = distr.sample()
 
-            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
+            repeated_past_target = torch.cat(
+                (repeated_past_target, next_sample / repeated_scale), dim=1
+            )
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)
-
-        return SampleTimeSeriesPredictionOutput(
-            sequences=concat_future_samples.reshape(
-                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
-            )
+        return concat_future_samples.reshape(
+            (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
         )
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 258230654b0a..68345408398e 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -137,7 +137,7 @@ def __init__(
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
-        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7], # Eli: Remove the default here
         scaling: bool = True,
         num_dynamic_real_features: int = 0,
         num_static_categorical_features: int = 0,
@@ -151,7 +151,7 @@ def __init__(
         decoder_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
-        is_encoder_decoder: bool = True, # Eli: remove from signature?
+        is_encoder_decoder: bool = True,
         activation_function: str = "gelu",
         dropout: float = 0.1,
         encoder_layerdrop: float = 0.1,

From 9d4ae9908d950d2a630a866c3289649458ad63e7 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 16 Jan 2023 13:12:44 +0000
Subject: [PATCH 008/164] done InformerConfig, but need to change the names

---
 .../models/informer/configuration_informer.py | 53 +++++++++++++------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 51ac98700310..913acf7f1f67 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -53,10 +53,11 @@ def __init__(
         input_size: int = 1,
         embedding_dimension: Optional[List[int]] = None,
         distr_output: str = "student_t",
-        lags_seq: Optional[List[int]] = None, # used to be freq.
+        lags_seq: Optional[List[int]] = None,  # used to be freq.
         scaling: bool = True,
         num_parallel_samples: int = 100,
         is_encoder_decoder: bool = True,
+        **kwargs
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
@@ -67,7 +68,7 @@ def __init__(
         # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
         # self.num_time_features = num_time_features # Eli: From vanilla ts transformer
         self.lags_seq = lags_seq
-        # self.scaling = scaling # Eli: From vanilla ts transformer
+        self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
         self.num_feat_static_cat = num_feat_static_cat
         self.num_feat_static_real = num_feat_static_real
@@ -91,22 +92,42 @@ def __init__(
             self.embedding_dimension = embedding_dimension
         else:
             self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
-            
-        self.num_parallel_samples = num_parallel_samples
 
-        # self.history_length = context_length + max(self.lags_seq)
-        self.embedder = FeatureEmbedder(
-            cardinalities=cardinality,
-            embedding_dims=self.embedding_dimension,
-        )
-        if scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
-        else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
+        self.num_parallel_samples = num_parallel_samples
+        # self.history_length = context_length + max(self.lags_seq) # Eli: I think can be removed
 
-        # total feature size
-        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        # Transformer architecture configuration
+        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        self.nhead = nhead
+        self.num_encoder_layers = num_encoder_layers # encoder_layers
+        self.num_decoder_layers = num_decoder_layers # decoder_layers
+        self.dim_feedforward = dim_feedforward
+        self.activation = activation  # activation_function
+        self.dropout = dropout
+        self.attn = attn,
+        self.factor = factor
+        self.distil = distil
 
-        self.param_proj = distr_output.get_args_proj(d_model)
+        # self.param_proj = distr_output.get_args_proj(d_model)
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_feat_static_real
+            + self.input_size  # the log(scale)
+        )
+
+
+    # @property
+    # def _number_of_features(self) -> int:
+    #     return (
+    #         sum(self.embedding_dimension)
+    #         + self.num_dynamic_real_features
+    #         + self.num_time_features
+    #         + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
+    #         + self.input_size  # the log(scale)
+    #     )

From 1223ba5a5fac99e36b3f1ba5a70b35b01d63f846 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 16 Jan 2023 13:42:55 +0000
Subject: [PATCH 009/164] Done informer model init. working on enc-dec

---
 .../models/informer/modeling_informer.py      | 367 ++++++++++++------
 1 file changed, 245 insertions(+), 122 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 9616af397751..395975549d53 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1,3 +1,42 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Informer model."""
+
+import random
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_informer import InformerConfig
+
 from math import sqrt
 from typing import List, Optional
 
@@ -5,9 +44,109 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from gluonts.torch.distributions import DistributionOutput, StudentTOutput
-from gluonts.torch.modules.feature import FeatureEmbedder
-from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InformerConfig"
+
+
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, scale
 
 
 class TriangularCausalMask:
@@ -263,9 +402,9 @@ def forward(self, x, attn_mask=None):
         return self.norm2(x + y), attn
 
 
-class Encoder(nn.Module):
+class InformerEncoder(nn.Module):
     def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
-        super(Encoder, self).__init__()
+        super(InformerEncoder, self).__init__()
         self.attn_layers = nn.ModuleList(attn_layers)
         self.conv_layers = (
             nn.ModuleList(conv_layers) if conv_layers is not None else None
@@ -330,7 +469,7 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         return self.norm3(x + y)
 
 
-class Decoder(nn.Module):
+class InformerDecoder(nn.Module):
     def __init__(self, layers, norm_layer=None):
         super(Decoder, self).__init__()
         self.layers = nn.ModuleList(layers)
@@ -346,132 +485,116 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         return x
 
 
-class InformerModel(nn.Module):
-    def __init__(  # add loss param
-        self,
-        freq: str, # frequency
-        context_length: int,
-        prediction_length: int,
-        num_feat_dynamic_real: int,  # num_dynamic_real_features
-        num_feat_static_real: int,  # num_static_real_features
-        num_feat_static_cat: int,  # num_static_categorical_features
-        cardinality: List[int],
-        # Informer arguments
-        nhead: int,
-        num_encoder_layers: int, # encoder_layers
-        num_decoder_layers: int, # decoder_layers
-        dim_feedforward: int,
-        activation: str = "gelu", # activation_function
-        dropout: float = 0.1,
-        attn: str = "prob",
-        factor: int = 5,
-        distil: bool = True,
-        # univariate input
-        input_size: int = 1,
-        embedding_dimension: Optional[List[int]] = None,
-        distr_output: DistributionOutput = StudentTOutput(),
-        lags_seq: Optional[List[int]] = None,
-        scaling: bool = True,
-        num_parallel_samples: int = 100,
-    ) -> None:
-        super().__init__()
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
 
-        self.input_size = input_size
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
 
-        self.target_shape = distr_output.event_shape
-        self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_cat = num_feat_static_cat
-        self.num_feat_static_real = num_feat_static_real
-        self.embedding_dimension = (
-            embedding_dimension
-            if embedding_dimension is not None or cardinality is None
-            else [min(50, (cat + 1) // 2) for cat in cardinality]
-        )
-        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
-        self.num_parallel_samples = num_parallel_samples
-        self.history_length = context_length + max(self.lags_seq)
-        self.embedder = FeatureEmbedder(
-            cardinalities=cardinality,
-            embedding_dims=self.embedding_dimension,
-        )
-        if scaling:
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        if config.scaling:
             self.scaler = MeanScaler(dim=1, keepdim=True)
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        # total feature size
-        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        self.embedder = FeatureEmbedder(
+            cardinalities=config.cardinality,
+            embedding_dims=config.embedding_dimension,
+        )
+
+        # Informer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
 
-        self.context_length = context_length
-        self.prediction_length = prediction_length
-        self.distr_output = distr_output
-        self.param_proj = distr_output.get_args_proj(d_model)
+        # Initialize weights and apply final processing
+        self.post_init()
 
         # Informer enc-decoder
-        Attn = ProbAttention if attn == "prob" else FullAttention
-        # Encoder
-        self.encoder = Encoder(
-            [
-                EncoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=False,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=False,
-                    ),
-                    d_model,
-                    d_ff=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                )
-                for l in range(num_encoder_layers)
-            ],
-            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
-            if distil
-            else None,
-            norm_layer=torch.nn.LayerNorm(d_model),
-        )
+        # Attn = ProbAttention if config.attn == "prob" else FullAttention
 
-        # Masked Decoder
-        self.decoder = Decoder(
-            [
-                DecoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=True,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=True,
-                    ),
-                    AttentionLayer(
-                        FullAttention(
-                            mask_flag=False,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=False,
-                    ),
-                    d_model,
-                    d_ff=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                )
-                for l in range(num_decoder_layers)
-            ],
-            norm_layer=torch.nn.LayerNorm(d_model),
-        )
+        # Encoder
+        # self.encoder = Encoder(
+        #     [
+        #         EncoderLayer(
+        #             AttentionLayer(
+        #                 Attn(
+        #                     mask_flag=False,
+        #                     factor=factor,
+        #                     attention_dropout=dropout,
+        #                     output_attention=False,
+        #                 ),
+        #                 d_model,
+        #                 nhead,
+        #                 mix=False,
+        #             ),
+        #             d_model,
+        #             d_ff=dim_feedforward,
+        #             dropout=dropout,
+        #             activation=activation,
+        #         )
+        #         for l in range(num_encoder_layers)
+        #     ],
+        #     [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
+        #     if distil
+        #     else None,
+        #     norm_layer=torch.nn.LayerNorm(d_model),
+        # )
+        #
+        # # Masked Decoder
+        # self.decoder = Decoder(
+        #     [
+        #         DecoderLayer(
+        #             AttentionLayer(
+        #                 Attn(
+        #                     mask_flag=True,
+        #                     factor=factor,
+        #                     attention_dropout=dropout,
+        #                     output_attention=False,
+        #                 ),
+        #                 d_model,
+        #                 nhead,
+        #                 mix=True,
+        #             ),
+        #             AttentionLayer(
+        #                 FullAttention(
+        #                     mask_flag=False,
+        #                     factor=factor,
+        #                     attention_dropout=dropout,
+        #                     output_attention=False,
+        #                 ),
+        #                 d_model,
+        #                 nhead,
+        #                 mix=False,
+        #             ),
+        #             d_model,
+        #             d_ff=dim_feedforward,
+        #             dropout=dropout,
+        #             activation=activation,
+        #         )
+        #         for l in range(num_decoder_layers)
+        #     ],
+        #     norm_layer=torch.nn.LayerNorm(d_model),
+        # )
 
     @property
     def _number_of_features(self) -> int:

From a7d38dc0fb0680e89b62e77870ce909e4d586bcc Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 17 Jan 2023 05:42:13 +0000
Subject: [PATCH 010/164] added things to address, after reading again enc-dec
 in the paper

---
 src/transformers/models/informer/config_using_gluonTS.py | 2 +-
 src/transformers/models/informer/modeling_informer.py    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/informer/config_using_gluonTS.py b/src/transformers/models/informer/config_using_gluonTS.py
index c407b3ac3ec5..b047f6458885 100644
--- a/src/transformers/models/informer/config_using_gluonTS.py
+++ b/src/transformers/models/informer/config_using_gluonTS.py
@@ -224,7 +224,7 @@ def __init__(self, c_in):
             padding=1,
             padding_mode="circular",
         )
-        self.norm = nn.BatchNorm1d(c_in)
+        self.norm = nn.BatchNorm1d(c_in) # Question: why batchnorm here?
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
 
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 395975549d53..8a82758db975 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -361,13 +361,13 @@ def __init__(self, c_in):
             padding=1,
             padding_mode="circular",
         )
-        self.norm = nn.BatchNorm1d(c_in)
+        self.norm = nn.BatchNorm1d(c_in) # Eli question: why batchnorm here?
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
 
     def forward(self, x):
         x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x)
+        x = self.norm(x) # Eli: why? maybe because the impl...
         x = self.activation(x)
         x = self.maxPool(x)
         x = x.transpose(1, 2)
@@ -535,7 +535,7 @@ def __init__(self, config: InformerConfig):
         # Encoder
         # self.encoder = Encoder(
         #     [
-        #         EncoderLayer(
+        #         EncoderLayer( # Eli question: why I need EncoderLayers here?
         #             AttentionLayer(
         #                 Attn(
         #                     mask_flag=False,

From 8853e9327e1424afe08c962ecbd4e4ba53c05fcd Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 17 Jan 2023 13:49:29 +0000
Subject: [PATCH 011/164] done modeling - checking initialization work

---
 .../informer/check_instantiate_works.py       |   2 +-
 .../models/informer/configuration_informer.py |  56 ++++----
 .../models/informer/modeling_informer.py      | 127 +++++++++---------
 3 files changed, 91 insertions(+), 94 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index 487bf2a9a21b..fbb6b151f79c 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -4,5 +4,5 @@
 if __name__ == '__main__':
     freq = "h"
     lags = get_lags_for_frequency(freq_str=freq)
-    model = InformerModel(InformerConfig())
+    model = InformerModel(InformerConfig(lags_seq=lags))
     print(model)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 913acf7f1f67..d7d6ecb0c153 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -32,37 +32,35 @@
 
 class InformerConfig(PretrainedConfig):
     def __init__(
-        self,
-        context_length: int,
-        prediction_length: int,
-        num_feat_dynamic_real: int,  # num_dynamic_real_features
-        num_feat_static_real: int,  # num_static_real_features
-        num_feat_static_cat: int,  # num_static_categorical_features
-        cardinality: List[int],
-        # Informer arguments
-        nhead: int,
-        num_encoder_layers: int, # encoder_layers
-        num_decoder_layers: int, # decoder_layers
-        dim_feedforward: int,
-        activation: str = "gelu", # activation_function
-        dropout: float = 0.1,
-        attn: str = "prob",
-        factor: int = 5,
-        distil: bool = True,
-        # univariate input
-        input_size: int = 1,
-        embedding_dimension: Optional[List[int]] = None,
-        distr_output: str = "student_t",
-        lags_seq: Optional[List[int]] = None,  # used to be freq.
-        scaling: bool = True,
-        num_parallel_samples: int = 100,
-        is_encoder_decoder: bool = True,
-        **kwargs
+            self,
+            input_size: int = 1,
+            prediction_length: Optional[int] = None,
+            context_length: Optional[int] = None,
+            distr_output: str = "student_t",
+            lags_seq: Optional[List[int]] = None,  # used to be freq.
+            scaling: bool = True,
+            num_feat_dynamic_real: int = 0,  # num_dynamic_real_features
+            num_feat_static_real: int = 0,  # num_static_real_features
+            num_feat_static_cat: int = 0,  # num_static_categorical_features
+            cardinality: Optional[List[int]] = None,
+            embedding_dimension: Optional[List[int]] = None,
+            dim_feedforward: int = 32,  # decoder_ffn_dim & encoder_ffn_dim
+            nhead: int = 2,  # Eli: not sure what the name
+            num_encoder_layers: int = 2,  # encoder_layers
+            num_decoder_layers: int = 2,  # decoder_layers
+            is_encoder_decoder: bool = True,
+            activation: str = "gelu",  # activation_function
+            dropout: float = 0.1,
+            attn: str = "prob",
+            factor: int = 5,
+            distil: bool = True,
+            num_parallel_samples: int = 100,
+            **kwargs
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.distr_output = distr_output # Eli: change to distribution_output
+        self.distr_output = distr_output  # Eli: change to distribution_output
         # self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
         # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
@@ -99,8 +97,8 @@ def __init__(
         # Transformer architecture configuration
         self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
         self.nhead = nhead
-        self.num_encoder_layers = num_encoder_layers # encoder_layers
-        self.num_decoder_layers = num_decoder_layers # decoder_layers
+        self.num_encoder_layers = num_encoder_layers  # encoder_layers
+        self.num_decoder_layers = num_decoder_layers  # decoder_layers
         self.dim_feedforward = dim_feedforward
         self.activation = activation  # activation_function
         self.dropout = dropout
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 8a82758db975..2b80ab2e89a8 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -471,7 +471,7 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
 
 class InformerDecoder(nn.Module):
     def __init__(self, layers, norm_layer=None):
-        super(Decoder, self).__init__()
+        super(InformerDecoder, self).__init__()
         self.layers = nn.ModuleList(layers)
         self.norm = norm_layer
 
@@ -530,71 +530,70 @@ def __init__(self, config: InformerConfig):
         self.post_init()
 
         # Informer enc-decoder
-        # Attn = ProbAttention if config.attn == "prob" else FullAttention
+        Attn = ProbAttention if config.attn == "prob" else FullAttention
 
         # Encoder
-        # self.encoder = Encoder(
-        #     [
-        #         EncoderLayer( # Eli question: why I need EncoderLayers here?
-        #             AttentionLayer(
-        #                 Attn(
-        #                     mask_flag=False,
-        #                     factor=factor,
-        #                     attention_dropout=dropout,
-        #                     output_attention=False,
-        #                 ),
-        #                 d_model,
-        #                 nhead,
-        #                 mix=False,
-        #             ),
-        #             d_model,
-        #             d_ff=dim_feedforward,
-        #             dropout=dropout,
-        #             activation=activation,
-        #         )
-        #         for l in range(num_encoder_layers)
-        #     ],
-        #     [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
-        #     if distil
-        #     else None,
-        #     norm_layer=torch.nn.LayerNorm(d_model),
-        # )
-        #
-        # # Masked Decoder
-        # self.decoder = Decoder(
-        #     [
-        #         DecoderLayer(
-        #             AttentionLayer(
-        #                 Attn(
-        #                     mask_flag=True,
-        #                     factor=factor,
-        #                     attention_dropout=dropout,
-        #                     output_attention=False,
-        #                 ),
-        #                 d_model,
-        #                 nhead,
-        #                 mix=True,
-        #             ),
-        #             AttentionLayer(
-        #                 FullAttention(
-        #                     mask_flag=False,
-        #                     factor=factor,
-        #                     attention_dropout=dropout,
-        #                     output_attention=False,
-        #                 ),
-        #                 d_model,
-        #                 nhead,
-        #                 mix=False,
-        #             ),
-        #             d_model,
-        #             d_ff=dim_feedforward,
-        #             dropout=dropout,
-        #             activation=activation,
-        #         )
-        #         for l in range(num_decoder_layers)
-        #     ],
-        #     norm_layer=torch.nn.LayerNorm(d_model),
-        # )
+        self.encoder = InformerEncoder(
+            [
+                EncoderLayer( # Eli question: why I need EncoderLayers here?
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=False,
+                    ),
+                    config.d_model,
+                    d_ff=config.dim_feedforward,
+                    dropout=config.dropout,
+                    activation=config.activation,
+                )
+                for l in range(config.num_encoder_layers)
+            ],
+            [ConvLayer(config.d_model) for l in range(config.num_encoder_layers - 1)]
+            if config.distil else None,
+            norm_layer=torch.nn.LayerNorm(config.d_model),
+        )
+
+        # Masked Decoder
+        self.decoder = InformerDecoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=False,
+                    ),
+                    config.d_model,
+                    d_ff=config.dim_feedforward,
+                    dropout=config.dropout,
+                    activation=config.activation,
+                )
+                for l in range(config.num_decoder_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(config.d_model),
+        )
 
     @property
     def _number_of_features(self) -> int:

From 422cdf5a4815aa984d6f3c138fdff8067afe7724 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 27 Dec 2022 14:47:46 +0000
Subject: [PATCH 012/164] added informer to gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index cf8183463613..bca127a3bce2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,6 @@ tags
 *.lock
 
 # DS_Store (MacOS)
-.DS_Store
\ No newline at end of file
+.DS_Store
+
+Informer2020/
\ No newline at end of file

From c1001f05cabfd47a06f44e610f13fe1f826cae9a Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Thu, 12 Jan 2023 11:03:57 +0000
Subject: [PATCH 013/164] WIP informer2020

---
 docs/source/en/model_doc/informer.mdx         |   46 +
 src/transformers/__init__.py                  |   22 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 src/transformers/models/informer/__init__.py  |   67 +
 .../models/informer/configuration_informer.py |  227 ++
 .../models/informer/modeling_informer.py      | 1955 +++++++++++++++++
 tests/models/informer/__init__.py             |    0
 .../models/informer/test_modeling_informer.py |  442 ++++
 10 files changed, 2764 insertions(+)
 create mode 100644 docs/source/en/model_doc/informer.mdx
 create mode 100644 src/transformers/models/informer/__init__.py
 create mode 100644 src/transformers/models/informer/configuration_informer.py
 create mode 100644 src/transformers/models/informer/modeling_informer.py
 create mode 100644 tests/models/informer/__init__.py
 create mode 100644 tests/models/informer/test_modeling_informer.py

diff --git a/docs/source/en/model_doc/informer.mdx b/docs/source/en/model_doc/informer.mdx
new file mode 100644
index 000000000000..6765b9768fc6
--- /dev/null
+++ b/docs/source/en/model_doc/informer.mdx
@@ -0,0 +1,46 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Informer
+
+## Overview
+
+The Informer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## InformerConfig
+
+[[autodoc]] InformerConfig
+
+
+## InformerModel
+
+[[autodoc]] InformerModel
+    - forward
+
+
+## InformerForPrediction
+
+[[autodoc]] InformerForPrediction
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8e91bacb26f6..627605c7322a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -404,6 +404,10 @@
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
+    "models.informer": [
+        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InformerConfig",
+    ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -2222,6 +2226,14 @@
             "TimeSeriesTransformerPreTrainedModel",
         ]
     )
+    _import_structure["models.informer"].extend(
+        [
+            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "InformerForPrediction",
+            "InformerModel",
+            "InformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.timesformer"].extend(
         [
             "TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3779,6 +3791,10 @@
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
+    from .models.informer import (
+        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InformerConfig,
+    )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
     from .models.trajectory_transformer import (
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5282,6 +5298,12 @@
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
         )
+        from .models.informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
         from .models.timesformer import (
             TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TimesformerForVideoClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index cf30880faa1c..ca4446744528 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -161,6 +161,7 @@
     tapas,
     tapex,
     time_series_transformer,
+    informer,
     timesformer,
     trajectory_transformer,
     transfo_xl,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index cc3d48fe3be8..ded53b017aca 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -157,6 +157,7 @@
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
+        ("informer", "InformerConfig"),
         ("timesformer", "TimesformerConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
         ("transfo-xl", "TransfoXLConfig"),
@@ -310,6 +311,7 @@
         ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -487,6 +489,7 @@
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
+        ("informer", "Informer"),
         ("timesformer", "TimeSformer"),
         ("trajectory_transformer", "Trajectory Transformer"),
         ("transfo-xl", "Transformer-XL"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4465097dfeed..2d9f9effc622 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -153,6 +153,7 @@
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
+        ("informer", "InformerModel"),
         ("timesformer", "TimesformerModel"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
diff --git a/src/transformers/models/informer/__init__.py b/src/transformers/models/informer/__init__.py
new file mode 100644
index 000000000000..927fad5e5e7f
--- /dev/null
+++ b/src/transformers/models/informer/__init__.py
@@ -0,0 +1,67 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_informer": [
+        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InformerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_informer"] = [
+        "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "InformerForPrediction",
+        "InformerModel",
+        "InformerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_informer import (
+        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InformerConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
new file mode 100644
index 000000000000..b08f1ee982bc
--- /dev/null
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Informer model configuration"""
+
+from typing import List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "elisim/informer": "https://huggingface.co/elisim/informer/resolve/main/config.json",
+}
+
+
+
+class InformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
+    instantiate a Informer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
+    Transformer
+    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If `None`, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        scaling (`bool`, *optional* defaults to `True`):
+            Whether to scale the input targets.
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
+            5, 6, 7]`.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+
+        Example:
+
+    ```python
+    >>> from transformers import InformerConfig, InformerModel
+
+    >>> # Initializing a default Informer configuration
+    >>> configuration = InformerConfig()
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = InformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "informer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
+    def __init__(
+        self,
+        input_size: int = 1,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        scaling: bool = True,
+        num_dynamic_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_static_real_features: int = 0,
+        num_time_features: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        is_encoder_decoder: bool = True,
+        activation_function: str = "gelu",
+        dropout: float = 0.1,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache=True,
+        **kwargs
+    ):
+        # time series specific configuration
+        self.prediction_length = prediction_length
+        self.context_length = context_length or prediction_length
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.scaling = scaling
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+        if cardinality and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError(
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [1]
+        if embedding_dimension and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError(
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.d_model = input_size * len(lags_sequence) + self._number_of_features
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
+        self.init_std = init_std
+
+        self.output_attentions = False
+        self.output_hidden_states = False
+
+        self.use_cache = use_cache
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
+            + self.input_size  # the log(scale)
+        )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
new file mode 100644
index 000000000000..8d36a171da35
--- /dev/null
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -0,0 +1,1955 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Informer model."""
+
+import random
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_informer import InformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InformerConfig"
+
+
+INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "elisim/informer",
+    # See all Informer models at https://huggingface.co/models?filter=informer
+]
+
+
+
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class ParameterProjection(nn.Module):
+    def __init__(
+        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self.function = function
+
+    def forward(self, x, *args):
+        return self.function(x, *args)
+
+
+class DistributionOutput:
+    distribution_class: type
+    in_features: int
+    args_dim: Dict[str, int]
+
+    def __init__(self, dim: int = 1) -> None:
+        self.dim = dim
+        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
+
+    def _base_distribution(self, distr_args):
+        if self.dim == 1:
+            return self.distribution_class(*distr_args)
+        else:
+            return Independent(self.distribution_class(*distr_args), 1)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions that this object constructs.
+        """
+        return () if self.dim == 1 else (self.dim,)
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
+        """
+        return 0.0
+
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distribution_class: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        df = 2.0 + cls.squareplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distribution_class: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distribution_class: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        if self.dim == 1:
+            return self.distribution_class(total_count=total_count, logits=logits)
+        else:
+            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            # See scaling property of Gamma.
+            logits += scale.log()
+
+        return self._base_distribution((total_count, logits))
+
+
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, scale
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+class NegativeLogLikelihood:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+@dataclass
+class Seq2SeqTimeSeriesModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
+    """
+    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
+    distribution.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
+            Distributional loss.
+        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
+            Parameters of the chosen distribution.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    params: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class SampleTimeSeriesPredictionOutput(ModelOutput):
+    sequences: torch.FloatTensor = None
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
+class InformerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->Informer
+class InformerEncoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = InformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->Informer
+class InformerDecoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = InformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = InformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+INFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`InformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Past values of the time series, that serve as context in order to predict the future. These values may
+            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features`).
+
+            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+
+            Missing values need to be replaced with zeros.
+
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features.
+
+            The Informer only learns additional embeddings for `static_categorical_features`.
+
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs to learn to output, given the `past_values`.
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `future_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional features.
+
+            The Informer only learns additional embeddings for `static_categorical_features`.
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
+class InformerEncoder(InformerPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`InformerEncoderLayer`].
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
+class InformerDecoder(InformerPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`InformerDecoderLayer`]
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        if config.scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
+
+        self.embedder = FeatureEmbedder(
+            cardinalities=config.cardinality,
+            embedding_dims=config.embedding_dimension,
+        )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def _past_length(self) -> int:
+        return self.config.context_length + max(self.config.lags_sequence)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
+            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
+            j, :, k] = sequence[i, -indices[k]-S+j, :].
+
+        Args:
+            sequence: Tensor
+                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            subsequences_length : int
+                Length of the subsequences to be extracted.
+            shift: int
+                Shift the lags by this amount back.
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.config.lags_sequence]
+
+        try:
+            assert max(indices) + subsequences_length <= sequence_length, (
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+        except AssertionError as e:
+            e.args += (max(indices), sequence_length)
+            raise
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
+                ),
+                dim=1,
+            )
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+        )
+
+        # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
+        _, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            torch.cat((past_values, future_values), dim=1) / scale
+            if future_values is not None
+            else past_values / scale
+        )
+
+        inputs_length = (
+            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
+        )
+        try:
+            assert inputs.shape[1] == inputs_length, (
+                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
+            )
+        except AssertionError as e:
+            e.args += (inputs.shape[1], inputs_length)
+            raise
+
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
+
+        # embeddings
+        embedded_cat = self.embedder(static_categorical_features)
+        # static features
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        # all features
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
+
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, scale, static_feat
+
+    def enc_dec_outputs(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.config.context_length, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+
+        encoder_outputs = self.encoder(inputs_embeds=enc_input)
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        )
+        return encoder_outputs, decoder_outputs
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = transformer_inputs[:, : self.config.context_length, ...]
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (scale, static_feat)
+
+        return Seq2SeqTimeSeriesModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+@add_start_docstrings(
+    "The Informer Model with a distribution head on top for time-series forecasting.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.model = InformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = NegativeLogLikelihood()
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, dec_output):
+        return self.parameter_projection(dec_output)
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, scale=scale)
+
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerForPrediction.from_pretrained(
+        ...     "huggingface/time-series-transformer-tourism-monthly"
+        ... )
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            params = self.output_params(outputs[0])  # outputs.last_hidden_state
+            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTimeSeriesPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_values: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.Tensor:
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=True,
+        )
+
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        scale = outputs.scale
+        static_feat = outputs.static_features
+
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        future_samples = []
+
+        # greedy decoding
+        for k in range(self.config.prediction_length):
+            lagged_sequence = self.model.get_lagged_subsequences(
+                sequence=repeated_past_values,
+                subsequences_length=1 + k,
+                shift=1,
+            )
+
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+
+            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+            dec_last_hidden = dec_output.last_hidden_state
+
+            params = self.parameter_projection(dec_last_hidden[:, -1:])
+            distr = self.output_distribution(params, scale=repeated_scale)
+            next_sample = distr.sample()
+
+            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+
+        return SampleTimeSeriesPredictionOutput(
+            sequences=concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
diff --git a/tests/models/informer/__init__.py b/tests/models/informer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
new file mode 100644
index 000000000000..ecbea487e790
--- /dev/null
+++ b/tests/models/informer/test_modeling_informer.py
@@ -0,0 +1,442 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Informer model. """
+
+import inspect
+import tempfile
+import unittest
+
+from huggingface_hub import hf_hub_download
+from transformers import is_torch_available
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        InformerConfig,
+        InformerForPrediction,
+        InformerModel,
+    )
+    from transformers.models.informer.modeling_informer import (
+        InformerDecoder,
+        InformerEncoder,
+    )
+
+
+@require_torch
+class InformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        lags_sequence=[1, 2, 3, 4, 5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.embedding_dimension = embedding_dimension
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+        self.encoder_seq_length = context_length
+        self.decoder_seq_length = prediction_length
+
+    def get_config(self):
+        return InformerConfig(
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            prediction_length=self.prediction_length,
+            context_length=self.context_length,
+            lags_sequence=self.lags_sequence,
+            num_time_features=self.num_time_features,
+            num_static_categorical_features=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
+        )
+
+    def prepare_informer_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        static_real_features = floats_tensor([self.batch_size, 1])
+
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length])
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "static_real_features": static_real_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_informer_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = InformerModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = InformerEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict)
+        enc_input = transformer_inputs[:, : config.context_length, ...]
+        dec_input = transformer_inputs[:, config.context_length :, ...]
+
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = InformerDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_last_hidden_state,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class InformerModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (InformerModel, InformerForPrediction) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (InformerForPrediction,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    def setUp(self):
+        self.model_tester = InformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=InformerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # Ignore since we have no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # # Input is 'static_categorical_features' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(InformerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(InformerModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "past_values",
+                "past_time_features",
+                "past_observed_mask",
+                "static_categorical_features",
+                "static_real_features",
+                "future_values",
+                "future_time_features",
+            ]
+
+            expected_arg_names.extend(
+                [
+                    "future_observed_mask",
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+                if "future_observed_mask" in arg_names
+                else [
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 6
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_seq_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    encoder_seq_length,
+                ],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+        )
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="kashif/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class InformerModelIntegrationTests(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
+        batch = prepare_batch()
+
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            )[0]
+
+        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = InformerForPrediction.from_pretrained(
+            "huggingface/time-series-transformer-tourism-monthly"
+        ).to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                future_time_features=batch["future_time_features"],
+            )[1]
+        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = InformerForPrediction.from_pretrained(
+            "huggingface/time-series-transformer-tourism-monthly"
+        ).to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                static_real_features=batch["static_real_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From 8648e08b9bf60cde765b243d0f2af8ac4f0c7ae7 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 13 Jan 2023 10:53:54 +0000
Subject: [PATCH 014/164] added checking that instantiate works

---
 src/transformers/models/informer/check_instantiate_works.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 src/transformers/models/informer/check_instantiate_works.py

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
new file mode 100644
index 000000000000..f392af08b552
--- /dev/null
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -0,0 +1,5 @@
+from transformers import InformerModel, InformerConfig
+
+if __name__ == '__main__':
+    model = InformerModel(InformerConfig())
+    print(model)

From 5bfe844593b94f7c4df50407dc1b46b711fe2cdb Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 14 Jan 2023 11:56:07 +0000
Subject: [PATCH 015/164] added config using gluonTS by kashif

---
 .../config_using_gluonTS.py                   | 723 ++++++++++++++++++
 1 file changed, 723 insertions(+)
 create mode 100644 src/transformers/models/time_series_transformer/config_using_gluonTS.py

diff --git a/src/transformers/models/time_series_transformer/config_using_gluonTS.py b/src/transformers/models/time_series_transformer/config_using_gluonTS.py
new file mode 100644
index 000000000000..dde188b9c79e
--- /dev/null
+++ b/src/transformers/models/time_series_transformer/config_using_gluonTS.py
@@ -0,0 +1,723 @@
+from math import sqrt
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from gluonts.core.component import validated
+from gluonts.time_feature import get_lags_for_frequency
+from gluonts.torch.distributions import DistributionOutput, StudentTOutput
+from gluonts.torch.modules.feature import FeatureEmbedder
+from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
+
+
+class TriangularCausalMask:
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class ProbMask:
+    def __init__(self, B, H, L, index, scores, device="cpu"):
+        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
+        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
+        indicator = _mask_ex[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ].to(device)
+        self._mask = indicator.view(scores.shape).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class FullAttention(nn.Module):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(FullAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1.0 / sqrt(E)
+
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
+
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
+
+        if self.output_attention:
+            return (V.contiguous(), A)
+        else:
+            return (V.contiguous(), None)
+
+
+class ProbAttention(nn.Module):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(ProbAttention, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
+        # Q [B, H, L, D]
+        B, H, L_K, E = K.shape
+        _, _, L_Q, _ = Q.shape
+
+        # calculate the sampled Q_K
+        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
+        index_sample = torch.randint(
+            L_K, (L_Q, sample_k)
+        )  # real U = U_part(factor*ln(L_k))*L_q
+        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(
+            -2
+        )
+
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
+        M_top = M.topk(n_top, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        Q_reduce = Q[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
+        ]  # factor*ln(L_q)
+        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        B, H, L_V, D = V.shape
+        if not self.mask_flag:
+            # V_sum = V.sum(dim=-2)
+            V_sum = V.mean(dim=-2)
+            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
+        else:  # use mask
+            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
+            contex = V.cumsum(dim=-2)
+        return contex
+
+    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
+        B, H, L_V, D = V.shape
+
+        if self.mask_flag:
+            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
+
+        context_in[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ] = torch.matmul(attn, V).type_as(context_in)
+        if self.output_attention:
+            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
+            attns[
+                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+            ] = attn
+            return (context_in, attns)
+        else:
+            return (context_in, None)
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L_Q, H, D = queries.shape
+        _, L_K, _, _ = keys.shape
+
+        queries = queries.transpose(2, 1)
+        keys = keys.transpose(2, 1)
+        values = values.transpose(2, 1)
+
+        U_part = self.factor * np.ceil(np.log1p(L_K)).astype("int").item()  # c*ln(L_k)
+        u = self.factor * np.ceil(np.log1p(L_Q)).astype("int").item()  # c*ln(L_q)
+
+        U_part = U_part if U_part < L_K else L_K
+        u = u if u < L_Q else L_Q
+
+        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
+
+        # add scale factor
+        scale = self.scale or 1.0 / sqrt(D)
+        if scale is not None:
+            scores_top = scores_top * scale
+        # get the context
+        context = self._get_initial_context(values, L_Q)
+        # update the context with selected top_k queries
+        context, attn = self._update_context(
+            context, values, scores_top, index, L_Q, attn_mask
+        )
+
+        return context.transpose(2, 1).contiguous(), attn
+
+
+class AttentionLayer(nn.Module):
+    def __init__(
+        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
+    ):
+        super(AttentionLayer, self).__init__()
+
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+
+        self.inner_attention = attention
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+        self.mix = mix
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+
+        out, attn = self.inner_attention(queries, keys, values, attn_mask)
+        if self.mix:
+            out = out.transpose(2, 1).contiguous()
+        out = out.view(B, L, -1)
+
+        return self.out_projection(out), attn
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        # x = x + self.dropout(self.attention(
+        #     x, x, x,
+        #     attn_mask = attn_mask
+        # ))
+        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = (
+            nn.ModuleList(conv_layers) if conv_layers is not None else None
+        )
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
+class DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+    ):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
+        )
+
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+
+class Decoder(nn.Module):
+    def __init__(self, layers, norm_layer=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        for layer in self.layers:
+            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x
+
+
+class InformerModel(nn.Module):
+    @validated()
+    def __init__(
+        self,
+        freq: str,
+        context_length: int,
+        prediction_length: int,
+        num_feat_dynamic_real: int,
+        num_feat_static_real: int,
+        num_feat_static_cat: int,
+        cardinality: List[int],
+        # Informer arguments
+        nhead: int,
+        num_encoder_layers: int,
+        num_decoder_layers: int,
+        dim_feedforward: int,
+        activation: str = "gelu",
+        dropout: float = 0.1,
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        # univariate input
+        input_size: int = 1,
+        embedding_dimension: Optional[List[int]] = None,
+        distr_output: DistributionOutput = StudentTOutput(),
+        lags_seq: Optional[List[int]] = None,
+        scaling: bool = True,
+        num_parallel_samples: int = 100,
+    ) -> None:
+        super().__init__()
+
+        self.input_size = input_size
+
+        self.target_shape = distr_output.event_shape
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+        self.embedding_dimension = (
+            embedding_dimension
+            if embedding_dimension is not None or cardinality is None
+            else [min(50, (cat + 1) // 2) for cat in cardinality]
+        )
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
+        self.num_parallel_samples = num_parallel_samples
+        self.history_length = context_length + max(self.lags_seq)
+        self.embedder = FeatureEmbedder(
+            cardinalities=cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+        if scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
+
+        # total feature size
+        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+
+        self.context_length = context_length
+        self.prediction_length = prediction_length
+        self.distr_output = distr_output
+        self.param_proj = distr_output.get_args_proj(d_model)
+
+        # Informer enc-decoder
+        Attn = ProbAttention if attn == "prob" else FullAttention
+        # Encoder
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
+                    ),
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                )
+                for l in range(num_encoder_layers)
+            ],
+            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
+            if distil
+            else None,
+            norm_layer=torch.nn.LayerNorm(d_model),
+        )
+
+        # Masked Decoder
+        self.decoder = Decoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
+                    ),
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                )
+                for l in range(num_decoder_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(d_model),
+        )
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_feat_static_real
+            + self.input_size  # the log(scale)
+        )
+
+    @property
+    def _past_length(self) -> int:
+        return self.context_length + max(self.lags_seq)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence.
+        Parameters
+        ----------
+        sequence : Tensor
+            the sequence from which lagged subsequences should be extracted.
+            Shape: (N, T, C).
+        subsequences_length : int
+            length of the subsequences to be extracted.
+        shift: int
+            shift the lags by this amount back.
+        Returns
+        --------
+        lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and
+            I = len(indices), containing lagged subsequences. Specifically,
+            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.lags_seq]
+
+        assert max(indices) + subsequences_length <= sequence_length, (
+            f"lags cannot go further than history length, found lag {max(indices)} "
+            f"while history length is only {sequence_length}"
+        )
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def _check_shapes(
+        self,
+        prior_input: torch.Tensor,
+        inputs: torch.Tensor,
+        features: Optional[torch.Tensor],
+    ) -> None:
+        assert len(prior_input.shape) == len(inputs.shape)
+        assert (
+            len(prior_input.shape) == 2 and self.input_size == 1
+        ) or prior_input.shape[2] == self.input_size
+        assert (len(inputs.shape) == 2 and self.input_size == 1) or inputs.shape[
+            -1
+        ] == self.input_size
+        assert (
+            features is None or features.shape[2] == self._number_of_features
+        ), f"{features.shape[2]}, expected {self._number_of_features}"
+
+    def create_network_inputs(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_feat[:, self._past_length - self.context_length :, ...],
+                    future_time_feat,
+                ),
+                dim=1,
+            )
+            if future_target is not None
+            else past_time_feat[:, self._past_length - self.context_length :, ...]
+        )
+
+        # target
+        context = past_target[:, -self.context_length :]
+        observed_context = past_observed_values[:, -self.context_length :]
+        _, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            torch.cat((past_target, future_target), dim=1) / scale
+            if future_target is not None
+            else past_target / scale
+        )
+
+        inputs_length = (
+            self._past_length + self.prediction_length
+            if future_target is not None
+            else self._past_length
+        )
+        assert inputs.shape[1] == inputs_length
+
+        subsequences_length = (
+            self.context_length + self.prediction_length
+            if future_target is not None
+            else self.context_length
+        )
+
+        # embeddings
+        embedded_cat = self.embedder(feat_static_cat)
+        log_scale = scale.log() if self.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat(
+            (embedded_cat, feat_static_real, log_scale),
+            dim=1,
+        )
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, time_feat.shape[1], -1
+        )
+
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # self._check_shapes(prior_input, inputs, features)
+
+        # sequence = torch.cat((prior_input, inputs), dim=1)
+        lagged_sequence = self.get_lagged_subsequences(
+            sequence=inputs,
+            subsequences_length=subsequences_length,
+        )
+
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(
+            lags_shape[0], lags_shape[1], -1
+        )
+
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, scale, static_feat
+
+    def output_params(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.context_length, ...]
+        dec_input = transformer_inputs[:, self.context_length :, ...]
+
+        enc_out, _ = self.encoder(enc_input)
+        dec_output = self.decoder(dec_input, enc_out)
+
+        return self.param_proj(dec_output)
+
+    @torch.jit.ignore
+    def output_distribution(
+        self, params, scale=None, trailing_n=None
+    ) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distr_output.distribution(sliced_params, scale=scale)
+
+    # for prediction
+    def forward(
+        self,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: torch.Tensor,
+        num_parallel_samples: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        if num_parallel_samples is None:
+            num_parallel_samples = self.num_parallel_samples
+
+        encoder_inputs, scale, static_feat = self.create_network_inputs(
+            feat_static_cat,
+            feat_static_real,
+            past_time_feat,
+            past_target,
+            past_observed_values,
+        )
+
+        enc_out, _ = self.encoder(encoder_inputs)
+
+        repeated_scale = scale.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
+
+        repeated_past_target = (
+            past_target.repeat_interleave(repeats=self.num_parallel_samples, dim=0)
+            / repeated_scale
+        )
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, future_time_feat.shape[1], -1
+        )
+        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+        repeated_features = features.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
+
+        repeated_enc_out = enc_out.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
+
+        future_samples = []
+
+        # greedy decoding
+        for k in range(self.prediction_length):
+            # self._check_shapes(repeated_past_target, next_sample, next_features)
+            # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
+
+            lagged_sequence = self.get_lagged_subsequences(
+                sequence=repeated_past_target,
+                subsequences_length=1 + k,
+                shift=1,
+            )
+
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(
+                lags_shape[0], lags_shape[1], -1
+            )
+
+            decoder_input = torch.cat(
+                (reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1
+            )
+
+            output = self.decoder(decoder_input, repeated_enc_out)
+
+            params = self.param_proj(output[:, -1:])
+            distr = self.output_distribution(params, scale=repeated_scale)
+            next_sample = distr.sample()
+
+            repeated_past_target = torch.cat(
+                (repeated_past_target, next_sample / repeated_scale), dim=1
+            )
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+        return concat_future_samples.reshape(
+            (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
+        )
+    
\ No newline at end of file

From b29cfa62ed6ef2f4eb81b367efac6ca8258366c9 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sun, 15 Jan 2023 05:12:45 +0000
Subject: [PATCH 016/164] WIP config

---
 .../config_using_gluonTS.py                     | 17 ++++++++---------
 .../configuration_time_series_transformer.py    |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)
 rename src/transformers/models/{time_series_transformer => informer}/config_using_gluonTS.py (98%)

diff --git a/src/transformers/models/time_series_transformer/config_using_gluonTS.py b/src/transformers/models/informer/config_using_gluonTS.py
similarity index 98%
rename from src/transformers/models/time_series_transformer/config_using_gluonTS.py
rename to src/transformers/models/informer/config_using_gluonTS.py
index dde188b9c79e..c407b3ac3ec5 100644
--- a/src/transformers/models/time_series_transformer/config_using_gluonTS.py
+++ b/src/transformers/models/informer/config_using_gluonTS.py
@@ -350,21 +350,21 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
 
 class InformerModel(nn.Module):
     @validated()
-    def __init__(
+    def __init__(  # add loss param
         self,
-        freq: str,
+        freq: str, # frequency
         context_length: int,
         prediction_length: int,
-        num_feat_dynamic_real: int,
-        num_feat_static_real: int,
-        num_feat_static_cat: int,
+        num_feat_dynamic_real: int,  # num_dynamic_real_features
+        num_feat_static_real: int,  # num_static_real_features
+        num_feat_static_cat: int,  # num_static_categorical_features
         cardinality: List[int],
         # Informer arguments
         nhead: int,
-        num_encoder_layers: int,
-        num_decoder_layers: int,
+        num_encoder_layers: int, # encoder_layers
+        num_decoder_layers: int, # decoder_layers
         dim_feedforward: int,
-        activation: str = "gelu",
+        activation: str = "gelu", # activation_function
         dropout: float = 0.1,
         attn: str = "prob",
         factor: int = 5,
@@ -720,4 +720,3 @@ def forward(
         return concat_future_samples.reshape(
             (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
         )
-    
\ No newline at end of file
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 8d89d5cd7f19..258230654b0a 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -151,7 +151,7 @@ def __init__(
         decoder_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
-        is_encoder_decoder: bool = True,
+        is_encoder_decoder: bool = True, # Eli: remove from signature?
         activation_function: str = "gelu",
         dropout: float = 0.1,
         encoder_layerdrop: float = 0.1,

From 215b0303b5e121d39c19c8297631c86be59dd7ae Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 16 Jan 2023 12:41:19 +0000
Subject: [PATCH 017/164] adding informeConfig. need to remove FeatureEmbedder

---
 .../informer/check_instantiate_works.py       |    3 +
 .../models/informer/configuration_informer.py |  229 +-
 .../models/informer/modeling_informer.py      | 2360 ++++-------------
 .../configuration_time_series_transformer.py  |    4 +-
 4 files changed, 624 insertions(+), 1972 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index f392af08b552..487bf2a9a21b 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -1,5 +1,8 @@
 from transformers import InformerModel, InformerConfig
+from gluonts.time_feature import get_lags_for_frequency
 
 if __name__ == '__main__':
+    freq = "h"
+    lags = get_lags_for_frequency(freq_str=freq)
     model = InformerModel(InformerConfig())
     print(model)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index b08f1ee982bc..51ac98700310 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,206 +22,91 @@
 
 logger = logging.get_logger(__name__)
 
-INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "elisim/informer": "https://huggingface.co/elisim/informer/resolve/main/config.json",
+TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "huggingface/time-series-transformer-tourism-monthly": (
+        "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json"
+    ),
+    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
 }
 
 
-
 class InformerConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
-    instantiate a Informer model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
-    Transformer
-    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
-    architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        prediction_length (`int`):
-            The prediction length for the decoder. In other words, the prediction horizon of the model.
-        context_length (`int`, *optional*, defaults to `prediction_length`):
-            The context length for the encoder. If `None`, the context length will be the same as the
-            `prediction_length`.
-        distribution_output (`string`, *optional*, defaults to `"student_t"`):
-            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
-        loss (`string`, *optional*, defaults to `"nll"`):
-            The loss function for the model corresponding to the `distribution_output` head. For parametric
-            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
-        input_size (`int`, *optional*, defaults to 1):
-            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
-            multivariate targets.
-        scaling (`bool`, *optional* defaults to `True`):
-            Whether to scale the input targets.
-        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
-            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
-            5, 6, 7]`.
-        num_time_features (`int`, *optional*, defaults to 0):
-            The number of time features in the input time series.
-        num_dynamic_real_features (`int`, *optional*, defaults to 0):
-            The number of dynamic real valued features.
-        num_static_categorical_features (`int`, *optional*, defaults to 0):
-            The number of static categorical features.
-        num_static_real_features (`int`, *optional*, defaults to 0):
-            The number of static real valued features.
-        cardinality (`list[int]`, *optional*):
-            The cardinality (number of different values) for each of the static categorical features. Should be a list
-            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
-            `num_static_categorical_features` is > 0.
-        embedding_dimension (`list[int]`, *optional*):
-            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
-            having the same length as `num_static_categorical_features`. Cannot be `None` if
-            `num_static_categorical_features` is > 0.
-        encoder_layers (`int`, *optional*, defaults to 2):
-            Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 2):
-            Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 2):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 2):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 32):
-            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 32):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
-            `"relu"` are supported.
-        dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the encoder, and decoder.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention and fully connected layers for each encoder layer.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention and fully connected layers for each decoder layer.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability used between the two layers of the feed-forward networks.
-        num_parallel_samples (`int`, *optional*, defaults to 100):
-            The number of samples to generate in parallel for each time step of inference.
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated normal weight initialization distribution.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
-
-        Example:
-
-    ```python
-    >>> from transformers import InformerConfig, InformerModel
-
-    >>> # Initializing a default Informer configuration
-    >>> configuration = InformerConfig()
-
-    >>> # Randomly initializing a model (with random weights) from the configuration
-    >>> model = InformerModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "informer"
-    attribute_map = {
-        "hidden_size": "d_model",
-        "num_attention_heads": "encoder_attention_heads",
-        "num_hidden_layers": "encoder_layers",
-    }
-
     def __init__(
         self,
+        context_length: int,
+        prediction_length: int,
+        num_feat_dynamic_real: int,  # num_dynamic_real_features
+        num_feat_static_real: int,  # num_static_real_features
+        num_feat_static_cat: int,  # num_static_categorical_features
+        cardinality: List[int],
+        # Informer arguments
+        nhead: int,
+        num_encoder_layers: int, # encoder_layers
+        num_decoder_layers: int, # decoder_layers
+        dim_feedforward: int,
+        activation: str = "gelu", # activation_function
+        dropout: float = 0.1,
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        # univariate input
         input_size: int = 1,
-        prediction_length: Optional[int] = None,
-        context_length: Optional[int] = None,
-        distribution_output: str = "student_t",
-        loss: str = "nll",
-        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
-        scaling: bool = True,
-        num_dynamic_real_features: int = 0,
-        num_static_categorical_features: int = 0,
-        num_static_real_features: int = 0,
-        num_time_features: int = 0,
-        cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
-        encoder_ffn_dim: int = 32,
-        decoder_ffn_dim: int = 32,
-        encoder_attention_heads: int = 2,
-        decoder_attention_heads: int = 2,
-        encoder_layers: int = 2,
-        decoder_layers: int = 2,
-        is_encoder_decoder: bool = True,
-        activation_function: str = "gelu",
-        dropout: float = 0.1,
-        encoder_layerdrop: float = 0.1,
-        decoder_layerdrop: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
+        distr_output: str = "student_t",
+        lags_seq: Optional[List[int]] = None, # used to be freq.
+        scaling: bool = True,
         num_parallel_samples: int = 100,
-        init_std: float = 0.02,
-        use_cache=True,
-        **kwargs
+        is_encoder_decoder: bool = True,
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.distribution_output = distribution_output
-        self.loss = loss
+        self.distr_output = distr_output # Eli: change to distribution_output
+        # self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
-        self.num_time_features = num_time_features
-        self.lags_sequence = lags_sequence
-        self.scaling = scaling
-        self.num_dynamic_real_features = num_dynamic_real_features
-        self.num_static_real_features = num_static_real_features
-        self.num_static_categorical_features = num_static_categorical_features
-        if cardinality and num_static_categorical_features > 0:
-            if len(cardinality) != num_static_categorical_features:
+        # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
+        # self.num_time_features = num_time_features # Eli: From vanilla ts transformer
+        self.lags_seq = lags_seq
+        # self.scaling = scaling # Eli: From vanilla ts transformer
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+
+        # set cardinality
+        if cardinality and num_feat_static_cat > 0:
+            if len(cardinality) != num_feat_static_cat:
                 raise ValueError(
                     "The cardinality should be a list of the same length as `num_static_categorical_features`"
                 )
             self.cardinality = cardinality
         else:
             self.cardinality = [1]
-        if embedding_dimension and num_static_categorical_features > 0:
-            if len(embedding_dimension) != num_static_categorical_features:
+
+        # set embedding_dimension
+        if embedding_dimension and num_feat_static_cat > 0:
+            if len(embedding_dimension) != num_feat_static_cat:
                 raise ValueError(
                     "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                 )
             self.embedding_dimension = embedding_dimension
         else:
             self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+            
         self.num_parallel_samples = num_parallel_samples
 
-        # Transformer architecture configuration
-        self.d_model = input_size * len(lags_sequence) + self._number_of_features
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_layers = encoder_layers
-        self.decoder_layers = decoder_layers
-
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-
-        self.activation_function = activation_function
-        self.init_std = init_std
+        # self.history_length = context_length + max(self.lags_seq)
+        self.embedder = FeatureEmbedder(
+            cardinalities=cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+        if scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        self.output_attentions = False
-        self.output_hidden_states = False
+        # total feature size
+        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
 
-        self.use_cache = use_cache
+        self.param_proj = distr_output.get_args_proj(d_model)
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
-
-    @property
-    def _number_of_features(self) -> int:
-        return (
-            sum(self.embedding_dimension)
-            + self.num_dynamic_real_features
-            + self.num_time_features
-            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
-            + self.input_size  # the log(scale)
-        )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 8d36a171da35..9616af397751 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1,1483 +1,519 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Informer model."""
-
-import random
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from math import sqrt
+from typing import List, Optional
 
+import numpy as np
 import torch
-from torch import nn
-from torch.distributions import (
-    AffineTransform,
-    Distribution,
-    Independent,
-    NegativeBinomial,
-    Normal,
-    StudentT,
-    TransformedDistribution,
-)
+import torch.nn as nn
+import torch.nn.functional as F
+from gluonts.torch.distributions import DistributionOutput, StudentTOutput
+from gluonts.torch.modules.feature import FeatureEmbedder
+from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_informer import InformerConfig
 
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "InformerConfig"
-
-
-INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "elisim/informer",
-    # See all Informer models at https://huggingface.co/models?filter=informer
-]
-
-
-
-class AffineTransformed(TransformedDistribution):
-    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
-        self.scale = 1.0 if scale is None else scale
-        self.loc = 0.0 if loc is None else loc
-
-        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+class TriangularCausalMask:
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
 
     @property
-    def mean(self):
-        """
-        Returns the mean of the distribution.
-        """
-        return self.base_dist.mean * self.scale + self.loc
+    def mask(self):
+        return self._mask
 
-    @property
-    def variance(self):
-        """
-        Returns the variance of the distribution.
-        """
-        return self.base_dist.variance * self.scale**2
+
+class ProbMask:
+    def __init__(self, B, H, L, index, scores, device="cpu"):
+        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
+        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
+        indicator = _mask_ex[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ].to(device)
+        self._mask = indicator.view(scores.shape).to(device)
 
     @property
-    def stddev(self):
-        """
-        Returns the standard deviation of the distribution.
-        """
-        return self.variance.sqrt()
+    def mask(self):
+        return self._mask
 
 
-class ParameterProjection(nn.Module):
+class FullAttention(nn.Module):
     def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        self.args_dim = args_dim
-        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
-        self.domain_map = domain_map
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
-        params_unbounded = [proj(x) for proj in self.proj]
-
-        return self.domain_map(*params_unbounded)
-
-
-class LambdaLayer(nn.Module):
-    def __init__(self, function):
-        super().__init__()
-        self.function = function
-
-    def forward(self, x, *args):
-        return self.function(x, *args)
-
-
-class DistributionOutput:
-    distribution_class: type
-    in_features: int
-    args_dim: Dict[str, int]
-
-    def __init__(self, dim: int = 1) -> None:
-        self.dim = dim
-        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
-
-    def _base_distribution(self, distr_args):
-        if self.dim == 1:
-            return self.distribution_class(*distr_args)
-        else:
-            return Independent(self.distribution_class(*distr_args), 1)
-
-    def distribution(
         self,
-        distr_args,
-        loc: Optional[torch.Tensor] = None,
-        scale: Optional[torch.Tensor] = None,
-    ) -> Distribution:
-        distr = self._base_distribution(distr_args)
-        if loc is None and scale is None:
-            return distr
-        else:
-            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
-
-    @property
-    def event_shape(self) -> Tuple:
-        r"""
-        Shape of each individual event contemplated by the distributions that this object constructs.
-        """
-        return () if self.dim == 1 else (self.dim,)
-
-    @property
-    def event_dim(self) -> int:
-        r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
-        constructs.
-        """
-        return len(self.event_shape)
-
-    @property
-    def value_in_support(self) -> float:
-        r"""
-        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
-        default 0.0. This value will be used when padding data series.
-        """
-        return 0.0
-
-    def get_parameter_projection(self, in_features: int) -> nn.Module:
-        r"""
-        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
-        """
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        r"""
-        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
-        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
-        distribution of the right event_shape.
-        """
-        raise NotImplementedError()
-
-    @classmethod
-    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
-        r"""
-        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
-        https://twitter.com/jon_barron/status/1387167648669048833
-        """
-        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
-
-
-class StudentTOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
-    distribution_class: type = StudentT
-
-    @classmethod
-    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        df = 2.0 + cls.squareplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NormalOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
-    distribution_class: type = Normal
-
-    @classmethod
-    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NegativeBinomialOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
-    distribution_class: type = NegativeBinomial
-
-    @classmethod
-    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
-        total_count = cls.squareplus(total_count)
-        return total_count.squeeze(-1), logits.squeeze(-1)
-
-    def _base_distribution(self, distr_args) -> Distribution:
-        total_count, logits = distr_args
-        if self.dim == 1:
-            return self.distribution_class(total_count=total_count, logits=logits)
-        else:
-            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
-
-    # Overwrites the parent class method. We cannot scale using the affine
-    # transformation since negative binomial should return integers. Instead
-    # we scale the parameters.
-    def distribution(
-        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
-    ) -> Distribution:
-        total_count, logits = distr_args
-
-        if scale is not None:
-            # See scaling property of Gamma.
-            logits += scale.log()
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(FullAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
 
-        return self._base_distribution((total_count, logits))
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1.0 / sqrt(E)
 
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
 
-class FeatureEmbedder(nn.Module):
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
+            scores.masked_fill_(attn_mask.mask, -np.inf)
 
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
 
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        if self.output_attention:
+            return (V.contiguous(), A)
         else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
+            return (V.contiguous(), None)
 
 
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
-
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # these will have shape (N, C)
-        total_weight = weights.sum(dim=self.dim)
-        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
-
-        # first compute a global scale per-dimension
-        total_observed = total_weight.sum(dim=0)
-        denominator = torch.max(total_observed, torch.ones_like(total_observed))
-        default_scale = weighted_sum.sum(dim=0) / denominator
-
-        # then compute a per-item, per-dimension scale
-        denominator = torch.max(total_weight, torch.ones_like(total_weight))
-        scale = weighted_sum / denominator
-
-        # use per-batch scale when no element is observed
-        # or when the sequence contains only zeros
-        scale = (
-            torch.max(
-                self.minimum_scale,
-                torch.where(
-                    weighted_sum > torch.zeros_like(weighted_sum),
-                    scale,
-                    default_scale * torch.ones_like(total_weight),
-                ),
-            )
-            .detach()
-            .unsqueeze(dim=self.dim)
-        )
-
-        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
-
-
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, scale
-
-
-def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
-    """
-    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-
-    Args:
-        input_tensor (`torch.FloatTensor`):
-            Input tensor, of which the average must be computed.
-        weights (`torch.FloatTensor`, *optional*):
-            Weights tensor, of the same shape as `input_tensor`.
-        dim (`int`, *optional*):
-            The dim along which to average `input_tensor`.
-
-    Returns:
-        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
-    """
-    if weights is not None:
-        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
-        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
-        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
-    else:
-        return input_tensor.mean(dim=dim)
-
-
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-@dataclass
-class Seq2SeqTimeSeriesModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
-    """
-    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
-    distribution.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
-            Distributional loss.
-        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
-            Parameters of the chosen distribution.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    params: Optional[Tuple[torch.FloatTensor]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class SampleTimeSeriesPredictionOutput(ModelOutput):
-    sequences: torch.FloatTensor = None
-
-
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
-class InformerAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
+class ProbAttention(nn.Module):
     def __init__(
         self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
     ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        super(ProbAttention, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
+        # Q [B, H, L, D]
+        B, H, L_K, E = K.shape
+        _, _, L_Q, _ = Q.shape
+
+        # calculate the sampled Q_K
+        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
+        index_sample = torch.randint(
+            L_K, (L_Q, sample_k)
+        )  # real U = U_part(factor*ln(L_k))*L_q
+        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(
+            -2
+        )
 
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
+        M_top = M.topk(n_top, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        Q_reduce = Q[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
+        ]  # factor*ln(L_q)
+        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        B, H, L_V, D = V.shape
+        if not self.mask_flag:
+            # V_sum = V.sum(dim=-2)
+            V_sum = V.mean(dim=-2)
+            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
+        else:  # use mask
+            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
+            contex = V.cumsum(dim=-2)
+        return contex
+
+    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
+        B, H, L_V, D = V.shape
+
+        if self.mask_flag:
+            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
+
+        context_in[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ] = torch.matmul(attn, V).type_as(context_in)
+        if self.output_attention:
+            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
+            attns[
+                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+            ] = attn
+            return (context_in, attns)
         else:
-            attn_weights_reshaped = None
+            return (context_in, None)
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+    def forward(self, queries, keys, values, attn_mask):
+        B, L_Q, H, D = queries.shape
+        _, L_K, _, _ = keys.shape
 
-        attn_output = torch.bmm(attn_probs, value_states)
+        queries = queries.transpose(2, 1)
+        keys = keys.transpose(2, 1)
+        values = values.transpose(2, 1)
 
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
+        U_part = self.factor * np.ceil(np.log1p(L_K)).astype("int").item()  # c*ln(L_k)
+        u = self.factor * np.ceil(np.log1p(L_Q)).astype("int").item()  # c*ln(L_q)
 
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
+        U_part = U_part if U_part < L_K else L_K
+        u = u if u < L_Q else L_Q
 
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
 
-        attn_output = self.out_proj(attn_output)
+        # add scale factor
+        scale = self.scale or 1.0 / sqrt(D)
+        if scale is not None:
+            scores_top = scores_top * scale
+        # get the context
+        context = self._get_initial_context(values, L_Q)
+        # update the context with selected top_k queries
+        context, attn = self._update_context(
+            context, values, scores_top, index, L_Q, attn_mask
+        )
 
-        return attn_output, attn_weights_reshaped, past_key_value
+        return context.transpose(2, 1).contiguous(), attn
 
 
-# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->Informer
-class InformerEncoderLayer(nn.Module):
-    def __init__(self, config: InformerConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = InformerAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
+class AttentionLayer(nn.Module):
+    def __init__(
+        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
+    ):
+        super(AttentionLayer, self).__init__()
+
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+
+        self.inner_attention = attention
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+        self.mix = mix
+
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+
+        out, attn = self.inner_attention(queries, keys, values, attn_mask)
+        if self.mix:
+            out = out.transpose(2, 1).contiguous()
+        out = out.view(B, L, -1)
+
+        return self.out_projection(out), attn
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
         )
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: torch.FloatTensor,
-        layer_head_mask: torch.FloatTensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states, attn_weights, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        # x = x + self.dropout(self.attention(
+        #     x, x, x,
+        #     attn_mask = attn_mask
+        # ))
+        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = (
+            nn.ModuleList(conv_layers) if conv_layers is not None else None
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        residual = hidden_states
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
 
+        if self.norm is not None:
+            x = self.norm(x)
 
-# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->Informer
-class InformerDecoderLayer(nn.Module):
-    def __init__(self, config: InformerConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
+        return x, attns
 
-        self.self_attn = InformerAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = InformerAttention(
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
-    def forward(
+class DecoderLayer(nn.Module):
+    def __init__(
         self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = True,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
-            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
-                size `(decoder_attention_heads,)`.
-            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+    ):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
-            )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-            hidden_states = residual + hidden_states
-            hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
-class InformerPreTrainedModel(PreTrainedModel):
-    config_class = InformerConfig
-    base_model_prefix = "model"
-    main_input_name = "past_values"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (InformerDecoder, InformerEncoder)):
-            module.gradient_checkpointing = value
-
-
-INFORMER_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
 
-    Parameters:
-        config ([`InformerConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
 
-INFORMER_INPUTS_DOCSTRING = r"""
-    Args:
-        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Past values of the time series, that serve as context in order to predict the future. These values may
-            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
-            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
-            `static_categorical_features`, `static_real_features`, `past_time_features`).
+        return self.norm3(x + y)
 
-            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
 
-            Missing values need to be replaced with zeros.
+class Decoder(nn.Module):
+    def __init__(self, layers, norm_layer=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
 
-        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `past_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        for layer in self.layers:
+            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
 
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
-            the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional time features.
+        if self.norm is not None:
+            x = self.norm(x)
 
-            The Informer only learns additional embeddings for `static_categorical_features`.
+        return x
 
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
-            `[0, 1]`:
 
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
-            Optional static categorical features for which the model will learn an embedding, which it will add to the
-            values of the time series.
-
-            Static categorical features are features which have the same value for all time steps (static over time).
-
-            A typical example of a static categorical feature is a time series ID.
-
-        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
-            Optional static real features which the model will add to the values of the time series.
-
-            Static real features are features which have the same value for all time steps (static over time).
-
-            A typical example of a static real feature is promotion information.
-
-        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
-            Future values of the time series, that serve as labels for the model. The `future_values` is what the
-            Transformer needs to learn to output, given the `past_values`.
-
-            See the demo notebook and code snippets for details.
-
-            Missing values need to be replaced with zeros.
-
-        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `future_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
-
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
-            the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional features.
-
-            The Informer only learns additional embeddings for `static_categorical_features`.
-
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
-            make sure the model can only look at previous inputs in order to predict the future.
-
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
-class InformerEncoder(InformerPreTrainedModel):
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`InformerEncoderLayer`].
-
-    Args:
-        config: InformerConfig
-    """
-
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-
-        embed_dim = config.d_model
-
-        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(embed_dim)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
+class InformerModel(nn.Module):
+    def __init__(  # add loss param
         self,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-
-        # check if head_mask has a correct number of layers specified if desired
-        if head_mask is not None:
-            if head_mask.size()[0] != (len(self.layers)):
-                raise ValueError(
-                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
-
-        for idx, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
-                layer_outputs = (None, None)
-            else:
-                if self.gradient_checkpointing and self.training:
-
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs, output_attentions)
-
-                        return custom_forward
-
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(encoder_layer),
-                        hidden_states,
-                        attention_mask,
-                        (head_mask[idx] if head_mask is not None else None),
-                    )
-                else:
-                    layer_outputs = encoder_layer(
-                        hidden_states,
-                        attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                        output_attentions=output_attentions,
-                    )
-
-                hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
-class InformerDecoder(InformerPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
-    [`InformerDecoderLayer`]
-
-    Args:
-        config: InformerConfig
-    """
-
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-
-        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(config.d_model)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(inputs_embeds.device)
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
+        freq: str, # frequency
+        context_length: int,
+        prediction_length: int,
+        num_feat_dynamic_real: int,  # num_dynamic_real_features
+        num_feat_static_real: int,  # num_static_real_features
+        num_feat_static_cat: int,  # num_static_categorical_features
+        cardinality: List[int],
+        # Informer arguments
+        nhead: int,
+        num_encoder_layers: int, # encoder_layers
+        num_decoder_layers: int, # decoder_layers
+        dim_feedforward: int,
+        activation: str = "gelu", # activation_function
+        dropout: float = 0.1,
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        # univariate input
+        input_size: int = 1,
+        embedding_dimension: Optional[List[int]] = None,
+        distr_output: DistributionOutput = StudentTOutput(),
+        lags_seq: Optional[List[int]] = None,
+        scaling: bool = True,
+        num_parallel_samples: int = 100,
+    ) -> None:
+        super().__init__()
 
-        return combined_attention_mask
+        self.input_size = input_size
 
-    def forward(
-        self,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        r"""
-        Args:
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
-                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        self.target_shape = distr_output.event_shape
+        self.num_feat_dynamic_real = num_feat_dynamic_real
+        self.num_feat_static_cat = num_feat_static_cat
+        self.num_feat_static_real = num_feat_static_real
+        self.embedding_dimension = (
+            embedding_dimension
+            if embedding_dimension is not None or cardinality is None
+            else [min(50, (cat + 1) // 2) for cat in cardinality]
         )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
+        self.num_parallel_samples = num_parallel_samples
+        self.history_length = context_length + max(self.lags_seq)
+        self.embedder = FeatureEmbedder(
+            cardinalities=cardinality,
+            embedding_dims=self.embedding_dimension,
+        )
+        if scaling:
+            self.scaler = MeanScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        input_shape = inputs_embeds.size()[:-1]
+        # total feature size
+        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
 
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        self.context_length = context_length
+        self.prediction_length = prediction_length
+        self.distr_output = distr_output
+        self.param_proj = distr_output.get_args_proj(d_model)
 
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        # Informer enc-decoder
+        Attn = ProbAttention if attn == "prob" else FullAttention
+        # Encoder
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
+                    ),
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                )
+                for l in range(num_encoder_layers)
+            ],
+            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
+            if distil
+            else None,
+            norm_layer=torch.nn.LayerNorm(d_model),
         )
 
-        # expand encoder attention mask
-        if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
-
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-        next_decoder_cache = () if use_cache else None
-
-        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != (len(self.layers)):
-                    raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {head_mask.size()[0]}."
-                    )
-
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, use_cache)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    head_mask[idx] if head_mask is not None else None,
-                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
-                    None,
-                )
-            else:
-
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    cross_attn_layer_head_mask=(
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+        # Masked Decoder
+        self.decoder = Decoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=factor,
+                            attention_dropout=dropout,
+                            output_attention=False,
+                        ),
+                        d_model,
+                        nhead,
+                        mix=False,
                     ),
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
+                    d_model,
+                    d_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
                 )
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
+                for l in range(num_decoder_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(d_model),
         )
 
-
-@add_start_docstrings(
-    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
-    INFORMER_START_DOCSTRING,
-)
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
-class InformerModel(InformerPreTrainedModel):
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-
-        if config.scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
-        else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
-
-        self.embedder = FeatureEmbedder(
-            cardinalities=config.cardinality,
-            embedding_dims=config.embedding_dimension,
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_feat_static_real
+            + self.input_size  # the log(scale)
         )
 
-        # transformer encoder-decoder and mask initializer
-        self.encoder = InformerEncoder(config)
-        self.decoder = InformerDecoder(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
     @property
     def _past_length(self) -> int:
-        return self.config.context_length + max(self.config.lags_sequence)
+        return self.context_length + max(self.lags_seq)
 
     def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
-        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
-            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
-            j, :, k] = sequence[i, -indices[k]-S+j, :].
-
-        Args:
-            sequence: Tensor
-                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
-            subsequences_length : int
-                Length of the subsequences to be extracted.
-            shift: int
-                Shift the lags by this amount back.
+        Returns lagged subsequences of a given sequence.
+        Parameters
+        ----------
+        sequence : Tensor
+            the sequence from which lagged subsequences should be extracted.
+            Shape: (N, T, C).
+        subsequences_length : int
+            length of the subsequences to be extracted.
+        shift: int
+            shift the lags by this amount back.
+        Returns
+        --------
+        lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and
+            I = len(indices), containing lagged subsequences. Specifically,
+            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
         """
         sequence_length = sequence.shape[1]
-        indices = [lag - shift for lag in self.config.lags_sequence]
+        indices = [lag - shift for lag in self.lags_seq]
 
-        try:
-            assert max(indices) + subsequences_length <= sequence_length, (
-                f"lags cannot go further than history length, found lag {max(indices)} "
-                f"while history length is only {sequence_length}"
-            )
-        except AssertionError as e:
-            e.args += (max(indices), sequence_length)
-            raise
+        assert max(indices) + subsequences_length <= sequence_length, (
+            f"lags cannot go further than history length, found lag {max(indices)} "
+            f"while history length is only {sequence_length}"
+        )
 
         lagged_values = []
         for lag_index in indices:
@@ -1486,470 +522,198 @@ def get_lagged_subsequences(
             lagged_values.append(sequence[:, begin_index:end_index, ...])
         return torch.stack(lagged_values, dim=-1)
 
+    def _check_shapes(
+        self,
+        prior_input: torch.Tensor,
+        inputs: torch.Tensor,
+        features: Optional[torch.Tensor],
+    ) -> None:
+        assert len(prior_input.shape) == len(inputs.shape)
+        assert (
+            len(prior_input.shape) == 2 and self.input_size == 1
+        ) or prior_input.shape[2] == self.input_size
+        assert (len(inputs.shape) == 2 and self.input_size == 1) or inputs.shape[
+            -1
+        ] == self.input_size
+        assert (
+            features is None or features.shape[2] == self._number_of_features
+        ), f"{features.shape[2]}, expected {self._number_of_features}"
+
     def create_network_inputs(
         self,
-        past_values: torch.Tensor,
-        past_time_features: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        past_observed_mask: Optional[torch.Tensor] = None,
-        future_values: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: Optional[torch.Tensor] = None,
+        future_target: Optional[torch.Tensor] = None,
     ):
         # time feature
         time_feat = (
             torch.cat(
                 (
-                    past_time_features[:, self._past_length - self.config.context_length :, ...],
-                    future_time_features,
+                    past_time_feat[:, self._past_length - self.context_length :, ...],
+                    future_time_feat,
                 ),
                 dim=1,
             )
-            if future_values is not None
-            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+            if future_target is not None
+            else past_time_feat[:, self._past_length - self.context_length :, ...]
         )
 
         # target
-        if past_observed_mask is None:
-            past_observed_mask = torch.ones_like(past_values)
-
-        context = past_values[:, -self.config.context_length :]
-        observed_context = past_observed_mask[:, -self.config.context_length :]
+        context = past_target[:, -self.context_length :]
+        observed_context = past_observed_values[:, -self.context_length :]
         _, scale = self.scaler(context, observed_context)
 
         inputs = (
-            torch.cat((past_values, future_values), dim=1) / scale
-            if future_values is not None
-            else past_values / scale
+            torch.cat((past_target, future_target), dim=1) / scale
+            if future_target is not None
+            else past_target / scale
         )
 
         inputs_length = (
-            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
+            self._past_length + self.prediction_length
+            if future_target is not None
+            else self._past_length
         )
-        try:
-            assert inputs.shape[1] == inputs_length, (
-                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
-            )
-        except AssertionError as e:
-            e.args += (inputs.shape[1], inputs_length)
-            raise
+        assert inputs.shape[1] == inputs_length
 
         subsequences_length = (
-            self.config.context_length + self.config.prediction_length
-            if future_values is not None
-            else self.config.context_length
+            self.context_length + self.prediction_length
+            if future_target is not None
+            else self.context_length
         )
 
         # embeddings
-        embedded_cat = self.embedder(static_categorical_features)
-        # static features
-        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
-        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
-        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
-
-        # all features
-        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
-
-        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
-
-        lags_shape = lagged_sequence.shape
-        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
-
-        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
-
-        return transformer_inputs, scale, static_feat
-
-    def enc_dec_outputs(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.config.context_length, ...]
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
-
-        encoder_outputs = self.encoder(inputs_embeds=enc_input)
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        embedded_cat = self.embedder(feat_static_cat)
+        log_scale = scale.log() if self.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat(
+            (embedded_cat, feat_static_real, log_scale),
+            dim=1,
         )
-        return encoder_outputs, decoder_outputs
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from huggingface_hub import hf_hub_download
-        >>> import torch
-        >>> from transformers import InformerModel
-
-        >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
-        ... )
-        >>> batch = torch.load(file)
-
-        >>> model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
-
-        >>> # during training, one provides both past and future values
-        >>> # as well as possible additional features
-        >>> outputs = model(
-        ...     past_values=batch["past_values"],
-        ...     past_time_features=batch["past_time_features"],
-        ...     past_observed_mask=batch["past_observed_mask"],
-        ...     static_categorical_features=batch["static_categorical_features"],
-        ...     static_real_features=batch["static_real_features"],
-        ...     future_values=batch["future_values"],
-        ...     future_time_features=batch["future_time_features"],
-        ... )
-
-        >>> last_hidden_state = outputs.last_hidden_state
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_inputs, scale, static_feat = self.create_network_inputs(
-            past_values=past_values,
-            past_time_features=past_time_features,
-            past_observed_mask=past_observed_mask,
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            future_values=future_values,
-            future_time_features=future_time_features,
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, time_feat.shape[1], -1
         )
 
-        if encoder_outputs is None:
-            enc_input = transformer_inputs[:, : self.config.context_length, ...]
-            encoder_outputs = self.encoder(
-                inputs_embeds=enc_input,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        # self._check_shapes(prior_input, inputs, features)
 
-        if not return_dict:
-            return decoder_outputs + encoder_outputs + (scale, static_feat)
-
-        return Seq2SeqTimeSeriesModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-            scale=scale,
-            static_features=static_feat,
+        # sequence = torch.cat((prior_input, inputs), dim=1)
+        lagged_sequence = self.get_lagged_subsequences(
+            sequence=inputs,
+            subsequences_length=subsequences_length,
         )
 
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(
+            lags_shape[0], lags_shape[1], -1
+        )
 
-@add_start_docstrings(
-    "The Informer Model with a distribution head on top for time-series forecasting.",
-    INFORMER_START_DOCSTRING,
-)
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
-class InformerForPrediction(InformerPreTrainedModel):
-    def __init__(self, config: InformerConfig):
-        super().__init__(config)
-        self.model = InformerModel(config)
-        if config.distribution_output == "student_t":
-            self.distribution_output = StudentTOutput(dim=config.input_size)
-        elif config.distribution_output == "normal":
-            self.distribution_output = NormalOutput(dim=config.input_size)
-        elif config.distribution_output == "negative_binomial":
-            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
-        else:
-            raise ValueError(f"Unknown distribution output {config.distribution_output}")
-
-        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
-        self.target_shape = self.distribution_output.event_shape
-
-        if config.loss == "nll":
-            self.loss = NegativeLogLikelihood()
-        else:
-            raise ValueError(f"Unknown loss function {config.loss}")
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
 
-        # Initialize weights of distribution_output and apply final processing
-        self.post_init()
+        return transformer_inputs, scale, static_feat
 
-    def output_params(self, dec_output):
-        return self.parameter_projection(dec_output)
+    def output_params(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.context_length, ...]
+        dec_input = transformer_inputs[:, self.context_length :, ...]
 
-    def get_encoder(self):
-        return self.model.get_encoder()
+        enc_out, _ = self.encoder(enc_input)
+        dec_output = self.decoder(dec_input, enc_out)
 
-    def get_decoder(self):
-        return self.model.get_decoder()
+        return self.param_proj(dec_output)
 
     @torch.jit.ignore
-    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+    def output_distribution(
+        self, params, scale=None, trailing_n=None
+    ) -> torch.distributions.Distribution:
         sliced_params = params
         if trailing_n is not None:
             sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distribution_output.distribution(sliced_params, scale=scale)
+        return self.distr_output.distribution(sliced_params, scale=scale)
 
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    # for prediction
     def forward(
         self,
-        past_values: torch.Tensor,
-        past_time_features: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
-        future_observed_mask: Optional[torch.Tensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
-        r"""
-        Returns:
-
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-
-        Examples:
-
-        ```python
-        >>> from huggingface_hub import hf_hub_download
-        >>> import torch
-        >>> from transformers import InformerForPrediction
-
-        >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
-        ... )
-        >>> batch = torch.load(file)
-
-        >>> model = InformerForPrediction.from_pretrained(
-        ...     "huggingface/time-series-transformer-tourism-monthly"
-        ... )
-
-        >>> # during training, one provides both past and future values
-        >>> # as well as possible additional features
-        >>> outputs = model(
-        ...     past_values=batch["past_values"],
-        ...     past_time_features=batch["past_time_features"],
-        ...     past_observed_mask=batch["past_observed_mask"],
-        ...     static_categorical_features=batch["static_categorical_features"],
-        ...     static_real_features=batch["static_real_features"],
-        ...     future_values=batch["future_values"],
-        ...     future_time_features=batch["future_time_features"],
-        ... )
-
-        >>> loss = outputs.loss
-        >>> loss.backward()
-
-        >>> # during inference, one only provides past values
-        >>> # as well as possible additional features
-        >>> # the model autoregressively generates future values
-        >>> outputs = model.generate(
-        ...     past_values=batch["past_values"],
-        ...     past_time_features=batch["past_time_features"],
-        ...     past_observed_mask=batch["past_observed_mask"],
-        ...     static_categorical_features=batch["static_categorical_features"],
-        ...     static_real_features=batch["static_real_features"],
-        ...     future_time_features=batch["future_time_features"],
-        ... )
-
-        >>> mean_prediction = outputs.sequences.mean(dim=1)
-        ```"""
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if future_values is not None:
-            use_cache = False
-
-        outputs = self.model(
-            past_values=past_values,
-            past_time_features=past_time_features,
-            past_observed_mask=past_observed_mask,
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            future_values=future_values,
-            future_time_features=future_time_features,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            past_key_values=past_key_values,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            return_dict=return_dict,
-        )
+        feat_static_cat: torch.Tensor,
+        feat_static_real: torch.Tensor,
+        past_time_feat: torch.Tensor,
+        past_target: torch.Tensor,
+        past_observed_values: torch.Tensor,
+        future_time_feat: torch.Tensor,
+        num_parallel_samples: Optional[int] = None,
+    ) -> torch.Tensor:
 
-        prediction_loss = None
-        params = None
-        if future_values is not None:
-            params = self.output_params(outputs[0])  # outputs.last_hidden_state
-            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
-
-            loss = self.loss(distribution, future_values)
-
-            if future_observed_mask is None:
-                future_observed_mask = torch.ones_like(future_values)
-
-            if len(self.target_shape) == 0:
-                loss_weights = future_observed_mask
-            else:
-                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
-
-            prediction_loss = weighted_average(loss, weights=loss_weights)
-
-        if not return_dict:
-            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
-            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
-
-        return Seq2SeqTimeSeriesPredictionOutput(
-            loss=prediction_loss,
-            params=params,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-            scale=outputs.scale,
-            static_features=outputs.static_features,
-        )
+        if num_parallel_samples is None:
+            num_parallel_samples = self.num_parallel_samples
 
-    @torch.no_grad()
-    def generate(
-        self,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
-        past_time_features: torch.Tensor,
-        past_values: torch.Tensor,
-        past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor],
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-    ) -> torch.Tensor:
-        outputs = self(
-            static_categorical_features=static_categorical_features,
-            static_real_features=static_real_features,
-            past_time_features=past_time_features,
-            past_values=past_values,
-            past_observed_mask=past_observed_mask,
-            future_time_features=future_time_features,
-            future_values=None,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=True,
-            use_cache=True,
+        encoder_inputs, scale, static_feat = self.create_network_inputs(
+            feat_static_cat,
+            feat_static_real,
+            past_time_feat,
+            past_target,
+            past_observed_values,
         )
 
-        decoder = self.model.get_decoder()
-        enc_last_hidden = outputs.encoder_last_hidden_state
-        scale = outputs.scale
-        static_feat = outputs.static_features
+        enc_out, _ = self.encoder(encoder_inputs)
 
-        num_parallel_samples = self.config.num_parallel_samples
-        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_scale = scale.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
 
-        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+        repeated_past_target = (
+            past_target.repeat_interleave(repeats=self.num_parallel_samples, dim=0)
+            / repeated_scale
+        )
 
-        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
-        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
-        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(
+            -1, future_time_feat.shape[1], -1
+        )
+        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
+        repeated_features = features.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
 
-        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_enc_out = enc_out.repeat_interleave(
+            repeats=self.num_parallel_samples, dim=0
+        )
 
         future_samples = []
 
         # greedy decoding
-        for k in range(self.config.prediction_length):
-            lagged_sequence = self.model.get_lagged_subsequences(
-                sequence=repeated_past_values,
+        for k in range(self.prediction_length):
+            # self._check_shapes(repeated_past_target, next_sample, next_features)
+            # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
+
+            lagged_sequence = self.get_lagged_subsequences(
+                sequence=repeated_past_target,
                 subsequences_length=1 + k,
                 shift=1,
             )
 
             lags_shape = lagged_sequence.shape
-            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+            reshaped_lagged_sequence = lagged_sequence.reshape(
+                lags_shape[0], lags_shape[1], -1
+            )
 
-            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+            decoder_input = torch.cat(
+                (reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1
+            )
 
-            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
-            dec_last_hidden = dec_output.last_hidden_state
+            output = self.decoder(decoder_input, repeated_enc_out)
 
-            params = self.parameter_projection(dec_last_hidden[:, -1:])
+            params = self.param_proj(output[:, -1:])
             distr = self.output_distribution(params, scale=repeated_scale)
             next_sample = distr.sample()
 
-            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
+            repeated_past_target = torch.cat(
+                (repeated_past_target, next_sample / repeated_scale), dim=1
+            )
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)
-
-        return SampleTimeSeriesPredictionOutput(
-            sequences=concat_future_samples.reshape(
-                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
-            )
+        return concat_future_samples.reshape(
+            (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
         )
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 258230654b0a..68345408398e 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -137,7 +137,7 @@ def __init__(
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
-        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7], # Eli: Remove the default here
         scaling: bool = True,
         num_dynamic_real_features: int = 0,
         num_static_categorical_features: int = 0,
@@ -151,7 +151,7 @@ def __init__(
         decoder_attention_heads: int = 2,
         encoder_layers: int = 2,
         decoder_layers: int = 2,
-        is_encoder_decoder: bool = True, # Eli: remove from signature?
+        is_encoder_decoder: bool = True,
         activation_function: str = "gelu",
         dropout: float = 0.1,
         encoder_layerdrop: float = 0.1,

From d56a0cda2e7f8852309da7ac56d7f50c969f94e1 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 16 Jan 2023 13:12:44 +0000
Subject: [PATCH 018/164] done InformerConfig, but need to change the names

---
 .../models/informer/configuration_informer.py | 53 +++++++++++++------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 51ac98700310..913acf7f1f67 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -53,10 +53,11 @@ def __init__(
         input_size: int = 1,
         embedding_dimension: Optional[List[int]] = None,
         distr_output: str = "student_t",
-        lags_seq: Optional[List[int]] = None, # used to be freq.
+        lags_seq: Optional[List[int]] = None,  # used to be freq.
         scaling: bool = True,
         num_parallel_samples: int = 100,
         is_encoder_decoder: bool = True,
+        **kwargs
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
@@ -67,7 +68,7 @@ def __init__(
         # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
         # self.num_time_features = num_time_features # Eli: From vanilla ts transformer
         self.lags_seq = lags_seq
-        # self.scaling = scaling # Eli: From vanilla ts transformer
+        self.scaling = scaling
         self.num_feat_dynamic_real = num_feat_dynamic_real
         self.num_feat_static_cat = num_feat_static_cat
         self.num_feat_static_real = num_feat_static_real
@@ -91,22 +92,42 @@ def __init__(
             self.embedding_dimension = embedding_dimension
         else:
             self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
-            
-        self.num_parallel_samples = num_parallel_samples
 
-        # self.history_length = context_length + max(self.lags_seq)
-        self.embedder = FeatureEmbedder(
-            cardinalities=cardinality,
-            embedding_dims=self.embedding_dimension,
-        )
-        if scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
-        else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
+        self.num_parallel_samples = num_parallel_samples
+        # self.history_length = context_length + max(self.lags_seq) # Eli: I think can be removed
 
-        # total feature size
-        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        # Transformer architecture configuration
+        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        self.nhead = nhead
+        self.num_encoder_layers = num_encoder_layers # encoder_layers
+        self.num_decoder_layers = num_decoder_layers # decoder_layers
+        self.dim_feedforward = dim_feedforward
+        self.activation = activation  # activation_function
+        self.dropout = dropout
+        self.attn = attn,
+        self.factor = factor
+        self.distil = distil
 
-        self.param_proj = distr_output.get_args_proj(d_model)
+        # self.param_proj = distr_output.get_args_proj(d_model)
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_feat_dynamic_real
+            + self.num_feat_static_real
+            + self.input_size  # the log(scale)
+        )
+
+
+    # @property
+    # def _number_of_features(self) -> int:
+    #     return (
+    #         sum(self.embedding_dimension)
+    #         + self.num_dynamic_real_features
+    #         + self.num_time_features
+    #         + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
+    #         + self.input_size  # the log(scale)
+    #     )

From f68192dcd359495a0eea1a6d102535acd4a0ed2c Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 16 Jan 2023 13:42:55 +0000
Subject: [PATCH 019/164] Done informer model init. working on enc-dec

---
 .../models/informer/modeling_informer.py      | 367 ++++++++++++------
 1 file changed, 245 insertions(+), 122 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 9616af397751..395975549d53 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1,3 +1,42 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Informer model."""
+
+import random
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_informer import InformerConfig
+
 from math import sqrt
 from typing import List, Optional
 
@@ -5,9 +44,109 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from gluonts.torch.distributions import DistributionOutput, StudentTOutput
-from gluonts.torch.modules.feature import FeatureEmbedder
-from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InformerConfig"
+
+
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, scale
 
 
 class TriangularCausalMask:
@@ -263,9 +402,9 @@ def forward(self, x, attn_mask=None):
         return self.norm2(x + y), attn
 
 
-class Encoder(nn.Module):
+class InformerEncoder(nn.Module):
     def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
-        super(Encoder, self).__init__()
+        super(InformerEncoder, self).__init__()
         self.attn_layers = nn.ModuleList(attn_layers)
         self.conv_layers = (
             nn.ModuleList(conv_layers) if conv_layers is not None else None
@@ -330,7 +469,7 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         return self.norm3(x + y)
 
 
-class Decoder(nn.Module):
+class InformerDecoder(nn.Module):
     def __init__(self, layers, norm_layer=None):
         super(Decoder, self).__init__()
         self.layers = nn.ModuleList(layers)
@@ -346,132 +485,116 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         return x
 
 
-class InformerModel(nn.Module):
-    def __init__(  # add loss param
-        self,
-        freq: str, # frequency
-        context_length: int,
-        prediction_length: int,
-        num_feat_dynamic_real: int,  # num_dynamic_real_features
-        num_feat_static_real: int,  # num_static_real_features
-        num_feat_static_cat: int,  # num_static_categorical_features
-        cardinality: List[int],
-        # Informer arguments
-        nhead: int,
-        num_encoder_layers: int, # encoder_layers
-        num_decoder_layers: int, # decoder_layers
-        dim_feedforward: int,
-        activation: str = "gelu", # activation_function
-        dropout: float = 0.1,
-        attn: str = "prob",
-        factor: int = 5,
-        distil: bool = True,
-        # univariate input
-        input_size: int = 1,
-        embedding_dimension: Optional[List[int]] = None,
-        distr_output: DistributionOutput = StudentTOutput(),
-        lags_seq: Optional[List[int]] = None,
-        scaling: bool = True,
-        num_parallel_samples: int = 100,
-    ) -> None:
-        super().__init__()
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
 
-        self.input_size = input_size
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
 
-        self.target_shape = distr_output.event_shape
-        self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_cat = num_feat_static_cat
-        self.num_feat_static_real = num_feat_static_real
-        self.embedding_dimension = (
-            embedding_dimension
-            if embedding_dimension is not None or cardinality is None
-            else [min(50, (cat + 1) // 2) for cat in cardinality]
-        )
-        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
-        self.num_parallel_samples = num_parallel_samples
-        self.history_length = context_length + max(self.lags_seq)
-        self.embedder = FeatureEmbedder(
-            cardinalities=cardinality,
-            embedding_dims=self.embedding_dimension,
-        )
-        if scaling:
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        if config.scaling:
             self.scaler = MeanScaler(dim=1, keepdim=True)
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        # total feature size
-        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        self.embedder = FeatureEmbedder(
+            cardinalities=config.cardinality,
+            embedding_dims=config.embedding_dimension,
+        )
+
+        # Informer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
 
-        self.context_length = context_length
-        self.prediction_length = prediction_length
-        self.distr_output = distr_output
-        self.param_proj = distr_output.get_args_proj(d_model)
+        # Initialize weights and apply final processing
+        self.post_init()
 
         # Informer enc-decoder
-        Attn = ProbAttention if attn == "prob" else FullAttention
-        # Encoder
-        self.encoder = Encoder(
-            [
-                EncoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=False,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=False,
-                    ),
-                    d_model,
-                    d_ff=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                )
-                for l in range(num_encoder_layers)
-            ],
-            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
-            if distil
-            else None,
-            norm_layer=torch.nn.LayerNorm(d_model),
-        )
+        # Attn = ProbAttention if config.attn == "prob" else FullAttention
 
-        # Masked Decoder
-        self.decoder = Decoder(
-            [
-                DecoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=True,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=True,
-                    ),
-                    AttentionLayer(
-                        FullAttention(
-                            mask_flag=False,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=False,
-                    ),
-                    d_model,
-                    d_ff=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                )
-                for l in range(num_decoder_layers)
-            ],
-            norm_layer=torch.nn.LayerNorm(d_model),
-        )
+        # Encoder
+        # self.encoder = Encoder(
+        #     [
+        #         EncoderLayer(
+        #             AttentionLayer(
+        #                 Attn(
+        #                     mask_flag=False,
+        #                     factor=factor,
+        #                     attention_dropout=dropout,
+        #                     output_attention=False,
+        #                 ),
+        #                 d_model,
+        #                 nhead,
+        #                 mix=False,
+        #             ),
+        #             d_model,
+        #             d_ff=dim_feedforward,
+        #             dropout=dropout,
+        #             activation=activation,
+        #         )
+        #         for l in range(num_encoder_layers)
+        #     ],
+        #     [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
+        #     if distil
+        #     else None,
+        #     norm_layer=torch.nn.LayerNorm(d_model),
+        # )
+        #
+        # # Masked Decoder
+        # self.decoder = Decoder(
+        #     [
+        #         DecoderLayer(
+        #             AttentionLayer(
+        #                 Attn(
+        #                     mask_flag=True,
+        #                     factor=factor,
+        #                     attention_dropout=dropout,
+        #                     output_attention=False,
+        #                 ),
+        #                 d_model,
+        #                 nhead,
+        #                 mix=True,
+        #             ),
+        #             AttentionLayer(
+        #                 FullAttention(
+        #                     mask_flag=False,
+        #                     factor=factor,
+        #                     attention_dropout=dropout,
+        #                     output_attention=False,
+        #                 ),
+        #                 d_model,
+        #                 nhead,
+        #                 mix=False,
+        #             ),
+        #             d_model,
+        #             d_ff=dim_feedforward,
+        #             dropout=dropout,
+        #             activation=activation,
+        #         )
+        #         for l in range(num_decoder_layers)
+        #     ],
+        #     norm_layer=torch.nn.LayerNorm(d_model),
+        # )
 
     @property
     def _number_of_features(self) -> int:

From ffb78333dcd3c998fc817c564939c4c25e990a90 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 17 Jan 2023 05:42:13 +0000
Subject: [PATCH 020/164] added things to address, after reading again enc-dec
 in the paper

---
 src/transformers/models/informer/config_using_gluonTS.py | 2 +-
 src/transformers/models/informer/modeling_informer.py    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/informer/config_using_gluonTS.py b/src/transformers/models/informer/config_using_gluonTS.py
index c407b3ac3ec5..b047f6458885 100644
--- a/src/transformers/models/informer/config_using_gluonTS.py
+++ b/src/transformers/models/informer/config_using_gluonTS.py
@@ -224,7 +224,7 @@ def __init__(self, c_in):
             padding=1,
             padding_mode="circular",
         )
-        self.norm = nn.BatchNorm1d(c_in)
+        self.norm = nn.BatchNorm1d(c_in) # Question: why batchnorm here?
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
 
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 395975549d53..8a82758db975 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -361,13 +361,13 @@ def __init__(self, c_in):
             padding=1,
             padding_mode="circular",
         )
-        self.norm = nn.BatchNorm1d(c_in)
+        self.norm = nn.BatchNorm1d(c_in) # Eli question: why batchnorm here?
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
 
     def forward(self, x):
         x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x)
+        x = self.norm(x) # Eli: why? maybe because the impl...
         x = self.activation(x)
         x = self.maxPool(x)
         x = x.transpose(1, 2)
@@ -535,7 +535,7 @@ def __init__(self, config: InformerConfig):
         # Encoder
         # self.encoder = Encoder(
         #     [
-        #         EncoderLayer(
+        #         EncoderLayer( # Eli question: why I need EncoderLayers here?
         #             AttentionLayer(
         #                 Attn(
         #                     mask_flag=False,

From 4286dd3adc4fd1a0ad4628231bb3d0482356806c Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 17 Jan 2023 13:49:29 +0000
Subject: [PATCH 021/164] done modeling - checking initialization work

---
 .../informer/check_instantiate_works.py       |   2 +-
 .../models/informer/configuration_informer.py |  56 ++++----
 .../models/informer/modeling_informer.py      | 127 +++++++++---------
 3 files changed, 91 insertions(+), 94 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index 487bf2a9a21b..fbb6b151f79c 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -4,5 +4,5 @@
 if __name__ == '__main__':
     freq = "h"
     lags = get_lags_for_frequency(freq_str=freq)
-    model = InformerModel(InformerConfig())
+    model = InformerModel(InformerConfig(lags_seq=lags))
     print(model)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 913acf7f1f67..d7d6ecb0c153 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -32,37 +32,35 @@
 
 class InformerConfig(PretrainedConfig):
     def __init__(
-        self,
-        context_length: int,
-        prediction_length: int,
-        num_feat_dynamic_real: int,  # num_dynamic_real_features
-        num_feat_static_real: int,  # num_static_real_features
-        num_feat_static_cat: int,  # num_static_categorical_features
-        cardinality: List[int],
-        # Informer arguments
-        nhead: int,
-        num_encoder_layers: int, # encoder_layers
-        num_decoder_layers: int, # decoder_layers
-        dim_feedforward: int,
-        activation: str = "gelu", # activation_function
-        dropout: float = 0.1,
-        attn: str = "prob",
-        factor: int = 5,
-        distil: bool = True,
-        # univariate input
-        input_size: int = 1,
-        embedding_dimension: Optional[List[int]] = None,
-        distr_output: str = "student_t",
-        lags_seq: Optional[List[int]] = None,  # used to be freq.
-        scaling: bool = True,
-        num_parallel_samples: int = 100,
-        is_encoder_decoder: bool = True,
-        **kwargs
+            self,
+            input_size: int = 1,
+            prediction_length: Optional[int] = None,
+            context_length: Optional[int] = None,
+            distr_output: str = "student_t",
+            lags_seq: Optional[List[int]] = None,  # used to be freq.
+            scaling: bool = True,
+            num_feat_dynamic_real: int = 0,  # num_dynamic_real_features
+            num_feat_static_real: int = 0,  # num_static_real_features
+            num_feat_static_cat: int = 0,  # num_static_categorical_features
+            cardinality: Optional[List[int]] = None,
+            embedding_dimension: Optional[List[int]] = None,
+            dim_feedforward: int = 32,  # decoder_ffn_dim & encoder_ffn_dim
+            nhead: int = 2,  # Eli: not sure what the name
+            num_encoder_layers: int = 2,  # encoder_layers
+            num_decoder_layers: int = 2,  # decoder_layers
+            is_encoder_decoder: bool = True,
+            activation: str = "gelu",  # activation_function
+            dropout: float = 0.1,
+            attn: str = "prob",
+            factor: int = 5,
+            distil: bool = True,
+            num_parallel_samples: int = 100,
+            **kwargs
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.distr_output = distr_output # Eli: change to distribution_output
+        self.distr_output = distr_output  # Eli: change to distribution_output
         # self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
         # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
@@ -99,8 +97,8 @@ def __init__(
         # Transformer architecture configuration
         self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
         self.nhead = nhead
-        self.num_encoder_layers = num_encoder_layers # encoder_layers
-        self.num_decoder_layers = num_decoder_layers # decoder_layers
+        self.num_encoder_layers = num_encoder_layers  # encoder_layers
+        self.num_decoder_layers = num_decoder_layers  # decoder_layers
         self.dim_feedforward = dim_feedforward
         self.activation = activation  # activation_function
         self.dropout = dropout
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 8a82758db975..2b80ab2e89a8 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -471,7 +471,7 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
 
 class InformerDecoder(nn.Module):
     def __init__(self, layers, norm_layer=None):
-        super(Decoder, self).__init__()
+        super(InformerDecoder, self).__init__()
         self.layers = nn.ModuleList(layers)
         self.norm = norm_layer
 
@@ -530,71 +530,70 @@ def __init__(self, config: InformerConfig):
         self.post_init()
 
         # Informer enc-decoder
-        # Attn = ProbAttention if config.attn == "prob" else FullAttention
+        Attn = ProbAttention if config.attn == "prob" else FullAttention
 
         # Encoder
-        # self.encoder = Encoder(
-        #     [
-        #         EncoderLayer( # Eli question: why I need EncoderLayers here?
-        #             AttentionLayer(
-        #                 Attn(
-        #                     mask_flag=False,
-        #                     factor=factor,
-        #                     attention_dropout=dropout,
-        #                     output_attention=False,
-        #                 ),
-        #                 d_model,
-        #                 nhead,
-        #                 mix=False,
-        #             ),
-        #             d_model,
-        #             d_ff=dim_feedforward,
-        #             dropout=dropout,
-        #             activation=activation,
-        #         )
-        #         for l in range(num_encoder_layers)
-        #     ],
-        #     [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
-        #     if distil
-        #     else None,
-        #     norm_layer=torch.nn.LayerNorm(d_model),
-        # )
-        #
-        # # Masked Decoder
-        # self.decoder = Decoder(
-        #     [
-        #         DecoderLayer(
-        #             AttentionLayer(
-        #                 Attn(
-        #                     mask_flag=True,
-        #                     factor=factor,
-        #                     attention_dropout=dropout,
-        #                     output_attention=False,
-        #                 ),
-        #                 d_model,
-        #                 nhead,
-        #                 mix=True,
-        #             ),
-        #             AttentionLayer(
-        #                 FullAttention(
-        #                     mask_flag=False,
-        #                     factor=factor,
-        #                     attention_dropout=dropout,
-        #                     output_attention=False,
-        #                 ),
-        #                 d_model,
-        #                 nhead,
-        #                 mix=False,
-        #             ),
-        #             d_model,
-        #             d_ff=dim_feedforward,
-        #             dropout=dropout,
-        #             activation=activation,
-        #         )
-        #         for l in range(num_decoder_layers)
-        #     ],
-        #     norm_layer=torch.nn.LayerNorm(d_model),
-        # )
+        self.encoder = InformerEncoder(
+            [
+                EncoderLayer( # Eli question: why I need EncoderLayers here?
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=False,
+                    ),
+                    config.d_model,
+                    d_ff=config.dim_feedforward,
+                    dropout=config.dropout,
+                    activation=config.activation,
+                )
+                for l in range(config.num_encoder_layers)
+            ],
+            [ConvLayer(config.d_model) for l in range(config.num_encoder_layers - 1)]
+            if config.distil else None,
+            norm_layer=torch.nn.LayerNorm(config.d_model),
+        )
+
+        # Masked Decoder
+        self.decoder = InformerDecoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=False,
+                    ),
+                    config.d_model,
+                    d_ff=config.dim_feedforward,
+                    dropout=config.dropout,
+                    activation=config.activation,
+                )
+                for l in range(config.num_decoder_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(config.d_model),
+        )
 
     @property
     def _number_of_features(self) -> int:

From d278b80840ff0350d82649d5ca753d1f6a811f5a Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 17 Jan 2023 14:18:07 +0000
Subject: [PATCH 022/164] moved enc-dec init to InformerEncoder/Decoder init

---
 .../models/informer/modeling_informer.py      | 143 +++++++++---------
 1 file changed, 68 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 2b80ab2e89a8..92c01a402220 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -403,13 +403,36 @@ def forward(self, x, attn_mask=None):
 
 
 class InformerEncoder(nn.Module):
-    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+    def __init__(self, config: InformerConfig):
         super(InformerEncoder, self).__init__()
-        self.attn_layers = nn.ModuleList(attn_layers)
-        self.conv_layers = (
-            nn.ModuleList(conv_layers) if conv_layers is not None else None
-        )
-        self.norm = norm_layer
+
+        Attn = ProbAttention if config.attn == "prob" else FullAttention
+        self.attn_layers = nn.ModuleList([
+            EncoderLayer(  # Eli question: why I need EncoderLayers here?
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=False,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=False,
+                    ),
+                    config.d_model,
+                    d_ff=config.dim_feedforward,
+                    dropout=config.dropout,
+                    activation=config.activation,
+                ) for _ in range(config.num_encoder_layers)
+        ])
+
+        if config.distil is not None:
+            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.num_encoder_layers - 1)])
+        else:
+            self.conv_layers = None
+
+        self.norm = torch.nn.LayerNorm(config.d_model)
 
     def forward(self, x, attn_mask=None):
         # x [B, L, D]
@@ -470,10 +493,46 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
 
 
 class InformerDecoder(nn.Module):
-    def __init__(self, layers, norm_layer=None):
+    def __init__(self, config: InformerConfig):
         super(InformerDecoder, self).__init__()
-        self.layers = nn.ModuleList(layers)
-        self.norm = norm_layer
+
+        Attn = ProbAttention if config.attn == "prob" else FullAttention
+
+        # Masked Decoder
+        self.layers = nn.ModuleList(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        Attn(
+                            mask_flag=True,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=True,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            mask_flag=False,
+                            factor=config.factor,
+                            attention_dropout=config.dropout,
+                            output_attention=False,
+                        ),
+                        config.d_model,
+                        config.nhead,
+                        mix=False,
+                    ),
+                    config.d_model,
+                    d_ff=config.dim_feedforward,
+                    dropout=config.dropout,
+                    activation=config.activation,
+                )
+                for _ in range(config.num_decoder_layers)
+            ],
+        )
+        self.norm = torch.nn.LayerNorm(config.d_model)
 
     def forward(self, x, cross, x_mask=None, cross_mask=None):
         for layer in self.layers:
@@ -529,72 +588,6 @@ def __init__(self, config: InformerConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-        # Informer enc-decoder
-        Attn = ProbAttention if config.attn == "prob" else FullAttention
-
-        # Encoder
-        self.encoder = InformerEncoder(
-            [
-                EncoderLayer( # Eli question: why I need EncoderLayers here?
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=False,
-                            factor=config.factor,
-                            attention_dropout=config.dropout,
-                            output_attention=False,
-                        ),
-                        config.d_model,
-                        config.nhead,
-                        mix=False,
-                    ),
-                    config.d_model,
-                    d_ff=config.dim_feedforward,
-                    dropout=config.dropout,
-                    activation=config.activation,
-                )
-                for l in range(config.num_encoder_layers)
-            ],
-            [ConvLayer(config.d_model) for l in range(config.num_encoder_layers - 1)]
-            if config.distil else None,
-            norm_layer=torch.nn.LayerNorm(config.d_model),
-        )
-
-        # Masked Decoder
-        self.decoder = InformerDecoder(
-            [
-                DecoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=True,
-                            factor=config.factor,
-                            attention_dropout=config.dropout,
-                            output_attention=False,
-                        ),
-                        config.d_model,
-                        config.nhead,
-                        mix=True,
-                    ),
-                    AttentionLayer(
-                        FullAttention(
-                            mask_flag=False,
-                            factor=config.factor,
-                            attention_dropout=config.dropout,
-                            output_attention=False,
-                        ),
-                        config.d_model,
-                        config.nhead,
-                        mix=False,
-                    ),
-                    config.d_model,
-                    d_ff=config.dim_feedforward,
-                    dropout=config.dropout,
-                    activation=config.activation,
-                )
-                for l in range(config.num_decoder_layers)
-            ],
-            norm_layer=torch.nn.LayerNorm(config.d_model),
-        )
-
     @property
     def _number_of_features(self) -> int:
         return (

From 9e668905da84149b948fbdcd09f63cffccd3c76b Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 17 Jan 2023 14:43:15 +0000
Subject: [PATCH 023/164] added 'init_std' to config, now model init works!

---
 src/transformers/models/informer/configuration_informer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index d7d6ecb0c153..d8dcb137c601 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -55,6 +55,8 @@ def __init__(
             factor: int = 5,
             distil: bool = True,
             num_parallel_samples: int = 100,
+            init_std: float = 0.02,
+            use_cache=True,
             **kwargs
     ):
         # time series specific configuration
@@ -105,6 +107,8 @@ def __init__(
         self.attn = attn,
         self.factor = factor
         self.distil = distil
+        self.init_std = init_std
+        self.use_cache = use_cache
 
         # self.param_proj = distr_output.get_args_proj(d_model)
 

From d9d1a671a00f69e4542968f910e63723e685a2ec Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 21 Jan 2023 12:56:24 +0000
Subject: [PATCH 024/164] WIP conversion script, and added code sources

---
 ...nal_colab_pytorch_checkpoint_to_pytorch.py | 141 ++++++++++++++++++
 .../models/informer/modeling_informer.py      |  28 ++--
 2 files changed, 158 insertions(+), 11 deletions(-)
 create mode 100644 src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py

diff --git a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..30f24bfbfc9d
--- /dev/null
+++ b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Informer checkpoint."""
+
+
+import argparse
+import os
+from pathlib import Path
+
+import torch
+from torch import nn
+
+from transformers import (
+    BartConfig,
+    BartForConditionalGeneration,
+    BartForSequenceClassification,
+    BartModel,
+    BartTokenizer,
+)
+from transformers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = " Hello world! cécé herlolip"
+
+mnli_rename_keys = [
+    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
+    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
+    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
+    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
+]
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "encoder.version",
+        "decoder.version",
+        "model.encoder.version",
+        "model.decoder.version",
+        "_float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def load_xsum_checkpoint(checkpoint_path):
+    """Checkpoint path should end in model.pt"""
+    sd = torch.load(checkpoint_path, map_location="cpu")
+    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
+    hub_interface.model.load_state_dict(sd["model"])
+    return hub_interface
+
+
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+@torch.no_grad()
+def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
+    """
+    Copy/paste/tweak model's weights to our BERT structure.
+    """
+    if not os.path.exists(checkpoint_path):
+        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
+    else:
+        bart = load_xsum_checkpoint(checkpoint_path)
+
+    bart.model.upgrade_state_dict(bart.model.state_dict())
+    if hf_checkpoint_name is None:
+        hf_checkpoint_name = checkpoint_path.replace(".", "-")
+    config = BartConfig.from_pretrained(hf_checkpoint_name)
+    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
+    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
+    assert torch.eq(tokens, tokens2).all()
+
+    if checkpoint_path == "bart.large.mnli":
+        state_dict = bart.state_dict()
+        remove_ignore_keys_(state_dict)
+        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
+        for src, dest in mnli_rename_keys:
+            rename_key(state_dict, src, dest)
+        model = BartForSequenceClassification(config).eval()
+        model.load_state_dict(state_dict)
+        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
+        new_model_outputs = model(tokens)[0]  # logits
+    else:  # no classification heads to worry about
+        state_dict = bart.model.state_dict()
+        remove_ignore_keys_(state_dict)
+        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
+        fairseq_output = bart.extract_features(tokens)
+        if hf_checkpoint_name == "facebook/bart-large":
+            model = BartModel(config).eval()
+            model.load_state_dict(state_dict)
+            new_model_outputs = model(tokens).model[0]
+        else:
+            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
+            model.model.load_state_dict(state_dict)
+            if hasattr(model, "lm_head"):
+                model.lm_head = make_linear_from_emb(model.model.shared)
+            new_model_outputs = model.model(tokens)[0]
+
+    # Check results
+    assert fairseq_output.shape == new_model_outputs.shape
+    assert (fairseq_output == new_model_outputs).all().item()
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
+    )
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
+    )
+    args = parser.parse_args()
+    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
\ No newline at end of file
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 92c01a402220..ef3a3370bc59 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -50,6 +50,8 @@
 _CONFIG_FOR_DOC = "InformerConfig"
 
 
+# Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
+# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
 class FeatureEmbedder(nn.Module):
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
         super().__init__()
@@ -74,6 +76,7 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
+# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
 class MeanScaler(nn.Module):
     """
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
@@ -128,6 +131,7 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens
         return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
 
 
+# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
 class NOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
@@ -148,7 +152,9 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
         return data, scale
 
-
+# Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
+# are from the original Informer repository (see the exact source below)
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class TriangularCausalMask:
     def __init__(self, B, L, device="cpu"):
         mask_shape = [B, 1, L, L]
@@ -162,6 +168,7 @@ def mask(self):
         return self._mask
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class ProbMask:
     def __init__(self, B, H, L, index, scores, device="cpu"):
         _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
@@ -176,6 +183,7 @@ def mask(self):
         return self._mask
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
 class FullAttention(nn.Module):
     def __init__(
         self,
@@ -194,7 +202,7 @@ def __init__(
     def forward(self, queries, keys, values, attn_mask):
         B, L, H, E = queries.shape
         _, S, _, D = values.shape
-        scale = self.scale or 1.0 / sqrt(E)
+        scale = self.scale or 1. / sqrt(E)
 
         scores = torch.einsum("blhe,bshe->bhls", queries, keys)
         if self.mask_flag:
@@ -212,6 +220,7 @@ def forward(self, queries, keys, values, attn_mask):
             return (V.contiguous(), None)
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
 class ProbAttention(nn.Module):
     def __init__(
         self,
@@ -235,22 +244,16 @@ def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
 
         # calculate the sampled Q_K
         K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
-        index_sample = torch.randint(
-            L_K, (L_Q, sample_k)
-        )  # real U = U_part(factor*ln(L_k))*L_q
+        index_sample = torch.randint(L_K, (L_Q, sample_k))  # real U = U_part(factor*ln(L_k))*L_q
         K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
-        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(
-            -2
-        )
+        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2)
 
         # find the Top_k query with sparisty measurement
         M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
         M_top = M.topk(n_top, sorted=False)[1]
 
         # use the reduced Q to calculate Q_K
-        Q_reduce = Q[
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
-        ]  # factor*ln(L_q)
+        Q_reduce = Q[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :]  # factor*ln(L_q)
         Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
 
         return Q_K, M_top
@@ -317,6 +320,7 @@ def forward(self, queries, keys, values, attn_mask):
         return context.transpose(2, 1).contiguous(), attn
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
 class AttentionLayer(nn.Module):
     def __init__(
         self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
@@ -351,6 +355,7 @@ def forward(self, queries, keys, values, attn_mask):
         return self.out_projection(out), attn
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
 class ConvLayer(nn.Module):
     def __init__(self, c_in):
         super(ConvLayer, self).__init__()
@@ -374,6 +379,7 @@ def forward(self, x):
         return x
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
 class EncoderLayer(nn.Module):
     def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
         super(EncoderLayer, self).__init__()

From 52de756863311d3d60284a02dc1fa5e13ce83e50 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 23 Jan 2023 08:28:44 +0000
Subject: [PATCH 025/164] WIP conversion script: loading original informer pth
 works

---
 ...nal_colab_pytorch_checkpoint_to_pytorch.py | 218 ++++++++++++------
 1 file changed, 148 insertions(+), 70 deletions(-)

diff --git a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
index 30f24bfbfc9d..983cf575cb72 100644
--- a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
@@ -14,6 +14,17 @@
 # limitations under the License.
 """Convert Informer checkpoint."""
 
+"""
+Assumption: 
+
+Informer2020 repository is git-cloned from
+https://github.com/elisim/Informer2020/tree/hf
+
+"hf" branch. There, I created a Informer's checkpoint from the official colab notebook.
+
+See also: https://github.com/elisim/Informer2020/blob/hf/create_checkpoint_from_offical_colab.ipynb
+"""
+
 
 import argparse
 import os
@@ -22,19 +33,20 @@
 import torch
 from torch import nn
 
-from transformers import (
-    BartConfig,
-    BartForConditionalGeneration,
-    BartForSequenceClassification,
-    BartModel,
-    BartTokenizer,
-)
+from transformers import InformerConfig, InformerModel
 from transformers.utils import logging
 
+
+import sys
+if not 'Informer2020' in sys.path:
+    sys.path += ['Informer2020']
+
+
+from Informer2020.exp.exp_informer import Exp_Informer
+
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 
-SAMPLE_TEXT = " Hello world! cécé herlolip"
 
 mnli_rename_keys = [
     ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
@@ -44,6 +56,79 @@
 ]
 
 
+def _create_informer_args():
+    class dotdict(dict):
+        """dot.notation access to dictionary attributes"""
+        __getattr__ = dict.get
+        __setattr__ = dict.__setitem__
+        __delattr__ = dict.__delitem__
+
+    args = dotdict()
+
+    ### BoilerCode
+    args.model = 'informer'  # model of experiment, options: [informer, informerstack, informerlight(TBD)]
+    args.data = 'ETTh1'  # data
+    args.root_path = './ETDataset/ETT-small/'  # root path of data file
+    args.data_path = 'ETTh1.csv'  # data file
+    args.checkpoints = './informer_checkpoints'  # location of model checkpoints
+
+    ### TS
+    args.features = 'M'  # forecasting task, options:[M, S, MS]
+    # M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
+    args.target = 'OT'  # target feature in S or MS task
+    args.freq = 'h'  # freq for time features encoding,
+    # options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly],
+    # you can also use more detailed freq like 15min or 3h
+
+    ### Encoder Decoder
+    args.seq_len = 96  # input sequence length of Informer encoder
+    args.label_len = 48  # start token length of Informer decoder
+    args.pred_len = 24  # prediction sequence length
+    # Informer decoder input: concat[start token series(label_len), zero padding series(pred_len)]
+
+    args.enc_in = 7  # encoder input size
+    args.dec_in = 7  # decoder input size
+    args.c_out = 7  # output size
+    args.factor = 5  # probsparse attn factor
+    args.d_model = 512  # dimension of model
+    args.n_heads = 8  # num of heads
+    args.e_layers = 2  # num of encoder layers
+    args.d_layers = 1  # num of decoder layers
+    args.d_ff = 2048  # dimension of fcn in model
+    args.dropout = 0.05  # dropout
+    args.attn = 'prob'  # attention used in encoder, options:[prob, full]
+    args.embed = 'timeF'  # time features encoding, options:[timeF, fixed, learned]
+    args.activation = 'gelu'  # activation
+    args.distil = True  # whether to use distilling in encoder
+    args.output_attention = False  # whether to output attention in ecoder
+    args.mix = True
+    args.padding = 0
+
+    ### Training
+    args.batch_size = 32
+    args.learning_rate = 0.0001
+    args.loss = 'mse'
+    args.lradj = 'type1'
+    args.use_amp = False  # whether to use automatic mixed precision training
+
+    args.num_workers = 0
+    args.itr = 1
+    args.train_epochs = 6
+    args.patience = 3
+    args.des = 'exp'
+
+    args.use_gpu = False  # True if torch.cuda.is_available() else False
+    args.gpu = 0
+
+    args.use_multi_gpu = False
+    args.devices = '0,1,2,3'
+
+    args.detail_freq = args.freq  # the actual freq
+    args.freq = args.freq[-1:]  # Not important
+
+    return args
+
+
 def remove_ignore_keys_(state_dict):
     ignore_keys = [
         "encoder.version",
@@ -61,12 +146,12 @@ def rename_key(dct, old, new):
     dct[new] = val
 
 
-def load_xsum_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pt"""
+def load_informer_checkpoint(checkpoint_path):
+    """Checkpoint path should end in model.pth"""
+    exp = Exp_Informer(args=_create_informer_args())
     sd = torch.load(checkpoint_path, map_location="cpu")
-    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
-    hub_interface.model.load_state_dict(sd["model"])
-    return hub_interface
+    exp.model.load_state_dict(sd)
+    return exp.model
 
 
 def make_linear_from_emb(emb):
@@ -76,66 +161,59 @@ def make_linear_from_emb(emb):
     return lin_layer
 
 
-@torch.no_grad()
-def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    if not os.path.exists(checkpoint_path):
-        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
-    else:
-        bart = load_xsum_checkpoint(checkpoint_path)
-
-    bart.model.upgrade_state_dict(bart.model.state_dict())
-    if hf_checkpoint_name is None:
-        hf_checkpoint_name = checkpoint_path.replace(".", "-")
-    config = BartConfig.from_pretrained(hf_checkpoint_name)
-    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
-    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
-    assert torch.eq(tokens, tokens2).all()
-
-    if checkpoint_path == "bart.large.mnli":
-        state_dict = bart.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
-        for src, dest in mnli_rename_keys:
-            rename_key(state_dict, src, dest)
-        model = BartForSequenceClassification(config).eval()
-        model.load_state_dict(state_dict)
-        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
-        new_model_outputs = model(tokens)[0]  # logits
-    else:  # no classification heads to worry about
-        state_dict = bart.model.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-        fairseq_output = bart.extract_features(tokens)
-        if hf_checkpoint_name == "facebook/bart-large":
-            model = BartModel(config).eval()
-            model.load_state_dict(state_dict)
-            new_model_outputs = model(tokens).model[0]
-        else:
-            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
-            model.model.load_state_dict(state_dict)
-            if hasattr(model, "lm_head"):
-                model.lm_head = make_linear_from_emb(model.model.shared)
-            new_model_outputs = model.model(tokens)[0]
-
-    # Check results
-    assert fairseq_output.shape == new_model_outputs.shape
-    assert (fairseq_output == new_model_outputs).all().item()
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
+# @torch.no_grad()
+# def convert_informer_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
+#     """
+#     Copy/paste/tweak model's weights to our BERT structure.
+#     """
+#     informer = load_informer_checkpoint(checkpoint_path)
+#
+#     informer.model.upgrade_state_dict(informer.model.state_dict())
+#     if hf_checkpoint_name is None:
+#         hf_checkpoint_name = checkpoint_path.replace(".", "-")
+#     config = BartConfig.from_pretrained(hf_checkpoint_name)
+#
+#     if checkpoint_path == "bart.large.mnli":
+#         state_dict = bart.state_dict()
+#         remove_ignore_keys_(state_dict)
+#         state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
+#         for src, dest in mnli_rename_keys:
+#             rename_key(state_dict, src, dest)
+#         model = BartForSequenceClassification(config).eval()
+#         model.load_state_dict(state_dict)
+#         fairseq_output = bart.predict("mnli", tokens, return_logits=True)
+#         new_model_outputs = model(tokens)[0]  # logits
+#     else:  # no classification heads to worry about
+#         state_dict = bart.model.state_dict()
+#         remove_ignore_keys_(state_dict)
+#         state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
+#         fairseq_output = bart.extract_features(tokens)
+#         if hf_checkpoint_name == "facebook/bart-large":
+#             model = BartModel(config).eval()
+#             model.load_state_dict(state_dict)
+#             new_model_outputs = model(tokens).model[0]
+#         else:
+#             model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
+#             model.model.load_state_dict(state_dict)
+#             if hasattr(model, "lm_head"):
+#                 model.lm_head = make_linear_from_emb(model.model.shared)
+#             new_model_outputs = model.model(tokens)[0]
+#
+#     # Check results
+#     assert fairseq_output.shape == new_model_outputs.shape
+#     assert (fairseq_output == new_model_outputs).all().item()
+#     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+#     model.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
+    informer_checkpoint_default_path = "./Informer2020/informer_checkpoints/informer_ETTh1_ftM_sl96_ll48_pl24_dm512_nh8_el2_dl1_df2048_atprob_fc5_ebtimeF_dtTrue_mxTrue_exp_0/checkpoint.pth"
+
     parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
+    parser.add_argument("informer_path", default=None, type=str, help="a path to a model.pth on local filesystem.")
     parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
-    )
     args = parser.parse_args()
-    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
\ No newline at end of file
+
+    # convert_informer_checkpoint(args.informer_path, args.pytorch_dump_folder_path)
+    informer = load_informer_checkpoint(informer_checkpoint_default_path)
+    print(informer)

From cb9a287eec253abdfcbde397a5974f6bc19a04b5 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 23 Jan 2023 11:18:46 +0000
Subject: [PATCH 026/164] WIP conversion script: change defaults in the config

---
 .../models/informer/configuration_informer.py            | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index d8dcb137c601..66827f4757ce 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -45,12 +45,12 @@ def __init__(
             cardinality: Optional[List[int]] = None,
             embedding_dimension: Optional[List[int]] = None,
             dim_feedforward: int = 32,  # decoder_ffn_dim & encoder_ffn_dim
-            nhead: int = 2,  # Eli: not sure what the name
+            nhead: int = 8,  # Eli: how much attention heads?
             num_encoder_layers: int = 2,  # encoder_layers
-            num_decoder_layers: int = 2,  # decoder_layers
+            num_decoder_layers: int = 1,  # decoder_layers
             is_encoder_decoder: bool = True,
             activation: str = "gelu",  # activation_function
-            dropout: float = 0.1,
+            dropout: float = 0.05,
             attn: str = "prob",
             factor: int = 5,
             distil: bool = True,
@@ -104,7 +104,7 @@ def __init__(
         self.dim_feedforward = dim_feedforward
         self.activation = activation  # activation_function
         self.dropout = dropout
-        self.attn = attn,
+        self.attn = attn
         self.factor = factor
         self.distil = distil
         self.init_std = init_std
@@ -123,7 +123,6 @@ def _number_of_features(self) -> int:
             + self.input_size  # the log(scale)
         )
 
-
     # @property
     # def _number_of_features(self) -> int:
     #     return (

From 2b8d7e42b8f1800d6f28eb8930cda37a20420886 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 24 Jan 2023 13:37:14 +0000
Subject: [PATCH 027/164] WIP conversion script: supporting Informer input
 embedding

---
 .../models/informer/configuration_informer.py |   2 +
 .../models/informer/modeling_informer.py      | 211 ++++++++++++++----
 2 files changed, 175 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 66827f4757ce..e78f5661ae80 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -55,6 +55,7 @@ def __init__(
             factor: int = 5,
             distil: bool = True,
             num_parallel_samples: int = 100,
+            time_features_embedding_type: str = "timeF",  # This can be set to timeF, fixed, learned
             init_std: float = 0.02,
             use_cache=True,
             **kwargs
@@ -109,6 +110,7 @@ def __init__(
         self.distil = distil
         self.init_std = init_std
         self.use_cache = use_cache
+        self.time_features_embedding_type = time_features_embedding_type
 
         # self.param_proj = distr_output.get_args_proj(d_model)
 
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index ef3a3370bc59..151eb894bc08 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -40,6 +40,7 @@
 from math import sqrt
 from typing import List, Optional
 
+import math
 import numpy as np
 import torch
 import torch.nn as nn
@@ -152,8 +153,137 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
         return data, scale
 
+# Eli: all the Embedding classes are from the original informer repository
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/embed.py
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular')
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6, 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    # def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+    #     super(DataEmbedding, self).__init__()
+    #
+    #     self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+    #     self.position_embedding = PositionalEmbedding(d_model=d_model)
+    #     self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+    #                                                 freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+    #         d_model=d_model, embed_type=embed_type, freq=freq)
+    #
+    #     self.dropout = nn.Dropout(p=dropout)
+
+    def __init__(self, config: InformerConfig):
+        super(DataEmbedding, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=config.d_model)
+        self.position_embedding = PositionalEmbedding(d_model=config.d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=config.d_model,
+                                                    embed_type=config.time_features_embedding_type,
+                                                    freq=freq) if config.time_features_embedding_type != 'timeF' \
+            else TimeFeatureEmbedding(d_model=config.d_model, embed_type=config.embed_type, freq=freq)
+
+        self.dropout = nn.Dropout(p=config.dropout)
+
+    def forward(self, x, x_mark):
+        x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark)
+
+        return self.dropout(x)
+
 # Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
 # are from the original Informer repository (see the exact source below)
+
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class TriangularCausalMask:
     def __init__(self, B, L, device="cpu"):
@@ -408,13 +538,51 @@ def forward(self, x, attn_mask=None):
         return self.norm2(x + y), attn
 
 
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/decoder.py
+class DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+    ):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
+        )
+
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+
 class InformerEncoder(nn.Module):
     def __init__(self, config: InformerConfig):
         super(InformerEncoder, self).__init__()
 
         Attn = ProbAttention if config.attn == "prob" else FullAttention
         self.attn_layers = nn.ModuleList([
-            EncoderLayer(  # Eli question: why I need EncoderLayers here?
+            EncoderLayer(
                     AttentionLayer(
                         Attn(
                             mask_flag=False,
@@ -461,43 +629,6 @@ def forward(self, x, attn_mask=None):
         return x, attns
 
 
-class DecoderLayer(nn.Module):
-    def __init__(
-        self,
-        self_attention,
-        cross_attention,
-        d_model,
-        d_ff=None,
-        dropout=0.1,
-        activation="relu",
-    ):
-        super(DecoderLayer, self).__init__()
-        d_ff = d_ff or 4 * d_model
-        self.self_attention = self_attention
-        self.cross_attention = cross_attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
-
-    def forward(self, x, cross, x_mask=None, cross_mask=None):
-        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
-        x = self.norm1(x)
-
-        x = x + self.dropout(
-            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
-        )
-
-        y = x = self.norm2(x)
-        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-        y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-        return self.norm3(x + y)
-
-
 class InformerDecoder(nn.Module):
     def __init__(self, config: InformerConfig):
         super(InformerDecoder, self).__init__()
@@ -587,6 +718,10 @@ def __init__(self, config: InformerConfig):
             embedding_dims=config.embedding_dimension,
         )
 
+        # Informer time features embeddings
+        self.enc_embedding = DataEmbedding(config)
+        self.dec_embedding = DataEmbedding(config)
+
         # Informer encoder-decoder and mask initializer
         self.encoder = InformerEncoder(config)
         self.decoder = InformerDecoder(config)

From 3fa969a96305f83e305adcc865611dbf24a55778 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 24 Jan 2023 14:45:38 +0000
Subject: [PATCH 028/164] WIP conversion script: added parameters for the
 informer embed

---
 .../models/informer/configuration_informer.py | 14 +++++--
 .../models/informer/modeling_informer.py      | 41 +++++++------------
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index e78f5661ae80..c2d602e16afc 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -55,8 +55,12 @@ def __init__(
             factor: int = 5,
             distil: bool = True,
             num_parallel_samples: int = 100,
-            time_features_embedding_type: str = "timeF",  # This can be set to timeF, fixed, learned
             init_std: float = 0.02,
+            embedding_type: str = "timeF",  # This can be set to timeF, fixed, learned
+            d_model: int = 512,  # because of the informer embedding
+            enc_in: int = 7,
+            dec_in: int = 7,
+            freq: str = "h",
             use_cache=True,
             **kwargs
     ):
@@ -98,7 +102,8 @@ def __init__(
         # self.history_length = context_length + max(self.lags_seq) # Eli: I think can be removed
 
         # Transformer architecture configuration
-        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        # self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
+        self.d_model = d_model
         self.nhead = nhead
         self.num_encoder_layers = num_encoder_layers  # encoder_layers
         self.num_decoder_layers = num_decoder_layers  # decoder_layers
@@ -110,7 +115,10 @@ def __init__(
         self.distil = distil
         self.init_std = init_std
         self.use_cache = use_cache
-        self.time_features_embedding_type = time_features_embedding_type
+        self.embedding_type = embedding_type
+        self.enc_in = enc_in
+        self.dec_in = dec_in
+        self.freq = freq
 
         # self.param_proj = distr_output.get_args_proj(d_model)
 
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 151eb894bc08..a831af175a47 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -43,7 +43,6 @@
 import math
 import numpy as np
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 
 logger = logging.get_logger(__name__)
@@ -253,28 +252,14 @@ def forward(self, x):
 
 
 class DataEmbedding(nn.Module):
-    # def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
-    #     super(DataEmbedding, self).__init__()
-    #
-    #     self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
-    #     self.position_embedding = PositionalEmbedding(d_model=d_model)
-    #     self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
-    #                                                 freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
-    #         d_model=d_model, embed_type=embed_type, freq=freq)
-    #
-    #     self.dropout = nn.Dropout(p=dropout)
-
-    def __init__(self, config: InformerConfig):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
         super(DataEmbedding, self).__init__()
 
-        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=config.d_model)
-        self.position_embedding = PositionalEmbedding(d_model=config.d_model)
-        self.temporal_embedding = TemporalEmbedding(d_model=config.d_model,
-                                                    embed_type=config.time_features_embedding_type,
-                                                    freq=freq) if config.time_features_embedding_type != 'timeF' \
-            else TimeFeatureEmbedding(d_model=config.d_model, embed_type=config.embed_type, freq=freq)
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type!='timeF' else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
 
-        self.dropout = nn.Dropout(p=config.dropout)
+        self.dropout = nn.Dropout(p=dropout)
 
     def forward(self, x, x_mark):
         x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark)
@@ -713,14 +698,18 @@ def __init__(self, config: InformerConfig):
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        self.embedder = FeatureEmbedder(
-            cardinalities=config.cardinality,
-            embedding_dims=config.embedding_dimension,
-        )
+        # Eli: it's not clear if the model will use the embedding of the paper,
+        # or the embedding from glounTS. Let's wait for HF review :)
+
+        # self.embedder = FeatureEmbedder(
+        #     cardinalities=config.cardinality,
+        #     embedding_dims=config.embedding_dimension,
+        # )
 
         # Informer time features embeddings
-        self.enc_embedding = DataEmbedding(config)
-        self.dec_embedding = DataEmbedding(config)
+        # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/model.py#L23
+        self.enc_embedding = DataEmbedding(config.enc_in, config.d_model, config.embedding_type, config.freq, config.dropout)
+        self.dec_embedding = DataEmbedding(config.enc_in, config.d_model, config.embedding_type, config.freq, config.dropout)
 
         # Informer encoder-decoder and mask initializer
         self.encoder = InformerEncoder(config)

From 49170d873ecb4013104f9dc38f3701b76fedf159 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 25 Jan 2023 11:27:44 +0000
Subject: [PATCH 029/164] WIP conversion script: change dim_feedforward=2048

---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index c2d602e16afc..af408d8e99b3 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -44,7 +44,7 @@ def __init__(
             num_feat_static_cat: int = 0,  # num_static_categorical_features
             cardinality: Optional[List[int]] = None,
             embedding_dimension: Optional[List[int]] = None,
-            dim_feedforward: int = 32,  # decoder_ffn_dim & encoder_ffn_dim
+            dim_feedforward: int = 2048,  # decoder_ffn_dim & encoder_ffn_dim
             nhead: int = 8,  # Eli: how much attention heads?
             num_encoder_layers: int = 2,  # encoder_layers
             num_decoder_layers: int = 1,  # decoder_layers

From 585e40b5112d984d5cedc746bd256889ed0b94ef Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 25 Jan 2023 12:30:19 +0000
Subject: [PATCH 030/164] WIP conversion script: remove unused args for loading
 checkpoint

---
 ...nal_colab_pytorch_checkpoint_to_pytorch.py | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
index 983cf575cb72..5dc032e6203f 100644
--- a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
@@ -24,8 +24,6 @@
 
 See also: https://github.com/elisim/Informer2020/blob/hf/create_checkpoint_from_offical_colab.ipynb
 """
-
-
 import argparse
 import os
 from pathlib import Path
@@ -57,6 +55,12 @@
 
 
 def _create_informer_args():
+    """
+    Arguments are taken from the offical colab example:
+    https://colab.research.google.com/drive/1_X7O2BkFLvqyCdZzDZvV2MB0aAvYALLC
+
+    I only comment arguments that are not needed for the model creation (e.g. data_path, use_gpu)
+    """
     class dotdict(dict):
         """dot.notation access to dictionary attributes"""
         __getattr__ = dict.get
@@ -67,10 +71,10 @@ class dotdict(dict):
 
     ### BoilerCode
     args.model = 'informer'  # model of experiment, options: [informer, informerstack, informerlight(TBD)]
-    args.data = 'ETTh1'  # data
-    args.root_path = './ETDataset/ETT-small/'  # root path of data file
-    args.data_path = 'ETTh1.csv'  # data file
-    args.checkpoints = './informer_checkpoints'  # location of model checkpoints
+    # args.data = 'ETTh1'  # data
+    # args.root_path = './ETDataset/ETT-small/'  # root path of data file
+    # args.data_path = 'ETTh1.csv'  # data file
+    # args.checkpoints = './informer_checkpoints'  # location of model checkpoints
 
     ### TS
     args.features = 'M'  # forecasting task, options:[M, S, MS]
@@ -117,11 +121,11 @@ class dotdict(dict):
     args.patience = 3
     args.des = 'exp'
 
-    args.use_gpu = False  # True if torch.cuda.is_available() else False
-    args.gpu = 0
-
-    args.use_multi_gpu = False
-    args.devices = '0,1,2,3'
+    # args.use_gpu = False  # True if torch.cuda.is_available() else False
+    # args.gpu = 0
+    #
+    # args.use_multi_gpu = False
+    # args.devices = '0,1,2,3'
 
     args.detail_freq = args.freq  # the actual freq
     args.freq = args.freq[-1:]  # Not important
@@ -209,10 +213,10 @@ def make_linear_from_emb(emb):
 if __name__ == "__main__":
     informer_checkpoint_default_path = "./Informer2020/informer_checkpoints/informer_ETTh1_ftM_sl96_ll48_pl24_dm512_nh8_el2_dl1_df2048_atprob_fc5_ebtimeF_dtTrue_mxTrue_exp_0/checkpoint.pth"
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument("informer_path", default=None, type=str, help="a path to a model.pth on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("informer_path", default=None, type=str, help="a path to a model.pth on local filesystem.")
+    # parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    # args = parser.parse_args()
 
     # convert_informer_checkpoint(args.informer_path, args.pytorch_dump_folder_path)
     informer = load_informer_checkpoint(informer_checkpoint_default_path)

From 824f48eb8e5aa5953111cb90d4277016c5f12de0 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 25 Jan 2023 12:43:32 +0000
Subject: [PATCH 031/164] just cleaning up

---
 .../models/informer/modeling_informer.py      | 79 ++++++++-----------
 1 file changed, 33 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index a831af175a47..1b7c1be4720e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -718,18 +718,9 @@ def __init__(self, config: InformerConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @property
-    def _number_of_features(self) -> int:
-        return (
-            sum(self.embedding_dimension)
-            + self.num_feat_dynamic_real
-            + self.num_feat_static_real
-            + self.input_size  # the log(scale)
-        )
-
     @property
     def _past_length(self) -> int:
-        return self.context_length + max(self.lags_seq)
+        return self.config.context_length + max(self.config.lags_seq)
 
     def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
@@ -767,44 +758,27 @@ def get_lagged_subsequences(
             lagged_values.append(sequence[:, begin_index:end_index, ...])
         return torch.stack(lagged_values, dim=-1)
 
-    def _check_shapes(
-        self,
-        prior_input: torch.Tensor,
-        inputs: torch.Tensor,
-        features: Optional[torch.Tensor],
-    ) -> None:
-        assert len(prior_input.shape) == len(inputs.shape)
-        assert (
-            len(prior_input.shape) == 2 and self.input_size == 1
-        ) or prior_input.shape[2] == self.input_size
-        assert (len(inputs.shape) == 2 and self.input_size == 1) or inputs.shape[
-            -1
-        ] == self.input_size
-        assert (
-            features is None or features.shape[2] == self._number_of_features
-        ), f"{features.shape[2]}, expected {self._number_of_features}"
-
     def create_network_inputs(
         self,
         feat_static_cat: torch.Tensor,
         feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
+        past_time_features: torch.Tensor,
         past_target: torch.Tensor,
         past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
         future_target: Optional[torch.Tensor] = None,
     ):
         # time feature
         time_feat = (
             torch.cat(
                 (
-                    past_time_feat[:, self._past_length - self.context_length :, ...],
-                    future_time_feat,
+                    past_time_features[:, self._past_length - self.config.context_length:, ...],
+                    future_time_features,
                 ),
                 dim=1,
             )
             if future_target is not None
-            else past_time_feat[:, self._past_length - self.context_length :, ...]
+            else past_time_features[:, self._past_length - self.context_length:, ...]
         )
 
         # target
@@ -844,8 +818,6 @@ def create_network_inputs(
 
         features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
-        # self._check_shapes(prior_input, inputs, features)
-
         # sequence = torch.cat((prior_input, inputs), dim=1)
         lagged_sequence = self.get_lagged_subsequences(
             sequence=inputs,
@@ -861,23 +833,20 @@ def create_network_inputs(
 
         return transformer_inputs, scale, static_feat
 
-    def output_params(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.context_length, ...]
-        dec_input = transformer_inputs[:, self.context_length :, ...]
+    def enc_dec_outputs(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.config.context_length, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
 
         enc_out, _ = self.encoder(enc_input)
         dec_output = self.decoder(dec_input, enc_out)
 
         return self.param_proj(dec_output)
 
-    @torch.jit.ignore
-    def output_distribution(
-        self, params, scale=None, trailing_n=None
-    ) -> torch.distributions.Distribution:
-        sliced_params = params
-        if trailing_n is not None:
-            sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distr_output.distribution(sliced_params, scale=scale)
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
 
     # for prediction
     def forward(
@@ -929,7 +898,6 @@ def forward(
 
         # greedy decoding
         for k in range(self.prediction_length):
-            # self._check_shapes(repeated_past_target, next_sample, next_features)
             # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
 
             lagged_sequence = self.get_lagged_subsequences(
@@ -962,3 +930,22 @@ def forward(
         return concat_future_samples.reshape(
             (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
         )
+
+
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self):
+        pass
+
+    @torch.jit.ignore
+    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distr_output.distribution(sliced_params, scale=scale)
+
+
+
+class InformerForPointPrediction(InformerPreTrainedModel):
+    pass
+
+

From 878bd79e17356c37de448a7b3bd869ec93546104 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Thu, 26 Jan 2023 13:33:18 +0000
Subject: [PATCH 032/164] DataEmbedding removed, after thinking with Kashif

---
 .../models/informer/configuration_informer.py |   8 --
 .../models/informer/modeling_informer.py      | 130 +-----------------
 2 files changed, 4 insertions(+), 134 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index af408d8e99b3..858afd2c5986 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -56,11 +56,7 @@ def __init__(
             distil: bool = True,
             num_parallel_samples: int = 100,
             init_std: float = 0.02,
-            embedding_type: str = "timeF",  # This can be set to timeF, fixed, learned
             d_model: int = 512,  # because of the informer embedding
-            enc_in: int = 7,
-            dec_in: int = 7,
-            freq: str = "h",
             use_cache=True,
             **kwargs
     ):
@@ -115,10 +111,6 @@ def __init__(
         self.distil = distil
         self.init_std = init_std
         self.use_cache = use_cache
-        self.embedding_type = embedding_type
-        self.enc_in = enc_in
-        self.dec_in = dec_in
-        self.freq = freq
 
         # self.param_proj = distr_output.get_args_proj(d_model)
 
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1b7c1be4720e..d5ed2adf2422 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -152,120 +152,6 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
         return data, scale
 
-# Eli: all the Embedding classes are from the original informer repository
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/embed.py
-class PositionalEmbedding(nn.Module):
-    def __init__(self, d_model, max_len=5000):
-        super(PositionalEmbedding, self).__init__()
-        # Compute the positional encodings once in log space.
-        pe = torch.zeros(max_len, d_model).float()
-        pe.require_grad = False
-
-        position = torch.arange(0, max_len).float().unsqueeze(1)
-        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
-
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-
-        pe = pe.unsqueeze(0)
-        self.register_buffer('pe', pe)
-
-    def forward(self, x):
-        return self.pe[:, :x.size(1)]
-
-
-class TokenEmbedding(nn.Module):
-    def __init__(self, c_in, d_model):
-        super(TokenEmbedding, self).__init__()
-        padding = 1 if torch.__version__ >= '1.5.0' else 2
-        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
-                                   kernel_size=3, padding=padding, padding_mode='circular')
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
-
-    def forward(self, x):
-        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
-        return x
-
-
-class FixedEmbedding(nn.Module):
-    def __init__(self, c_in, d_model):
-        super(FixedEmbedding, self).__init__()
-
-        w = torch.zeros(c_in, d_model).float()
-        w.require_grad = False
-
-        position = torch.arange(0, c_in).float().unsqueeze(1)
-        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
-
-        w[:, 0::2] = torch.sin(position * div_term)
-        w[:, 1::2] = torch.cos(position * div_term)
-
-        self.emb = nn.Embedding(c_in, d_model)
-        self.emb.weight = nn.Parameter(w, requires_grad=False)
-
-    def forward(self, x):
-        return self.emb(x).detach()
-
-
-class TemporalEmbedding(nn.Module):
-    def __init__(self, d_model, embed_type='fixed', freq='h'):
-        super(TemporalEmbedding, self).__init__()
-
-        minute_size = 4
-        hour_size = 24
-        weekday_size = 7
-        day_size = 32
-        month_size = 13
-
-        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
-        if freq == 't':
-            self.minute_embed = Embed(minute_size, d_model)
-        self.hour_embed = Embed(hour_size, d_model)
-        self.weekday_embed = Embed(weekday_size, d_model)
-        self.day_embed = Embed(day_size, d_model)
-        self.month_embed = Embed(month_size, d_model)
-
-    def forward(self, x):
-        x = x.long()
-
-        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0.
-        hour_x = self.hour_embed(x[:, :, 3])
-        weekday_x = self.weekday_embed(x[:, :, 2])
-        day_x = self.day_embed(x[:, :, 1])
-        month_x = self.month_embed(x[:, :, 0])
-
-        return hour_x + weekday_x + day_x + month_x + minute_x
-
-
-class TimeFeatureEmbedding(nn.Module):
-    def __init__(self, d_model, embed_type='timeF', freq='h'):
-        super(TimeFeatureEmbedding, self).__init__()
-
-        freq_map = {'h': 4, 't': 5, 's': 6, 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
-        d_inp = freq_map[freq]
-        self.embed = nn.Linear(d_inp, d_model)
-
-    def forward(self, x):
-        return self.embed(x)
-
-
-class DataEmbedding(nn.Module):
-    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
-        super(DataEmbedding, self).__init__()
-
-        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
-        self.position_embedding = PositionalEmbedding(d_model=d_model)
-        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type!='timeF' else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
-
-        self.dropout = nn.Dropout(p=dropout)
-
-    def forward(self, x, x_mark):
-        x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark)
-
-        return self.dropout(x)
-
 # Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
 # are from the original Informer repository (see the exact source below)
 
@@ -698,18 +584,10 @@ def __init__(self, config: InformerConfig):
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        # Eli: it's not clear if the model will use the embedding of the paper,
-        # or the embedding from glounTS. Let's wait for HF review :)
-
-        # self.embedder = FeatureEmbedder(
-        #     cardinalities=config.cardinality,
-        #     embedding_dims=config.embedding_dimension,
-        # )
-
-        # Informer time features embeddings
-        # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/model.py#L23
-        self.enc_embedding = DataEmbedding(config.enc_in, config.d_model, config.embedding_type, config.freq, config.dropout)
-        self.dec_embedding = DataEmbedding(config.enc_in, config.d_model, config.embedding_type, config.freq, config.dropout)
+        self.embedder = FeatureEmbedder(
+            cardinalities=config.cardinality,
+            embedding_dims=config.embedding_dimension,
+        )
 
         # Informer encoder-decoder and mask initializer
         self.encoder = InformerEncoder(config)

From 1ce431017150fa998abc0299a27554c133180e53 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Thu, 26 Jan 2023 15:31:33 +0000
Subject: [PATCH 033/164] working on forward pass

---
 .../informer/check_instantiate_works.py       | 43 +++++++++++++++++--
 .../models/informer/modeling_informer.py      | 15 +++----
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index fbb6b151f79c..6fb3619d25cb 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -1,8 +1,43 @@
-from transformers import InformerModel, InformerConfig
-from gluonts.time_feature import get_lags_for_frequency
+from transformers import InformerModel, InformerConfig, TimeSeriesTransformerModel
+from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
+
+from huggingface_hub import hf_hub_download
+import torch
+
 
 if __name__ == '__main__':
-    freq = "h"
+    freq = "1M"
+    prediction_length = 24
     lags = get_lags_for_frequency(freq_str=freq)
-    model = InformerModel(InformerConfig(lags_seq=lags))
+    time_features = time_features_from_frequency_str(freq)
+
+    config = InformerConfig(prediction_length=prediction_length,
+                            context_length=prediction_length*3,
+                            lags_seq=lags,
+                            num_time_features=len(time_features) + 1,
+                            num_static_categorical_features=1,
+                            cardinality=[366],
+                            embedding_dimension=[2],
+                            encoder_layers=4,
+                            decoder_layers=4)
+
+    model: InformerModel = InformerModel(config)
     print(model)
+    file = hf_hub_download(
+        repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+    )
+    batch = torch.load(file)
+
+    # during training, one provides both past and future values
+    # as well as possible additional features
+    outputs = model(
+        past_values=batch["past_values"],
+        past_time_features=batch["past_time_features"],
+        past_observed_mask=batch["past_observed_mask"],
+        static_categorical_features=batch["static_categorical_features"],
+        static_real_features=batch["static_real_features"],
+        future_values=batch["future_values"],
+        future_time_features=batch["future_time_features"],
+    )
+
+    last_hidden_state = outputs.last_hidden_state
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index d5ed2adf2422..a2678edc4385 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -726,14 +726,13 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
-    # for prediction
     def forward(
         self,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
+        past_values: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
         past_time_feat: torch.Tensor,
         past_target: torch.Tensor,
-        past_observed_values: torch.Tensor,
         future_time_feat: torch.Tensor,
         num_parallel_samples: Optional[int] = None,
     ) -> torch.Tensor:
@@ -742,11 +741,11 @@ def forward(
             num_parallel_samples = self.num_parallel_samples
 
         encoder_inputs, scale, static_feat = self.create_network_inputs(
-            feat_static_cat,
-            feat_static_real,
+            static_categorical_features,
+            static_real_features,
             past_time_feat,
             past_target,
-            past_observed_values,
+            past_values,
         )
 
         enc_out, _ = self.encoder(encoder_inputs)
@@ -808,6 +807,7 @@ def forward(
         return concat_future_samples.reshape(
             (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
         )
+    # for prediction
 
 
 class InformerForPrediction(InformerPreTrainedModel):
@@ -822,7 +822,6 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
         return self.distr_output.distribution(sliced_params, scale=scale)
 
 
-
 class InformerForPointPrediction(InformerPreTrainedModel):
     pass
 

From 4568c0c57fd2ba2da884beeb721ed09eddc84499 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 27 Jan 2023 16:13:31 +0000
Subject: [PATCH 034/164] WIP forward pass: trying to establish working batch
 for forward pass

---
 .../informer/check_instantiate_works.py       |  44 +-
 .../models/informer/configuration_informer.py |  14 +-
 .../models/informer/modeling_informer.py      | 722 +++++++++++++++---
 3 files changed, 656 insertions(+), 124 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index 6fb3619d25cb..3d95e9d28f1c 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -1,4 +1,5 @@
-from transformers import InformerModel, InformerConfig, TimeSeriesTransformerModel
+from transformers import InformerModel, InformerConfig, TimeSeriesTransformerForPrediction, TimeSeriesTransformerModel, \
+    TimeSeriesTransformerConfig
 from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
 
 from huggingface_hub import hf_hub_download
@@ -11,18 +12,33 @@
     lags = get_lags_for_frequency(freq_str=freq)
     time_features = time_features_from_frequency_str(freq)
 
-    config = InformerConfig(prediction_length=prediction_length,
-                            context_length=prediction_length*3,
-                            lags_seq=lags,
-                            num_time_features=len(time_features) + 1,
-                            num_static_categorical_features=1,
-                            cardinality=[366],
-                            embedding_dimension=[2],
-                            encoder_layers=4,
-                            decoder_layers=4)
-
-    model: InformerModel = InformerModel(config)
-    print(model)
+    # config = InformerConfig(prediction_length=prediction_length,
+    #                         context_length=prediction_length*3,
+    #                         lags_seq=lags,
+    #                         num_time_features=len(time_features) + 1,
+    #                         num_static_categorical_features=1,
+    #                         cardinality=[366],
+    #                         embedding_dimension=[2],
+    #                         encoder_layers=4,
+    #                         decoder_layers=4)
+    # model = InformerModel(config)
+
+    config = TimeSeriesTransformerConfig(
+        prediction_length=prediction_length,
+        context_length=prediction_length * 3,  # context length
+        lags_sequence=lags,
+        num_time_features=len(time_features) + 1,  # we'll add 2 time features ("month of year" and "age", see further)
+        num_static_categorical_features=1,  # we have a single static categorical feature, namely time series ID
+        cardinality=[366],  # it has 366 possible values
+        embedding_dimension=[2],  # the model will learn an embedding of size 2 for each of the 366 possible values
+        encoder_layers=4,
+        decoder_layers=4,
+    )
+    model = TimeSeriesTransformerModel(config)
+    # model.eval()
+
+    # model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+
     file = hf_hub_download(
         repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
     )
@@ -40,4 +56,4 @@
         future_time_features=batch["future_time_features"],
     )
 
-    last_hidden_state = outputs.last_hidden_state
+    print(outputs.last_hidden_state.shape)
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 858afd2c5986..6e8e7c90d588 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -36,7 +36,7 @@ def __init__(
             input_size: int = 1,
             prediction_length: Optional[int] = None,
             context_length: Optional[int] = None,
-            distr_output: str = "student_t",
+            distribution_output: str = "student_t",
             lags_seq: Optional[List[int]] = None,  # used to be freq.
             scaling: bool = True,
             num_feat_dynamic_real: int = 0,  # num_dynamic_real_features
@@ -44,10 +44,10 @@ def __init__(
             num_feat_static_cat: int = 0,  # num_static_categorical_features
             cardinality: Optional[List[int]] = None,
             embedding_dimension: Optional[List[int]] = None,
-            dim_feedforward: int = 2048,  # decoder_ffn_dim & encoder_ffn_dim
-            nhead: int = 8,  # Eli: how much attention heads?
+            dim_feedforward: int = 32,  # decoder_ffn_dim & encoder_ffn_dim
+            nhead: int = 2,  # Eli: how much attention heads?
             num_encoder_layers: int = 2,  # encoder_layers
-            num_decoder_layers: int = 1,  # decoder_layers
+            num_decoder_layers: int = 2,  # decoder_layers
             is_encoder_decoder: bool = True,
             activation: str = "gelu",  # activation_function
             dropout: float = 0.05,
@@ -56,14 +56,13 @@ def __init__(
             distil: bool = True,
             num_parallel_samples: int = 100,
             init_std: float = 0.02,
-            d_model: int = 512,  # because of the informer embedding
             use_cache=True,
             **kwargs
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
-        self.distr_output = distr_output  # Eli: change to distribution_output
+        self.distribution_output = distribution_output
         # self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
         # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
@@ -98,8 +97,7 @@ def __init__(
         # self.history_length = context_length + max(self.lags_seq) # Eli: I think can be removed
 
         # Transformer architecture configuration
-        # self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
-        self.d_model = d_model
+        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
         self.nhead = nhead
         self.num_encoder_layers = num_encoder_layers  # encoder_layers
         self.num_decoder_layers = num_decoder_layers  # decoder_layers
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index a2678edc4385..0c0ce3d53454 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -50,6 +50,285 @@
 _CONFIG_FOR_DOC = "InformerConfig"
 
 
+class NegativeLogLikelihood:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        return -input.log_prob(target)
+
+
+class DistributionOutput:
+    distribution_class: type
+    in_features: int
+    args_dim: Dict[str, int]
+
+    def __init__(self, dim: int = 1) -> None:
+        self.dim = dim
+        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
+
+    def _base_distribution(self, distr_args):
+        if self.dim == 1:
+            return self.distribution_class(*distr_args)
+        else:
+            return Independent(self.distribution_class(*distr_args), 1)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions that this object constructs.
+        """
+        return () if self.dim == 1 else (self.dim,)
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
+        """
+        return 0.0
+
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distribution_class: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        df = 2.0 + cls.squareplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distribution_class: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distribution_class: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        if self.dim == 1:
+            return self.distribution_class(total_count=total_count, logits=logits)
+        else:
+            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            # See scaling property of Gamma.
+            logits += scale.log()
+
+        return self._base_distribution((total_count, logits))
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
+@dataclass
+class Seq2SeqTimeSeriesModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
+@dataclass
+class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
+    """
+    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
+    distribution.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
+            Distributional loss.
+        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
+            Parameters of the chosen distribution.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale to the original scale.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    params: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
+@dataclass
+class SampleTimeSeriesPredictionOutput(ModelOutput):
+    sequences: torch.FloatTensor = None
+
 # Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
 # source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
 class FeatureEmbedder(nn.Module):
@@ -638,74 +917,72 @@ def get_lagged_subsequences(
 
     def create_network_inputs(
         self,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
+        past_values: torch.Tensor,
         past_time_features: torch.Tensor,
-        past_target: torch.Tensor,
-        past_observed_values: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
-        future_target: Optional[torch.Tensor] = None,
     ):
         # time feature
         time_feat = (
             torch.cat(
                 (
-                    past_time_features[:, self._past_length - self.config.context_length:, ...],
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
                     future_time_features,
                 ),
                 dim=1,
             )
-            if future_target is not None
-            else past_time_features[:, self._past_length - self.context_length:, ...]
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
         )
 
         # target
-        context = past_target[:, -self.context_length :]
-        observed_context = past_observed_values[:, -self.context_length :]
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
         _, scale = self.scaler(context, observed_context)
 
         inputs = (
-            torch.cat((past_target, future_target), dim=1) / scale
-            if future_target is not None
-            else past_target / scale
+            torch.cat((past_values, future_values), dim=1) / scale
+            if future_values is not None
+            else past_values / scale
         )
 
         inputs_length = (
-            self._past_length + self.prediction_length
-            if future_target is not None
-            else self._past_length
+            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
         )
-        assert inputs.shape[1] == inputs_length
+        try:
+            assert inputs.shape[1] == inputs_length, (
+                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
+            )
+        except AssertionError as e:
+            e.args += (inputs.shape[1], inputs_length)
+            raise
 
         subsequences_length = (
-            self.context_length + self.prediction_length
-            if future_target is not None
-            else self.context_length
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
         )
 
         # embeddings
-        embedded_cat = self.embedder(feat_static_cat)
-        log_scale = scale.log() if self.input_size == 1 else scale.squeeze(1).log()
-        static_feat = torch.cat(
-            (embedded_cat, feat_static_real, log_scale),
-            dim=1,
-        )
-        expanded_static_feat = static_feat.unsqueeze(1).expand(
-            -1, time_feat.shape[1], -1
-        )
+        embedded_cat = self.embedder(static_categorical_features)
+        # static features
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
 
+        # all features
         features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
-        # sequence = torch.cat((prior_input, inputs), dim=1)
-        lagged_sequence = self.get_lagged_subsequences(
-            sequence=inputs,
-            subsequences_length=subsequences_length,
-        )
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
 
         lags_shape = lagged_sequence.shape
-        reshaped_lagged_sequence = lagged_sequence.reshape(
-            lags_shape[0], lags_shape[1], -1
-        )
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
 
         transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
 
@@ -726,103 +1003,344 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    def forward(
+            self,
+            past_values: torch.Tensor,
+            past_time_features: torch.Tensor,
+            past_observed_mask: torch.Tensor,
+            static_categorical_features: torch.Tensor,
+            static_real_features: torch.Tensor,
+            future_values: Optional[torch.Tensor] = None,
+            future_time_features: Optional[torch.Tensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            output_hidden_states: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            use_cache: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = transformer_inputs[:, : self.config.context_length, ...]
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        dec_input = transformer_inputs[:, self.config.context_length:, ...]
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (scale, static_feat)
+
+        return Seq2SeqTimeSeriesModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.model = InformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = NegativeLogLikelihood()
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, dec_output):
+        return self.parameter_projection(dec_output)
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, scale=scale)
+
     def forward(
         self,
         past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
         static_categorical_features: torch.Tensor,
         static_real_features: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
-        future_time_feat: torch.Tensor,
-        num_parallel_samples: Optional[int] = None,
-    ) -> torch.Tensor:
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import TimeSeriesTransformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = TimeSeriesTransformerForPrediction.from_pretrained(
+        ...     "huggingface/time-series-transformer-tourism-monthly"
+        ... )
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
 
-        if num_parallel_samples is None:
-            num_parallel_samples = self.num_parallel_samples
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            params = self.output_params(outputs[0])  # outputs.last_hidden_state
+            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTimeSeriesPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
 
-        encoder_inputs, scale, static_feat = self.create_network_inputs(
-            static_categorical_features,
-            static_real_features,
-            past_time_feat,
-            past_target,
-            past_values,
+    @torch.no_grad()
+    def generate(
+        self,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_values: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        future_time_features: Optional[torch.Tensor],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.Tensor:
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=True,
         )
 
-        enc_out, _ = self.encoder(encoder_inputs)
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        scale = outputs.scale
+        static_feat = outputs.static_features
 
-        repeated_scale = scale.repeat_interleave(
-            repeats=self.num_parallel_samples, dim=0
-        )
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-        repeated_past_target = (
-            past_target.repeat_interleave(repeats=self.num_parallel_samples, dim=0)
-            / repeated_scale
-        )
+        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
 
-        expanded_static_feat = static_feat.unsqueeze(1).expand(
-            -1, future_time_feat.shape[1], -1
-        )
-        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
-        repeated_features = features.repeat_interleave(
-            repeats=self.num_parallel_samples, dim=0
-        )
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-        repeated_enc_out = enc_out.repeat_interleave(
-            repeats=self.num_parallel_samples, dim=0
-        )
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
         future_samples = []
 
         # greedy decoding
-        for k in range(self.prediction_length):
-            # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
-
-            lagged_sequence = self.get_lagged_subsequences(
-                sequence=repeated_past_target,
+        for k in range(self.config.prediction_length):
+            lagged_sequence = self.model.get_lagged_subsequences(
+                sequence=repeated_past_values,
                 subsequences_length=1 + k,
                 shift=1,
             )
 
             lags_shape = lagged_sequence.shape
-            reshaped_lagged_sequence = lagged_sequence.reshape(
-                lags_shape[0], lags_shape[1], -1
-            )
+            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
 
-            decoder_input = torch.cat(
-                (reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1
-            )
+            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
 
-            output = self.decoder(decoder_input, repeated_enc_out)
+            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+            dec_last_hidden = dec_output.last_hidden_state
 
-            params = self.param_proj(output[:, -1:])
+            params = self.parameter_projection(dec_last_hidden[:, -1:])
             distr = self.output_distribution(params, scale=repeated_scale)
             next_sample = distr.sample()
 
-            repeated_past_target = torch.cat(
-                (repeated_past_target, next_sample / repeated_scale), dim=1
-            )
+            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)
-        return concat_future_samples.reshape(
-            (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
-        )
-    # for prediction
-
-
-class InformerForPrediction(InformerPreTrainedModel):
-    def __init__(self):
-        pass
-
-    @torch.jit.ignore
-    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
-        sliced_params = params
-        if trailing_n is not None:
-            sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distr_output.distribution(sliced_params, scale=scale)
 
+        return SampleTimeSeriesPredictionOutput(
+            sequences=concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
 
-class InformerForPointPrediction(InformerPreTrainedModel):
-    pass
 
 

From a31fc5a61c33f29a4ae92df7cca9f616959540a1 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 28 Jan 2023 06:32:53 +0000
Subject: [PATCH 035/164] cleaning and finalizing

---
 .../models/informer/config_using_gluonTS.py   | 722 ------------------
 .../models/informer/configuration_informer.py | 213 ++++--
 2 files changed, 159 insertions(+), 776 deletions(-)
 delete mode 100644 src/transformers/models/informer/config_using_gluonTS.py

diff --git a/src/transformers/models/informer/config_using_gluonTS.py b/src/transformers/models/informer/config_using_gluonTS.py
deleted file mode 100644
index b047f6458885..000000000000
--- a/src/transformers/models/informer/config_using_gluonTS.py
+++ /dev/null
@@ -1,722 +0,0 @@
-from math import sqrt
-from typing import List, Optional
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from gluonts.core.component import validated
-from gluonts.time_feature import get_lags_for_frequency
-from gluonts.torch.distributions import DistributionOutput, StudentTOutput
-from gluonts.torch.modules.feature import FeatureEmbedder
-from gluonts.torch.modules.scaler import MeanScaler, NOPScaler
-
-
-class TriangularCausalMask:
-    def __init__(self, B, L, device="cpu"):
-        mask_shape = [B, 1, L, L]
-        with torch.no_grad():
-            self._mask = torch.triu(
-                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
-            ).to(device)
-
-    @property
-    def mask(self):
-        return self._mask
-
-
-class ProbMask:
-    def __init__(self, B, H, L, index, scores, device="cpu"):
-        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
-        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
-        indicator = _mask_ex[
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
-        ].to(device)
-        self._mask = indicator.view(scores.shape).to(device)
-
-    @property
-    def mask(self):
-        return self._mask
-
-
-class FullAttention(nn.Module):
-    def __init__(
-        self,
-        mask_flag=True,
-        factor=5,
-        scale=None,
-        attention_dropout=0.1,
-        output_attention=False,
-    ):
-        super(FullAttention, self).__init__()
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.output_attention = output_attention
-        self.dropout = nn.Dropout(attention_dropout)
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L, H, E = queries.shape
-        _, S, _, D = values.shape
-        scale = self.scale or 1.0 / sqrt(E)
-
-        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
-        if self.mask_flag:
-            if attn_mask is None:
-                attn_mask = TriangularCausalMask(B, L, device=queries.device)
-
-            scores.masked_fill_(attn_mask.mask, -np.inf)
-
-        A = self.dropout(torch.softmax(scale * scores, dim=-1))
-        V = torch.einsum("bhls,bshd->blhd", A, values)
-
-        if self.output_attention:
-            return (V.contiguous(), A)
-        else:
-            return (V.contiguous(), None)
-
-
-class ProbAttention(nn.Module):
-    def __init__(
-        self,
-        mask_flag=True,
-        factor=5,
-        scale=None,
-        attention_dropout=0.1,
-        output_attention=False,
-    ):
-        super(ProbAttention, self).__init__()
-        self.factor = factor
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.output_attention = output_attention
-        self.dropout = nn.Dropout(attention_dropout)
-
-    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
-        # Q [B, H, L, D]
-        B, H, L_K, E = K.shape
-        _, _, L_Q, _ = Q.shape
-
-        # calculate the sampled Q_K
-        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
-        index_sample = torch.randint(
-            L_K, (L_Q, sample_k)
-        )  # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
-        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(
-            -2
-        )
-
-        # find the Top_k query with sparisty measurement
-        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
-        M_top = M.topk(n_top, sorted=False)[1]
-
-        # use the reduced Q to calculate Q_K
-        Q_reduce = Q[
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
-        ]  # factor*ln(L_q)
-        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
-
-        return Q_K, M_top
-
-    def _get_initial_context(self, V, L_Q):
-        B, H, L_V, D = V.shape
-        if not self.mask_flag:
-            # V_sum = V.sum(dim=-2)
-            V_sum = V.mean(dim=-2)
-            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
-        else:  # use mask
-            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
-            contex = V.cumsum(dim=-2)
-        return contex
-
-    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
-        B, H, L_V, D = V.shape
-
-        if self.mask_flag:
-            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
-            scores.masked_fill_(attn_mask.mask, -np.inf)
-
-        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
-
-        context_in[
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
-        ] = torch.matmul(attn, V).type_as(context_in)
-        if self.output_attention:
-            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
-            attns[
-                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
-            ] = attn
-            return (context_in, attns)
-        else:
-            return (context_in, None)
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L_Q, H, D = queries.shape
-        _, L_K, _, _ = keys.shape
-
-        queries = queries.transpose(2, 1)
-        keys = keys.transpose(2, 1)
-        values = values.transpose(2, 1)
-
-        U_part = self.factor * np.ceil(np.log1p(L_K)).astype("int").item()  # c*ln(L_k)
-        u = self.factor * np.ceil(np.log1p(L_Q)).astype("int").item()  # c*ln(L_q)
-
-        U_part = U_part if U_part < L_K else L_K
-        u = u if u < L_Q else L_Q
-
-        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
-
-        # add scale factor
-        scale = self.scale or 1.0 / sqrt(D)
-        if scale is not None:
-            scores_top = scores_top * scale
-        # get the context
-        context = self._get_initial_context(values, L_Q)
-        # update the context with selected top_k queries
-        context, attn = self._update_context(
-            context, values, scores_top, index, L_Q, attn_mask
-        )
-
-        return context.transpose(2, 1).contiguous(), attn
-
-
-class AttentionLayer(nn.Module):
-    def __init__(
-        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
-    ):
-        super(AttentionLayer, self).__init__()
-
-        d_keys = d_keys or (d_model // n_heads)
-        d_values = d_values or (d_model // n_heads)
-
-        self.inner_attention = attention
-        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
-        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
-        self.value_projection = nn.Linear(d_model, d_values * n_heads)
-        self.out_projection = nn.Linear(d_values * n_heads, d_model)
-        self.n_heads = n_heads
-        self.mix = mix
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L, _ = queries.shape
-        _, S, _ = keys.shape
-        H = self.n_heads
-
-        queries = self.query_projection(queries).view(B, L, H, -1)
-        keys = self.key_projection(keys).view(B, S, H, -1)
-        values = self.value_projection(values).view(B, S, H, -1)
-
-        out, attn = self.inner_attention(queries, keys, values, attn_mask)
-        if self.mix:
-            out = out.transpose(2, 1).contiguous()
-        out = out.view(B, L, -1)
-
-        return self.out_projection(out), attn
-
-
-class ConvLayer(nn.Module):
-    def __init__(self, c_in):
-        super(ConvLayer, self).__init__()
-        self.downConv = nn.Conv1d(
-            in_channels=c_in,
-            out_channels=c_in,
-            kernel_size=3,
-            padding=1,
-            padding_mode="circular",
-        )
-        self.norm = nn.BatchNorm1d(c_in) # Question: why batchnorm here?
-        self.activation = nn.ELU()
-        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
-
-    def forward(self, x):
-        x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.maxPool(x)
-        x = x.transpose(1, 2)
-        return x
-
-
-class EncoderLayer(nn.Module):
-    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
-        super(EncoderLayer, self).__init__()
-        d_ff = d_ff or 4 * d_model
-        self.attention = attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
-
-    def forward(self, x, attn_mask=None):
-        # x [B, L, D]
-        # x = x + self.dropout(self.attention(
-        #     x, x, x,
-        #     attn_mask = attn_mask
-        # ))
-        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
-        x = x + self.dropout(new_x)
-
-        y = x = self.norm1(x)
-        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-        y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-        return self.norm2(x + y), attn
-
-
-class Encoder(nn.Module):
-    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
-        super(Encoder, self).__init__()
-        self.attn_layers = nn.ModuleList(attn_layers)
-        self.conv_layers = (
-            nn.ModuleList(conv_layers) if conv_layers is not None else None
-        )
-        self.norm = norm_layer
-
-    def forward(self, x, attn_mask=None):
-        # x [B, L, D]
-        attns = []
-        if self.conv_layers is not None:
-            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
-                x, attn = attn_layer(x, attn_mask=attn_mask)
-                x = conv_layer(x)
-                attns.append(attn)
-            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
-            attns.append(attn)
-        else:
-            for attn_layer in self.attn_layers:
-                x, attn = attn_layer(x, attn_mask=attn_mask)
-                attns.append(attn)
-
-        if self.norm is not None:
-            x = self.norm(x)
-
-        return x, attns
-
-
-class DecoderLayer(nn.Module):
-    def __init__(
-        self,
-        self_attention,
-        cross_attention,
-        d_model,
-        d_ff=None,
-        dropout=0.1,
-        activation="relu",
-    ):
-        super(DecoderLayer, self).__init__()
-        d_ff = d_ff or 4 * d_model
-        self.self_attention = self_attention
-        self.cross_attention = cross_attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
-
-    def forward(self, x, cross, x_mask=None, cross_mask=None):
-        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
-        x = self.norm1(x)
-
-        x = x + self.dropout(
-            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
-        )
-
-        y = x = self.norm2(x)
-        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-        y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-        return self.norm3(x + y)
-
-
-class Decoder(nn.Module):
-    def __init__(self, layers, norm_layer=None):
-        super(Decoder, self).__init__()
-        self.layers = nn.ModuleList(layers)
-        self.norm = norm_layer
-
-    def forward(self, x, cross, x_mask=None, cross_mask=None):
-        for layer in self.layers:
-            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
-
-        if self.norm is not None:
-            x = self.norm(x)
-
-        return x
-
-
-class InformerModel(nn.Module):
-    @validated()
-    def __init__(  # add loss param
-        self,
-        freq: str, # frequency
-        context_length: int,
-        prediction_length: int,
-        num_feat_dynamic_real: int,  # num_dynamic_real_features
-        num_feat_static_real: int,  # num_static_real_features
-        num_feat_static_cat: int,  # num_static_categorical_features
-        cardinality: List[int],
-        # Informer arguments
-        nhead: int,
-        num_encoder_layers: int, # encoder_layers
-        num_decoder_layers: int, # decoder_layers
-        dim_feedforward: int,
-        activation: str = "gelu", # activation_function
-        dropout: float = 0.1,
-        attn: str = "prob",
-        factor: int = 5,
-        distil: bool = True,
-        # univariate input
-        input_size: int = 1,
-        embedding_dimension: Optional[List[int]] = None,
-        distr_output: DistributionOutput = StudentTOutput(),
-        lags_seq: Optional[List[int]] = None,
-        scaling: bool = True,
-        num_parallel_samples: int = 100,
-    ) -> None:
-        super().__init__()
-
-        self.input_size = input_size
-
-        self.target_shape = distr_output.event_shape
-        self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_cat = num_feat_static_cat
-        self.num_feat_static_real = num_feat_static_real
-        self.embedding_dimension = (
-            embedding_dimension
-            if embedding_dimension is not None or cardinality is None
-            else [min(50, (cat + 1) // 2) for cat in cardinality]
-        )
-        self.lags_seq = lags_seq or get_lags_for_frequency(freq_str=freq)
-        self.num_parallel_samples = num_parallel_samples
-        self.history_length = context_length + max(self.lags_seq)
-        self.embedder = FeatureEmbedder(
-            cardinalities=cardinality,
-            embedding_dims=self.embedding_dimension,
-        )
-        if scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
-        else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
-
-        # total feature size
-        d_model = self.input_size * len(self.lags_seq) + self._number_of_features
-
-        self.context_length = context_length
-        self.prediction_length = prediction_length
-        self.distr_output = distr_output
-        self.param_proj = distr_output.get_args_proj(d_model)
-
-        # Informer enc-decoder
-        Attn = ProbAttention if attn == "prob" else FullAttention
-        # Encoder
-        self.encoder = Encoder(
-            [
-                EncoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=False,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=False,
-                    ),
-                    d_model,
-                    d_ff=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                )
-                for l in range(num_encoder_layers)
-            ],
-            [ConvLayer(d_model) for l in range(num_encoder_layers - 1)]
-            if distil
-            else None,
-            norm_layer=torch.nn.LayerNorm(d_model),
-        )
-
-        # Masked Decoder
-        self.decoder = Decoder(
-            [
-                DecoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=True,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=True,
-                    ),
-                    AttentionLayer(
-                        FullAttention(
-                            mask_flag=False,
-                            factor=factor,
-                            attention_dropout=dropout,
-                            output_attention=False,
-                        ),
-                        d_model,
-                        nhead,
-                        mix=False,
-                    ),
-                    d_model,
-                    d_ff=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                )
-                for l in range(num_decoder_layers)
-            ],
-            norm_layer=torch.nn.LayerNorm(d_model),
-        )
-
-    @property
-    def _number_of_features(self) -> int:
-        return (
-            sum(self.embedding_dimension)
-            + self.num_feat_dynamic_real
-            + self.num_feat_static_real
-            + self.input_size  # the log(scale)
-        )
-
-    @property
-    def _past_length(self) -> int:
-        return self.context_length + max(self.lags_seq)
-
-    def get_lagged_subsequences(
-        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
-    ) -> torch.Tensor:
-        """
-        Returns lagged subsequences of a given sequence.
-        Parameters
-        ----------
-        sequence : Tensor
-            the sequence from which lagged subsequences should be extracted.
-            Shape: (N, T, C).
-        subsequences_length : int
-            length of the subsequences to be extracted.
-        shift: int
-            shift the lags by this amount back.
-        Returns
-        --------
-        lagged : Tensor
-            a tensor of shape (N, S, C, I), where S = subsequences_length and
-            I = len(indices), containing lagged subsequences. Specifically,
-            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
-        """
-        sequence_length = sequence.shape[1]
-        indices = [lag - shift for lag in self.lags_seq]
-
-        assert max(indices) + subsequences_length <= sequence_length, (
-            f"lags cannot go further than history length, found lag {max(indices)} "
-            f"while history length is only {sequence_length}"
-        )
-
-        lagged_values = []
-        for lag_index in indices:
-            begin_index = -lag_index - subsequences_length
-            end_index = -lag_index if lag_index > 0 else None
-            lagged_values.append(sequence[:, begin_index:end_index, ...])
-        return torch.stack(lagged_values, dim=-1)
-
-    def _check_shapes(
-        self,
-        prior_input: torch.Tensor,
-        inputs: torch.Tensor,
-        features: Optional[torch.Tensor],
-    ) -> None:
-        assert len(prior_input.shape) == len(inputs.shape)
-        assert (
-            len(prior_input.shape) == 2 and self.input_size == 1
-        ) or prior_input.shape[2] == self.input_size
-        assert (len(inputs.shape) == 2 and self.input_size == 1) or inputs.shape[
-            -1
-        ] == self.input_size
-        assert (
-            features is None or features.shape[2] == self._number_of_features
-        ), f"{features.shape[2]}, expected {self._number_of_features}"
-
-    def create_network_inputs(
-        self,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
-        past_observed_values: torch.Tensor,
-        future_time_feat: Optional[torch.Tensor] = None,
-        future_target: Optional[torch.Tensor] = None,
-    ):
-        # time feature
-        time_feat = (
-            torch.cat(
-                (
-                    past_time_feat[:, self._past_length - self.context_length :, ...],
-                    future_time_feat,
-                ),
-                dim=1,
-            )
-            if future_target is not None
-            else past_time_feat[:, self._past_length - self.context_length :, ...]
-        )
-
-        # target
-        context = past_target[:, -self.context_length :]
-        observed_context = past_observed_values[:, -self.context_length :]
-        _, scale = self.scaler(context, observed_context)
-
-        inputs = (
-            torch.cat((past_target, future_target), dim=1) / scale
-            if future_target is not None
-            else past_target / scale
-        )
-
-        inputs_length = (
-            self._past_length + self.prediction_length
-            if future_target is not None
-            else self._past_length
-        )
-        assert inputs.shape[1] == inputs_length
-
-        subsequences_length = (
-            self.context_length + self.prediction_length
-            if future_target is not None
-            else self.context_length
-        )
-
-        # embeddings
-        embedded_cat = self.embedder(feat_static_cat)
-        log_scale = scale.log() if self.input_size == 1 else scale.squeeze(1).log()
-        static_feat = torch.cat(
-            (embedded_cat, feat_static_real, log_scale),
-            dim=1,
-        )
-        expanded_static_feat = static_feat.unsqueeze(1).expand(
-            -1, time_feat.shape[1], -1
-        )
-
-        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
-
-        # self._check_shapes(prior_input, inputs, features)
-
-        # sequence = torch.cat((prior_input, inputs), dim=1)
-        lagged_sequence = self.get_lagged_subsequences(
-            sequence=inputs,
-            subsequences_length=subsequences_length,
-        )
-
-        lags_shape = lagged_sequence.shape
-        reshaped_lagged_sequence = lagged_sequence.reshape(
-            lags_shape[0], lags_shape[1], -1
-        )
-
-        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
-
-        return transformer_inputs, scale, static_feat
-
-    def output_params(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.context_length, ...]
-        dec_input = transformer_inputs[:, self.context_length :, ...]
-
-        enc_out, _ = self.encoder(enc_input)
-        dec_output = self.decoder(dec_input, enc_out)
-
-        return self.param_proj(dec_output)
-
-    @torch.jit.ignore
-    def output_distribution(
-        self, params, scale=None, trailing_n=None
-    ) -> torch.distributions.Distribution:
-        sliced_params = params
-        if trailing_n is not None:
-            sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distr_output.distribution(sliced_params, scale=scale)
-
-    # for prediction
-    def forward(
-        self,
-        feat_static_cat: torch.Tensor,
-        feat_static_real: torch.Tensor,
-        past_time_feat: torch.Tensor,
-        past_target: torch.Tensor,
-        past_observed_values: torch.Tensor,
-        future_time_feat: torch.Tensor,
-        num_parallel_samples: Optional[int] = None,
-    ) -> torch.Tensor:
-
-        if num_parallel_samples is None:
-            num_parallel_samples = self.num_parallel_samples
-
-        encoder_inputs, scale, static_feat = self.create_network_inputs(
-            feat_static_cat,
-            feat_static_real,
-            past_time_feat,
-            past_target,
-            past_observed_values,
-        )
-
-        enc_out, _ = self.encoder(encoder_inputs)
-
-        repeated_scale = scale.repeat_interleave(
-            repeats=self.num_parallel_samples, dim=0
-        )
-
-        repeated_past_target = (
-            past_target.repeat_interleave(repeats=self.num_parallel_samples, dim=0)
-            / repeated_scale
-        )
-
-        expanded_static_feat = static_feat.unsqueeze(1).expand(
-            -1, future_time_feat.shape[1], -1
-        )
-        features = torch.cat((expanded_static_feat, future_time_feat), dim=-1)
-        repeated_features = features.repeat_interleave(
-            repeats=self.num_parallel_samples, dim=0
-        )
-
-        repeated_enc_out = enc_out.repeat_interleave(
-            repeats=self.num_parallel_samples, dim=0
-        )
-
-        future_samples = []
-
-        # greedy decoding
-        for k in range(self.prediction_length):
-            # self._check_shapes(repeated_past_target, next_sample, next_features)
-            # sequence = torch.cat((repeated_past_target, next_sample), dim=1)
-
-            lagged_sequence = self.get_lagged_subsequences(
-                sequence=repeated_past_target,
-                subsequences_length=1 + k,
-                shift=1,
-            )
-
-            lags_shape = lagged_sequence.shape
-            reshaped_lagged_sequence = lagged_sequence.reshape(
-                lags_shape[0], lags_shape[1], -1
-            )
-
-            decoder_input = torch.cat(
-                (reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1
-            )
-
-            output = self.decoder(decoder_input, repeated_enc_out)
-
-            params = self.param_proj(output[:, -1:])
-            distr = self.output_distribution(params, scale=repeated_scale)
-            next_sample = distr.sample()
-
-            repeated_past_target = torch.cat(
-                (repeated_past_target, next_sample / repeated_scale), dim=1
-            )
-            future_samples.append(next_sample)
-
-        concat_future_samples = torch.cat(future_samples, dim=1)
-        return concat_future_samples.reshape(
-            (-1, self.num_parallel_samples, self.prediction_length) + self.target_shape,
-        )
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 6e8e7c90d588..d77ab4ce66d1 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Informer model configuration"""
+"""Informer model configuration"""
 
 from typing import List, Optional
 
@@ -22,41 +22,147 @@
 
 logger = logging.get_logger(__name__)
 
-TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "huggingface/time-series-transformer-tourism-monthly": (
-        "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json"
-    ),
-    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
+INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "elisim/informer": "https://huggingface.co/elisim/informer/resolve/main/config.json",
 }
 
 
+
 class InformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
+    instantiate a Informer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
+    Transformer
+    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If `None`, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        scaling (`bool`, *optional* defaults to `True`):
+            Whether to scale the input targets.
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
+            5, 6, 7]`.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+
+        Example:
+
+    ```python
+    >>> from transformers import InformerConfig, InformerModel
+
+    >>> # Initializing a default Informer configuration
+    >>> configuration = InformerConfig()
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = InformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "informer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
     def __init__(
             self,
             input_size: int = 1,
             prediction_length: Optional[int] = None,
             context_length: Optional[int] = None,
             distribution_output: str = "student_t",
-            lags_seq: Optional[List[int]] = None,  # used to be freq.
+            loss: str = "nll",
+            lags_sequence: List[int] = None,
             scaling: bool = True,
-            num_feat_dynamic_real: int = 0,  # num_dynamic_real_features
-            num_feat_static_real: int = 0,  # num_static_real_features
-            num_feat_static_cat: int = 0,  # num_static_categorical_features
+            num_dynamic_real_features: int = 0,
+            num_static_real_features: int = 0,
+            num_static_categorical_features: int = 0,
+            num_time_features: int = 0,
             cardinality: Optional[List[int]] = None,
             embedding_dimension: Optional[List[int]] = None,
-            dim_feedforward: int = 32,  # decoder_ffn_dim & encoder_ffn_dim
-            nhead: int = 2,  # Eli: how much attention heads?
-            num_encoder_layers: int = 2,  # encoder_layers
-            num_decoder_layers: int = 2,  # decoder_layers
+            encoder_ffn_dim: int = 32,
+            decoder_ffn_dim: int = 32,
+            encoder_attention_heads: int = 2,
+            decoder_attention_heads: int = 2,
+            encoder_layers: int = 2,
+            decoder_layers: int = 2,
             is_encoder_decoder: bool = True,
-            activation: str = "gelu",  # activation_function
+            activation_function: str = "gelu",
             dropout: float = 0.05,
-            attn: str = "prob",
-            factor: int = 5,
-            distil: bool = True,
+            encoder_layerdrop: float = 0.1,
+            decoder_layerdrop: float = 0.1,
+            attention_dropout: float = 0.1,
+            activation_dropout: float = 0.1,
             num_parallel_samples: int = 100,
             init_std: float = 0.02,
             use_cache=True,
+            # Informer arguments
+            attn: str = "prob",
+            factor: int = 5,
+            distil: bool = True,
             **kwargs
     ):
         # time series specific configuration
@@ -65,17 +171,14 @@ def __init__(
         self.distribution_output = distribution_output
         # self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
-        # self.target_shape = distr_output.event_shape  # Eli: I think can be removed
-        # self.num_time_features = num_time_features # Eli: From vanilla ts transformer
-        self.lags_seq = lags_seq
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
         self.scaling = scaling
-        self.num_feat_dynamic_real = num_feat_dynamic_real
-        self.num_feat_static_cat = num_feat_static_cat
-        self.num_feat_static_real = num_feat_static_real
-
-        # set cardinality
-        if cardinality and num_feat_static_cat > 0:
-            if len(cardinality) != num_feat_static_cat:
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+        if cardinality and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
                 raise ValueError(
                     "The cardinality should be a list of the same length as `num_static_categorical_features`"
                 )
@@ -84,8 +187,8 @@ def __init__(
             self.cardinality = [1]
 
         # set embedding_dimension
-        if embedding_dimension and num_feat_static_cat > 0:
-            if len(embedding_dimension) != num_feat_static_cat:
+        if embedding_dimension and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
                 raise ValueError(
                     "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                 )
@@ -94,23 +197,34 @@ def __init__(
             self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
 
         self.num_parallel_samples = num_parallel_samples
-        # self.history_length = context_length + max(self.lags_seq) # Eli: I think can be removed
 
         # Transformer architecture configuration
-        self.d_model = self.input_size * len(self.lags_seq) + self._number_of_features
-        self.nhead = nhead
-        self.num_encoder_layers = num_encoder_layers  # encoder_layers
-        self.num_decoder_layers = num_decoder_layers  # decoder_layers
-        self.dim_feedforward = dim_feedforward
-        self.activation = activation  # activation_function
+        self.d_model = input_size * len(lags_sequence) + self._number_of_features
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
         self.dropout = dropout
-        self.attn = attn
-        self.factor = factor
-        self.distil = distil
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
         self.init_std = init_std
+
+        self.output_attentions = False
+        self.output_hidden_states = False
+
         self.use_cache = use_cache
 
-        # self.param_proj = distr_output.get_args_proj(d_model)
+        # Informer
+        self.attn = attn
+        self.factor = factor
+        self.distil = distil
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
@@ -118,17 +232,8 @@ def __init__(
     def _number_of_features(self) -> int:
         return (
             sum(self.embedding_dimension)
-            + self.num_feat_dynamic_real
-            + self.num_feat_static_real
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
             + self.input_size  # the log(scale)
         )
-
-    # @property
-    # def _number_of_features(self) -> int:
-    #     return (
-    #         sum(self.embedding_dimension)
-    #         + self.num_dynamic_real_features
-    #         + self.num_time_features
-    #         + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
-    #         + self.input_size  # the log(scale)
-    #     )

From 27045b850c997050b516edd1f8b1104f1b6cf90b Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 28 Jan 2023 07:07:51 +0000
Subject: [PATCH 036/164] adding HF names and docs

---
 .../models/informer/modeling_informer.py      | 379 ++++++++++++------
 1 file changed, 251 insertions(+), 128 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 0c0ce3d53454..71700b91e965 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1,6 +1,6 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -40,9 +40,7 @@
 from math import sqrt
 from typing import List, Optional
 
-import math
 import numpy as np
-import torch
 import torch.nn.functional as F
 
 logger = logging.get_logger(__name__)
@@ -50,13 +48,64 @@
 _CONFIG_FOR_DOC = "InformerConfig"
 
 
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
+INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "elisim/informer",
+    # See all Informer models at https://huggingface.co/models?filter=informer
+]
 
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
+
+
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class ParameterProjection(nn.Module):
+    def __init__(
+        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self.function = function
+
+    def forward(self, x, *args):
+        return self.function(x, *args)
 
 
 class DistributionOutput:
@@ -188,6 +237,173 @@ def distribution(
         return self._base_distribution((total_count, logits))
 
 
+# Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
+# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, scale
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+class NegativeLogLikelihood:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class Seq2SeqTimeSeriesModelOutput(ModelOutput):
@@ -329,107 +545,6 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
 class SampleTimeSeriesPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
-# Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
-# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
-class FeatureEmbedder(nn.Module):
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
-
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
-        else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
-
-
-# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
-
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # these will have shape (N, C)
-        total_weight = weights.sum(dim=self.dim)
-        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
-
-        # first compute a global scale per-dimension
-        total_observed = total_weight.sum(dim=0)
-        denominator = torch.max(total_observed, torch.ones_like(total_observed))
-        default_scale = weighted_sum.sum(dim=0) / denominator
-
-        # then compute a per-item, per-dimension scale
-        denominator = torch.max(total_weight, torch.ones_like(total_weight))
-        scale = weighted_sum / denominator
-
-        # use per-batch scale when no element is observed
-        # or when the sequence contains only zeros
-        scale = (
-            torch.max(
-                self.minimum_scale,
-                torch.where(
-                    weighted_sum > torch.zeros_like(weighted_sum),
-                    scale,
-                    default_scale * torch.ones_like(total_weight),
-                ),
-            )
-            .detach()
-            .unsqueeze(dim=self.dim)
-        )
-
-        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
-
-
-# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, scale
 
 # Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
 # are from the original Informer repository (see the exact source below)
@@ -730,6 +845,7 @@ class InformerEncoder(nn.Module):
     def __init__(self, config: InformerConfig):
         super(InformerEncoder, self).__init__()
 
+        self.activation_fn = ACT2FN[config.activation_function]
         Attn = ProbAttention if config.attn == "prob" else FullAttention
         self.attn_layers = nn.ModuleList([
             EncoderLayer(
@@ -737,18 +853,18 @@ def __init__(self, config: InformerConfig):
                         Attn(
                             mask_flag=False,
                             factor=config.factor,
-                            attention_dropout=config.dropout,
+                            attention_dropout=config.attention_dropout,
                             output_attention=False,
                         ),
                         config.d_model,
-                        config.nhead,
+                        config.encoder_attention_heads,
                         mix=False,
                     ),
                     config.d_model,
-                    d_ff=config.dim_feedforward,
-                    dropout=config.dropout,
-                    activation=config.activation,
-                ) for _ in range(config.num_encoder_layers)
+                    d_ff=config.encoder_ffn_dim,
+                    dropout=config.attention_dropout,
+                    activation=self.activation_fn,
+                ) for _ in range(config.encoder_layers)
         ])
 
         if config.distil is not None:
@@ -783,6 +899,7 @@ class InformerDecoder(nn.Module):
     def __init__(self, config: InformerConfig):
         super(InformerDecoder, self).__init__()
 
+        self.activation_fn = ACT2FN[config.activation_function]
         Attn = ProbAttention if config.attn == "prob" else FullAttention
 
         # Masked Decoder
@@ -793,7 +910,7 @@ def __init__(self, config: InformerConfig):
                         Attn(
                             mask_flag=True,
                             factor=config.factor,
-                            attention_dropout=config.dropout,
+                            attention_dropout=config.attention_dropout,
                             output_attention=False,
                         ),
                         config.d_model,
@@ -804,17 +921,17 @@ def __init__(self, config: InformerConfig):
                         FullAttention(
                             mask_flag=False,
                             factor=config.factor,
-                            attention_dropout=config.dropout,
+                            attention_dropout=config.attention_dropout,
                             output_attention=False,
                         ),
                         config.d_model,
-                        config.nhead,
+                        config.decoder_attention_heads,
                         mix=False,
                     ),
                     config.d_model,
-                    d_ff=config.dim_feedforward,
+                    d_ff=config.decoder_ffn_dim,
                     dropout=config.dropout,
-                    activation=config.activation,
+                    activation=self.activation_fn,
                 )
                 for _ in range(config.num_decoder_layers)
             ],
@@ -877,7 +994,7 @@ def __init__(self, config: InformerConfig):
 
     @property
     def _past_length(self) -> int:
-        return self.config.context_length + max(self.config.lags_seq)
+        return self.config.context_length + max(self.config.lags_sequence)
 
     def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
@@ -901,12 +1018,16 @@ def get_lagged_subsequences(
             lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
         """
         sequence_length = sequence.shape[1]
-        indices = [lag - shift for lag in self.lags_seq]
+        indices = [lag - shift for lag in self.config.lags_sequence]
 
-        assert max(indices) + subsequences_length <= sequence_length, (
-            f"lags cannot go further than history length, found lag {max(indices)} "
-            f"while history length is only {sequence_length}"
-        )
+        try:
+            assert max(indices) + subsequences_length <= sequence_length, (
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+        except AssertionError as e:
+            e.args += (max(indices), sequence_length)
+            raise
 
         lagged_values = []
         for lag_index in indices:
@@ -1003,6 +1124,8 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
             self,
             past_values: torch.Tensor,

From c8a6cae3c0c58bf7ff4629932844c236fa6f3fa7 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 28 Jan 2023 07:26:14 +0000
Subject: [PATCH 037/164] init after cleaning works

---
 .../informer/check_instantiate_works.py       | 83 ++++++++++---------
 .../models/informer/modeling_informer.py      |  8 +-
 2 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index 3d95e9d28f1c..7a7bc764d0f4 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -12,48 +12,49 @@
     lags = get_lags_for_frequency(freq_str=freq)
     time_features = time_features_from_frequency_str(freq)
 
-    # config = InformerConfig(prediction_length=prediction_length,
-    #                         context_length=prediction_length*3,
-    #                         lags_seq=lags,
-    #                         num_time_features=len(time_features) + 1,
-    #                         num_static_categorical_features=1,
-    #                         cardinality=[366],
-    #                         embedding_dimension=[2],
-    #                         encoder_layers=4,
-    #                         decoder_layers=4)
-    # model = InformerModel(config)
-
-    config = TimeSeriesTransformerConfig(
-        prediction_length=prediction_length,
-        context_length=prediction_length * 3,  # context length
-        lags_sequence=lags,
-        num_time_features=len(time_features) + 1,  # we'll add 2 time features ("month of year" and "age", see further)
-        num_static_categorical_features=1,  # we have a single static categorical feature, namely time series ID
-        cardinality=[366],  # it has 366 possible values
-        embedding_dimension=[2],  # the model will learn an embedding of size 2 for each of the 366 possible values
-        encoder_layers=4,
-        decoder_layers=4,
-    )
-    model = TimeSeriesTransformerModel(config)
+    config = InformerConfig(prediction_length=prediction_length,
+                            context_length=prediction_length*3,
+                            lags_sequence=lags,
+                            num_time_features=len(time_features) + 1,
+                            num_static_categorical_features=1,
+                            cardinality=[366],
+                            embedding_dimension=[2],
+                            encoder_layers=4,
+                            decoder_layers=4)
+    model = InformerModel(config)
+    print(model)
+
+    # config = TimeSeriesTransformerConfig(
+    #     prediction_length=prediction_length,
+    #     context_length=prediction_length * 3,  # context length
+    #     lags_sequence=lags,
+    #     num_time_features=len(time_features) + 1,  # we'll add 2 time features ("month of year" and "age", see further)
+    #     num_static_categorical_features=1,  # we have a single static categorical feature, namely time series ID
+    #     cardinality=[366],  # it has 366 possible values
+    #     embedding_dimension=[2],  # the model will learn an embedding of size 2 for each of the 366 possible values
+    #     encoder_layers=4,
+    #     decoder_layers=4,
+    # )
+    # model = TimeSeriesTransformerModel(config)
     # model.eval()
 
     # model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
 
-    file = hf_hub_download(
-        repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
-    )
-    batch = torch.load(file)
-
-    # during training, one provides both past and future values
-    # as well as possible additional features
-    outputs = model(
-        past_values=batch["past_values"],
-        past_time_features=batch["past_time_features"],
-        past_observed_mask=batch["past_observed_mask"],
-        static_categorical_features=batch["static_categorical_features"],
-        static_real_features=batch["static_real_features"],
-        future_values=batch["future_values"],
-        future_time_features=batch["future_time_features"],
-    )
-
-    print(outputs.last_hidden_state.shape)
+    # file = hf_hub_download(
+    #     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+    # )
+    # batch = torch.load(file)
+    #
+    # # during training, one provides both past and future values
+    # # as well as possible additional features
+    # outputs = model(
+    #     past_values=batch["past_values"],
+    #     past_time_features=batch["past_time_features"],
+    #     past_observed_mask=batch["past_observed_mask"],
+    #     static_categorical_features=batch["static_categorical_features"],
+    #     static_real_features=batch["static_real_features"],
+    #     future_values=batch["future_values"],
+    #     future_time_features=batch["future_time_features"],
+    # )
+    #
+    # print(outputs.last_hidden_state.shape)
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 71700b91e965..4334fc0c77c3 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -868,7 +868,7 @@ def __init__(self, config: InformerConfig):
         ])
 
         if config.distil is not None:
-            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.num_encoder_layers - 1)])
+            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
         else:
             self.conv_layers = None
 
@@ -914,7 +914,7 @@ def __init__(self, config: InformerConfig):
                             output_attention=False,
                         ),
                         config.d_model,
-                        config.nhead,
+                        config.decoder_attention_heads,
                         mix=True,
                     ),
                     AttentionLayer(
@@ -933,7 +933,7 @@ def __init__(self, config: InformerConfig):
                     dropout=config.dropout,
                     activation=self.activation_fn,
                 )
-                for _ in range(config.num_decoder_layers)
+                for _ in range(config.decoder_layers)
             ],
         )
         self.norm = torch.nn.LayerNorm(config.d_model)
@@ -1124,8 +1124,6 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
             self,
             past_values: torch.Tensor,

From 0e8ffb612b4e55040123d26df86f0490a4105c3e Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 28 Jan 2023 07:58:17 +0000
Subject: [PATCH 038/164] WIP in tests

---
 .../models/informer/check_instantiate_works.py       | 12 +++++++-----
 .../models/informer/configuration_informer.py        |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
index 7a7bc764d0f4..48c963395972 100644
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ b/src/transformers/models/informer/check_instantiate_works.py
@@ -5,7 +5,9 @@
 from huggingface_hub import hf_hub_download
 import torch
 
-
+"""
+Establish one batch for forward pass in the Informer
+"""
 if __name__ == '__main__':
     freq = "1M"
     prediction_length = 24
@@ -19,8 +21,8 @@
                             num_static_categorical_features=1,
                             cardinality=[366],
                             embedding_dimension=[2],
-                            encoder_layers=4,
-                            decoder_layers=4)
+                            encoder_layers=1,
+                            decoder_layers=1)
     model = InformerModel(config)
     print(model)
 
@@ -37,9 +39,9 @@
     # )
     # model = TimeSeriesTransformerModel(config)
     # model.eval()
-
+    #
     # model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
-
+    #
     # file = hf_hub_download(
     #     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
     # )
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index d77ab4ce66d1..a4e9f2b60d9d 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -169,7 +169,7 @@ def __init__(
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.distribution_output = distribution_output
-        # self.loss = loss # Eli: From vanilla ts transformer
+        self.loss = loss # Eli: From vanilla ts transformer
         self.input_size = input_size
         self.num_time_features = num_time_features
         self.lags_sequence = lags_sequence

From 83d39df0b1b6224647369d9ece7a6832f0a14a17 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 28 Jan 2023 09:41:21 +0000
Subject: [PATCH 039/164] added docs for the informer specific args

---
 .../models/informer/configuration_informer.py            | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index a4e9f2b60d9d..5111d415d63a 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -31,7 +31,7 @@
 class InformerConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
-    instantiate a Informer model according to the specified arguments, defining the model architecture.
+    instantiate an Informer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
     Transformer
     [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
@@ -106,6 +106,13 @@ class InformerConfig(PretrainedConfig):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+        attn (`str`, defaults to `prob`):
+            Attention used in encoder. This can be set to prob (informer) or full (transformer)
+        factor (`int`, defaults to 5):
+            ProbSparse attention factor
+        distil (`bool`, defualts to `True`):
+            Whether to use distilling in encoder
+
 
         Example:
 

From fdffeb819c60d4e085dfed3fe2b6ec2f5287b94d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 12:04:13 +0100
Subject: [PATCH 040/164] fix style

---
 src/transformers/__init__.py                  |  41 +++----
 src/transformers/models/__init__.py           |   2 +-
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 src/transformers/models/informer/__init__.py  |   5 +-
 .../models/informer/configuration_informer.py |  80 ++++++------
 .../models/informer/modeling_informer.py      | 116 ++++++++----------
 .../models/informer/test_modeling_informer.py |  31 ++---
 8 files changed, 122 insertions(+), 161 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0900c050c18b..695055930371 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -290,6 +290,10 @@
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
+    "models.informer": [
+        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InformerConfig",
+    ],
     "models.jukebox": [
         "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "JukeboxConfig",
@@ -414,10 +418,6 @@
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
-    "models.informer": [
-        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "InformerConfig",
-    ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1621,6 +1621,14 @@
             "load_tf_weights_in_imagegpt",
         ]
     )
+    _import_structure["models.informer"].extend(
+        [
+            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "InformerForPrediction",
+            "InformerModel",
+            "InformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.jukebox"].extend(
         [
             "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2275,14 +2283,6 @@
             "TimeSeriesTransformerPreTrainedModel",
         ]
     )
-    _import_structure["models.informer"].extend(
-        [
-            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "InformerForPrediction",
-            "InformerModel",
-            "InformerPreTrainedModel",
-        ]
-    )
     _import_structure["models.timesformer"].extend(
         [
             "TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3741,6 +3741,7 @@
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
+    from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
     from .models.jukebox import (
         JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP,
         JukeboxConfig,
@@ -3855,10 +3856,6 @@
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
-    from .models.informer import (
-        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        InformerConfig,
-    )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
     from .models.trajectory_transformer import (
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4865,6 +4862,12 @@
             ImageGPTPreTrainedModel,
             load_tf_weights_in_imagegpt,
         )
+        from .models.informer import (
+            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InformerForPrediction,
+            InformerModel,
+            InformerPreTrainedModel,
+        )
         from .models.jukebox import (
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
@@ -5393,12 +5396,6 @@
             TimeSeriesTransformerModel,
             TimeSeriesTransformerPreTrainedModel,
         )
-        from .models.informer import (
-            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            InformerForPrediction,
-            InformerModel,
-            InformerPreTrainedModel,
-        )
         from .models.timesformer import (
             TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TimesformerForVideoClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 2da3bc253081..a83e7a2318f7 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -90,6 +90,7 @@
     hubert,
     ibert,
     imagegpt,
+    informer,
     jukebox,
     layoutlm,
     layoutlmv2,
@@ -165,7 +166,6 @@
     tapas,
     tapex,
     time_series_transformer,
-    informer,
     timesformer,
     trajectory_transformer,
     transfo_xl,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index e1235b9bffbd..61111ba75fbf 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -93,6 +93,7 @@
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
         ("imagegpt", "ImageGPTConfig"),
+        ("informer", "InformerConfig"),
         ("jukebox", "JukeboxConfig"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
@@ -161,7 +162,6 @@
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
-        ("informer", "InformerConfig"),
         ("timesformer", "TimesformerConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
         ("transfo-xl", "TransfoXLConfig"),
@@ -258,6 +258,7 @@
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -319,7 +320,6 @@
         ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -424,6 +424,7 @@
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("imagegpt", "ImageGPT"),
+        ("informer", "Informer"),
         ("jukebox", "Jukebox"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
@@ -500,7 +501,6 @@
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
-        ("informer", "Informer"),
         ("timesformer", "TimeSformer"),
         ("trajectory_transformer", "Trajectory Transformer"),
         ("transfo-xl", "Transformer-XL"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b365d58acfbd..16bd7f3dc5b9 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -92,6 +92,7 @@
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("imagegpt", "ImageGPTModel"),
+        ("informer", "InformerModel"),
         ("jukebox", "JukeboxModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
@@ -157,7 +158,6 @@
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
-        ("informer", "InformerModel"),
         ("timesformer", "TimesformerModel"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
diff --git a/src/transformers/models/informer/__init__.py b/src/transformers/models/informer/__init__.py
index 927fad5e5e7f..47e7a9c115bf 100644
--- a/src/transformers/models/informer/__init__.py
+++ b/src/transformers/models/informer/__init__.py
@@ -43,10 +43,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_informer import (
-        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        InformerConfig,
-    )
+    from .configuration_informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 5111d415d63a..a303b8586167 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -27,13 +27,11 @@
 }
 
 
-
 class InformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
-    instantiate an Informer model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
-    Transformer
+    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to instantiate an
+    Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Time Series Transformer
     [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
     architecture.
 
@@ -136,47 +134,47 @@ class InformerConfig(PretrainedConfig):
     }
 
     def __init__(
-            self,
-            input_size: int = 1,
-            prediction_length: Optional[int] = None,
-            context_length: Optional[int] = None,
-            distribution_output: str = "student_t",
-            loss: str = "nll",
-            lags_sequence: List[int] = None,
-            scaling: bool = True,
-            num_dynamic_real_features: int = 0,
-            num_static_real_features: int = 0,
-            num_static_categorical_features: int = 0,
-            num_time_features: int = 0,
-            cardinality: Optional[List[int]] = None,
-            embedding_dimension: Optional[List[int]] = None,
-            encoder_ffn_dim: int = 32,
-            decoder_ffn_dim: int = 32,
-            encoder_attention_heads: int = 2,
-            decoder_attention_heads: int = 2,
-            encoder_layers: int = 2,
-            decoder_layers: int = 2,
-            is_encoder_decoder: bool = True,
-            activation_function: str = "gelu",
-            dropout: float = 0.05,
-            encoder_layerdrop: float = 0.1,
-            decoder_layerdrop: float = 0.1,
-            attention_dropout: float = 0.1,
-            activation_dropout: float = 0.1,
-            num_parallel_samples: int = 100,
-            init_std: float = 0.02,
-            use_cache=True,
-            # Informer arguments
-            attn: str = "prob",
-            factor: int = 5,
-            distil: bool = True,
-            **kwargs
+        self,
+        input_size: int = 1,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        lags_sequence: List[int] = None,
+        scaling: bool = True,
+        num_dynamic_real_features: int = 0,
+        num_static_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_time_features: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        is_encoder_decoder: bool = True,
+        activation_function: str = "gelu",
+        dropout: float = 0.05,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache=True,
+        # Informer arguments
+        attn: str = "prob",
+        factor: int = 5,
+        distil: bool = True,
+        **kwargs
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.distribution_output = distribution_output
-        self.loss = loss # Eli: From vanilla ts transformer
+        self.loss = loss  # Eli: From vanilla ts transformer
         self.input_size = input_size
         self.num_time_features = num_time_features
         self.lags_sequence = lags_sequence
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4334fc0c77c3..b513e5c67d30 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -17,9 +17,12 @@
 
 import random
 from dataclasses import dataclass
+from math import sqrt
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
+import torch.nn.functional as F
 from torch import nn
 from torch.distributions import (
     AffineTransform,
@@ -37,11 +40,6 @@
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
 
-from math import sqrt
-from typing import List, Optional
-
-import numpy as np
-import torch.nn.functional as F
 
 logger = logging.get_logger(__name__)
 
@@ -54,7 +52,6 @@
 ]
 
 
-
 class AffineTransformed(TransformedDistribution):
     def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
         self.scale = 1.0 if scale is None else scale
@@ -472,6 +469,7 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     scale: Optional[torch.FloatTensor] = None
     static_features: Optional[torch.FloatTensor] = None
 
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
@@ -540,6 +538,7 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     scale: Optional[torch.FloatTensor] = None
     static_features: Optional[torch.FloatTensor] = None
 
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class SampleTimeSeriesPredictionOutput(ModelOutput):
@@ -554,9 +553,7 @@ class TriangularCausalMask:
     def __init__(self, B, L, device="cpu"):
         mask_shape = [B, 1, L, L]
         with torch.no_grad():
-            self._mask = torch.triu(
-                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
-            ).to(device)
+            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
 
     @property
     def mask(self):
@@ -568,9 +565,7 @@ class ProbMask:
     def __init__(self, B, H, L, index, scores, device="cpu"):
         _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
         _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
-        indicator = _mask_ex[
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
-        ].to(device)
+        indicator = _mask_ex[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :].to(device)
         self._mask = indicator.view(scores.shape).to(device)
 
     @property
@@ -597,7 +592,7 @@ def __init__(
     def forward(self, queries, keys, values, attn_mask):
         B, L, H, E = queries.shape
         _, S, _, D = values.shape
-        scale = self.scale or 1. / sqrt(E)
+        scale = self.scale or 1.0 / sqrt(E)
 
         scores = torch.einsum("blhe,bshe->bhls", queries, keys)
         if self.mask_flag:
@@ -673,14 +668,12 @@ def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
 
         attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
 
-        context_in[
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
-        ] = torch.matmul(attn, V).type_as(context_in)
+        context_in[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = torch.matmul(
+            attn, V
+        ).type_as(context_in)
         if self.output_attention:
             attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
-            attns[
-                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
-            ] = attn
+            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
             return (context_in, attns)
         else:
             return (context_in, None)
@@ -708,18 +701,14 @@ def forward(self, queries, keys, values, attn_mask):
         # get the context
         context = self._get_initial_context(values, L_Q)
         # update the context with selected top_k queries
-        context, attn = self._update_context(
-            context, values, scores_top, index, L_Q, attn_mask
-        )
+        context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
 
         return context.transpose(2, 1).contiguous(), attn
 
 
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
 class AttentionLayer(nn.Module):
-    def __init__(
-        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
-    ):
+    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False):
         super(AttentionLayer, self).__init__()
 
         d_keys = d_keys or (d_model // n_heads)
@@ -761,13 +750,13 @@ def __init__(self, c_in):
             padding=1,
             padding_mode="circular",
         )
-        self.norm = nn.BatchNorm1d(c_in) # Eli question: why batchnorm here?
+        self.norm = nn.BatchNorm1d(c_in)  # Eli question: why batchnorm here?
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
 
     def forward(self, x):
         x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x) # Eli: why? maybe because the impl...
+        x = self.norm(x)  # Eli: why? maybe because the impl...
         x = self.activation(x)
         x = self.maxPool(x)
         x = x.transpose(1, 2)
@@ -830,9 +819,7 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
         x = self.norm1(x)
 
-        x = x + self.dropout(
-            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
-        )
+        x = x + self.dropout(self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0])
 
         y = x = self.norm2(x)
         y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
@@ -847,8 +834,9 @@ def __init__(self, config: InformerConfig):
 
         self.activation_fn = ACT2FN[config.activation_function]
         Attn = ProbAttention if config.attn == "prob" else FullAttention
-        self.attn_layers = nn.ModuleList([
-            EncoderLayer(
+        self.attn_layers = nn.ModuleList(
+            [
+                EncoderLayer(
                     AttentionLayer(
                         Attn(
                             mask_flag=False,
@@ -864,8 +852,10 @@ def __init__(self, config: InformerConfig):
                     d_ff=config.encoder_ffn_dim,
                     dropout=config.attention_dropout,
                     activation=self.activation_fn,
-                ) for _ in range(config.encoder_layers)
-        ])
+                )
+                for _ in range(config.encoder_layers)
+            ]
+        )
 
         if config.distil is not None:
             self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
@@ -1000,22 +990,15 @@ def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
-        Returns lagged subsequences of a given sequence.
-        Parameters
-        ----------
-        sequence : Tensor
-            the sequence from which lagged subsequences should be extracted.
-            Shape: (N, T, C).
+        Returns lagged subsequences of a given sequence. Parameters ---------- sequence : Tensor
+            the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
         subsequences_length : int
             length of the subsequences to be extracted.
         shift: int
             shift the lags by this amount back.
-        Returns
-        --------
-        lagged : Tensor
-            a tensor of shape (N, S, C, I), where S = subsequences_length and
-            I = len(indices), containing lagged subsequences. Specifically,
-            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        Returns -------- lagged : Tensor
+            a tensor of shape (N, S, C, I), where S = subsequences_length and I = len(indices), containing lagged
+            subsequences. Specifically, lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
         """
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_sequence]
@@ -1125,24 +1108,24 @@ def get_decoder(self):
         return self.decoder
 
     def forward(
-            self,
-            past_values: torch.Tensor,
-            past_time_features: torch.Tensor,
-            past_observed_mask: torch.Tensor,
-            static_categorical_features: torch.Tensor,
-            static_real_features: torch.Tensor,
-            future_values: Optional[torch.Tensor] = None,
-            future_time_features: Optional[torch.Tensor] = None,
-            decoder_attention_mask: Optional[torch.LongTensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
-            decoder_head_mask: Optional[torch.Tensor] = None,
-            cross_attn_head_mask: Optional[torch.Tensor] = None,
-            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            output_hidden_states: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            use_cache: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: torch.Tensor,
+        static_real_features: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1178,7 +1161,7 @@ def forward(
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
 
-        dec_input = transformer_inputs[:, self.config.context_length:, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
         decoder_outputs = self.decoder(
             inputs_embeds=dec_input,
             attention_mask=decoder_attention_mask,
@@ -1462,6 +1445,3 @@ def generate(
                 (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
             )
         )
-
-
-
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index ecbea487e790..4a10708fa072 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -31,15 +31,8 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        InformerConfig,
-        InformerForPrediction,
-        InformerModel,
-    )
-    from transformers.models.informer.modeling_informer import (
-        InformerDecoder,
-        InformerEncoder,
-    )
+    from transformers import InformerConfig, InformerForPrediction, InformerModel
+    from transformers.models.informer.modeling_informer import InformerDecoder, InformerEncoder
 
 
 @require_torch
@@ -171,9 +164,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 @require_torch
 class InformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (InformerModel, InformerForPrediction) if is_torch_available() else ()
-    )
+    all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else ()
     all_generative_model_classes = (InformerForPrediction,) if is_torch_available() else ()
     is_encoder_decoder = True
     test_pruning = False
@@ -374,9 +365,7 @@ def prepare_batch(filename="train-batch.pt"):
 @slow
 class InformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            torch_device
-        )
+        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
         batch = prepare_batch()
 
         with torch.no_grad():
@@ -399,9 +388,9 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_inference_head(self):
-        model = InformerForPrediction.from_pretrained(
-            "huggingface/time-series-transformer-tourism-monthly"
-        ).to(torch_device)
+        model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
         batch = prepare_batch("val-batch.pt")
         with torch.no_grad():
             output = model(
@@ -421,9 +410,9 @@ def test_inference_head(self):
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_seq_to_seq_generation(self):
-        model = InformerForPrediction.from_pretrained(
-            "huggingface/time-series-transformer-tourism-monthly"
-        ).to(torch_device)
+        model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+            torch_device
+        )
         batch = prepare_batch("val-batch.pt")
         with torch.no_grad():
             outputs = model.generate(

From aab1e08f9b3783573258bbff4e524638ff361313 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 12:04:51 +0100
Subject: [PATCH 041/164] undo change

---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 68345408398e..8d89d5cd7f19 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -137,7 +137,7 @@ def __init__(
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
-        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7], # Eli: Remove the default here
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
         scaling: bool = True,
         num_dynamic_real_features: int = 0,
         num_static_categorical_features: int = 0,

From 4e73184c0e33fc36786098d76fff59774c89965d Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 30 Jan 2023 11:27:16 +0000
Subject: [PATCH 042/164] cleaning informer, now need to work only enc-dec

---
 .../models/informer/modeling_informer.py      | 273 +++++++++++++++---
 1 file changed, 235 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b513e5c67d30..e0a49c98fa1e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -52,6 +52,7 @@
 ]
 
 
+
 class AffineTransformed(TransformedDistribution):
     def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
         self.scale = 1.0 if scale is None else scale
@@ -828,6 +829,177 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         return self.norm3(x + y)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+INFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`InformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Past values of the time series, that serve as context in order to predict the future. These values may
+            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features`).
+
+            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+
+            Missing values need to be replaced with zeros.
+
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features.
+
+            The Informer only learns additional embeddings for `static_categorical_features`.
+
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs to learn to output, given the `past_values`.
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `future_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional features.
+
+            The Informer only learns additional embeddings for `static_categorical_features`.
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
 class InformerEncoder(nn.Module):
     def __init__(self, config: InformerConfig):
         super(InformerEncoder, self).__init__()
@@ -938,29 +1110,11 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         return x
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
-class InformerPreTrainedModel(PreTrainedModel):
-    config_class = InformerConfig
-    base_model_prefix = "model"
-    main_input_name = "past_values"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (InformerDecoder, InformerEncoder)):
-            module.gradient_checkpointing = value
-
-
+@add_start_docstrings(
+    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
 class InformerModel(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
@@ -990,15 +1144,17 @@ def get_lagged_subsequences(
         self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
-        Returns lagged subsequences of a given sequence. Parameters ---------- sequence : Tensor
-            the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
-        subsequences_length : int
-            length of the subsequences to be extracted.
-        shift: int
-            shift the lags by this amount back.
-        Returns -------- lagged : Tensor
-            a tensor of shape (N, S, C, I), where S = subsequences_length and I = len(indices), containing lagged
-            subsequences. Specifically, lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
+            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
+            j, :, k] = sequence[i, -indices[k]-S+j, :].
+
+        Args:
+            sequence: Tensor
+                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            subsequences_length : int
+                Length of the subsequences to be extracted.
+            shift: int
+                Shift the lags by this amount back.
         """
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_sequence]
@@ -1096,10 +1252,11 @@ def enc_dec_outputs(self, transformer_inputs):
         enc_input = transformer_inputs[:, : self.config.context_length, ...]
         dec_input = transformer_inputs[:, self.config.context_length :, ...]
 
-        enc_out, _ = self.encoder(enc_input)
-        dec_output = self.decoder(dec_input, enc_out)
-
-        return self.param_proj(dec_output)
+        encoder_outputs = self.encoder(inputs_embeds=enc_input)
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        )
+        return encoder_outputs, decoder_outputs
 
     def get_encoder(self):
         return self.encoder
@@ -1107,6 +1264,8 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1127,6 +1286,37 @@ def forward(
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1192,6 +1382,11 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    "The Informer Model with a distribution head on top for time-series forecasting.",
+    INFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
 class InformerForPrediction(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
@@ -1232,6 +1427,8 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
             sliced_params = [p[:, -trailing_n:] for p in params]
         return self.distribution_output.distribution(sliced_params, scale=scale)
 
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1270,14 +1467,14 @@ def forward(
         ```python
         >>> from huggingface_hub import hf_hub_download
         >>> import torch
-        >>> from transformers import TimeSeriesTransformerForPrediction
+        >>> from transformers import InformerForPrediction
 
         >>> file = hf_hub_download(
         ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = TimeSeriesTransformerForPrediction.from_pretrained(
+        >>> model = InformerForPrediction.from_pretrained(
         ...     "huggingface/time-series-transformer-tourism-monthly"
         ... )
 

From f12c0b0a1e974bebfa5d5e5221ecdfd70b335b47 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 14:18:51 +0100
Subject: [PATCH 043/164] initial enc-dec classes

---
 .../models/informer/modeling_informer.py      | 644 ++++++++++++++++--
 1 file changed, 579 insertions(+), 65 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index e0a49c98fa1e..518726672163 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -52,7 +52,6 @@
 ]
 
 
-
 class AffineTransformed(TransformedDistribution):
     def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
         self.scale = 1.0 if scale is None else scale
@@ -546,6 +545,322 @@ class SampleTimeSeriesPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
+class InformerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class ProbSparseAttention(nn.Module):
+    """ProbSparse Attention"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        factor: int = 5,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.factor = factor
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        # calculate the sampled Q_K
+        import pdb
+
+        pdb.set_trace()
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
 # Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
 # are from the original Informer repository (see the exact source below)
 
@@ -764,69 +1079,270 @@ def forward(self, x):
         return x
 
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
-class EncoderLayer(nn.Module):
-    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
-        super(EncoderLayer, self).__init__()
-        d_ff = d_ff or 4 * d_model
-        self.attention = attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
-
-    def forward(self, x, attn_mask=None):
-        # x [B, L, D]
-        # x = x + self.dropout(self.attention(
-        #     x, x, x,
-        #     attn_mask = attn_mask
-        # ))
-        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
-        x = x + self.dropout(new_x)
-
-        y = x = self.norm1(x)
-        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-        y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-        return self.norm2(x + y), attn
-
+class InformerEncoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        if config.attn == "prob":
+            self.self_attn = ProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                factor=config.factor,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+            )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/decoder.py
-class DecoderLayer(nn.Module):
-    def __init__(
+    def forward(
         self,
-        self_attention,
-        cross_attention,
-        d_model,
-        d_ff=None,
-        dropout=0.1,
-        activation="relu",
-    ):
-        super(DecoderLayer, self).__init__()
-        d_ff = d_ff or 4 * d_model
-        self.self_attention = self_attention
-        self.cross_attention = cross_attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
-
-    def forward(self, x, cross, x_mask=None, cross_mask=None):
-        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
-        x = self.norm1(x)
-
-        x = x + self.dropout(self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0])
-
-        y = x = self.norm2(x)
-        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
+# class EncoderLayer(nn.Module):
+#     def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+#         super(EncoderLayer, self).__init__()
+#         d_ff = d_ff or 4 * d_model
+#         self.attention = attention
+#         self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+#         self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+#         self.norm1 = nn.LayerNorm(d_model)
+#         self.norm2 = nn.LayerNorm(d_model)
+#         self.dropout = nn.Dropout(dropout)
+#         self.activation = F.relu if activation == "relu" else F.gelu
+
+#     def forward(self, x, attn_mask=None):
+#         # x [B, L, D]
+#         # x = x + self.dropout(self.attention(
+#         #     x, x, x,
+#         #     attn_mask = attn_mask
+#         # ))
+#         new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
+#         x = x + self.dropout(new_x)
+
+#         y = x = self.norm1(x)
+#         y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+#         y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+#         return self.norm2(x + y), attn
+
+
+class InformerDecoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        if config.attn == "prob":
+            self.self_attn = ProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                factor=config.factor,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = InformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
-        return self.norm3(x + y)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+# # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/decoder.py
+# class DecoderLayer(nn.Module):
+#     def __init__(
+#         self,
+#         self_attention,
+#         cross_attention,
+#         d_model,
+#         d_ff=None,
+#         dropout=0.1,
+#         activation="relu",
+#     ):
+#         super(DecoderLayer, self).__init__()
+#         d_ff = d_ff or 4 * d_model
+#         self.self_attention = self_attention
+#         self.cross_attention = cross_attention
+#         self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+#         self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+#         self.norm1 = nn.LayerNorm(d_model)
+#         self.norm2 = nn.LayerNorm(d_model)
+#         self.norm3 = nn.LayerNorm(d_model)
+#         self.dropout = nn.Dropout(dropout)
+#         self.activation = F.relu if activation == "relu" else F.gelu
+
+#     def forward(self, x, cross, x_mask=None, cross_mask=None):
+#         x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
+#         x = self.norm1(x)
+
+#         x = x + self.dropout(self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0])
+
+#         y = x = self.norm2(x)
+#         y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+#         y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+#         return self.norm3(x + y)
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
@@ -1059,7 +1575,7 @@ def forward(self, x, attn_mask=None):
 
 class InformerDecoder(nn.Module):
     def __init__(self, config: InformerConfig):
-        super(InformerDecoder, self).__init__()
+        super().__init__(config)
 
         self.activation_fn = ACT2FN[config.activation_function]
         Attn = ProbAttention if config.attn == "prob" else FullAttention
@@ -1474,9 +1990,7 @@ def forward(
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerForPrediction.from_pretrained(
-        ...     "huggingface/time-series-transformer-tourism-monthly"
-        ... )
+        >>> model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features

From 48337b0036d045bb4004b7c8a0d6f4fd91fee36e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 14:32:10 +0100
Subject: [PATCH 044/164] added encoder and decoder

---
 .../models/informer/modeling_informer.py      | 425 ++++++++++++++----
 1 file changed, 338 insertions(+), 87 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 518726672163..24251d6d99fc 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1516,114 +1516,364 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-class InformerEncoder(nn.Module):
+class InformerEncoder(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
-        super(InformerEncoder, self).__init__()
+        super().__init__(config)
 
-        self.activation_fn = ACT2FN[config.activation_function]
-        Attn = ProbAttention if config.attn == "prob" else FullAttention
-        self.attn_layers = nn.ModuleList(
-            [
-                EncoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=False,
-                            factor=config.factor,
-                            attention_dropout=config.attention_dropout,
-                            output_attention=False,
-                        ),
-                        config.d_model,
-                        config.encoder_attention_heads,
-                        mix=False,
-                    ),
-                    config.d_model,
-                    d_ff=config.encoder_ffn_dim,
-                    dropout=config.attention_dropout,
-                    activation=self.activation_fn,
-                )
-                for _ in range(config.encoder_layers)
-            ]
-        )
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.gradient_checkpointing = False
+
+        embed_dim = config.d_model
+
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
         if config.distil is not None:
             self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
         else:
             self.conv_layers = None
 
-        self.norm = torch.nn.LayerNorm(config.d_model)
-
-    def forward(self, x, attn_mask=None):
-        # x [B, L, D]
-        attns = []
-        if self.conv_layers is not None:
-            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
-                x, attn = attn_layer(x, attn_mask=attn_mask)
-                x = conv_layer(x)
-                attns.append(attn)
-            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
-            attns.append(attn)
-        else:
-            for attn_layer in self.attn_layers:
-                x, attn = attn_layer(x, attn_mask=attn_mask)
-                attns.append(attn)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
 
-        if self.norm is not None:
-            x = self.norm(x)
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
 
-        return x, attns
+                hidden_states = layer_outputs[0]
 
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
 
-class InformerDecoder(nn.Module):
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class InformerDecoder(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 
-        self.activation_fn = ACT2FN[config.activation_function]
-        Attn = ProbAttention if config.attn == "prob" else FullAttention
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
 
-        # Masked Decoder
-        self.layers = nn.ModuleList(
-            [
-                DecoderLayer(
-                    AttentionLayer(
-                        Attn(
-                            mask_flag=True,
-                            factor=config.factor,
-                            attention_dropout=config.attention_dropout,
-                            output_attention=False,
-                        ),
-                        config.d_model,
-                        config.decoder_attention_heads,
-                        mix=True,
-                    ),
-                    AttentionLayer(
-                        FullAttention(
-                            mask_flag=False,
-                            factor=config.factor,
-                            attention_dropout=config.attention_dropout,
-                            output_attention=False,
-                        ),
-                        config.d_model,
-                        config.decoder_attention_heads,
-                        mix=False,
+        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                     ),
-                    config.d_model,
-                    d_ff=config.decoder_ffn_dim,
-                    dropout=config.dropout,
-                    activation=self.activation_fn,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
                 )
-                for _ in range(config.decoder_layers)
-            ],
-        )
-        self.norm = torch.nn.LayerNorm(config.d_model)
+            hidden_states = layer_outputs[0]
 
-    def forward(self, x, cross, x_mask=None, cross_mask=None):
-        for layer in self.layers:
-            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
 
-        if self.norm is not None:
-            x = self.norm(x)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
 
-        return x
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
 
 
 @add_start_docstrings(
@@ -1906,6 +2156,7 @@ def forward(
 class InformerForPrediction(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
+
         self.model = InformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput(dim=config.input_size)

From 537404910f06a06d001a1ad748195cdf3c216d8b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 14:51:06 +0100
Subject: [PATCH 045/164] added todo

---
 src/transformers/models/informer/modeling_informer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 24251d6d99fc..2358c4b305b5 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -796,10 +796,11 @@ def forward(
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
+        # TODO
         # calculate the sampled Q_K
-        import pdb
+        # import pdb
 
-        pdb.set_trace()
+        # pdb.set_trace()
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

From dd3b46a20ee6d00669457bdda84652a6d20d205a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 15:22:26 +0100
Subject: [PATCH 046/164] add todos for conv_layers

---
 src/transformers/models/informer/modeling_informer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 2358c4b305b5..b33865dec351 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1623,6 +1623,7 @@ def custom_forward(*inputs):
                         attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                     )
+                    # TODO support for checkpointing conv_layers
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
@@ -1630,6 +1631,7 @@ def custom_forward(*inputs):
                         layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
+                    # TODO support for conv_layers
 
                 hidden_states = layer_outputs[0]
 

From 6bf419e6aef9bf928bec8ed0081bdcafa9f4d742 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 31 Jan 2023 12:43:32 +0000
Subject: [PATCH 047/164] added decoder docs from vanilla

---
 src/transformers/models/informer/modeling_informer.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b33865dec351..048ad23e7488 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1647,8 +1647,16 @@ def custom_forward(*inputs):
             last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
         )
 
-
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
+    """
+    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`InformerDecoderLayer`]
+
+    Args:
+        config: InformerConfig
+    """
+    
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 

From ed21221d6016e389764bffb07ad5fc649692f149 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 31 Jan 2023 12:49:12 +0000
Subject: [PATCH 048/164] added encoder docs from vanilla

---
 .../models/informer/modeling_informer.py             | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 048ad23e7488..db3057db0aaf 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1516,8 +1516,16 @@ def _set_gradient_checkpointing(self, module, value=False):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
 class InformerEncoder(InformerPreTrainedModel):
+    """
+    Informer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`InformerEncoderLayer`].
+
+    Args:
+        config: InformerConfig
+    """
+
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 
@@ -1656,7 +1664,7 @@ class InformerDecoder(InformerPreTrainedModel):
     Args:
         config: InformerConfig
     """
-    
+
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 

From e4c7875d3f419549845567a768c004c849277e20 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 31 Jan 2023 12:56:31 +0000
Subject: [PATCH 049/164] remove encoder decoder from the original informer

---
 .../models/informer/modeling_informer.py      | 63 -------------------
 1 file changed, 63 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index db3057db0aaf..b9897a2c4357 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1156,33 +1156,6 @@ def forward(
         return outputs
 
 
-# # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
-# class EncoderLayer(nn.Module):
-#     def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
-#         super(EncoderLayer, self).__init__()
-#         d_ff = d_ff or 4 * d_model
-#         self.attention = attention
-#         self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-#         self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-#         self.norm1 = nn.LayerNorm(d_model)
-#         self.norm2 = nn.LayerNorm(d_model)
-#         self.dropout = nn.Dropout(dropout)
-#         self.activation = F.relu if activation == "relu" else F.gelu
-
-#     def forward(self, x, attn_mask=None):
-#         # x [B, L, D]
-#         # x = x + self.dropout(self.attention(
-#         #     x, x, x,
-#         #     attn_mask = attn_mask
-#         # ))
-#         new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
-#         x = x + self.dropout(new_x)
-
-#         y = x = self.norm1(x)
-#         y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-#         y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-#         return self.norm2(x + y), attn
 
 
 class InformerDecoderLayer(nn.Module):
@@ -1310,42 +1283,6 @@ def forward(
         return outputs
 
 
-# # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/decoder.py
-# class DecoderLayer(nn.Module):
-#     def __init__(
-#         self,
-#         self_attention,
-#         cross_attention,
-#         d_model,
-#         d_ff=None,
-#         dropout=0.1,
-#         activation="relu",
-#     ):
-#         super(DecoderLayer, self).__init__()
-#         d_ff = d_ff or 4 * d_model
-#         self.self_attention = self_attention
-#         self.cross_attention = cross_attention
-#         self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
-#         self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
-#         self.norm1 = nn.LayerNorm(d_model)
-#         self.norm2 = nn.LayerNorm(d_model)
-#         self.norm3 = nn.LayerNorm(d_model)
-#         self.dropout = nn.Dropout(dropout)
-#         self.activation = F.relu if activation == "relu" else F.gelu
-
-#     def forward(self, x, cross, x_mask=None, cross_mask=None):
-#         x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
-#         x = self.norm1(x)
-
-#         x = x + self.dropout(self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0])
-
-#         y = x = self.norm2(x)
-#         y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-#         y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-#         return self.norm3(x + y)
-
-
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
 class InformerPreTrainedModel(PreTrainedModel):
     config_class = InformerConfig

From 645810a866358bbc4527c42ed9cfcd9b1d5fa07a Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 31 Jan 2023 13:14:15 +0000
Subject: [PATCH 050/164] removed AttentionLayer from the original paper

---
 .../models/informer/modeling_informer.py      | 36 +------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b9897a2c4357..e70999a865b3 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1023,39 +1023,6 @@ def forward(self, queries, keys, values, attn_mask):
         return context.transpose(2, 1).contiguous(), attn
 
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
-class AttentionLayer(nn.Module):
-    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False):
-        super(AttentionLayer, self).__init__()
-
-        d_keys = d_keys or (d_model // n_heads)
-        d_values = d_values or (d_model // n_heads)
-
-        self.inner_attention = attention
-        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
-        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
-        self.value_projection = nn.Linear(d_model, d_values * n_heads)
-        self.out_projection = nn.Linear(d_values * n_heads, d_model)
-        self.n_heads = n_heads
-        self.mix = mix
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L, _ = queries.shape
-        _, S, _ = keys.shape
-        H = self.n_heads
-
-        queries = self.query_projection(queries).view(B, L, H, -1)
-        keys = self.key_projection(keys).view(B, S, H, -1)
-        values = self.value_projection(values).view(B, S, H, -1)
-
-        out, attn = self.inner_attention(queries, keys, values, attn_mask)
-        if self.mix:
-            out = out.transpose(2, 1).contiguous()
-        out = out.view(B, L, -1)
-
-        return self.out_projection(out), attn
-
-
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
 class ConvLayer(nn.Module):
     def __init__(self, c_in):
@@ -1156,8 +1123,6 @@ def forward(
         return outputs
 
 
-
-
 class InformerDecoderLayer(nn.Module):
     def __init__(self, config: InformerConfig):
         super().__init__()
@@ -1592,6 +1557,7 @@ def custom_forward(*inputs):
             last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
         )
 
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """

From 2529b7c7be29129aa2f83f6a0622a29f700b7767 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 31 Jan 2023 13:25:06 +0000
Subject: [PATCH 051/164] removed TriangularCausalMask, same as
 decoder_attention_mask

---
 .../models/informer/modeling_informer.py             | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index e70999a865b3..82ec7fa3afbe 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -865,18 +865,6 @@ def forward(
 # Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
 # are from the original Informer repository (see the exact source below)
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
-class TriangularCausalMask:
-    def __init__(self, B, L, device="cpu"):
-        mask_shape = [B, 1, L, L]
-        with torch.no_grad():
-            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
-
-    @property
-    def mask(self):
-        return self._mask
-
-
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class ProbMask:
     def __init__(self, B, H, L, index, scores, device="cpu"):

From 225af5661ab0b073c02c557ccc2a808775bb64df Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 31 Jan 2023 15:30:18 +0100
Subject: [PATCH 052/164] initial sparse attention

---
 .../models/informer/modeling_informer.py      | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 82ec7fa3afbe..b406ed68bc41 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -796,11 +796,27 @@ def forward(
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        # TODO
+        # c*ln(L_k)
+        L_K = key_states.size(1)
+        U_part = min(self.factor * np.ceil(np.log1p(L_K)).astype("int").item(), L_K)
+
+        # c*ln(L_q)
+        L_Q = query_states.size(1)
+        u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
+
         # calculate the sampled Q_K
-        # import pdb
+        K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1)
+        index_sample = torch.randint(0, L_K, (L_Q, U_part))
+        # real U = U_part(factor*ln(L_k))*L_q
+        K_sample = K_expand[:, torch.arange(L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.bmm(query_states, key_states.transpose(1, 2))
 
-        # pdb.set_trace()
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
+        M_top = M.topk(u, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        # TODO
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

From f31ec9828356df235f7846415fbdcefeda07fb32 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 1 Feb 2023 12:34:34 +0100
Subject: [PATCH 053/164] use conv_layers

---
 .../models/informer/modeling_informer.py      | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b406ed68bc41..cfb178c665c5 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -804,23 +804,25 @@ def forward(
         L_Q = query_states.size(1)
         u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
 
+        # __prob_QK
         # calculate the sampled Q_K
         K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1)
         index_sample = torch.randint(0, L_K, (L_Q, U_part))
+
         # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = K_expand[:, torch.arange(L_Q).unsqueeze(1), index_sample, :]
-        Q_K_sample = torch.bmm(query_states, key_states.transpose(1, 2))
+        K_sample = K_expand[:, :, index_sample, :]
+        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
 
         # find the Top_k query with sparisty measurement
         M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
         M_top = M.topk(u, sorted=False)[1]
 
         # use the reduced Q to calculate Q_K
-        # TODO
+        # factor*ln(L_q)
+        Q_reduce = query_states[:, M_top, :]
+        attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
                 f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
@@ -881,6 +883,7 @@ def forward(
 # Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
 # are from the original Informer repository (see the exact source below)
 
+
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class ProbMask:
     def __init__(self, B, H, L, index, scores, device="cpu"):
@@ -1422,6 +1425,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
 class InformerEncoder(InformerPreTrainedModel):
     """
@@ -1446,8 +1450,9 @@ def __init__(self, config: InformerConfig):
 
         if config.distil is not None:
             self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
+            self.conv_layers.append(None)
         else:
-            self.conv_layers = None
+            self.conv_layers = [None] * config.encoder_layers
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1515,7 +1520,7 @@ def forward(
                     f" {head_mask.size()[0]}."
                 )
 
-        for idx, encoder_layer in enumerate(self.layers):
+        for idx, encoder_layer, conv_layer in enumerate(zip(self.layers, self.conv_layers)):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1545,7 +1550,8 @@ def custom_forward(*inputs):
                         layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
-                    # TODO support for conv_layers
+                    if conv_layer is not None:
+                        hidden_states = conv_layer(hidden_states)
 
                 hidden_states = layer_outputs[0]
 
@@ -1732,7 +1738,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1757,7 +1762,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,

From 778c2a282ecb4d49aebf46ff36bfee2a74317965 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 1 Feb 2023 13:55:23 +0000
Subject: [PATCH 054/164] fixed test_config test

---
 .../models/informer/configuration_informer.py             | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index a303b8586167..9e628755f012 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -174,14 +174,16 @@ def __init__(
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.distribution_output = distribution_output
-        self.loss = loss  # Eli: From vanilla ts transformer
+        self.loss = loss
         self.input_size = input_size
         self.num_time_features = num_time_features
-        self.lags_sequence = lags_sequence
+        self.lags_sequence = lags_sequence if lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7]
         self.scaling = scaling
         self.num_dynamic_real_features = num_dynamic_real_features
         self.num_static_real_features = num_static_real_features
         self.num_static_categorical_features = num_static_categorical_features
+
+        # set cardinality
         if cardinality and num_static_categorical_features > 0:
             if len(cardinality) != num_static_categorical_features:
                 raise ValueError(
@@ -204,7 +206,7 @@ def __init__(
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture configuration
-        self.d_model = input_size * len(lags_sequence) + self._number_of_features
+        self.d_model = input_size * len(self.lags_sequence) + self._number_of_features
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_attention_heads = decoder_attention_heads
         self.encoder_ffn_dim = encoder_ffn_dim

From 7363444e6a98b454d4484ca07e70079403487097 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 1 Feb 2023 14:03:19 +0000
Subject: [PATCH 055/164] fix parenthesis when itearting zip(layers,
 conv_layers)

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index cfb178c665c5..ad3acca10bbe 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1520,7 +1520,7 @@ def forward(
                     f" {head_mask.size()[0]}."
                 )
 
-        for idx, encoder_layer, conv_layer in enumerate(zip(self.layers, self.conv_layers)):
+        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)

From 86486d2eb362b1e7c2ac500db75fd0861eb8aff3 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 1 Feb 2023 14:18:14 +0000
Subject: [PATCH 056/164] error found in prob attention, added sizes as
 comments

---
 src/transformers/models/informer/modeling_informer.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index ad3acca10bbe..1bb253fdb70c 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -806,12 +806,13 @@ def forward(
 
         # __prob_QK
         # calculate the sampled Q_K
-        K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1)
-        index_sample = torch.randint(0, L_K, (L_Q, U_part))
+        K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1) # torch.Size([52, 14, 14, 4])
+        index_sample = torch.randint(0, L_K, (L_Q, U_part)) # torch.Size([14, 14])
 
         # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = K_expand[:, :, index_sample, :]
-        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
+        K_sample = K_expand[:, :, index_sample, :] # torch.Size([52, 14, 14, 14, 4])
+        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2)) # error
+        # torch.Size([52, 14, 4]) x torch.Size([52, 14, 14, 14, 4])
 
         # find the Top_k query with sparisty measurement
         M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)

From 606ca81a5d74a1e86d399e4835ac392121a54d03 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 1 Feb 2023 15:29:28 +0100
Subject: [PATCH 057/164] fix sizes

---
 .../models/informer/modeling_informer.py          | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1bb253fdb70c..4762b68bc29a 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -806,13 +806,13 @@ def forward(
 
         # __prob_QK
         # calculate the sampled Q_K
-        K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1) # torch.Size([52, 14, 14, 4])
-        index_sample = torch.randint(0, L_K, (L_Q, U_part)) # torch.Size([14, 14])
+        # K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1) # torch.Size([52, 14, 14, 4])
+        index_sample = torch.randint(0, L_K, (U_part,)) # torch.Size([14])
 
         # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = K_expand[:, :, index_sample, :] # torch.Size([52, 14, 14, 14, 4])
-        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2)) # error
-        # torch.Size([52, 14, 4]) x torch.Size([52, 14, 14, 14, 4])
+        K_sample = key_states[:, index_sample, :] # torch.Size([52, 14, 4])
+        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
+        # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
 
         # find the Top_k query with sparisty measurement
         M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
@@ -820,7 +820,10 @@ def forward(
 
         # use the reduced Q to calculate Q_K
         # factor*ln(L_q)
-        Q_reduce = query_states[:, M_top, :]
+        # Q_reduce = query_states[:, M_top, :]
+        Q_reduce = torch.gather(
+            input=query_states, dim=0, index=torch.tile(M_top[:, :, None], (1, 1, query_states.shape[2]))
+        )
         attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)

From b416de8f24f4df44cb7cc154219a2da9e187368a Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 3 Feb 2023 12:05:21 +0000
Subject: [PATCH 058/164] added proposal for q_reduce indexing, and remove
 unused

---
 .../models/informer/modeling_informer.py      | 45 ++-----------------
 1 file changed, 4 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4762b68bc29a..17f1c0dc4043 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -824,6 +824,10 @@ def forward(
         Q_reduce = torch.gather(
             input=query_states, dim=0, index=torch.tile(M_top[:, :, None], (1, 1, query_states.shape[2]))
         )
+        # Eli: might be more nice
+        # dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+        # Q_reduce = query_states[dim_for_slice, M_top]
+
         attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)
@@ -884,9 +888,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Eli: TriangularCausalMask, ProbMask, FullAttention, ProbAttention and AttentionLayer
-# are from the original Informer repository (see the exact source below)
-
 
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class ProbMask:
@@ -900,44 +901,6 @@ def __init__(self, B, H, L, index, scores, device="cpu"):
     def mask(self):
         return self._mask
 
-
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
-class FullAttention(nn.Module):
-    def __init__(
-        self,
-        mask_flag=True,
-        factor=5,
-        scale=None,
-        attention_dropout=0.1,
-        output_attention=False,
-    ):
-        super(FullAttention, self).__init__()
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.output_attention = output_attention
-        self.dropout = nn.Dropout(attention_dropout)
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L, H, E = queries.shape
-        _, S, _, D = values.shape
-        scale = self.scale or 1.0 / sqrt(E)
-
-        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
-        if self.mask_flag:
-            if attn_mask is None:
-                attn_mask = TriangularCausalMask(B, L, device=queries.device)
-
-            scores.masked_fill_(attn_mask.mask, -np.inf)
-
-        A = self.dropout(torch.softmax(scale * scores, dim=-1))
-        V = torch.einsum("bhls,bshd->blhd", A, values)
-
-        if self.output_attention:
-            return (V.contiguous(), A)
-        else:
-            return (V.contiguous(), None)
-
-
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
 class ProbAttention(nn.Module):
     def __init__(

From 3bfbb23d99d79cd55e88e863cc23dac63ee605a4 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 3 Feb 2023 12:09:31 +0000
Subject: [PATCH 059/164] WIP ProbMask, and changed factor=2 for testing

---
 .../models/informer/configuration_informer.py |  2 +-
 .../models/informer/modeling_informer.py      | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 9e628755f012..ca95a51e4308 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -166,7 +166,7 @@ def __init__(
         use_cache=True,
         # Informer arguments
         attn: str = "prob",
-        factor: int = 5,
+        factor: int = 2,
         distil: bool = True,
         **kwargs
     ):
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 17f1c0dc4043..edd38ad9e670 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1578,6 +1578,26 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         return combined_attention_mask
 
+    def _prepare_decoder_prob_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+            # create prob mask
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = None
+            if input_shape[-1] > 1:
+                combined_attention_mask = _make_causal_mask( # TODO _make_prob_mask
+                    input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+                ).to(inputs_embeds.device)
+
+            if attention_mask is not None:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                    inputs_embeds.device
+                )
+                combined_attention_mask = (
+                    expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+                )
+
+            return combined_attention_mask
+
     def forward(
         self,
         attention_mask: Optional[torch.Tensor] = None,

From 11a081e09e92771e51a5d2758d53a9afb59547f0 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Tue, 7 Feb 2023 18:46:22 +0700
Subject: [PATCH 060/164] remove unused libs for this PR for creating the env

---
 setup.py | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/setup.py b/setup.py
index aad145b145f2..603760581faa 100644
--- a/setup.py
+++ b/setup.py
@@ -123,7 +123,6 @@
     "jax>=0.2.8,!=0.3.2,<=0.3.6",
     "jaxlib>=0.1.65,<=0.3.6",
     "jieba",
-    "kenlm",
     "keras-nlp>=0.3.1",
     "nltk",
     "natten>=0.14.4",
@@ -159,9 +158,9 @@
     "sigopt",
     "librosa",
     "starlette",
-    "tensorflow-cpu>=2.4,<2.12",
-    "tensorflow>=2.4,<2.12",
-    "tensorflow-text",
+    # "tensorflow-cpu>=2.4,<2.12",
+    # "tensorflow>=2.4,<2.12",
+    # "tensorflow-text",
     "tf2onnx",
     "timeout-decorator",
     "timm",
@@ -176,7 +175,7 @@
     "uvicorn",
     "beautifulsoup4",
     "sudachipy>=0.6.6",
-    "sudachidict_core>=20220729",
+    # "sudachidict_core>=20220729",
     "rhoknp>=1.1.0",
 ]
 
@@ -246,11 +245,11 @@ def run(self):
 
 extras = {}
 
-extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
+extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "rhoknp")
 extras["sklearn"] = deps_list("scikit-learn")
 
-extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
-extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
+extras["tf"] = deps_list("tf2onnx")
+extras["tf-cpu"] = deps_list("tf2onnx")
 
 extras["torch"] = deps_list("torch")
 extras["accelerate"] = deps_list("accelerate")
@@ -278,7 +277,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
@@ -363,19 +362,19 @@ def run(self):
     + extras["modelcreation"]
     + extras["onnxruntime"]
 )
-extras["dev-tensorflow"] = (
-    extras["testing"]
-    + extras["tf"]
-    + extras["sentencepiece"]
-    + extras["tokenizers"]
-    + extras["vision"]
-    + extras["quality"]
-    + extras["docs_specific"]
-    + extras["sklearn"]
-    + extras["modelcreation"]
-    + extras["onnx"]
-    + extras["tf-speech"]
-)
+# extras["dev-tensorflow"] = (
+#     extras["testing"]
+#     + extras["tf"]
+#     + extras["sentencepiece"]
+#     + extras["tokenizers"]
+#     + extras["vision"]
+#     + extras["quality"]
+#     + extras["docs_specific"]
+#     + extras["sklearn"]
+#     + extras["modelcreation"]
+#     + extras["onnx"]
+#     + extras["tf-speech"]
+# )
 extras["dev"] = (
     extras["all"]
     + extras["testing"]

From bb70b1b8291030df6906fdf7fc3a40a415ed9456 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Fri, 10 Feb 2023 15:45:01 +0700
Subject: [PATCH 061/164] fix checking the attn_weights.size() after bmm

---
 .../models/informer/modeling_informer.py             | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index edd38ad9e670..12152a730790 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -806,15 +806,13 @@ def forward(
 
         # __prob_QK
         # calculate the sampled Q_K
-        # K_expand = key_states.unsqueeze(2).expand(-1, L_Q, L_K, -1) # torch.Size([52, 14, 14, 4])
-        index_sample = torch.randint(0, L_K, (U_part,)) # torch.Size([14])
+        index_sample = torch.randint(low=0, high=L_K, size=(U_part,)) # torch.Size([U_part])
 
         # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = key_states[:, index_sample, :] # torch.Size([52, 14, 4])
-        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
-        # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
+        K_sample = key_states[:, index_sample, :]  # torch.Size([bsz * self.num_heads, U_part, channel])
+        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2)) # torch.Size([bsz * self.num_heads, L_Q, U_part])
 
-        # find the Top_k query with sparisty measurement
+        # find the Top_k query with sparsity measurement
         M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
         M_top = M.topk(u, sorted=False)[1]
 
@@ -831,7 +829,7 @@ def forward(
         attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
             raise ValueError(
                 f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                 f" {attn_weights.size()}"

From 588205b76eee986d4cef3d86c272a72bd993ed93 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Fri, 10 Feb 2023 15:57:18 +0700
Subject: [PATCH 062/164] Q_reduce: changed from torch.gather to simple slicing

---
 src/transformers/models/informer/modeling_informer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 12152a730790..b74ce9b82764 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -819,12 +819,8 @@ def forward(
         # use the reduced Q to calculate Q_K
         # factor*ln(L_q)
         # Q_reduce = query_states[:, M_top, :]
-        Q_reduce = torch.gather(
-            input=query_states, dim=0, index=torch.tile(M_top[:, :, None], (1, 1, query_states.shape[2]))
-        )
-        # Eli: might be more nice
-        # dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
-        # Q_reduce = query_states[dim_for_slice, M_top]
+        dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+        Q_reduce = query_states[dim_for_slice, M_top]
 
         attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 

From b3f595f98d43ef3efb97b2a37733f00784c15331 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Fri, 10 Feb 2023 18:23:00 +0700
Subject: [PATCH 063/164] WIP calculate final attn_output

---
 .../models/informer/modeling_informer.py      | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b74ce9b82764..182a0878782a 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -827,11 +827,12 @@ def forward(
         src_len = key_states.size(1)
         if attn_weights.size() != (bsz * self.num_heads, u, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
                 f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
+            # TODO: change tgt_len to u
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
@@ -842,6 +843,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if layer_head_mask is not None:
+            # TODO: change tgt_len to u
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
@@ -855,14 +857,28 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
         else:
             attn_weights_reshaped = None
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        # The authors didn't use attention dropout.
+        # Not removing this yet, waiting for Kashif approval
+        # attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        # attn_output = torch.bmm(attn_probs, value_states)
+        attn_output = torch.bmm(attn_weights, value_states)
 
-        attn_output = torch.bmm(attn_probs, value_states)
+        # Build final output
+        # reimplemented from the original:
+        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L70
+        if attention_mask is not None:
+            v_aggregated = value_states.mean(dim=-2)
+        else:
+            v_aggregated = value_states.cumsum(dim=-2)
+
+        # TODO: combine v_aggregated with attn_output to create the new attn_output
+        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L90
+        attn_output = v_aggregated
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(

From ceef0ad09856c55835542b0b46d907a8827b61ee Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 11 Feb 2023 18:46:50 +0700
Subject: [PATCH 064/164] finish adding v_aggregated, attn_output ready

---
 .../models/informer/modeling_informer.py             | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 182a0878782a..14e2e38de62e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -871,13 +871,17 @@ def forward(
         # Build final output
         # reimplemented from the original:
         # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L70
-        if attention_mask is not None:
-            v_aggregated = value_states.mean(dim=-2)
+        if attention_mask is None:
+            v_aggregated = value_states.mean(dim=1)
+            v_aggregated = v_aggregated.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, v_aggregated.size(-1))
         else:
-            v_aggregated = value_states.cumsum(dim=-2)
+            v_aggregated = value_states.cumsum(dim=1)
 
-        # TODO: combine v_aggregated with attn_output to create the new attn_output
         # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L90
+        dim_for_slice = torch.arange(v_aggregated.size(0)).unsqueeze(-1)
+        v_aggregated[dim_for_slice, M_top, :] = attn_output
+
+        # Rename final output
         attn_output = v_aggregated
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):

From db95fd8b3b1f4d82549eb66b8f0618360844d649 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 11 Feb 2023 18:50:57 +0700
Subject: [PATCH 065/164] changed tgt_len to u in attention_mask, need to fix
 the size error

---
 src/transformers/models/informer/modeling_informer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 14e2e38de62e..16aa4a7bf31f 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -832,13 +832,12 @@ def forward(
             )
 
         if attention_mask is not None:
-            # TODO: change tgt_len to u
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            if attention_mask.size() != (bsz, 1, u, src_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(bsz, 1, u, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 

From 3946b0cef0f3aa56caec621ff6c9d05214535f31 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sun, 12 Feb 2023 09:39:09 +0200
Subject: [PATCH 066/164] comment attention_mask for encoder, and fix if cond
 for v_agg

---
 .../models/informer/modeling_informer.py      | 39 ++++++-------------
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 16aa4a7bf31f..24ee71f35e50 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -831,13 +831,16 @@ def forward(
                 f" {attn_weights.size()}"
             )
 
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, u, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, u, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+        # Original impl don't apply attention_mask to the input of the encoder, only for the decoder
+        # For the decoder, it creates a casual mask sliced with M_top
+        # if attention_mask is not None:
+        #     if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+        #         raise ValueError(
+        #             f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+        #         )
+        #
+        #     attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + attention_mask
+        #     attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -870,7 +873,7 @@ def forward(
         # Build final output
         # reimplemented from the original:
         # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L70
-        if attention_mask is None:
+        if not self.is_decoder:
             v_aggregated = value_states.mean(dim=1)
             v_aggregated = v_aggregated.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, v_aggregated.size(-1))
         else:
@@ -1591,26 +1594,6 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         return combined_attention_mask
 
-    def _prepare_decoder_prob_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-            # create prob mask
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            combined_attention_mask = None
-            if input_shape[-1] > 1:
-                combined_attention_mask = _make_causal_mask( # TODO _make_prob_mask
-                    input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-                ).to(inputs_embeds.device)
-
-            if attention_mask is not None:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                    inputs_embeds.device
-                )
-                combined_attention_mask = (
-                    expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-                )
-
-            return combined_attention_mask
-
     def forward(
         self,
         attention_mask: Optional[torch.Tensor] = None,

From d2db148739ede984a950aa0710a45ffeffe052ce Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sun, 12 Feb 2023 20:30:58 +0700
Subject: [PATCH 067/164] added ProbMask support (wip), removed old original
 code

---
 .../models/informer/modeling_informer.py      | 123 ++----------------
 1 file changed, 13 insertions(+), 110 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 24ee71f35e50..2cd65fb1cf71 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -831,16 +831,12 @@ def forward(
                 f" {attn_weights.size()}"
             )
 
-        # Original impl don't apply attention_mask to the input of the encoder, only for the decoder
-        # For the decoder, it creates a casual mask sliced with M_top
-        # if attention_mask is not None:
-        #     if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-        #         raise ValueError(
-        #             f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-        #         )
-        #
-        #     attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + attention_mask
-        #     attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+        # Original impl don't apply attention_mask to the encoder, only for the decoder
+        # For the decoder, it creates a casual mask sliced with M_top (ProbMask)
+        if self.is_decoder:
+            prob_mask = ProbMask(B=bsz, H=self.num_heads, L=L_Q , index=M_top, scores=attn_weights)  # size = (bsz, 1, u, src_len)
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -904,7 +900,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
 class ProbMask:
     def __init__(self, B, H, L, index, scores, device="cpu"):
@@ -917,102 +912,6 @@ def __init__(self, B, H, L, index, scores, device="cpu"):
     def mask(self):
         return self._mask
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
-class ProbAttention(nn.Module):
-    def __init__(
-        self,
-        mask_flag=True,
-        factor=5,
-        scale=None,
-        attention_dropout=0.1,
-        output_attention=False,
-    ):
-        super(ProbAttention, self).__init__()
-        self.factor = factor
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.output_attention = output_attention
-        self.dropout = nn.Dropout(attention_dropout)
-
-    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
-        # Q [B, H, L, D]
-        B, H, L_K, E = K.shape
-        _, _, L_Q, _ = Q.shape
-
-        # calculate the sampled Q_K
-        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
-        index_sample = torch.randint(L_K, (L_Q, sample_k))  # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
-        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2)
-
-        # find the Top_k query with sparisty measurement
-        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
-        M_top = M.topk(n_top, sorted=False)[1]
-
-        # use the reduced Q to calculate Q_K
-        Q_reduce = Q[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :]  # factor*ln(L_q)
-        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
-
-        return Q_K, M_top
-
-    def _get_initial_context(self, V, L_Q):
-        B, H, L_V, D = V.shape
-        if not self.mask_flag:
-            # V_sum = V.sum(dim=-2)
-            V_sum = V.mean(dim=-2)
-            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
-        else:  # use mask
-            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
-            contex = V.cumsum(dim=-2)
-        return contex
-
-    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
-        B, H, L_V, D = V.shape
-
-        if self.mask_flag:
-            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
-            scores.masked_fill_(attn_mask.mask, -np.inf)
-
-        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
-
-        context_in[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = torch.matmul(
-            attn, V
-        ).type_as(context_in)
-        if self.output_attention:
-            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
-            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
-            return (context_in, attns)
-        else:
-            return (context_in, None)
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L_Q, H, D = queries.shape
-        _, L_K, _, _ = keys.shape
-
-        queries = queries.transpose(2, 1)
-        keys = keys.transpose(2, 1)
-        values = values.transpose(2, 1)
-
-        U_part = self.factor * np.ceil(np.log1p(L_K)).astype("int").item()  # c*ln(L_k)
-        u = self.factor * np.ceil(np.log1p(L_Q)).astype("int").item()  # c*ln(L_q)
-
-        U_part = U_part if U_part < L_K else L_K
-        u = u if u < L_Q else L_Q
-
-        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
-
-        # add scale factor
-        scale = self.scale or 1.0 / sqrt(D)
-        if scale is not None:
-            scores_top = scores_top * scale
-        # get the context
-        context = self._get_initial_context(values, L_Q)
-        # update the context with selected top_k queries
-        context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
-
-        return context.transpose(2, 1).contiguous(), attn
-
-
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
 class ConvLayer(nn.Module):
     def __init__(self, c_in):
@@ -1124,6 +1023,7 @@ def __init__(self, config: InformerConfig):
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
                 factor=config.factor,
+                is_decoder=True
             )
         else:
             self.self_attn = InformerAttention(
@@ -1569,6 +1469,7 @@ def __init__(self, config: InformerConfig):
 
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.attn = config.attn
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1681,9 +1582,11 @@ def forward(
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
+        # create casual mask only if it's full attention (and not ProbAttention)
+        if self.attn != 'prob':
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
 
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:

From 4477d429dac5f84fc227d82f1f4e215a7fef8655 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sun, 12 Feb 2023 21:17:02 +0700
Subject: [PATCH 068/164] =?UTF-8?q?finished=20ProbMask=20=F0=9F=98=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/informer/modeling_informer.py      | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 2cd65fb1cf71..e75c79dad2ec 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -834,21 +834,19 @@ def forward(
         # Original impl don't apply attention_mask to the encoder, only for the decoder
         # For the decoder, it creates a casual mask sliced with M_top (ProbMask)
         if self.is_decoder:
-            prob_mask = ProbMask(B=bsz, H=self.num_heads, L=L_Q , index=M_top, scores=attn_weights)  # size = (bsz, 1, u, src_len)
-            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+            prob_mask = _prepare_decoder_prob_attention_mask(L=L_Q, M_top=M_top, scores=attn_weights)
+            attn_weights.masked_fill_(prob_mask, -np.inf)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if layer_head_mask is not None:
-            # TODO: change tgt_len to u
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
 
         if output_attentions:
             # this operation is a bit awkward, but it's required to
@@ -900,17 +898,19 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/utils/masking.py
-class ProbMask:
-    def __init__(self, B, H, L, index, scores, device="cpu"):
-        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
-        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
-        indicator = _mask_ex[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :].to(device)
-        self._mask = indicator.view(scores.shape).to(device)
+def _prepare_decoder_prob_attention_mask(L, M_top, scores):
+    # create triangular matrix
+    triangular_matrix = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(scores.device).triu(1)
+
+    # add batch*num_heads dim to the triangular_matrix
+    triangular_mask = triangular_matrix[None, :].expand(scores.size(0), L, scores.shape[-1])
+
+    # slice the triangular_mask with M_top
+    dim_for_slice = torch.arange(triangular_mask.size(0)).unsqueeze(-1)
+    prob_mask = triangular_mask[dim_for_slice, M_top].to(scores.device)
+
+    return prob_mask.to(scores.device)
 
-    @property
-    def mask(self):
-        return self._mask
 
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
 class ConvLayer(nn.Module):

From fa131dcea6ee73f29014384489e8382b0298c658 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Mon, 13 Feb 2023 12:30:19 +0700
Subject: [PATCH 069/164] Revert "remove unused libs for this PR for creating
 the env"

This reverts commit 11a081e09e92771e51a5d2758d53a9afb59547f0.
---
 setup.py | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/setup.py b/setup.py
index fd2b8542c84f..1d6df3bb5c1a 100644
--- a/setup.py
+++ b/setup.py
@@ -123,6 +123,7 @@
     "jax>=0.2.8,!=0.3.2,<=0.3.6",
     "jaxlib>=0.1.65,<=0.3.6",
     "jieba",
+    "kenlm",
     "keras-nlp>=0.3.1",
     "nltk",
     "natten>=0.14.4",
@@ -158,9 +159,9 @@
     "sigopt",
     "librosa",
     "starlette",
-    # "tensorflow-cpu>=2.4,<2.12",
-    # "tensorflow>=2.4,<2.12",
-    # "tensorflow-text",
+    "tensorflow-cpu>=2.4,<2.12",
+    "tensorflow>=2.4,<2.12",
+    "tensorflow-text",
     "tf2onnx",
     "timeout-decorator",
     "timm",
@@ -175,7 +176,7 @@
     "uvicorn",
     "beautifulsoup4",
     "sudachipy>=0.6.6",
-    # "sudachidict_core>=20220729",
+    "sudachidict_core>=20220729",
     "rhoknp>=1.1.0",
 ]
 
@@ -245,11 +246,11 @@ def run(self):
 
 extras = {}
 
-extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "rhoknp")
+extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
 extras["sklearn"] = deps_list("scikit-learn")
 
-extras["tf"] = deps_list("tf2onnx")
-extras["tf-cpu"] = deps_list("tf2onnx")
+extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
+extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
 
 extras["torch"] = deps_list("torch")
 extras["accelerate"] = deps_list("accelerate")
@@ -277,7 +278,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
@@ -362,19 +363,19 @@ def run(self):
     + extras["modelcreation"]
     + extras["onnxruntime"]
 )
-# extras["dev-tensorflow"] = (
-#     extras["testing"]
-#     + extras["tf"]
-#     + extras["sentencepiece"]
-#     + extras["tokenizers"]
-#     + extras["vision"]
-#     + extras["quality"]
-#     + extras["docs_specific"]
-#     + extras["sklearn"]
-#     + extras["modelcreation"]
-#     + extras["onnx"]
-#     + extras["tf-speech"]
-# )
+extras["dev-tensorflow"] = (
+    extras["testing"]
+    + extras["tf"]
+    + extras["sentencepiece"]
+    + extras["tokenizers"]
+    + extras["vision"]
+    + extras["quality"]
+    + extras["docs_specific"]
+    + extras["sklearn"]
+    + extras["modelcreation"]
+    + extras["onnx"]
+    + extras["tf-speech"]
+)
 extras["dev"] = (
     extras["all"]
     + extras["testing"]

From 19a34086bd8bd995226e398b8d4bde41ae10f79a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 11:57:57 +0100
Subject: [PATCH 070/164] fixes

---
 .../models/informer/modeling_informer.py      | 119 ++++++++----------
 1 file changed, 55 insertions(+), 64 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index e75c79dad2ec..9867007ea2e8 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -804,24 +804,30 @@ def forward(
         L_Q = query_states.size(1)
         u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
 
-        # __prob_QK
-        # calculate the sampled Q_K
-        index_sample = torch.randint(low=0, high=L_K, size=(U_part,)) # torch.Size([U_part])
+        if L_K > 0:
+            index_sample = torch.randint(0, L_K, (U_part,))  # torch.Size([14])
 
-        # real U = U_part(factor*ln(L_k))*L_q
-        K_sample = key_states[:, index_sample, :]  # torch.Size([bsz * self.num_heads, U_part, channel])
-        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2)) # torch.Size([bsz * self.num_heads, L_Q, U_part])
-
-        # find the Top_k query with sparsity measurement
-        M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
-        M_top = M.topk(u, sorted=False)[1]
-
-        # use the reduced Q to calculate Q_K
-        # factor*ln(L_q)
-        # Q_reduce = query_states[:, M_top, :]
-        dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
-        Q_reduce = query_states[dim_for_slice, M_top]
+            # real U = U_part(factor*ln(L_k))*L_q
+            K_sample = key_states[:, index_sample, :]  # torch.Size([52, 14, 4])
+        else:
+            K_sample = key_states
+        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
+        # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
+
+        # find the Top_k query with sparisty measurement
+        if u > 0:
+            M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
+            M_top = M.topk(u, sorted=False)[1]
+
+            # use the reduced Q to calculate Q_K
+            # factor*ln(L_q)
+            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+            Q_reduce = query_states[dim_for_slice, M_top]
+        else:
+            Q_reduce = query_states
+            M_top = None
 
+        # score_top
         attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)
@@ -831,11 +837,23 @@ def forward(
                 f" {attn_weights.size()}"
             )
 
-        # Original impl don't apply attention_mask to the encoder, only for the decoder
-        # For the decoder, it creates a casual mask sliced with M_top (ProbMask)
-        if self.is_decoder:
-            prob_mask = _prepare_decoder_prob_attention_mask(L=L_Q, M_top=M_top, scores=attn_weights)
-            attn_weights.masked_fill_(prob_mask, -np.inf)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+
+            if M_top is not None:
+                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
+                prob_mask = prob_mask[dim_for_slice, M_top, :]
+
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
+                bsz, self.num_heads, u, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
@@ -858,31 +876,21 @@ def forward(
         else:
             attn_weights_reshaped = None
 
-        # The authors didn't use attention dropout.
-        # Not removing this yet, waiting for Kashif approval
-        # attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        # attn_output = torch.bmm(attn_probs, value_states)
-        attn_output = torch.bmm(attn_weights, value_states)
-
-        # Build final output
-        # reimplemented from the original:
-        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L70
-        if not self.is_decoder:
-            v_aggregated = value_states.mean(dim=1)
-            v_aggregated = v_aggregated.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, v_aggregated.size(-1))
-        else:
-            v_aggregated = value_states.cumsum(dim=1)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
 
-        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L90
-        dim_for_slice = torch.arange(v_aggregated.size(0)).unsqueeze(-1)
-        v_aggregated[dim_for_slice, M_top, :] = attn_output
+        # get initial context
+        context = value_states.cumsum(dim=-2)
+        attn_output = torch.bmm(attn_probs, value_states)
 
-        # Rename final output
-        attn_output = v_aggregated
+        if M_top is not None:
+            # update context: copy the attention output to the context at M_top index
+            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
+            context[dim_for_slice, M_top, :] = attn_output
+            attn_output = context
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -898,20 +906,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-def _prepare_decoder_prob_attention_mask(L, M_top, scores):
-    # create triangular matrix
-    triangular_matrix = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(scores.device).triu(1)
-
-    # add batch*num_heads dim to the triangular_matrix
-    triangular_mask = triangular_matrix[None, :].expand(scores.size(0), L, scores.shape[-1])
-
-    # slice the triangular_mask with M_top
-    dim_for_slice = torch.arange(triangular_mask.size(0)).unsqueeze(-1)
-    prob_mask = triangular_mask[dim_for_slice, M_top].to(scores.device)
-
-    return prob_mask.to(scores.device)
-
-
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
 class ConvLayer(nn.Module):
     def __init__(self, c_in):
@@ -923,13 +917,13 @@ def __init__(self, c_in):
             padding=1,
             padding_mode="circular",
         )
-        self.norm = nn.BatchNorm1d(c_in)  # Eli question: why batchnorm here?
+        self.norm = nn.BatchNorm1d(c_in)
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
 
     def forward(self, x):
         x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x)  # Eli: why? maybe because the impl...
+        x = self.norm(x)
         x = self.activation(x)
         x = self.maxPool(x)
         x = x.transpose(1, 2)
@@ -1023,7 +1017,7 @@ def __init__(self, config: InformerConfig):
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
                 factor=config.factor,
-                is_decoder=True
+                is_decoder=True,
             )
         else:
             self.self_attn = InformerAttention(
@@ -1469,7 +1463,6 @@ def __init__(self, config: InformerConfig):
 
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
-        self.attn = config.attn
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1582,11 +1575,9 @@ def forward(
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
-        # create casual mask only if it's full attention (and not ProbAttention)
-        if self.attn != 'prob':
-            attention_mask = self._prepare_decoder_attention_mask(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
-            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
 
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:

From 84a4ba35440279672cc4fbdfe7ff61e399a3675d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 12:08:18 +0100
Subject: [PATCH 071/164] make style

---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 src/transformers/models/informer/modeling_informer.py      | 2 --
 tests/models/informer/test_modeling_informer.py            | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index ca95a51e4308..4524b4463449 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -168,7 +168,7 @@ def __init__(
         attn: str = "prob",
         factor: int = 2,
         distil: bool = True,
-        **kwargs
+        **kwargs,
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 9867007ea2e8..e0a40d86887e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -17,12 +17,10 @@
 
 import random
 from dataclasses import dataclass
-from math import sqrt
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 from torch import nn
 from torch.distributions import (
     AffineTransform,
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 4a10708fa072..f33f8b137a49 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -19,6 +19,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import is_torch_available
 from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
 

From a48e661784e28d28c2b892580f724b0a0b960e13 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 12:34:23 +0100
Subject: [PATCH 072/164] fix initial tests

---
 .../models/informer/test_modeling_informer.py | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index f33f8b137a49..0ca207f81091 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -18,6 +18,8 @@
 import tempfile
 import unittest
 
+import numpy as np
+
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
@@ -56,6 +58,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         lags_sequence=[1, 2, 3, 4, 5],
+        factor=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -74,8 +77,10 @@ def __init__(
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
-        self.encoder_seq_length = context_length
-        self.decoder_seq_length = prediction_length
+        self.encoder_seq_length = min(factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length)
+        self.decoder_seq_length = min(
+            factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
+        )
 
     def get_config(self):
         return InformerConfig(
@@ -264,6 +269,8 @@ def test_attention_outputs(self):
         seq_len = getattr(self.model_tester, "seq_length", None)
         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        context_length = getattr(self.model_tester, "context_length", seq_len)
+        prediction_length = getattr(self.model_tester, "prediction_length", seq_len)
 
         for model_class in self.all_model_classes:
             inputs_dict["output_attentions"] = True
@@ -290,7 +297,7 @@ def test_attention_outputs(self):
 
             self.assertListEqual(
                 list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+                [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
             )
             out_len = len(outputs)
 
@@ -316,7 +323,7 @@ def test_attention_outputs(self):
             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
             self.assertListEqual(
                 list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_seq_length],
+                [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length],
             )
 
             # cross attentions
@@ -327,8 +334,8 @@ def test_attention_outputs(self):
                 list(cross_attentions[0].shape[-3:]),
                 [
                     self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    encoder_seq_length,
+                    prediction_length,
+                    context_length,
                 ],
             )
 
@@ -348,7 +355,7 @@ def test_attention_outputs(self):
         self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
         self.assertListEqual(
             list(self_attentions[0].shape[-3:]),
-            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
+            [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
         )
 
     @is_flaky()

From a3b6ddb67eeb643a541b308166eb2cd9ac03fbbd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 14:26:23 +0100
Subject: [PATCH 073/164] fix more tests

---
 .../models/informer/modeling_informer.py      |  4 ++
 .../models/informer/test_modeling_informer.py | 63 ++++++++++++++++++-
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index e0a40d86887e..2811845aeb39 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1143,6 +1143,10 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        if isinstance(module, nn.Conv1d):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 0ca207f81091..897e2607979c 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -58,7 +58,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         lags_sequence=[1, 2, 3, 4, 5],
-        factor=2,
+        factor=10,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -81,6 +81,7 @@ def __init__(
         self.decoder_seq_length = min(
             factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
         )
+        self.factor = factor
 
     def get_config(self):
         return InformerConfig(
@@ -99,6 +100,7 @@ def get_config(self):
             num_static_categorical_features=1,
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
+            factor=self.factor,
         )
 
     def prepare_informer_inputs_dict(self, config):
@@ -201,10 +203,69 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.context_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
     # Ignore since we have no tokens embeddings
     def test_resize_tokens_embeddings(self):
         pass
 
+    def test_model_outputs_equivalence(self):
+        pass
+
+    def test_determinism(self):
+        pass
+
     # # Input is 'static_categorical_features' not 'input_ids'
     def test_model_main_input_name(self):
         model_signature = inspect.signature(getattr(InformerModel, "forward"))

From a14171222fe50b13a94bc0913e70979a9cc05ab9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 15:00:40 +0100
Subject: [PATCH 074/164] dry

---
 src/transformers/models/informer/modeling_informer.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 2811845aeb39..df7685ca46f6 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1139,11 +1139,7 @@ class InformerPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        if isinstance(module, nn.Conv1d):
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()

From 299d17f56b412354abff656f559b7da6c83ae9a5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 15:13:10 +0100
Subject: [PATCH 075/164] make style

---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 tests/models/informer/test_modeling_informer.py            | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 4524b4463449..6a2af31341d6 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -106,7 +106,7 @@ class InformerConfig(PretrainedConfig):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
         attn (`str`, defaults to `prob`):
             Attention used in encoder. This can be set to prob (informer) or full (transformer)
-        factor (`int`, defaults to 5):
+        factor (`int`, defaults to 2):
             ProbSparse attention factor
         distil (`bool`, defualts to `True`):
             Whether to use distilling in encoder
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 897e2607979c..3d27fb2ca607 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -19,7 +19,6 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available

From 36df283d783be0088afabff5080c7b1be622cd0b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 15:13:52 +0100
Subject: [PATCH 076/164] remove unused files

---
 .../informer/check_instantiate_works.py       |  62 -----
 ...nal_colab_pytorch_checkpoint_to_pytorch.py | 223 ------------------
 2 files changed, 285 deletions(-)
 delete mode 100644 src/transformers/models/informer/check_instantiate_works.py
 delete mode 100644 src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py

diff --git a/src/transformers/models/informer/check_instantiate_works.py b/src/transformers/models/informer/check_instantiate_works.py
deleted file mode 100644
index 48c963395972..000000000000
--- a/src/transformers/models/informer/check_instantiate_works.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from transformers import InformerModel, InformerConfig, TimeSeriesTransformerForPrediction, TimeSeriesTransformerModel, \
-    TimeSeriesTransformerConfig
-from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str
-
-from huggingface_hub import hf_hub_download
-import torch
-
-"""
-Establish one batch for forward pass in the Informer
-"""
-if __name__ == '__main__':
-    freq = "1M"
-    prediction_length = 24
-    lags = get_lags_for_frequency(freq_str=freq)
-    time_features = time_features_from_frequency_str(freq)
-
-    config = InformerConfig(prediction_length=prediction_length,
-                            context_length=prediction_length*3,
-                            lags_sequence=lags,
-                            num_time_features=len(time_features) + 1,
-                            num_static_categorical_features=1,
-                            cardinality=[366],
-                            embedding_dimension=[2],
-                            encoder_layers=1,
-                            decoder_layers=1)
-    model = InformerModel(config)
-    print(model)
-
-    # config = TimeSeriesTransformerConfig(
-    #     prediction_length=prediction_length,
-    #     context_length=prediction_length * 3,  # context length
-    #     lags_sequence=lags,
-    #     num_time_features=len(time_features) + 1,  # we'll add 2 time features ("month of year" and "age", see further)
-    #     num_static_categorical_features=1,  # we have a single static categorical feature, namely time series ID
-    #     cardinality=[366],  # it has 366 possible values
-    #     embedding_dimension=[2],  # the model will learn an embedding of size 2 for each of the 366 possible values
-    #     encoder_layers=4,
-    #     decoder_layers=4,
-    # )
-    # model = TimeSeriesTransformerModel(config)
-    # model.eval()
-    #
-    # model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
-    #
-    # file = hf_hub_download(
-    #     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
-    # )
-    # batch = torch.load(file)
-    #
-    # # during training, one provides both past and future values
-    # # as well as possible additional features
-    # outputs = model(
-    #     past_values=batch["past_values"],
-    #     past_time_features=batch["past_time_features"],
-    #     past_observed_mask=batch["past_observed_mask"],
-    #     static_categorical_features=batch["static_categorical_features"],
-    #     static_real_features=batch["static_real_features"],
-    #     future_values=batch["future_values"],
-    #     future_time_features=batch["future_time_features"],
-    # )
-    #
-    # print(outputs.last_hidden_state.shape)
diff --git a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 5dc032e6203f..000000000000
--- a/src/transformers/models/informer/convert_informer_original_colab_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Informer checkpoint."""
-
-"""
-Assumption: 
-
-Informer2020 repository is git-cloned from
-https://github.com/elisim/Informer2020/tree/hf
-
-"hf" branch. There, I created a Informer's checkpoint from the official colab notebook.
-
-See also: https://github.com/elisim/Informer2020/blob/hf/create_checkpoint_from_offical_colab.ipynb
-"""
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from torch import nn
-
-from transformers import InformerConfig, InformerModel
-from transformers.utils import logging
-
-
-import sys
-if not 'Informer2020' in sys.path:
-    sys.path += ['Informer2020']
-
-
-from Informer2020.exp.exp_informer import Exp_Informer
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-mnli_rename_keys = [
-    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
-    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
-    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
-    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
-]
-
-
-def _create_informer_args():
-    """
-    Arguments are taken from the offical colab example:
-    https://colab.research.google.com/drive/1_X7O2BkFLvqyCdZzDZvV2MB0aAvYALLC
-
-    I only comment arguments that are not needed for the model creation (e.g. data_path, use_gpu)
-    """
-    class dotdict(dict):
-        """dot.notation access to dictionary attributes"""
-        __getattr__ = dict.get
-        __setattr__ = dict.__setitem__
-        __delattr__ = dict.__delitem__
-
-    args = dotdict()
-
-    ### BoilerCode
-    args.model = 'informer'  # model of experiment, options: [informer, informerstack, informerlight(TBD)]
-    # args.data = 'ETTh1'  # data
-    # args.root_path = './ETDataset/ETT-small/'  # root path of data file
-    # args.data_path = 'ETTh1.csv'  # data file
-    # args.checkpoints = './informer_checkpoints'  # location of model checkpoints
-
-    ### TS
-    args.features = 'M'  # forecasting task, options:[M, S, MS]
-    # M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
-    args.target = 'OT'  # target feature in S or MS task
-    args.freq = 'h'  # freq for time features encoding,
-    # options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly],
-    # you can also use more detailed freq like 15min or 3h
-
-    ### Encoder Decoder
-    args.seq_len = 96  # input sequence length of Informer encoder
-    args.label_len = 48  # start token length of Informer decoder
-    args.pred_len = 24  # prediction sequence length
-    # Informer decoder input: concat[start token series(label_len), zero padding series(pred_len)]
-
-    args.enc_in = 7  # encoder input size
-    args.dec_in = 7  # decoder input size
-    args.c_out = 7  # output size
-    args.factor = 5  # probsparse attn factor
-    args.d_model = 512  # dimension of model
-    args.n_heads = 8  # num of heads
-    args.e_layers = 2  # num of encoder layers
-    args.d_layers = 1  # num of decoder layers
-    args.d_ff = 2048  # dimension of fcn in model
-    args.dropout = 0.05  # dropout
-    args.attn = 'prob'  # attention used in encoder, options:[prob, full]
-    args.embed = 'timeF'  # time features encoding, options:[timeF, fixed, learned]
-    args.activation = 'gelu'  # activation
-    args.distil = True  # whether to use distilling in encoder
-    args.output_attention = False  # whether to output attention in ecoder
-    args.mix = True
-    args.padding = 0
-
-    ### Training
-    args.batch_size = 32
-    args.learning_rate = 0.0001
-    args.loss = 'mse'
-    args.lradj = 'type1'
-    args.use_amp = False  # whether to use automatic mixed precision training
-
-    args.num_workers = 0
-    args.itr = 1
-    args.train_epochs = 6
-    args.patience = 3
-    args.des = 'exp'
-
-    # args.use_gpu = False  # True if torch.cuda.is_available() else False
-    # args.gpu = 0
-    #
-    # args.use_multi_gpu = False
-    # args.devices = '0,1,2,3'
-
-    args.detail_freq = args.freq  # the actual freq
-    args.freq = args.freq[-1:]  # Not important
-
-    return args
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def load_informer_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pth"""
-    exp = Exp_Informer(args=_create_informer_args())
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    exp.model.load_state_dict(sd)
-    return exp.model
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-# @torch.no_grad()
-# def convert_informer_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
-#     """
-#     Copy/paste/tweak model's weights to our BERT structure.
-#     """
-#     informer = load_informer_checkpoint(checkpoint_path)
-#
-#     informer.model.upgrade_state_dict(informer.model.state_dict())
-#     if hf_checkpoint_name is None:
-#         hf_checkpoint_name = checkpoint_path.replace(".", "-")
-#     config = BartConfig.from_pretrained(hf_checkpoint_name)
-#
-#     if checkpoint_path == "bart.large.mnli":
-#         state_dict = bart.state_dict()
-#         remove_ignore_keys_(state_dict)
-#         state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
-#         for src, dest in mnli_rename_keys:
-#             rename_key(state_dict, src, dest)
-#         model = BartForSequenceClassification(config).eval()
-#         model.load_state_dict(state_dict)
-#         fairseq_output = bart.predict("mnli", tokens, return_logits=True)
-#         new_model_outputs = model(tokens)[0]  # logits
-#     else:  # no classification heads to worry about
-#         state_dict = bart.model.state_dict()
-#         remove_ignore_keys_(state_dict)
-#         state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-#         fairseq_output = bart.extract_features(tokens)
-#         if hf_checkpoint_name == "facebook/bart-large":
-#             model = BartModel(config).eval()
-#             model.load_state_dict(state_dict)
-#             new_model_outputs = model(tokens).model[0]
-#         else:
-#             model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
-#             model.model.load_state_dict(state_dict)
-#             if hasattr(model, "lm_head"):
-#                 model.lm_head = make_linear_from_emb(model.model.shared)
-#             new_model_outputs = model.model(tokens)[0]
-#
-#     # Check results
-#     assert fairseq_output.shape == new_model_outputs.shape
-#     assert (fairseq_output == new_model_outputs).all().item()
-#     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-#     model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    informer_checkpoint_default_path = "./Informer2020/informer_checkpoints/informer_ETTh1_ftM_sl96_ll48_pl24_dm512_nh8_el2_dl1_df2048_atprob_fc5_ebtimeF_dtTrue_mxTrue_exp_0/checkpoint.pth"
-
-    # parser = argparse.ArgumentParser()
-    # parser.add_argument("informer_path", default=None, type=str, help="a path to a model.pth on local filesystem.")
-    # parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    # args = parser.parse_args()
-
-    # convert_informer_checkpoint(args.informer_path, args.pytorch_dump_folder_path)
-    informer = load_informer_checkpoint(informer_checkpoint_default_path)
-    print(informer)

From 0b4220a5264edf0676cb731797d34eec72914173 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 15:14:29 +0100
Subject: [PATCH 077/164] style

---
 src/transformers/models/informer/modeling_informer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index df7685ca46f6..225e48bbcaac 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1446,8 +1446,7 @@ def custom_forward(*inputs):
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """
-    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a
-    [`InformerDecoderLayer`]
+    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
 
     Args:
         config: InformerConfig

From e9e3c6e60900a73070367fe5a0dbbc2efa7c1094 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 18:20:15 +0100
Subject: [PATCH 078/164] added integration tests

---
 .../models/informer/configuration_informer.py |  6 +++---
 .../models/informer/test_modeling_informer.py | 21 ++++++++++---------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 6a2af31341d6..90c2a0e6d4d2 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -107,9 +107,9 @@ class InformerConfig(PretrainedConfig):
         attn (`str`, defaults to `prob`):
             Attention used in encoder. This can be set to prob (informer) or full (transformer)
         factor (`int`, defaults to 2):
-            ProbSparse attention factor
-        distil (`bool`, defualts to `True`):
-            Whether to use distilling in encoder
+            ProbSparse attention factor.
+        distil (`bool`, defaults to `True`):
+            Whether to use distilling in encoder.
 
 
         Example:
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 3d27fb2ca607..72e24f15a9b8 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -433,9 +433,10 @@ def prepare_batch(filename="train-batch.pt"):
 @slow
 class InformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
+        model = InformerModel.from_pretrained("kashif/informer-tourism-monthly").to(torch_device)
         batch = prepare_batch()
 
+        torch.manual_seed(0)
         with torch.no_grad():
             output = model(
                 past_values=batch["past_values"],
@@ -451,15 +452,15 @@ def test_inference_no_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device
+            [[-1.4829, 0.7390, -1.3606], [-1.9992, 0.3949, -1.3191], [-1.1011, 0.2860, -1.5074]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_inference_head(self):
-        model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            torch_device
-        )
+        model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly").to(torch_device)
         batch = prepare_batch("val-batch.pt")
+
+        torch.manual_seed(0)
         with torch.no_grad():
             output = model(
                 past_values=batch["past_values"],
@@ -473,15 +474,15 @@ def test_inference_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device
+            [[0.4427, 0.6329, 0.1136], [0.5492, 2.3569, 0.6203], [0.0812, 2.6220, 1.5276]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_seq_to_seq_generation(self):
-        model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            torch_device
-        )
+        model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly").to(torch_device)
         batch = prepare_batch("val-batch.pt")
+
+        torch.manual_seed(0)
         with torch.no_grad():
             outputs = model.generate(
                 static_categorical_features=batch["static_categorical_features"],
@@ -494,6 +495,6 @@ def test_seq_to_seq_generation(self):
         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
         self.assertEqual(outputs.sequences.shape, expected_shape)
 
-        expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device)
+        expected_slice = torch.tensor([3877.3796, 4988.0166, 7795.9473], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
         self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From 83657df455b693f5c81b540231e0a853126a5989 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 18:30:15 +0100
Subject: [PATCH 079/164] fix num_static_real_features

---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 src/transformers/models/informer/modeling_informer.py      | 2 +-
 tests/models/informer/test_modeling_informer.py            | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 90c2a0e6d4d2..2f488e2f49a4 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -241,6 +241,6 @@ def _number_of_features(self) -> int:
             sum(self.embedding_dimension)
             + self.num_dynamic_real_features
             + self.num_time_features
-            + max(1, self.num_static_real_features)  # there is at least one dummy static real feature
+            + self.num_static_real_features
             + self.input_size  # the log(scale)
         )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 225e48bbcaac..75f1d471b9cf 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -45,7 +45,7 @@
 
 
 INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "elisim/informer",
+    "kashif/informer-tourism-monthly",
     # See all Informer models at https://huggingface.co/models?filter=informer
 ]
 
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 72e24f15a9b8..fc88614ed163 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -97,6 +97,7 @@ def get_config(self):
             lags_sequence=self.lags_sequence,
             num_time_features=self.num_time_features,
             num_static_categorical_features=1,
+            num_static_real_features=1,
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
             factor=self.factor,

From c893ad8f6b8fddbf985365bf61dc0097ad6498fe Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 19:51:37 +0100
Subject: [PATCH 080/164] fix header

---
 src/transformers/models/informer/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/informer/__init__.py b/src/transformers/models/informer/__init__.py
index 47e7a9c115bf..478ad56a72ba 100644
--- a/src/transformers/models/informer/__init__.py
+++ b/src/transformers/models/informer/__init__.py
@@ -1,7 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 79a21e0f8973e82fefcfe9f0479f5d7a92ca379f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 19:53:47 +0100
Subject: [PATCH 081/164] remove unused function

---
 src/transformers/models/informer/modeling_informer.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 75f1d471b9cf..23c732ad36fe 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1818,16 +1818,6 @@ def create_network_inputs(
 
         return transformer_inputs, scale, static_feat
 
-    def enc_dec_outputs(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.config.context_length, ...]
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
-
-        encoder_outputs = self.encoder(inputs_embeds=enc_input)
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
-        )
-        return encoder_outputs, decoder_outputs
-
     def get_encoder(self):
         return self.encoder
 

From db1a0e4ab20303bbc239a7f848a546665b7e674a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 19:59:20 +0100
Subject: [PATCH 082/164] fix example

---
 src/transformers/models/informer/modeling_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 23c732ad36fe..b9e8cb37696c 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1861,7 +1861,7 @@ def forward(
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+        >>> model = InformerModel.from_pretrained("kashif/informer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features
@@ -2035,7 +2035,7 @@ def forward(
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+        >>> model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features

From ad96fdd4c312155a28338cd8d7326f0c324f1480 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 20:02:53 +0100
Subject: [PATCH 083/164] fix docs

---
 .../models/informer/modeling_informer.py      | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b9e8cb37696c..1e6ee94285c4 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1236,6 +1236,15 @@ def _set_gradient_checkpointing(self, module, value=False):
             Transformer requires to provide additional features.
 
             The Informer only learns additional embeddings for `static_categorical_features`.
+        
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
 
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
@@ -1271,6 +1280,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
@@ -1282,6 +1292,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -1847,8 +1858,6 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
         r"""
-        Returns:
-
         Examples:
 
         ```python
@@ -2012,17 +2021,6 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
         r"""
-        Returns:
-
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-
         Examples:
 
         ```python

From ea921d94020ac69644c9cd7a4b849b3cb45e0cd3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 21:01:13 +0100
Subject: [PATCH 084/164] Update
 src/transformers/models/informer/configuration_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 2f488e2f49a4..7cec2f7171b8 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -112,7 +112,7 @@ class InformerConfig(PretrainedConfig):
             Whether to use distilling in encoder.
 
 
-        Example:
+Example:
 
     ```python
     >>> from transformers import InformerConfig, InformerModel

From 6ae25e6f6e77a0359dc150847c8e7286b9802b32 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 15 Feb 2023 21:01:23 +0100
Subject: [PATCH 085/164] Update
 src/transformers/models/informer/modeling_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/modeling_informer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1e6ee94285c4..a94d4f9a7e5b 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From fd046ba75fc4382b9faa1e650864496042a97fe3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 08:49:13 +0100
Subject: [PATCH 086/164] Update
 src/transformers/models/informer/configuration_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/configuration_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 7cec2f7171b8..57b543add0de 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -31,8 +31,8 @@ class InformerConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`InformerModel`]. It is used to instantiate an
     Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Time Series Transformer
-    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
+    with the defaults will yield a similar configuration to that of the Informer
+    [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly)
     architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the

From 18751ae0d0249e22e194f7be2ecf0758db7a5398 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 08:49:26 +0100
Subject: [PATCH 087/164] Update
 src/transformers/models/informer/configuration_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 57b543add0de..192cf8b70e41 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -29,7 +29,7 @@
 
 class InformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to instantiate an
+    This is the configuration class to store the configuration of an [`InformerModel`]. It is used to instantiate an
     Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the Informer
     [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly)

From 51f890da34a2fe4df3f36b58d786424d6c845e1a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 08:49:35 +0100
Subject: [PATCH 088/164] Update
 src/transformers/models/informer/configuration_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/configuration_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 192cf8b70e41..ce21fe50641d 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -112,7 +112,7 @@ class InformerConfig(PretrainedConfig):
             Whether to use distilling in encoder.
 
 
-Example:
+       Example:
 
     ```python
     >>> from transformers import InformerConfig, InformerModel

From dc49ff796dfa67878e0e23b4eea9e19654d7e9e0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 08:50:26 +0100
Subject: [PATCH 089/164] Update
 src/transformers/models/informer/configuration_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/configuration_informer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index ce21fe50641d..14cb4e3f763c 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -104,11 +104,11 @@ class InformerConfig(PretrainedConfig):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
-        attn (`str`, defaults to `prob`):
+        attention_type (`str`, *optional*, defaults to `prob`):
             Attention used in encoder. This can be set to prob (informer) or full (transformer)
-        factor (`int`, defaults to 2):
+        attention_factor (`int`, *optional*, defaults to 2):
             ProbSparse attention factor.
-        distil (`bool`, defaults to `True`):
+        distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 
 

From 125ee055f4740a00f79bdb5be7e253a1643ee194 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 10:41:46 +0100
Subject: [PATCH 090/164] fixes for reviewer

---
 .../models/informer/configuration_informer.py | 22 ++++++++++---------
 .../models/informer/modeling_informer.py      | 16 +++++++++-----
 .../models/informer/test_modeling_informer.py | 20 ++++++++++-------
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 14cb4e3f763c..869e4599033f 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -23,7 +23,10 @@
 logger = logging.get_logger(__name__)
 
 INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "elisim/informer": "https://huggingface.co/elisim/informer/resolve/main/config.json",
+    "kashif/informer-tourism-monthly": (
+        "https://huggingface.co/kashif/informer-tourism-monthly/resolve/main/config.json"
+    ),
+    # See all Informer models at https://huggingface.co/models?filter=informer
 }
 
 
@@ -104,15 +107,14 @@ class InformerConfig(PretrainedConfig):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
-        attention_type (`str`, *optional*, defaults to `prob`):
-            Attention used in encoder. This can be set to prob (informer) or full (transformer)
+        attention_type (`str`, *optional*, defaults to "prob"):
+            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (transformer).
         attention_factor (`int`, *optional*, defaults to 2):
             ProbSparse attention factor.
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 
-
-       Example:
+    Example:
 
     ```python
     >>> from transformers import InformerConfig, InformerModel
@@ -135,11 +137,11 @@ class InformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        input_size: int = 1,
         prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
+        input_size: int = 1,
         lags_sequence: List[int] = None,
         scaling: bool = True,
         num_dynamic_real_features: int = 0,
@@ -165,8 +167,8 @@ def __init__(
         init_std: float = 0.02,
         use_cache=True,
         # Informer arguments
-        attn: str = "prob",
-        factor: int = 2,
+        attention_type: str = "prob",
+        attention_factor: int = 2,
         distil: bool = True,
         **kwargs,
     ):
@@ -229,8 +231,8 @@ def __init__(
         self.use_cache = use_cache
 
         # Informer
-        self.attn = attn
-        self.factor = factor
+        self.attention_type = attention_type
+        self.attention_factor = attention_factor
         self.distil = distil
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index a94d4f9a7e5b..dc0b68a7ea4a 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -706,11 +706,11 @@ def __init__(
         num_heads: int,
         dropout: float = 0.0,
         is_decoder: bool = False,
-        factor: int = 5,
+        attention_factor: int = 5,
         bias: bool = True,
     ):
         super().__init__()
-        self.factor = factor
+        self.factor = attention_factor
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
@@ -931,12 +931,12 @@ class InformerEncoderLayer(nn.Module):
     def __init__(self, config: InformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        if config.attn == "prob":
+        if config.attention_type == "prob":
             self.self_attn = ProbSparseAttention(
                 embed_dim=self.embed_dim,
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
-                factor=config.factor,
+                attention_factor=config.attention_factor,
             )
         else:
             self.self_attn = InformerAttention(
@@ -1008,12 +1008,12 @@ def __init__(self, config: InformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        if config.attn == "prob":
+        if config.attention_type == "prob":
             self.self_attn = ProbSparseAttention(
                 embed_dim=self.embed_dim,
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
-                factor=config.factor,
+                attention_factor=config.attention_factor,
                 is_decoder=True,
             )
         else:
@@ -1857,6 +1857,8 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
         r"""
+        Returns:
+
         Examples:
 
         ```python
@@ -2020,6 +2022,8 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
         r"""
+        Returns:
+
         Examples:
 
         ```python
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index fc88614ed163..0a37022f48d5 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -57,7 +57,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         lags_sequence=[1, 2, 3, 4, 5],
-        factor=10,
+        attention_factor=10,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -76,14 +76,17 @@ def __init__(
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
-        self.encoder_seq_length = min(factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length)
+        self.encoder_seq_length = min(
+            attention_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
+        )
         self.decoder_seq_length = min(
-            factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
+            attention_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
         )
-        self.factor = factor
+        self.attention_factor = attention_factor
 
     def get_config(self):
         return InformerConfig(
+            prediction_length=self.prediction_length,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
@@ -92,7 +95,6 @@ def get_config(self):
             decoder_ffn_dim=self.intermediate_size,
             dropout=self.hidden_dropout_prob,
             attention_dropout=self.attention_probs_dropout_prob,
-            prediction_length=self.prediction_length,
             context_length=self.context_length,
             lags_sequence=self.lags_sequence,
             num_time_features=self.num_time_features,
@@ -100,7 +102,7 @@ def get_config(self):
             num_static_real_features=1,
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
-            factor=self.factor,
+            attention_factor=self.attention_factor,
         )
 
     def prepare_informer_inputs_dict(self, config):
@@ -184,13 +186,15 @@ class InformerModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = InformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=InformerConfig, has_text_modality=False)
+        self.config_tester = ConfigTester(
+            self, config_class=InformerConfig, has_text_modality=False, prediction_length=12
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
     def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config, _ = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
             model = model_class(config)
 

From a64162a00f4e19687c1092e4d42203f10eb51eff Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 11:23:17 +0100
Subject: [PATCH 091/164] use prediction_length from model

---
 tests/models/informer/test_modeling_informer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 0a37022f48d5..699eedf3a11f 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -187,7 +187,10 @@ class InformerModelTest(ModelTesterMixin, unittest.TestCase):
     def setUp(self):
         self.model_tester = InformerModelTester(self)
         self.config_tester = ConfigTester(
-            self, config_class=InformerConfig, has_text_modality=False, prediction_length=12
+            self,
+            config_class=InformerConfig,
+            has_text_modality=False,
+            prediction_length=self.model_tester.prediction_length,
         )
 
     def test_config(self):

From 19e32796d8fbab75238628cce32a21afcc12b1a9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 11:24:26 +0100
Subject: [PATCH 092/164] fix style

---
 src/transformers/models/informer/configuration_informer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 869e4599033f..0735f3bf4f8c 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -35,8 +35,7 @@ class InformerConfig(PretrainedConfig):
     This is the configuration class to store the configuration of an [`InformerModel`]. It is used to instantiate an
     Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the Informer
-    [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly)
-    architecture.
+    [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From b316a5af13633edd664b24ed5e53ee615e9e3a1e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 12:17:53 +0100
Subject: [PATCH 093/164] fixed informer.mdx

---
 docs/source/en/model_doc/informer.mdx         |  15 +-
 .../models/informer/modeling_informer.py      | 816 ++++++++++--------
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 3 files changed, 510 insertions(+), 345 deletions(-)

diff --git a/docs/source/en/model_doc/informer.mdx b/docs/source/en/model_doc/informer.mdx
index 6765b9768fc6..d54813f4c11e 100644
--- a/docs/source/en/model_doc/informer.mdx
+++ b/docs/source/en/model_doc/informer.mdx
@@ -14,19 +14,16 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The Informer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The Informer model was proposed in [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting ](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
+This method introduces a Probabilistic Attention mechanism to select the "active" queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and memory requirements of vanilla attention.
 
-Tips:
+The abstract from the paper is the following:
 
-<INSERT TIPS ABOUT MODEL HERE>
+*Many real-world applications require the prediction of long sequence time-series, such as electricity consumption planning. Long sequence time-series forecasting (LSTF) demands a high prediction capacity of the model, which is the ability to capture precise long-range dependency coupling between output and input efficiently. Recent studies have shown the potential of Transformer to increase the prediction capacity. However, there are several severe issues with Transformer that prevent it from being directly applicable to LSTF, including quadratic time complexity, high memory usage, and inherent limitation of the encoder-decoder architecture. To address these issues, we design an efficient transformer-based model for LSTF, named Informer, with three distinctive characteristics: (i) a ProbSparse self-attention mechanism, which achieves O(L logL) in time complexity and memory usage, and has comparable performance on sequences' dependency alignment. (ii) the self-attention distilling highlights dominating attention by halving cascading layer input, and efficiently handles extreme long input sequences. (iii) the generative style decoder, while conceptually simple, predicts the long time-series sequences at one forward operation rather than a step-by-step way, which drastically improves the inference speed of long-sequence predictions. Extensive experiments on four large-scale datasets demonstrate that Informer significantly outperforms existing methods and provides a new solution to the LSTF problem.*
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [elisim](https://huggingface.co/elisim), [kashif](https://huggingface.co/kashif) and [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/zhouhaoyi/Informer2020).
 
 
 ## InformerConfig
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index dc0b68a7ea4a..4d6c24ea3748 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -231,7 +231,7 @@ def distribution(
         return self._base_distribution((total_count, logits))
 
 
-# Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
+# FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
 # source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
 class FeatureEmbedder(nn.Module):
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
@@ -399,6 +399,402 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Time Series Transformer model."""
+
+import random
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_time_series_transformer import TimeSeriesTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
+
+
+TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "huggingface/time-series-transformer-tourism-monthly",
+    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
+]
+
+
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class ParameterProjection(nn.Module):
+    def __init__(
+        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self.function = function
+
+    def forward(self, x, *args):
+        return self.function(x, *args)
+
+
+class DistributionOutput:
+    distribution_class: type
+    in_features: int
+    args_dim: Dict[str, int]
+
+    def __init__(self, dim: int = 1) -> None:
+        self.dim = dim
+        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
+
+    def _base_distribution(self, distr_args):
+        if self.dim == 1:
+            return self.distribution_class(*distr_args)
+        else:
+            return Independent(self.distribution_class(*distr_args), 1)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions that this object constructs.
+        """
+        return () if self.dim == 1 else (self.dim,)
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
+        """
+        return 0.0
+
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distribution_class: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        df = 2.0 + cls.squareplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distribution_class: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distribution_class: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        if self.dim == 1:
+            return self.distribution_class(total_count=total_count, logits=logits)
+        else:
+            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            # See scaling property of Gamma.
+            logits += scale.log()
+
+        return self._base_distribution((total_count, logits))
+
+
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
+
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # these will have shape (N, C)
+        total_weight = weights.sum(dim=self.dim)
+        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
+
+        # first compute a global scale per-dimension
+        total_observed = total_weight.sum(dim=0)
+        denominator = torch.max(total_observed, torch.ones_like(total_observed))
+        default_scale = weighted_sum.sum(dim=0) / denominator
+
+        # then compute a per-item, per-dimension scale
+        denominator = torch.max(total_weight, torch.ones_like(total_weight))
+        scale = weighted_sum / denominator
+
+        # use per-batch scale when no element is observed
+        # or when the sequence contains only zeros
+        scale = (
+            torch.max(
+                self.minimum_scale,
+                torch.where(
+                    weighted_sum > torch.zeros_like(weighted_sum),
+                    scale,
+                    default_scale * torch.ones_like(total_weight),
+                ),
+            )
+            .detach()
+            .unsqueeze(dim=self.dim)
+        )
+
+        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, scale
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+class NegativeLogLikelihood:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
 @dataclass
 class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     """
@@ -467,7 +863,6 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     """
@@ -536,14 +931,13 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class SampleTimeSeriesPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
-class InformerAttention(nn.Module):
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->TimeSeriesTransformer
+class TimeSeriesTransformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -689,213 +1083,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped, past_key_value
-
-
-class ProbSparseAttention(nn.Module):
-    """ProbSparse Attention"""
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        attention_factor: int = 5,
-        bias: bool = True,
-    ):
-        super().__init__()
-        self.factor = attention_factor
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        # c*ln(L_k)
-        L_K = key_states.size(1)
-        U_part = min(self.factor * np.ceil(np.log1p(L_K)).astype("int").item(), L_K)
-
-        # c*ln(L_q)
-        L_Q = query_states.size(1)
-        u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
-
-        if L_K > 0:
-            index_sample = torch.randint(0, L_K, (U_part,))  # torch.Size([14])
-
-            # real U = U_part(factor*ln(L_k))*L_q
-            K_sample = key_states[:, index_sample, :]  # torch.Size([52, 14, 4])
-        else:
-            K_sample = key_states
-        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
-        # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
-
-        # find the Top_k query with sparisty measurement
-        if u > 0:
-            M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
-            M_top = M.topk(u, sorted=False)[1]
-
-            # use the reduced Q to calculate Q_K
-            # factor*ln(L_q)
-            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
-            Q_reduce = query_states[dim_for_slice, M_top]
-        else:
-            Q_reduce = query_states
-            M_top = None
-
-        # score_top
-        attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
-
-        src_len = key_states.size(1)
-        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
-                bsz * self.num_heads, tgt_len, src_len
-            )
-
-            if M_top is not None:
-                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
-                prob_mask = prob_mask[dim_for_slice, M_top, :]
-
-            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
-                bsz, self.num_heads, u, src_len
-            )
-            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        # get initial context
-        context = value_states.cumsum(dim=-2)
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if M_top is not None:
-            # update context: copy the attention output to the context at M_top index
-            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
-            context[dim_for_slice, M_top, :] = attn_output
-            attn_output = context
-
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -903,47 +1091,16 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
-class ConvLayer(nn.Module):
-    def __init__(self, c_in):
-        super(ConvLayer, self).__init__()
-        self.downConv = nn.Conv1d(
-            in_channels=c_in,
-            out_channels=c_in,
-            kernel_size=3,
-            padding=1,
-            padding_mode="circular",
-        )
-        self.norm = nn.BatchNorm1d(c_in)
-        self.activation = nn.ELU()
-        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
-
-    def forward(self, x):
-        x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.maxPool(x)
-        x = x.transpose(1, 2)
-        return x
-
-
-class InformerEncoderLayer(nn.Module):
-    def __init__(self, config: InformerConfig):
+# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->TimeSeriesTransformer
+class TimeSeriesTransformerEncoderLayer(nn.Module):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        if config.attention_type == "prob":
-            self.self_attn = ProbSparseAttention(
-                embed_dim=self.embed_dim,
-                num_heads=config.encoder_attention_heads,
-                dropout=config.attention_dropout,
-                attention_factor=config.attention_factor,
-            )
-        else:
-            self.self_attn = InformerAttention(
-                embed_dim=self.embed_dim,
-                num_heads=config.encoder_attention_heads,
-                dropout=config.attention_dropout,
-            )
+        self.self_attn = TimeSeriesTransformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1003,32 +1160,24 @@ def forward(
         return outputs
 
 
-class InformerDecoderLayer(nn.Module):
-    def __init__(self, config: InformerConfig):
+# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->TimeSeriesTransformer
+class TimeSeriesTransformerDecoderLayer(nn.Module):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        if config.attention_type == "prob":
-            self.self_attn = ProbSparseAttention(
-                embed_dim=self.embed_dim,
-                num_heads=config.encoder_attention_heads,
-                dropout=config.attention_dropout,
-                attention_factor=config.attention_factor,
-                is_decoder=True,
-            )
-        else:
-            self.self_attn = InformerAttention(
-                embed_dim=self.embed_dim,
-                num_heads=config.decoder_attention_heads,
-                dropout=config.attention_dropout,
-                is_decoder=True,
-            )
+        self.self_attn = TimeSeriesTransformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = InformerAttention(
+        self.encoder_attn = TimeSeriesTransformerAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -1129,16 +1278,15 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
-class InformerPreTrainedModel(PreTrainedModel):
-    config_class = InformerConfig
+class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
+    config_class = TimeSeriesTransformerConfig
     base_model_prefix = "model"
     main_input_name = "past_values"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
         std = self.config.init_std
-        if isinstance(module, (nn.Linear, nn.Conv1d)):
+        if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
@@ -1148,11 +1296,11 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (InformerDecoder, InformerEncoder)):
+        if isinstance(module, (TimeSeriesTransformerDecoder, TimeSeriesTransformerEncoder)):
             module.gradient_checkpointing = value
 
 
-INFORMER_START_DOCSTRING = r"""
+TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -1162,13 +1310,13 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`InformerConfig`]):
+        config ([`TimeSeriesTransformerConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-INFORMER_INPUTS_DOCSTRING = r"""
+TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
         past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Past values of the time series, that serve as context in order to predict the future. These values may
@@ -1191,7 +1339,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
             Transformer requires to provide additional time features.
 
-            The Informer only learns additional embeddings for `static_categorical_features`.
+            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
 
         past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
@@ -1234,16 +1382,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
             Transformer requires to provide additional features.
 
-            The Informer only learns additional embeddings for `static_categorical_features`.
-        
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
+            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
 
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
@@ -1279,7 +1418,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
@@ -1291,7 +1429,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -1311,34 +1448,27 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
-class InformerEncoder(InformerPreTrainedModel):
+class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
     """
-    Informer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`InformerEncoderLayer`].
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`TimeSeriesTransformerEncoderLayer`].
 
     Args:
-        config: InformerConfig
+        config: TimeSeriesTransformerConfig
     """
 
-    def __init__(self, config: InformerConfig):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
-        self.gradient_checkpointing = False
 
         embed_dim = config.d_model
 
-        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
-        if config.distil is not None:
-            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
-            self.conv_layers.append(None)
-        else:
-            self.conv_layers = [None] * config.encoder_layers
-
+        self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1405,7 +1535,7 @@ def forward(
                     f" {head_mask.size()[0]}."
                 )
 
-        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
+        for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1427,7 +1557,6 @@ def custom_forward(*inputs):
                         attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                     )
-                    # TODO support for checkpointing conv_layers
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
@@ -1435,8 +1564,6 @@ def custom_forward(*inputs):
                         layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
-                    if conv_layer is not None:
-                        hidden_states = conv_layer(hidden_states)
 
                 hidden_states = layer_outputs[0]
 
@@ -1453,22 +1580,21 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
-class InformerDecoder(InformerPreTrainedModel):
+class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
     """
-    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`TimeSeriesTransformerDecoderLayer`]
 
     Args:
-        config: InformerConfig
+        config: TimeSeriesTransformerConfig
     """
 
-    def __init__(self, config: InformerConfig):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
-
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
@@ -1691,12 +1817,11 @@ def custom_forward(*inputs):
 
 
 @add_start_docstrings(
-    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
-    INFORMER_START_DOCSTRING,
+    "The bare Time Series Transformer Model outputting raw hidden-states without any specific head on top.",
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
-class InformerModel(InformerPreTrainedModel):
-    def __init__(self, config: InformerConfig):
+class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
         if config.scaling:
@@ -1709,9 +1834,9 @@ def __init__(self, config: InformerConfig):
             embedding_dims=config.embedding_dimension,
         )
 
-        # Informer encoder-decoder and mask initializer
-        self.encoder = InformerEncoder(config)
-        self.decoder = InformerDecoder(config)
+        # transformer encoder-decoder and mask initializer
+        self.encoder = TimeSeriesTransformerEncoder(config)
+        self.decoder = TimeSeriesTransformerDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1828,13 +1953,23 @@ def create_network_inputs(
 
         return transformer_inputs, scale, static_feat
 
+    def enc_dec_outputs(self, transformer_inputs):
+        enc_input = transformer_inputs[:, : self.config.context_length, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
+
+        encoder_outputs = self.encoder(inputs_embeds=enc_input)
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
+        )
+        return encoder_outputs, decoder_outputs
+
     def get_encoder(self):
         return self.encoder
 
     def get_decoder(self):
         return self.decoder
 
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1864,14 +1999,14 @@ def forward(
         ```python
         >>> from huggingface_hub import hf_hub_download
         >>> import torch
-        >>> from transformers import InformerModel
+        >>> from transformers import TimeSeriesTransformerModel
 
         >>> file = hf_hub_download(
         ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerModel.from_pretrained("kashif/informer-tourism-monthly")
+        >>> model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features
@@ -1953,15 +2088,13 @@ def forward(
 
 
 @add_start_docstrings(
-    "The Informer Model with a distribution head on top for time-series forecasting.",
-    INFORMER_START_DOCSTRING,
+    "The Time Series Transformer Model with a distribution head on top for time-series forecasting.",
+    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
-class InformerForPrediction(InformerPreTrainedModel):
-    def __init__(self, config: InformerConfig):
+class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
-
-        self.model = InformerModel(config)
+        self.model = TimeSeriesTransformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput(dim=config.input_size)
         elif config.distribution_output == "normal":
@@ -1998,7 +2131,7 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
             sliced_params = [p[:, -trailing_n:] for p in params]
         return self.distribution_output.distribution(sliced_params, scale=scale)
 
-    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -2024,19 +2157,30 @@ def forward(
         r"""
         Returns:
 
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+
         Examples:
 
         ```python
         >>> from huggingface_hub import hf_hub_download
         >>> import torch
-        >>> from transformers import InformerForPrediction
+        >>> from transformers import TimeSeriesTransformerForPrediction
 
         >>> file = hf_hub_download(
         ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly")
+        >>> model = TimeSeriesTransformerForPrediction.from_pretrained(
+        ...     "huggingface/time-series-transformer-tourism-monthly"
+        ... )
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b10e2ef7b7b7..f97d4d36ad6c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3154,6 +3154,30 @@ def load_tf_weights_in_imagegpt(*args, **kwargs):
     requires_backends(load_tf_weights_in_imagegpt, ["torch"])
 
 
+INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class InformerForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 

From 35e4549774e5d841376c58d30d9973705c0ea8c8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 12:26:49 +0100
Subject: [PATCH 094/164] added to index

---
 docs/source/en/index.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 5622765da172..5e4ab203c5a9 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -122,6 +122,7 @@ The documentation is organized into five sections:
 1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -305,6 +306,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Informer            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |

From 31f8bd382273ec48e9a4d718bf2716e995270164 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 12:31:57 +0100
Subject: [PATCH 095/164] updated readme

---
 README.md         | 1 +
 README_es.md      | 1 +
 README_hd.md      | 1 +
 README_ja.md      | 1 +
 README_ko.md      | 1 +
 README_zh-hans.md | 1 +
 README_zh-hant.md | 1 +
 7 files changed, 7 insertions(+)

diff --git a/README.md b/README.md
index c7d864be9225..98f76e3ac38b 100644
--- a/README.md
+++ b/README.md
@@ -343,6 +343,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/README_es.md b/README_es.md
index 4a35888cf17c..ca2502b991cb 100644
--- a/README_es.md
+++ b/README_es.md
@@ -336,6 +336,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/README_hd.md b/README_hd.md
index 48c2a486feda..0aa6f208f8fd 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -308,6 +308,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा।
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/README_ja.md b/README_ja.md
index 5d12a0a887fc..613a2020f5df 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -370,6 +370,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447)
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
diff --git a/README_ko.md b/README_ko.md
index db602db08629..7fa58b7638fe 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -285,6 +285,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다.
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 3e7b904f00c2..b639710fed99 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -309,6 +309,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 5eecef72f1ce..75802ed6e12a 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -321,6 +321,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/main/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.

From c26ba0e9661da865f2cedf9698debb83174ac3d3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 12:50:34 +0100
Subject: [PATCH 096/164] undo

---
 .../models/informer/modeling_informer.py      | 816 ++++++++----------
 1 file changed, 336 insertions(+), 480 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4d6c24ea3748..dc0b68a7ea4a 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -231,7 +231,7 @@ def distribution(
         return self._base_distribution((total_count, logits))
 
 
-# FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
+# Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
 # source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
 class FeatureEmbedder(nn.Module):
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
@@ -399,402 +399,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Time Series Transformer model."""
-
-import random
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.distributions import (
-    AffineTransform,
-    Distribution,
-    Independent,
-    NegativeBinomial,
-    Normal,
-    StudentT,
-    TransformedDistribution,
-)
-
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_time_series_transformer import TimeSeriesTransformerConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
-
-
-TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "huggingface/time-series-transformer-tourism-monthly",
-    # See all TimeSeriesTransformer models at https://huggingface.co/models?filter=time_series_transformer
-]
-
-
-class AffineTransformed(TransformedDistribution):
-    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
-        self.scale = 1.0 if scale is None else scale
-        self.loc = 0.0 if loc is None else loc
-
-        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
-
-    @property
-    def mean(self):
-        """
-        Returns the mean of the distribution.
-        """
-        return self.base_dist.mean * self.scale + self.loc
-
-    @property
-    def variance(self):
-        """
-        Returns the variance of the distribution.
-        """
-        return self.base_dist.variance * self.scale**2
-
-    @property
-    def stddev(self):
-        """
-        Returns the standard deviation of the distribution.
-        """
-        return self.variance.sqrt()
-
-
-class ParameterProjection(nn.Module):
-    def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        self.args_dim = args_dim
-        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
-        self.domain_map = domain_map
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
-        params_unbounded = [proj(x) for proj in self.proj]
-
-        return self.domain_map(*params_unbounded)
-
-
-class LambdaLayer(nn.Module):
-    def __init__(self, function):
-        super().__init__()
-        self.function = function
-
-    def forward(self, x, *args):
-        return self.function(x, *args)
-
-
-class DistributionOutput:
-    distribution_class: type
-    in_features: int
-    args_dim: Dict[str, int]
-
-    def __init__(self, dim: int = 1) -> None:
-        self.dim = dim
-        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
-
-    def _base_distribution(self, distr_args):
-        if self.dim == 1:
-            return self.distribution_class(*distr_args)
-        else:
-            return Independent(self.distribution_class(*distr_args), 1)
-
-    def distribution(
-        self,
-        distr_args,
-        loc: Optional[torch.Tensor] = None,
-        scale: Optional[torch.Tensor] = None,
-    ) -> Distribution:
-        distr = self._base_distribution(distr_args)
-        if loc is None and scale is None:
-            return distr
-        else:
-            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
-
-    @property
-    def event_shape(self) -> Tuple:
-        r"""
-        Shape of each individual event contemplated by the distributions that this object constructs.
-        """
-        return () if self.dim == 1 else (self.dim,)
-
-    @property
-    def event_dim(self) -> int:
-        r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
-        constructs.
-        """
-        return len(self.event_shape)
-
-    @property
-    def value_in_support(self) -> float:
-        r"""
-        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
-        default 0.0. This value will be used when padding data series.
-        """
-        return 0.0
-
-    def get_parameter_projection(self, in_features: int) -> nn.Module:
-        r"""
-        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
-        """
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        r"""
-        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
-        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
-        distribution of the right event_shape.
-        """
-        raise NotImplementedError()
-
-    @classmethod
-    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
-        r"""
-        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
-        https://twitter.com/jon_barron/status/1387167648669048833
-        """
-        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
-
-
-class StudentTOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
-    distribution_class: type = StudentT
-
-    @classmethod
-    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        df = 2.0 + cls.squareplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NormalOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
-    distribution_class: type = Normal
-
-    @classmethod
-    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NegativeBinomialOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
-    distribution_class: type = NegativeBinomial
-
-    @classmethod
-    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
-        total_count = cls.squareplus(total_count)
-        return total_count.squeeze(-1), logits.squeeze(-1)
-
-    def _base_distribution(self, distr_args) -> Distribution:
-        total_count, logits = distr_args
-        if self.dim == 1:
-            return self.distribution_class(total_count=total_count, logits=logits)
-        else:
-            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
-
-    # Overwrites the parent class method. We cannot scale using the affine
-    # transformation since negative binomial should return integers. Instead
-    # we scale the parameters.
-    def distribution(
-        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
-    ) -> Distribution:
-        total_count, logits = distr_args
-
-        if scale is not None:
-            # See scaling property of Gamma.
-            logits += scale.log()
-
-        return self._base_distribution((total_count, logits))
-
-
-class FeatureEmbedder(nn.Module):
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
-
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
-        else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
-
-
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
-
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # these will have shape (N, C)
-        total_weight = weights.sum(dim=self.dim)
-        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
-
-        # first compute a global scale per-dimension
-        total_observed = total_weight.sum(dim=0)
-        denominator = torch.max(total_observed, torch.ones_like(total_observed))
-        default_scale = weighted_sum.sum(dim=0) / denominator
-
-        # then compute a per-item, per-dimension scale
-        denominator = torch.max(total_weight, torch.ones_like(total_weight))
-        scale = weighted_sum / denominator
-
-        # use per-batch scale when no element is observed
-        # or when the sequence contains only zeros
-        scale = (
-            torch.max(
-                self.minimum_scale,
-                torch.where(
-                    weighted_sum > torch.zeros_like(weighted_sum),
-                    scale,
-                    default_scale * torch.ones_like(total_weight),
-                ),
-            )
-            .detach()
-            .unsqueeze(dim=self.dim)
-        )
-
-        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
-
-
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, scale
-
-
-def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
-    """
-    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-
-    Args:
-        input_tensor (`torch.FloatTensor`):
-            Input tensor, of which the average must be computed.
-        weights (`torch.FloatTensor`, *optional*):
-            Weights tensor, of the same shape as `input_tensor`.
-        dim (`int`, *optional*):
-            The dim along which to average `input_tensor`.
-
-    Returns:
-        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
-    """
-    if weights is not None:
-        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
-        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
-        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
-    else:
-        return input_tensor.mean(dim=dim)
-
-
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
-    mask_cond = torch.arange(mask.size(-1))
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
 @dataclass
 class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     """
@@ -863,6 +467,7 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     """
@@ -931,13 +536,14 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
 class SampleTimeSeriesPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->TimeSeriesTransformer
-class TimeSeriesTransformerAttention(nn.Module):
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
+class InformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
@@ -1083,7 +689,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -1091,16 +697,253 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->TimeSeriesTransformer
-class TimeSeriesTransformerEncoderLayer(nn.Module):
-    def __init__(self, config: TimeSeriesTransformerConfig):
+class ProbSparseAttention(nn.Module):
+    """ProbSparse Attention"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        attention_factor: int = 5,
+        bias: bool = True,
+    ):
         super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = TimeSeriesTransformerAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
+        self.factor = attention_factor
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        # c*ln(L_k)
+        L_K = key_states.size(1)
+        U_part = min(self.factor * np.ceil(np.log1p(L_K)).astype("int").item(), L_K)
+
+        # c*ln(L_q)
+        L_Q = query_states.size(1)
+        u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
+
+        if L_K > 0:
+            index_sample = torch.randint(0, L_K, (U_part,))  # torch.Size([14])
+
+            # real U = U_part(factor*ln(L_k))*L_q
+            K_sample = key_states[:, index_sample, :]  # torch.Size([52, 14, 4])
+        else:
+            K_sample = key_states
+        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
+        # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
+
+        # find the Top_k query with sparisty measurement
+        if u > 0:
+            M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
+            M_top = M.topk(u, sorted=False)[1]
+
+            # use the reduced Q to calculate Q_K
+            # factor*ln(L_q)
+            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+            Q_reduce = query_states[dim_for_slice, M_top]
+        else:
+            Q_reduce = query_states
+            M_top = None
+
+        # score_top
+        attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
+
+        src_len = key_states.size(1)
+        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+
+            if M_top is not None:
+                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
+                prob_mask = prob_mask[dim_for_slice, M_top, :]
+
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
+                bsz, self.num_heads, u, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # get initial context
+        context = value_states.cumsum(dim=-2)
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if M_top is not None:
+            # update context: copy the attention output to the context at M_top index
+            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
+            context[dim_for_slice, M_top, :] = attn_output
+            attn_output = context
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
         )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class InformerEncoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        if config.attention_type == "prob":
+            self.self_attn = ProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                attention_factor=config.attention_factor,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+            )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -1160,24 +1003,32 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->TimeSeriesTransformer
-class TimeSeriesTransformerDecoderLayer(nn.Module):
-    def __init__(self, config: TimeSeriesTransformerConfig):
+class InformerDecoderLayer(nn.Module):
+    def __init__(self, config: InformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        self.self_attn = TimeSeriesTransformerAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
+        if config.attention_type == "prob":
+            self.self_attn = ProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                attention_factor=config.attention_factor,
+                is_decoder=True,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = TimeSeriesTransformerAttention(
+        self.encoder_attn = InformerAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -1278,15 +1129,16 @@ def forward(
         return outputs
 
 
-class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
-    config_class = TimeSeriesTransformerConfig
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
+class InformerPreTrainedModel(PreTrainedModel):
+    config_class = InformerConfig
     base_model_prefix = "model"
     main_input_name = "past_values"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
         std = self.config.init_std
-        if isinstance(module, nn.Linear):
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
@@ -1296,11 +1148,11 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (TimeSeriesTransformerDecoder, TimeSeriesTransformerEncoder)):
+        if isinstance(module, (InformerDecoder, InformerEncoder)):
             module.gradient_checkpointing = value
 
 
-TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
+INFORMER_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -1310,13 +1162,13 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`TimeSeriesTransformerConfig`]):
+        config ([`InformerConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
+INFORMER_INPUTS_DOCSTRING = r"""
     Args:
         past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Past values of the time series, that serve as context in order to predict the future. These values may
@@ -1339,7 +1191,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
             Transformer requires to provide additional time features.
 
-            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
+            The Informer only learns additional embeddings for `static_categorical_features`.
 
         past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
@@ -1382,7 +1234,16 @@ def _set_gradient_checkpointing(self, module, value=False):
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
             Transformer requires to provide additional features.
 
-            The Time Series Transformer only learns additional embeddings for `static_categorical_features`.
+            The Informer only learns additional embeddings for `static_categorical_features`.
+        
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
 
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
@@ -1418,6 +1279,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
             hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
@@ -1429,6 +1291,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -1448,27 +1311,34 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
+class InformerEncoder(InformerPreTrainedModel):
     """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`TimeSeriesTransformerEncoderLayer`].
+    Informer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`InformerEncoderLayer`].
 
     Args:
-        config: TimeSeriesTransformerConfig
+        config: InformerConfig
     """
 
-    def __init__(self, config: TimeSeriesTransformerConfig):
+    def __init__(self, config: InformerConfig):
         super().__init__(config)
 
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
+        self.gradient_checkpointing = False
 
         embed_dim = config.d_model
 
-        self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
-        self.gradient_checkpointing = False
+        if config.distil is not None:
+            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
+            self.conv_layers.append(None)
+        else:
+            self.conv_layers = [None] * config.encoder_layers
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1535,7 +1405,7 @@ def forward(
                     f" {head_mask.size()[0]}."
                 )
 
-        for idx, encoder_layer in enumerate(self.layers):
+        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1557,6 +1427,7 @@ def custom_forward(*inputs):
                         attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                     )
+                    # TODO support for checkpointing conv_layers
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
@@ -1564,6 +1435,8 @@ def custom_forward(*inputs):
                         layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
+                    if conv_layer is not None:
+                        hidden_states = conv_layer(hidden_states)
 
                 hidden_states = layer_outputs[0]
 
@@ -1580,21 +1453,22 @@ def custom_forward(*inputs):
         )
 
 
-class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
+class InformerDecoder(InformerPreTrainedModel):
     """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
-    [`TimeSeriesTransformerDecoderLayer`]
+    Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
 
     Args:
-        config: TimeSeriesTransformerConfig
+        config: InformerConfig
     """
 
-    def __init__(self, config: TimeSeriesTransformerConfig):
+    def __init__(self, config: InformerConfig):
         super().__init__(config)
+
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
 
-        self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
@@ -1817,11 +1691,12 @@ def custom_forward(*inputs):
 
 
 @add_start_docstrings(
-    "The bare Time Series Transformer Model outputting raw hidden-states without any specific head on top.",
-    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
+    "The bare Informer Model outputting raw hidden-states without any specific head on top.",
+    INFORMER_START_DOCSTRING,
 )
-class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
-    def __init__(self, config: TimeSeriesTransformerConfig):
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
         super().__init__(config)
 
         if config.scaling:
@@ -1834,9 +1709,9 @@ def __init__(self, config: TimeSeriesTransformerConfig):
             embedding_dims=config.embedding_dimension,
         )
 
-        # transformer encoder-decoder and mask initializer
-        self.encoder = TimeSeriesTransformerEncoder(config)
-        self.decoder = TimeSeriesTransformerDecoder(config)
+        # Informer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1953,23 +1828,13 @@ def create_network_inputs(
 
         return transformer_inputs, scale, static_feat
 
-    def enc_dec_outputs(self, transformer_inputs):
-        enc_input = transformer_inputs[:, : self.config.context_length, ...]
-        dec_input = transformer_inputs[:, self.config.context_length :, ...]
-
-        encoder_outputs = self.encoder(inputs_embeds=enc_input)
-        decoder_outputs = self.decoder(
-            inputs_embeds=dec_input, encoder_hidden_states=encoder_outputs.last_hidden_state
-        )
-        return encoder_outputs, decoder_outputs
-
     def get_encoder(self):
         return self.encoder
 
     def get_decoder(self):
         return self.decoder
 
-    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1999,14 +1864,14 @@ def forward(
         ```python
         >>> from huggingface_hub import hf_hub_download
         >>> import torch
-        >>> from transformers import TimeSeriesTransformerModel
+        >>> from transformers import InformerModel
 
         >>> file = hf_hub_download(
         ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+        >>> model = InformerModel.from_pretrained("kashif/informer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features
@@ -2088,13 +1953,15 @@ def forward(
 
 
 @add_start_docstrings(
-    "The Time Series Transformer Model with a distribution head on top for time-series forecasting.",
-    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
+    "The Informer Model with a distribution head on top for time-series forecasting.",
+    INFORMER_START_DOCSTRING,
 )
-class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
-    def __init__(self, config: TimeSeriesTransformerConfig):
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
         super().__init__(config)
-        self.model = TimeSeriesTransformerModel(config)
+
+        self.model = InformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput(dim=config.input_size)
         elif config.distribution_output == "normal":
@@ -2131,7 +1998,7 @@ def output_distribution(self, params, scale=None, trailing_n=None) -> torch.dist
             sliced_params = [p[:, -trailing_n:] for p in params]
         return self.distribution_output.distribution(sliced_params, scale=scale)
 
-    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -2157,30 +2024,19 @@ def forward(
         r"""
         Returns:
 
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-
         Examples:
 
         ```python
         >>> from huggingface_hub import hf_hub_download
         >>> import torch
-        >>> from transformers import TimeSeriesTransformerForPrediction
+        >>> from transformers import InformerForPrediction
 
         >>> file = hf_hub_download(
         ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = TimeSeriesTransformerForPrediction.from_pretrained(
-        ...     "huggingface/time-series-transformer-tourism-monthly"
-        ... )
+        >>> model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features

From 8379d0c03bb1509a90f1f7cfb83add0096bd6cde Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 12:58:44 +0100
Subject: [PATCH 097/164]  make fix-copies

---
 .../models/informer/modeling_informer.py      | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index dc0b68a7ea4a..77cc03ffd54f 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -49,6 +49,7 @@
 ]
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.AffineTransformed
 class AffineTransformed(TransformedDistribution):
     def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
         self.scale = 1.0 if scale is None else scale
@@ -78,6 +79,7 @@ def stddev(self):
         return self.variance.sqrt()
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.ParameterProjection
 class ParameterProjection(nn.Module):
     def __init__(
         self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
@@ -93,6 +95,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
         return self.domain_map(*params_unbounded)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.LambdaLayer
 class LambdaLayer(nn.Module):
     def __init__(self, function):
         super().__init__()
@@ -102,6 +105,7 @@ def forward(self, x, *args):
         return self.function(x, *args)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.DistributionOutput
 class DistributionOutput:
     distribution_class: type
     in_features: int
@@ -179,6 +183,7 @@ def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
         return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.StudentTOutput
 class StudentTOutput(DistributionOutput):
     args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
     distribution_class: type = StudentT
@@ -190,6 +195,7 @@ def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NormalOutput
 class NormalOutput(DistributionOutput):
     args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
     distribution_class: type = Normal
@@ -200,6 +206,7 @@ def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
         return loc.squeeze(-1), scale.squeeze(-1)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NegativeBinomialOutput
 class NegativeBinomialOutput(DistributionOutput):
     args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
     distribution_class: type = NegativeBinomial
@@ -231,8 +238,7 @@ def distribution(
         return self._base_distribution((total_count, logits))
 
 
-# Eli: FeatureEmbedder, MeanScaler and NOPScaler are from GlounTS (see the exact source below)
-# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/feature.py
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.FeatureEmbedder
 class FeatureEmbedder(nn.Module):
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
         super().__init__()
@@ -257,7 +263,7 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
-# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.MeanScaler
 class MeanScaler(nn.Module):
     """
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
@@ -312,7 +318,7 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens
         return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
 
 
-# source: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/modules/scaler.py
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NOPScaler
 class NOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
@@ -334,6 +340,7 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple
         return data, scale
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
 def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
     """
     Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
@@ -358,6 +365,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return input_tensor.mean(dim=dim)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NegativeLogLikelihood
 class NegativeLogLikelihood:
     """
     Computes the negative log likelihood loss from input distribution with respect to target.
@@ -398,8 +406,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.Seq2SeqTimeSeriesModelOutput
 class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     """
     Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
@@ -467,8 +475,8 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.Seq2SeqTimeSeriesPredictionOutput
 class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     """
     Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
@@ -536,8 +544,8 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     static_features: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
 @dataclass
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.SampleTimeSeriesPredictionOutput
 class SampleTimeSeriesPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
@@ -689,7 +697,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -1129,7 +1137,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerPreTrainedModel with TimeSeriesTransformer->Informer
 class InformerPreTrainedModel(PreTrainedModel):
     config_class = InformerConfig
     base_model_prefix = "model"
@@ -1311,7 +1318,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 """
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Informer
 class InformerEncoder(InformerPreTrainedModel):
     """
     Informer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -1453,7 +1459,6 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """
     Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
@@ -1694,7 +1699,6 @@ def custom_forward(*inputs):
     "The bare Informer Model outputting raw hidden-states without any specific head on top.",
     INFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
 class InformerModel(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
@@ -1960,7 +1964,6 @@ def forward(
 class InformerForPrediction(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
-
         self.model = InformerModel(config)
         if config.distribution_output == "student_t":
             self.distribution_output = StudentTOutput(dim=config.input_size)
@@ -2024,6 +2027,15 @@ def forward(
         r"""
         Returns:
 
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+
         Examples:
 
         ```python
@@ -2036,7 +2048,7 @@ def forward(
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly")
+        >>> model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features

From a8846f5da836df8da5d41b391b00e4095322e019 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 13:02:21 +0100
Subject: [PATCH 098/164] typo

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 77cc03ffd54f..f695288db556 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -819,7 +819,7 @@ def forward(
         Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
         # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
 
-        # find the Top_k query with sparisty measurement
+        # find the Top_k query with sparsity measurement
         if u > 0:
             M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
             M_top = M.topk(u, sorted=False)[1]

From 217260d52741a2cd2f11662b48d0c63822ef1321 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 16 Feb 2023 13:08:19 +0100
Subject: [PATCH 099/164] fix copy

---
 src/transformers/models/informer/modeling_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index f695288db556..108d8d066806 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1960,7 +1960,7 @@ def forward(
     "The Informer Model with a distribution head on top for time-series forecasting.",
     INFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerForPrediction with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer
 class InformerForPrediction(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
@@ -2048,7 +2048,7 @@ def forward(
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
+        >>> model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features

From 1421ccd0954646b4cd2d9589e1c0daaf9cb61d82 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 09:21:26 +0100
Subject: [PATCH 100/164] added Informer to toctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a2afbfaf42eb..836b2fc48f97 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -588,6 +588,8 @@
       sections:
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
+      - local: model_doc/informer
+        title: Informer
       title: Time series models
     - isExpanded: false
       sections:

From 2b0a26d497d96b950f76c2152a336d940a8565cd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 09:24:16 +0100
Subject: [PATCH 101/164] in order

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 836b2fc48f97..9868027f2815 100755
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -586,10 +586,10 @@
       title: Reinforcement learning models
     - isExpanded: false
       sections:
-      - local: model_doc/time_series_transformer
-        title: Time Series Transformer
       - local: model_doc/informer
         title: Informer
+      - local: model_doc/time_series_transformer
+        title: Time Series Transformer
       title: Time series models
     - isExpanded: false
       sections:

From 10ed99538497e7571530cc99e2af14911b646f7d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 09:42:05 +0100
Subject: [PATCH 102/164] fixed comments

---
 .../models/informer/modeling_informer.py            | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 108d8d066806..3e5de3c46801 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -810,14 +810,13 @@ def forward(
         u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
 
         if L_K > 0:
-            index_sample = torch.randint(0, L_K, (U_part,))  # torch.Size([14])
+            index_sample = torch.randint(0, L_K, (U_part,))
 
             # real U = U_part(factor*ln(L_k))*L_q
-            K_sample = key_states[:, index_sample, :]  # torch.Size([52, 14, 4])
+            K_sample = key_states[:, index_sample, :]
         else:
             K_sample = key_states
         Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
-        # torch.Size([52, 14, 4]) x torch.Size([52, 4, 14])
 
         # find the Top_k query with sparsity measurement
         if u > 0:
@@ -882,10 +881,14 @@ def forward(
             attn_weights_reshaped = None
 
         attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
 
         # get initial context
-        context = value_states.cumsum(dim=-2)
-        attn_output = torch.bmm(attn_probs, value_states)
+        if self.is_decoder:
+            context = value_states.cumsum(dim=-2)
+        else:
+            V_sum = value_states.mean(dim=-2)
+            context = V_sum.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, V_sum.size(-1)).clone()
 
         if M_top is not None:
             # update context: copy the attention output to the context at M_top index

From ea07cd9e0d47e0944a5b01026aaa952e9e2b9f1b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 09:54:33 +0100
Subject: [PATCH 103/164] remove unneeded new lines in docs

---
 .../models/informer/modeling_informer.py      | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 3e5de3c46801..78b741db7c0c 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1189,7 +1189,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
 
             Missing values need to be replaced with zeros.
-
         past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
             Optional time features, which the model internally will add to `past_values`. These could be things like
             "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
@@ -1202,7 +1201,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             Transformer requires to provide additional time features.
 
             The Informer only learns additional embeddings for `static_categorical_features`.
-
         past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
             `[0, 1]`:
@@ -1217,14 +1215,12 @@ def _set_gradient_checkpointing(self, module, value=False):
             Static categorical features are features which have the same value for all time steps (static over time).
 
             A typical example of a static categorical feature is a time series ID.
-
         static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
             Optional static real features which the model will add to the values of the time series.
 
             Static real features are features which have the same value for all time steps (static over time).
 
             A typical example of a static real feature is promotion information.
-
         future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
             Future values of the time series, that serve as labels for the model. The `future_values` is what the
             Transformer needs to learn to output, given the `past_values`.
@@ -1232,7 +1228,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             See the demo notebook and code snippets for details.
 
             Missing values need to be replaced with zeros.
-
         future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
             Optional time features, which the model internally will add to `future_values`. These could be things like
             "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
@@ -1244,8 +1239,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
             Transformer requires to provide additional features.
 
-            The Informer only learns additional embeddings for `static_categorical_features`.
-        
+            The Informer only learns additional embeddings for `static_categorical_features`.        
         future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
             in `[0, 1]`:
@@ -1254,7 +1248,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
             This mask is used to filter out missing values for the final loss calculation.
-
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
 
@@ -1262,11 +1255,9 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-
         decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
             make sure the model can only look at previous inputs in order to predict the future.
-
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 
@@ -1288,8 +1279,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.        
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
@@ -1300,13 +1290,11 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.        
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
             model's internal embedding lookup matrix.
-
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -1561,20 +1549,16 @@ def forward(
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                 all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                 This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                 than the model's internal embedding lookup matrix.
-
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
-
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more detail.
-
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """

From 8299745d8f9adc209d3026b18faa72b3abca8caa Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 14:53:48 +0100
Subject: [PATCH 104/164] make static real and cat optional

---
 .../models/informer/configuration_informer.py |  2 +-
 .../models/informer/modeling_informer.py      | 38 ++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 0735f3bf4f8c..01a71ac3fc08 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -192,7 +192,7 @@ def __init__(
                 )
             self.cardinality = cardinality
         else:
-            self.cardinality = [1]
+            self.cardinality = [0]
 
         # set embedding_dimension
         if embedding_dimension and num_static_categorical_features > 0:
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 78b741db7c0c..32ebe93a8391 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1695,10 +1695,11 @@ def __init__(self, config: InformerConfig):
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
-        self.embedder = FeatureEmbedder(
-            cardinalities=config.cardinality,
-            embedding_dims=config.embedding_dimension,
-        )
+        if config.num_static_categorical_features > 0:
+            self.embedder = FeatureEmbedder(
+                cardinalities=config.cardinality,
+                embedding_dims=config.embedding_dimension,
+            )
 
         # Informer encoder-decoder and mask initializer
         self.encoder = InformerEncoder(config)
@@ -1750,8 +1751,8 @@ def create_network_inputs(
         self,
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
@@ -1800,11 +1801,14 @@ def create_network_inputs(
             else self.config.context_length
         )
 
-        # embeddings
-        embedded_cat = self.embedder(static_categorical_features)
         # static features
-        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
-        static_feat = torch.cat((embedded_cat, static_real_features, log_scale), dim=1)
+        static_feat = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        if static_real_features is not None:
+            static_feat = torch.cat((static_real_features, static_feat), dim=1)
+        if static_categorical_features is not None:
+            # embeddings
+            embedded_cat = self.embedder(static_categorical_features)
+            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
         expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
 
         # all features
@@ -1832,8 +1836,8 @@ def forward(
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1995,8 +1999,8 @@ def forward(
         past_values: torch.Tensor,
         past_time_features: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         future_time_features: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
@@ -2130,12 +2134,12 @@ def forward(
     @torch.no_grad()
     def generate(
         self,
-        static_categorical_features: torch.Tensor,
-        static_real_features: torch.Tensor,
         past_time_features: torch.Tensor,
         past_values: torch.Tensor,
         past_observed_mask: torch.Tensor,
-        future_time_features: Optional[torch.Tensor],
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> torch.Tensor:

From 48663628c66e0943d7097d2212350240ed334b8b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 16:00:17 +0100
Subject: [PATCH 105/164] fix use of distil conv layers

---
 src/transformers/models/informer/modeling_informer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 32ebe93a8391..97508f562ee3 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1433,7 +1433,8 @@ def custom_forward(*inputs):
                         output_attentions=output_attentions,
                     )
                     if conv_layer is not None:
-                        hidden_states = conv_layer(hidden_states)
+                        output = conv_layer(layer_outputs[0])
+                        layer_outputs = (output,) + layer_outputs[1:]
 
                 hidden_states = layer_outputs[0]
 

From 8218d555b94161bc4a9c6f5fb8a07b0fb3c7081e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Feb 2023 18:39:29 +0100
Subject: [PATCH 106/164] fixed integration test

---
 .../models/informer/test_modeling_informer.py | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 699eedf3a11f..0ee8bd671f9c 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -58,6 +58,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         lags_sequence=[1, 2, 3, 4, 5],
         attention_factor=10,
+        distil=False,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -83,6 +84,7 @@ def __init__(
             attention_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
         )
         self.attention_factor = attention_factor
+        self.distil = distil
 
     def get_config(self):
         return InformerConfig(
@@ -103,6 +105,7 @@ def get_config(self):
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
             attention_factor=self.attention_factor,
+            distil=self.distil,
         )
 
     def prepare_informer_inputs_dict(self, config):
@@ -402,8 +405,8 @@ def test_attention_outputs(self):
                 list(cross_attentions[0].shape[-3:]),
                 [
                     self.model_tester.num_attention_heads,
-                    prediction_length,
-                    context_length,
+                    decoder_seq_length,
+                    decoder_seq_length,
                 ],
             )
 
@@ -454,13 +457,12 @@ def test_inference_no_head(self):
                 static_real_features=batch["static_real_features"],
                 future_values=batch["future_values"],
                 future_time_features=batch["future_time_features"],
-            )[0]
-
-        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+            ).last_hidden_state
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-1.4829, 0.7390, -1.3606], [-1.9992, 0.3949, -1.3191], [-1.1011, 0.2860, -1.5074]], device=torch_device
+            [[-0.6678, 0.4203, 0.0956], [-0.8622, 0.2728, 0.0858], [-0.5118, 0.2205, -0.0191]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -477,12 +479,14 @@ def test_inference_head(self):
                 static_categorical_features=batch["static_categorical_features"],
                 static_real_features=batch["static_real_features"],
                 future_time_features=batch["future_time_features"],
-            )[1]
-        expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model))
+            ).encoder_last_hidden_state
+
+        # encoder distils the context length to 1/8th of the original length
+        expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model))
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[0.4427, 0.6329, 0.1136], [0.5492, 2.3569, 0.6203], [0.0812, 2.6220, 1.5276]], device=torch_device
+            [[-0.2993, 1.8141, -0.4122], [-0.3320, 2.0362, -0.7312], [-0.3640, 2.4771, -0.7129]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -503,6 +507,6 @@ def test_seq_to_seq_generation(self):
         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
         self.assertEqual(outputs.sequences.shape, expected_shape)
 
-        expected_slice = torch.tensor([3877.3796, 4988.0166, 7795.9473], device=torch_device)
+        expected_slice = torch.tensor([2726.9468, 3130.4065, 4020.5728], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
         self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From 21e3d43aca2d24999370639f051788975243c016 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Feb 2023 11:43:52 +0100
Subject: [PATCH 107/164] added checkpoint for convlayer

---
 src/transformers/models/informer/modeling_informer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 97508f562ee3..1f3125e2d366 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1424,7 +1424,9 @@ def custom_forward(*inputs):
                         attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                     )
-                    # TODO support for checkpointing conv_layers
+                    if conv_layer is not None:
+                        output = torch.utils.checkpoint.checkpoint(conv_layer, layer_outputs[0])
+                        layer_outputs = (output,) + layer_outputs[1:]
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,

From c1a58ebb3c024c664cba6b473dc4c7293676596c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 09:06:24 +0100
Subject: [PATCH 108/164] make fix-copies

---
 .../models/informer/modeling_informer.py      | 220 +++++++++++++-----
 1 file changed, 159 insertions(+), 61 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1f3125e2d366..4e5a5617afac 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -274,48 +274,49 @@ class MeanScaler(nn.Module):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
             Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
         minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-10):
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
         self.dim = dim
         self.keepdim = keepdim
-        self.register_buffer("minimum_scale", torch.tensor(minimum_scale))
-
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # these will have shape (N, C)
-        total_weight = weights.sum(dim=self.dim)
-        weighted_sum = (data.abs() * weights).sum(dim=self.dim)
-
-        # first compute a global scale per-dimension
-        total_observed = total_weight.sum(dim=0)
-        denominator = torch.max(total_observed, torch.ones_like(total_observed))
-        default_scale = weighted_sum.sum(dim=0) / denominator
-
-        # then compute a per-item, per-dimension scale
-        denominator = torch.max(total_weight, torch.ones_like(total_weight))
-        scale = weighted_sum / denominator
-
-        # use per-batch scale when no element is observed
-        # or when the sequence contains only zeros
-        scale = (
-            torch.max(
-                self.minimum_scale,
-                torch.where(
-                    weighted_sum > torch.zeros_like(weighted_sum),
-                    scale,
-                    default_scale * torch.ones_like(total_weight),
-                ),
-            )
-            .detach()
-            .unsqueeze(dim=self.dim)
-        )
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
 
-        return data / scale, scale if self.keepdim else scale.squeeze(dim=self.dim)
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NOPScaler
@@ -335,9 +336,12 @@ def __init__(self, dim: int, keepdim: bool = False):
         self.dim = dim
         self.keepdim = keepdim
 
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, scale
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
@@ -456,9 +460,12 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
+            magnitude and then used to rescale back to the original magnitude.
         static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
@@ -471,6 +478,7 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
     scale: Optional[torch.FloatTensor] = None
     static_features: Optional[torch.FloatTensor] = None
 
@@ -524,9 +532,12 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        scale: (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale to the original scale.
+            magnitude and then used to rescale back to the original magnitude.
         static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
@@ -540,6 +551,7 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
     scale: Optional[torch.FloatTensor] = None
     static_features: Optional[torch.FloatTensor] = None
 
@@ -1989,11 +2001,11 @@ def get_decoder(self):
         return self.model.get_decoder()
 
     @torch.jit.ignore
-    def output_distribution(self, params, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
         sliced_params = params
         if trailing_n is not None:
             sliced_params = [p[:, -trailing_n:] for p in params]
-        return self.distribution_output.distribution(sliced_params, scale=scale)
+        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
 
     @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
@@ -2021,15 +2033,6 @@ def forward(
         r"""
         Returns:
 
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-
         Examples:
 
         ```python
@@ -2102,7 +2105,8 @@ def forward(
         params = None
         if future_values is not None:
             params = self.output_params(outputs[0])  # outputs.last_hidden_state
-            distribution = self.output_distribution(params, outputs[-2])  # outputs.scale
+            # loc is 3rd last and scale is 2nd last output
+            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
 
             loss = self.loss(distribution, future_values)
 
@@ -2130,6 +2134,7 @@ def forward(
             encoder_last_hidden_state=outputs.encoder_last_hidden_state,
             encoder_hidden_states=outputs.encoder_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
+            loc=outputs.loc,
             scale=outputs.scale,
             static_features=outputs.static_features,
         )
@@ -2137,15 +2142,102 @@ def forward(
     @torch.no_grad()
     def generate(
         self,
-        past_time_features: torch.Tensor,
         past_values: torch.Tensor,
-        past_observed_mask: torch.Tensor,
+        past_time_features: torch.Tensor,
+        future_time_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
         static_categorical_features: Optional[torch.Tensor] = None,
         static_real_features: Optional[torch.Tensor] = None,
-        future_time_features: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> torch.Tensor:
+    ) -> SampleTimeSeriesPredictionOutput:
+        r"""
+        Greedily generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+                Past values of the time series, that serve as context in order to predict the future. The sequence size
+                of this tensor must be larger than the `context_length` of the model, since the model will use the
+                larger size to construct lag features, i.e. additional values from the past which are added in order to
+                serve as "extra context".
+
+                The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
+                no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+                look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
+                of the past.
+
+                The `past_values` is what the Transformer encoder gets as input (with optional additional features,
+                such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+                Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+                For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
+                of variates in the time series per time step.
+            past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+                Required time features, which the model internally will add to `past_values`. These could be things
+                like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
+                These could also be so-called "age" features, which basically help the model know "at which point in
+                life" a time-series is. Age features have small values for distant past time steps and increase
+                monotonically the more we approach the current time step. Holiday features are also a good example of
+                time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+                Required time features for the prediction window, which the model internally will add to sampled
+                predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
+                (for instance as Fourier features). These could also be so-called "age" features, which basically help
+                the model know "at which point in life" a time-series is. Age features have small values for distant
+                past time steps and increase monotonically the more we approach the current time step. Holiday features
+                are also a good example of time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+                Optional static categorical features for which the model will learn an embedding, which it will add to
+                the values of the time series.
+
+                Static categorical features are features which have the same value for all time steps (static over
+                time).
+
+                A typical example of a static categorical feature is a time series ID.
+            static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+                Optional static real features which the model will add to the values of the time series.
+
+                Static real features are features which have the same value for all time steps (static over time).
+
+                A typical example of a static real feature is promotion information.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+        Return:
+            [`SampleTimeSeriesPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)`
+            for multivariate predictions.
+        """
         outputs = self(
             static_categorical_features=static_categorical_features,
             static_real_features=static_real_features,
@@ -2162,13 +2254,17 @@ def generate(
 
         decoder = self.model.get_decoder()
         enc_last_hidden = outputs.encoder_last_hidden_state
+        loc = outputs.loc
         scale = outputs.scale
         static_feat = outputs.static_features
 
         num_parallel_samples = self.config.num_parallel_samples
+        repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
         repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
 
-        repeated_past_values = past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) / repeated_scale
+        repeated_past_values = (
+            past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
+        ) / repeated_scale
 
         expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
         features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
@@ -2195,10 +2291,12 @@ def generate(
             dec_last_hidden = dec_output.last_hidden_state
 
             params = self.parameter_projection(dec_last_hidden[:, -1:])
-            distr = self.output_distribution(params, scale=repeated_scale)
+            distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
             next_sample = distr.sample()
 
-            repeated_past_values = torch.cat((repeated_past_values, next_sample / repeated_scale), dim=1)
+            repeated_past_values = torch.cat(
+                (repeated_past_values, (next_sample - repeated_loc) / repeated_scale), dim=1
+            )
             future_samples.append(next_sample)
 
         concat_future_samples = torch.cat(future_samples, dim=1)

From 56578bfef58b703c951fc08345f859b299aa2985 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 10:08:54 +0100
Subject: [PATCH 109/164] updated from time series model

---
 .../models/informer/configuration_informer.py |  15 +-
 .../models/informer/modeling_informer.py      | 200 ++++++++++++++----
 2 files changed, 167 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 01a71ac3fc08..b25f7a1ec105 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -54,8 +54,9 @@ class InformerConfig(PretrainedConfig):
         input_size (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
-        scaling (`bool`, *optional* defaults to `True`):
-            Whether to scale the input targets.
+        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
         lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
             The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
             5, 6, 7]`.
@@ -75,6 +76,8 @@ class InformerConfig(PretrainedConfig):
             The dimension of the embedding for each of the static categorical features. Should be a list of integers,
             having the same length as `num_static_categorical_features`. Cannot be `None` if
             `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 2):
@@ -119,7 +122,7 @@ class InformerConfig(PretrainedConfig):
     >>> from transformers import InformerConfig, InformerModel
 
     >>> # Initializing a default Informer configuration
-    >>> configuration = InformerConfig()
+    >>> configuration = InformerConfig(prediction_length=7)
 
     >>> # Randomly initializing a model (with random weights) from the configuration
     >>> model = InformerModel(configuration)
@@ -149,6 +152,7 @@ def __init__(
         num_time_features: int = 0,
         cardinality: Optional[List[int]] = None,
         embedding_dimension: Optional[List[int]] = None,
+        d_model: int = 64,
         encoder_ffn_dim: int = 32,
         decoder_ffn_dim: int = 32,
         encoder_attention_heads: int = 2,
@@ -207,7 +211,8 @@ def __init__(
         self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture configuration
-        self.d_model = input_size * len(self.lags_sequence) + self._number_of_features
+        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
+        self.d_model = d_model
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_attention_heads = decoder_attention_heads
         self.encoder_ffn_dim = encoder_ffn_dim
@@ -243,5 +248,5 @@ def _number_of_features(self) -> int:
             + self.num_dynamic_real_features
             + self.num_time_features
             + self.num_static_real_features
-            + self.input_size  # the log(scale)
+            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
         )
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4e5a5617afac..1d350754f884 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -263,6 +263,40 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.StdScaler
+class StdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.MeanScaler
 class MeanScaler(nn.Module):
     """
@@ -410,6 +444,51 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->TimeSeries
+class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.ValueEmbedding
+class ValueEmbedding(nn.Module):
+    def __init__(self, feature_size, d_model):
+        super(ValueEmbedding, self).__init__()
+        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
+
+    def forward(self, x):
+        return self.value_projection(x)
+
+
 @dataclass
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.Seq2SeqTimeSeriesModelOutput
 class Seq2SeqTimeSeriesModelOutput(ModelOutput):
@@ -1184,7 +1263,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`InformerConfig`]):
+        config ([`TimeSeriesTransformerConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1192,28 +1271,41 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 INFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Past values of the time series, that serve as context in order to predict the future. These values may
-            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
             The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
-            `static_categorical_features`, `static_real_features`, `past_time_features`).
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
 
-            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
 
-            Missing values need to be replaced with zeros.
-        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `past_values`. These could be things like
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
             "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
             could also be so-called "age" features, which basically help the model know "at which point in life" a
             time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
+            more we approach the current time step. Holiday features are also a good example of time features.
 
             These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional time features.
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
 
-            The Informer only learns additional embeddings for `static_categorical_features`.
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
             Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
             `[0, 1]`:
 
@@ -1233,26 +1325,37 @@ def _set_gradient_checkpointing(self, module, value=False):
             Static real features are features which have the same value for all time steps (static over time).
 
             A typical example of a static real feature is promotion information.
-        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
             Future values of the time series, that serve as labels for the model. The `future_values` is what the
-            Transformer needs to learn to output, given the `past_values`.
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
 
             See the demo notebook and code snippets for details.
 
-            Missing values need to be replaced with zeros.
-        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
-            Optional time features, which the model internally will add to `future_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step.
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
 
             These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
             the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional features.
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
 
-            The Informer only learns additional embeddings for `static_categorical_features`.        
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
             Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
             in `[0, 1]`:
 
@@ -1291,7 +1394,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.        
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
@@ -1302,7 +1405,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.        
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -1337,10 +1440,12 @@ def __init__(self, config: InformerConfig):
         self.layerdrop = config.encoder_layerdrop
         self.gradient_checkpointing = False
 
-        embed_dim = config.d_model
-
+        self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
         self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         if config.distil is not None:
             self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
@@ -1394,8 +1499,10 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # expand attention_mask
@@ -1478,11 +1585,15 @@ def __init__(self, config: InformerConfig):
 
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
+        self.gradient_checkpointing = False
 
+        self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
-        self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1598,9 +1709,9 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
-        hidden_states = inputs_embeds
-        hidden_states = self.layernorm_embedding(hidden_states)
-
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size(), past_key_values_length=self.config.context_length)
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # decoder layers
@@ -1701,6 +1812,7 @@ def custom_forward(*inputs):
     "The bare Informer Model outputting raw hidden-states without any specific head on top.",
     INFORMER_START_DOCSTRING,
 )
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer
 class InformerModel(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
@@ -1791,12 +1903,12 @@ def create_network_inputs(
 
         context = past_values[:, -self.config.context_length :]
         observed_context = past_observed_mask[:, -self.config.context_length :]
-        _, scale = self.scaler(context, observed_context)
+        _, loc, scale = self.scaler(context, observed_context)
 
         inputs = (
-            torch.cat((past_values, future_values), dim=1) / scale
+            (torch.cat((past_values, future_values), dim=1) - loc) / scale
             if future_values is not None
-            else past_values / scale
+            else (past_values - loc) / scale
         )
 
         inputs_length = (
@@ -1817,11 +1929,13 @@ def create_network_inputs(
         )
 
         # static features
-        static_feat = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
+
         if static_real_features is not None:
             static_feat = torch.cat((static_real_features, static_feat), dim=1)
         if static_categorical_features is not None:
-            # embeddings
             embedded_cat = self.embedder(static_categorical_features)
             static_feat = torch.cat((embedded_cat, static_feat), dim=1)
         expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
@@ -1829,14 +1943,14 @@ def create_network_inputs(
         # all features
         features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
+        # lagged features
         lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
-
         lags_shape = lagged_sequence.shape
         reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
 
         transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
 
-        return transformer_inputs, scale, static_feat
+        return transformer_inputs, loc, scale, static_feat
 
     def get_encoder(self):
         return self.encoder

From 32e94d1ac26604afd77b4e6c7b684562d7ebd35f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 10:10:17 +0100
Subject: [PATCH 110/164] make fix-copies

---
 .../models/informer/modeling_informer.py            | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1d350754f884..761b9d364c31 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1817,8 +1817,10 @@ class InformerModel(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 
-        if config.scaling:
+        if config.scaling == "mean" or config.scaling:
             self.scaler = MeanScaler(dim=1, keepdim=True)
+        elif config.scaling == "std":
+            self.scaler = StdScaler(dim=1, keepdim=True)
         else:
             self.scaler = NOPScaler(dim=1, keepdim=True)
 
@@ -1828,7 +1830,7 @@ def __init__(self, config: InformerConfig):
                 embedding_dims=config.embedding_dimension,
             )
 
-        # Informer encoder-decoder and mask initializer
+        # transformer encoder-decoder and mask initializer
         self.encoder = InformerEncoder(config)
         self.decoder = InformerDecoder(config)
 
@@ -1995,7 +1997,7 @@ def forward(
         ... )
         >>> batch = torch.load(file)
 
-        >>> model = InformerModel.from_pretrained("kashif/informer-tourism-monthly")
+        >>> model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly")
 
         >>> # during training, one provides both past and future values
         >>> # as well as possible additional features
@@ -2018,7 +2020,7 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_inputs, scale, static_feat = self.create_network_inputs(
+        transformer_inputs, loc, scale, static_feat = self.create_network_inputs(
             past_values=past_values,
             past_time_features=past_time_features,
             past_observed_mask=past_observed_mask,
@@ -2060,7 +2062,7 @@ def forward(
         )
 
         if not return_dict:
-            return decoder_outputs + encoder_outputs + (scale, static_feat)
+            return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
 
         return Seq2SeqTimeSeriesModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
@@ -2071,6 +2073,7 @@ def forward(
             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
+            loc=loc,
             scale=scale,
             static_features=static_feat,
         )

From 29ba424d6e60d3395ee7af81feae7a596f30704b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 10:18:45 +0100
Subject: [PATCH 111/164] copy decoder

---
 src/transformers/models/informer/modeling_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 761b9d364c31..11b6178cac65 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1572,6 +1572,7 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer,Transformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """
     Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
@@ -1582,10 +1583,8 @@ class InformerDecoder(InformerPreTrainedModel):
 
     def __init__(self, config: InformerConfig):
         super().__init__(config)
-
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
-        self.gradient_checkpointing = False
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
@@ -1594,6 +1593,7 @@ def __init__(self, config: InformerConfig):
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
+        self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
 

From 19abd0b447288da9106c62b4e637e54c2930439f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 10:22:16 +0100
Subject: [PATCH 112/164] fix unit tests

---
 tests/models/informer/test_modeling_informer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 0ee8bd671f9c..3582320020e4 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -89,6 +89,7 @@ def __init__(
     def get_config(self):
         return InformerConfig(
             prediction_length=self.prediction_length,
+            d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
@@ -154,7 +155,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             encoder.save_pretrained(tmpdirname)
             encoder = InformerEncoder.from_pretrained(tmpdirname).to(torch_device)
 
-        transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict)
+        transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
         enc_input = transformer_inputs[:, : config.context_length, ...]
         dec_input = transformer_inputs[:, config.context_length :, ...]
 
@@ -372,7 +373,7 @@ def test_attention_outputs(self):
             )
             out_len = len(outputs)
 
-            correct_outlen = 6
+            correct_outlen = 7
 
             if "last_hidden_state" in outputs:
                 correct_outlen += 1

From 39ce9fd7b469bdd2d8e81c9763df9738ea3a52b0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 10:56:45 +0100
Subject: [PATCH 113/164] updated scaling config

---
 src/transformers/models/informer/configuration_informer.py | 4 ++--
 src/transformers/models/informer/modeling_informer.py      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index b25f7a1ec105..e416d48451a1 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Informer model configuration"""
 
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -145,7 +145,7 @@ def __init__(
         loss: str = "nll",
         input_size: int = 1,
         lags_sequence: List[int] = None,
-        scaling: bool = True,
+        scaling: Optional[Union[str, bool]] = "mean",
         num_dynamic_real_features: int = 0,
         num_static_real_features: int = 0,
         num_static_categorical_features: int = 0,
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 11b6178cac65..1c1c546793e3 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1426,8 +1426,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 class InformerEncoder(InformerPreTrainedModel):
     """
-    Informer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`InformerEncoderLayer`].
+    Informer encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each
+    attention layer is an [`InformerEncoderLayer`].
 
     Args:
         config: InformerConfig

From 68e68c017eca7d88a0a363cd6fe3e3a3bf0abc00 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 11:41:24 +0100
Subject: [PATCH 114/164] fix integration tests

---
 tests/models/informer/test_modeling_informer.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 3582320020e4..3538effaeeee 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -455,7 +455,6 @@ def test_inference_no_head(self):
                 past_time_features=batch["past_time_features"],
                 past_observed_mask=batch["past_observed_mask"],
                 static_categorical_features=batch["static_categorical_features"],
-                static_real_features=batch["static_real_features"],
                 future_values=batch["future_values"],
                 future_time_features=batch["future_time_features"],
             ).last_hidden_state
@@ -463,7 +462,12 @@ def test_inference_no_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.6678, 0.4203, 0.0956], [-0.8622, 0.2728, 0.0858], [-0.5118, 0.2205, -0.0191]], device=torch_device
+            [
+                [4.6951e-01, 7.2927e-01, 8.9877e-01],
+                [4.8908e-01, 3.8113e-01, 9.5783e-01],
+                [7.7888e-04, 3.7370e-01, 1.0078e00],
+            ],
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -478,7 +482,6 @@ def test_inference_head(self):
                 past_time_features=batch["past_time_features"],
                 past_observed_mask=batch["past_observed_mask"],
                 static_categorical_features=batch["static_categorical_features"],
-                static_real_features=batch["static_real_features"],
                 future_time_features=batch["future_time_features"],
             ).encoder_last_hidden_state
 
@@ -487,7 +490,7 @@ def test_inference_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.2993, 1.8141, -0.4122], [-0.3320, 2.0362, -0.7312], [-0.3640, 2.4771, -0.7129]], device=torch_device
+            [[0.4247, 0.9017, 0.8062], [0.3082, 0.7525, 0.6986], [0.6724, -0.6343, 1.2863]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -499,7 +502,6 @@ def test_seq_to_seq_generation(self):
         with torch.no_grad():
             outputs = model.generate(
                 static_categorical_features=batch["static_categorical_features"],
-                static_real_features=batch["static_real_features"],
                 past_time_features=batch["past_time_features"],
                 past_values=batch["past_values"],
                 future_time_features=batch["future_time_features"],
@@ -508,6 +510,6 @@ def test_seq_to_seq_generation(self):
         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
         self.assertEqual(outputs.sequences.shape, expected_shape)
 
-        expected_slice = torch.tensor([2726.9468, 3130.4065, 4020.5728], device=torch_device)
+        expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
         self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From 8ee1b1054695c277ca35468034b663878848e386 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 11:45:56 +0100
Subject: [PATCH 115/164] IGNORE_NON_TESTED

---
 utils/check_repo.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 53717645cf65..4e1902e02343 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -70,6 +70,8 @@
     "TableTransformerDecoder",  # Building part of bigger (tested) model.
     "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
     "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
+    "InformerEncoder",  # Building part of bigger (tested) model.
+    "InformerDecoder",  # Building part of bigger (tested) model.
     "JukeboxVQVAE",  # Building part of bigger (tested) model.
     "JukeboxPrior",  # Building part of bigger (tested) model.
     "DeformableDetrEncoder",  # Building part of bigger (tested) model.

From 05856aae4ffe6fe02c5c62b489b0c6369e432789 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 11:54:46 +0100
Subject: [PATCH 116/164] IGNORE_NON_AUTO_CONFIGURED

---
 .../configuration_time_series_transformer.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 5b631f21d8e4..dcfc5ee517b7 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -118,7 +118,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
 
     >>> # Initializing a default Time Series Transformer configuration
-    >>> configuration = TimeSeriesTransformerConfig()
+    >>> configuration = TimeSeriesTransformerConfig(prediction_length=7)
 
     >>> # Randomly initializing a model (with random weights) from the configuration
     >>> model = TimeSeriesTransformerModel(configuration)

From 50fc144d3899123946cf0d86a28c9cf1046782b0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 11:55:07 +0100
Subject: [PATCH 117/164] IGNORE_NON_AUTO_CONFIGURED

---
 utils/check_repo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 4e1902e02343..7d29563e74fa 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -209,6 +209,7 @@
     "EsmForProteinFolding",
     "GPTSanJapaneseModel",
     "TimeSeriesTransformerForPrediction",
+    "InformerForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",
     "PegasusXEncoder",

From e7845e8b6834571c9950dcb48742f742d60f451f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 12:13:15 +0100
Subject: [PATCH 118/164] updated check configs

---
 utils/check_config_attributes.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 93948cc2b9da..589a94ba6d07 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -69,6 +69,10 @@
     "RetriBertConfig": ["layer_norm_eps"],
     # having default values other than `1e-5` - we can't fix them without breaking
     "TrajectoryTransformerConfig": ["layer_norm_eps"],
+    # used internally to calculate the feature size
+    "InformerConfig": ["num_static_real_features", "num_time_features"],
+    # used internally to calculate the feature size
+    "TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
 }
 
 # TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
@@ -97,7 +101,6 @@
         "SwitchTransformersConfig": True,
         "TableTransformerConfig": True,
         "TapasConfig": True,
-        "TimeSeriesTransformerConfig": True,
         "TrajectoryTransformerConfig": True,
         "TransfoXLConfig": True,
         "UniSpeechConfig": True,

From b030bead7e784a9f7460a960a1030a40d6878754 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 22 Feb 2023 14:50:06 +0100
Subject: [PATCH 119/164] fix formatting

---
 src/transformers/__init__.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 324d72a09794..0b5d6d60a882 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -311,10 +311,7 @@
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
-    "models.informer": [
-        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "InformerConfig",
-    ],
+    "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"],
     "models.jukebox": [
         "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "JukeboxConfig",

From 308268b83127c1db0c3ca6fc96f8fe9a06c5c2a9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 23 Feb 2023 15:38:25 +0100
Subject: [PATCH 120/164] undo change from time series

---
 src/transformers/models/informer/configuration_informer.py    | 4 ++--
 .../configuration_time_series_transformer.py                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index e416d48451a1..ce0e05db5176 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -121,8 +121,8 @@ class InformerConfig(PretrainedConfig):
     ```python
     >>> from transformers import InformerConfig, InformerModel
 
-    >>> # Initializing a default Informer configuration
-    >>> configuration = InformerConfig(prediction_length=7)
+    >>> # Initializing an Informer configuration with 12 time steps for prediction
+    >>> configuration = InformerConfig(prediction_length=12)
 
     >>> # Randomly initializing a model (with random weights) from the configuration
     >>> model = InformerModel(configuration)
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index dcfc5ee517b7..5b631f21d8e4 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -118,7 +118,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
     >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
 
     >>> # Initializing a default Time Series Transformer configuration
-    >>> configuration = TimeSeriesTransformerConfig(prediction_length=7)
+    >>> configuration = TimeSeriesTransformerConfig()
 
     >>> # Randomly initializing a model (with random weights) from the configuration
     >>> model = TimeSeriesTransformerModel(configuration)

From f064080a0bbf38f0a1591a7b8833012acf1462e9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 23 Feb 2023 16:45:02 +0100
Subject: [PATCH 121/164] prediction_length should not be None

---
 src/transformers/models/informer/configuration_informer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index ce0e05db5176..b3e2955540f7 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -41,8 +41,9 @@ class InformerConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        prediction_length (`int`):
-            The prediction length for the decoder. In other words, the prediction horizon of the model.
+        prediction_length (`int`, defaults to 1):
+            The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
+            typically set from the dataset one is training on and we recommend to change it appropriately.
         context_length (`int`, *optional*, defaults to `prediction_length`):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
@@ -139,7 +140,7 @@ class InformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        prediction_length: Optional[int] = None,
+        prediction_length: int = 1,
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",

From 5086c4c283f624920fc62bd6241f17f19b83ed66 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Fri, 24 Feb 2023 13:08:30 +0700
Subject: [PATCH 122/164] aliign with the blog: prettify ProbSparse and change
 attention_factor  to sampling_factor

---
 .../models/informer/configuration_informer.py | 11 ++++----
 .../models/informer/modeling_informer.py      | 28 +++++++++----------
 .../models/informer/test_modeling_informer.py | 10 +++----
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index b3e2955540f7..caf23047c481 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -111,9 +111,10 @@ class InformerConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
         attention_type (`str`, *optional*, defaults to "prob"):
-            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (transformer).
-        attention_factor (`int`, *optional*, defaults to 2):
-            ProbSparse attention factor.
+            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full"
+            (vanilla transformer's canonical self-attention).
+        sampling_factor (`int`, *optional*, defaults to 5):
+            ProbSparse sampling factor.  It is used to control the reduced query matrix (Q_reduce) input length.
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 
@@ -172,7 +173,7 @@ def __init__(
         use_cache=True,
         # Informer arguments
         attention_type: str = "prob",
-        attention_factor: int = 2,
+        sampling_factor: int = 5,
         distil: bool = True,
         **kwargs,
     ):
@@ -237,7 +238,7 @@ def __init__(
 
         # Informer
         self.attention_type = attention_type
-        self.attention_factor = attention_factor
+        self.sampling_factor = sampling_factor
         self.distil = distil
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1c1c546793e3..4e11ed9e1f6c 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -805,11 +805,11 @@ def __init__(
         num_heads: int,
         dropout: float = 0.0,
         is_decoder: bool = False,
-        attention_factor: int = 5,
+        sampling_factor: int = 5,
         bias: bool = True,
     ):
         super().__init__()
-        self.factor = attention_factor
+        self.factor = sampling_factor
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
@@ -892,21 +892,21 @@ def forward(
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        # c*ln(L_k)
         L_K = key_states.size(1)
-        U_part = min(self.factor * np.ceil(np.log1p(L_K)).astype("int").item(), L_K)
+        log_L_K = np.ceil(np.log1p(L_K)).astype("int").item()
 
-        # c*ln(L_q)
         L_Q = query_states.size(1)
-        u = min(self.factor * np.ceil(np.log1p(L_Q)).astype("int").item(), L_Q)
+        log_L_Q = np.ceil(np.log1p(L_Q)).astype("int").item()
+
+        U_part = min(self.factor * log_L_K, L_K)
+        u = min(self.factor * log_L_Q, L_Q)
 
         if L_K > 0:
             index_sample = torch.randint(0, L_K, (U_part,))
-
-            # real U = U_part(factor*ln(L_k))*L_q
             K_sample = key_states[:, index_sample, :]
         else:
             K_sample = key_states
+
         Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
 
         # find the Top_k query with sparsity measurement
@@ -914,15 +914,14 @@ def forward(
             M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
             M_top = M.topk(u, sorted=False)[1]
 
-            # use the reduced Q to calculate Q_K
-            # factor*ln(L_q)
+            # calculate Q_reduce: query_states[:, M_top]
             dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
             Q_reduce = query_states[dim_for_slice, M_top]
         else:
             Q_reduce = query_states
             M_top = None
 
-        # score_top
+        # Use Q_reduce to calculate attention weights
         attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)
@@ -974,7 +973,8 @@ def forward(
         attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
 
-        # get initial context
+        # calculate contex for updating the attn_output, based on:
+        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L74
         if self.is_decoder:
             context = value_states.cumsum(dim=-2)
         else:
@@ -1038,7 +1038,7 @@ def __init__(self, config: InformerConfig):
                 embed_dim=self.embed_dim,
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
-                attention_factor=config.attention_factor,
+                sampling_factor=config.sampling_factor,
             )
         else:
             self.self_attn = InformerAttention(
@@ -1115,7 +1115,7 @@ def __init__(self, config: InformerConfig):
                 embed_dim=self.embed_dim,
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
-                attention_factor=config.attention_factor,
+                sampling_factor=config.sampling_factor,
                 is_decoder=True,
             )
         else:
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 3538effaeeee..c7ca42141411 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -57,7 +57,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         lags_sequence=[1, 2, 3, 4, 5],
-        attention_factor=10,
+        sampling_factor=10,
         distil=False,
     ):
         self.parent = parent
@@ -78,12 +78,12 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
         self.encoder_seq_length = min(
-            attention_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
+            sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
         )
         self.decoder_seq_length = min(
-            attention_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
+            sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
         )
-        self.attention_factor = attention_factor
+        self.sampling_factor = sampling_factor
         self.distil = distil
 
     def get_config(self):
@@ -105,7 +105,7 @@ def get_config(self):
             num_static_real_features=1,
             cardinality=[self.cardinality],
             embedding_dimension=[self.embedding_dimension],
-            attention_factor=self.attention_factor,
+            sampling_factor=self.sampling_factor,
             distil=self.distil,
         )
 

From d47c556fa266c807f4b95cfa5b22fad3e4d36786 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 24 Feb 2023 09:46:22 +0000
Subject: [PATCH 123/164] make style

---
 src/transformers/models/informer/configuration_informer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index caf23047c481..41300e89699e 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -111,10 +111,10 @@ class InformerConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
         attention_type (`str`, *optional*, defaults to "prob"):
-            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full"
-            (vanilla transformer's canonical self-attention).
+            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (vanilla
+            transformer's canonical self-attention).
         sampling_factor (`int`, *optional*, defaults to 5):
-            ProbSparse sampling factor.  It is used to control the reduced query matrix (Q_reduce) input length.
+            ProbSparse sampling factor. It is used to control the reduced query matrix (Q_reduce) input length.
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 

From 5462fbdf94ad3337f4831e5f82743bb0eda2ebc2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 24 Feb 2023 13:49:23 +0100
Subject: [PATCH 124/164] make fix-copies

---
 .../models/informer/configuration_informer.py          | 10 +++++-----
 src/transformers/models/informer/modeling_informer.py  |  4 ++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 41300e89699e..17832c7c4a6d 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -41,9 +41,9 @@ class InformerConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        prediction_length (`int`, defaults to 1):
+        prediction_length (`int`):
             The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
-            typically set from the dataset one is training on and we recommend to change it appropriately.
+            typically dictated by the dataset and we recommend to set it appropriately.
         context_length (`int`, *optional*, defaults to `prediction_length`):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
@@ -59,8 +59,8 @@ class InformerConfig(PretrainedConfig):
             Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
             scaler is set to "mean".
         lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
-            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
-            5, 6, 7]`.
+            The lags of the input time series as covariates often dictated by the frequency of the data. Default is
+            `[1, 2, 3, 4, 5, 6, 7]` but we recommend to change it based on the dataset appropriately.
         num_time_features (`int`, *optional*, defaults to 0):
             The number of time features in the input time series.
         num_dynamic_real_features (`int`, *optional*, defaults to 0):
@@ -141,7 +141,7 @@ class InformerConfig(PretrainedConfig):
 
     def __init__(
         self,
-        prediction_length: int = 1,
+        prediction_length: Optional[int] = None,
         context_length: Optional[int] = None,
         distribution_output: str = "student_t",
         loss: str = "nll",
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4e11ed9e1f6c..c59a16b08c09 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1439,6 +1439,8 @@ def __init__(self, config: InformerConfig):
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
         self.gradient_checkpointing = False
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
@@ -1585,6 +1587,8 @@ def __init__(self, config: InformerConfig):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(

From 4988b2fce74ad1e571747fadae3f3093d2ea97b7 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 17:19:01 +0700
Subject: [PATCH 125/164] niels CR: update contributed by

---
 docs/source/en/model_doc/informer.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/informer.mdx b/docs/source/en/model_doc/informer.mdx
index d54813f4c11e..a4801f5dd36a 100644
--- a/docs/source/en/model_doc/informer.mdx
+++ b/docs/source/en/model_doc/informer.mdx
@@ -22,7 +22,7 @@ The abstract from the paper is the following:
 
 *Many real-world applications require the prediction of long sequence time-series, such as electricity consumption planning. Long sequence time-series forecasting (LSTF) demands a high prediction capacity of the model, which is the ability to capture precise long-range dependency coupling between output and input efficiently. Recent studies have shown the potential of Transformer to increase the prediction capacity. However, there are several severe issues with Transformer that prevent it from being directly applicable to LSTF, including quadratic time complexity, high memory usage, and inherent limitation of the encoder-decoder architecture. To address these issues, we design an efficient transformer-based model for LSTF, named Informer, with three distinctive characteristics: (i) a ProbSparse self-attention mechanism, which achieves O(L logL) in time complexity and memory usage, and has comparable performance on sequences' dependency alignment. (ii) the self-attention distilling highlights dominating attention by halving cascading layer input, and efficiently handles extreme long input sequences. (iii) the generative style decoder, while conceptually simple, predicts the long time-series sequences at one forward operation rather than a step-by-step way, which drastically improves the inference speed of long-sequence predictions. Extensive experiments on four large-scale datasets demonstrate that Informer significantly outperforms existing methods and provides a new solution to the LSTF problem.*
 
-This model was contributed by [elisim](https://huggingface.co/elisim), [kashif](https://huggingface.co/kashif) and [nielsr](https://huggingface.co/nielsr).
+This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
 The original code can be found [here](https://github.com/zhouhaoyi/Informer2020).
 
 

From c7be3cf028170da14b4e9f5b7987a5009bd685eb Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 12:20:27 +0200
Subject: [PATCH 126/164] niels CR: update configuration_informer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/configuration_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 17832c7c4a6d..e63437654cb1 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -23,8 +23,8 @@
 logger = logging.get_logger(__name__)
 
 INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "kashif/informer-tourism-monthly": (
-        "https://huggingface.co/kashif/informer-tourism-monthly/resolve/main/config.json"
+    "huggingface/informer-tourism-monthly": (
+        "https://huggingface.co/huggingface/informer-tourism-monthly/resolve/main/config.json"
     ),
     # See all Informer models at https://huggingface.co/models?filter=informer
 }

From bb03fc6c8c8ac6452236f017f12ba6dea486d620 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 12:21:12 +0200
Subject: [PATCH 127/164] niels CR: update kashif -> huggingface

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index c59a16b08c09..b8500243cbba 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -44,7 +44,7 @@
 
 
 INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "kashif/informer-tourism-monthly",
+    "huggingface/informer-tourism-monthly",
     # See all Informer models at https://huggingface.co/models?filter=informer
 ]
 

From 1e78712245c0ee031d6c3ce853d81872b3c94659 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 17:26:54 +0700
Subject: [PATCH 128/164] niels CR: `sampling_factor` only relevant when
 `attention_type`=prob

---
 src/transformers/models/informer/configuration_informer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 17832c7c4a6d..b0f6b7accb2f 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -114,7 +114,8 @@ class InformerConfig(PretrainedConfig):
             Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (vanilla
             transformer's canonical self-attention).
         sampling_factor (`int`, *optional*, defaults to 5):
-            ProbSparse sampling factor. It is used to control the reduced query matrix (Q_reduce) input length.
+            ProbSparse sampling factor (only makes affect when `attention_type`="prob").
+            It is used to control the reduced query matrix (Q_reduce) input length.
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 

From 2afa495b0bdefba5e44a3b23f52b680d78d1ad21 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 10:38:37 +0000
Subject: [PATCH 129/164] make style

---
 src/transformers/models/informer/configuration_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 6154524b5d7c..d5950275b988 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -114,8 +114,8 @@ class InformerConfig(PretrainedConfig):
             Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (vanilla
             transformer's canonical self-attention).
         sampling_factor (`int`, *optional*, defaults to 5):
-            ProbSparse sampling factor (only makes affect when `attention_type`="prob").
-            It is used to control the reduced query matrix (Q_reduce) input length.
+            ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the
+            reduced query matrix (Q_reduce) input length.
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 

From 193938ffbe0cd3178c704f8a1499226a92ee2cf1 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 17:47:43 +0700
Subject: [PATCH 130/164] fixed U_part: added multiplication by `L_Q`

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b8500243cbba..f05e7f0b8300 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -898,7 +898,7 @@ def forward(
         L_Q = query_states.size(1)
         log_L_Q = np.ceil(np.log1p(L_Q)).astype("int").item()
 
-        U_part = min(self.factor * log_L_K, L_K)
+        U_part = min(self.factor * L_Q * log_L_K, L_K)
         u = min(self.factor * log_L_Q, L_Q)
 
         if L_K > 0:

From f064beb38713d2c7be95584695abd06ae52fd112 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 19:20:24 +0700
Subject: [PATCH 131/164] fixed bug: remove `is not None` from `if
 config.distil`

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index f05e7f0b8300..4c6907489369 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1449,7 +1449,7 @@ def __init__(self, config: InformerConfig):
         self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
-        if config.distil is not None:
+        if config.distil:
             self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
             self.conv_layers.append(None)
         else:

From 3db65bd4ae13f1b5fc23f01daef0ab3f14a99fc7 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Sat, 25 Feb 2023 20:28:14 +0700
Subject: [PATCH 132/164] fixed test: `decoder_seq_length` to
 `encoder_seq_length` in cross_attentions check

---
 tests/models/informer/test_modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index c7ca42141411..47e127465b0c 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -407,7 +407,7 @@ def test_attention_outputs(self):
                 [
                     self.model_tester.num_attention_heads,
                     decoder_seq_length,
-                    decoder_seq_length,
+                    encoder_seq_length,
                 ],
             )
 

From c4c6133701f53a7d16910b63ea4314101c02c81f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 27 Feb 2023 09:14:13 +0100
Subject: [PATCH 133/164] fix integration tests

---
 tests/models/informer/test_modeling_informer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 47e127465b0c..637fca78cf00 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -462,11 +462,7 @@ def test_inference_no_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [
-                [4.6951e-01, 7.2927e-01, 8.9877e-01],
-                [4.8908e-01, 3.8113e-01, 9.5783e-01],
-                [7.7888e-04, 3.7370e-01, 1.0078e00],
-            ],
+            [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]],
             device=torch_device,
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
@@ -490,7 +486,7 @@ def test_inference_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[0.4247, 0.9017, 0.8062], [0.3082, 0.7525, 0.6986], [0.6724, -0.6343, 1.2863]], device=torch_device
+            [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 

From 329647ec417c6afb01867a247dbe53e236cfc1a3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 27 Feb 2023 09:31:38 +0100
Subject: [PATCH 134/164] updated model hub

---
 tests/models/informer/test_modeling_informer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 637fca78cf00..91e6fb74f6f1 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -445,7 +445,7 @@ def prepare_batch(filename="train-batch.pt"):
 @slow
 class InformerModelIntegrationTests(unittest.TestCase):
     def test_inference_no_head(self):
-        model = InformerModel.from_pretrained("kashif/informer-tourism-monthly").to(torch_device)
+        model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
@@ -468,7 +468,7 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_inference_head(self):
-        model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly").to(torch_device)
+        model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device)
         batch = prepare_batch("val-batch.pt")
 
         torch.manual_seed(0)
@@ -491,7 +491,7 @@ def test_inference_head(self):
         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
 
     def test_seq_to_seq_generation(self):
-        model = InformerForPrediction.from_pretrained("kashif/informer-tourism-monthly").to(torch_device)
+        model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device)
         batch = prepare_batch("val-batch.pt")
 
         torch.manual_seed(0)

From 582c867fc1cd59ab44b177298777afca46d56df4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 27 Feb 2023 12:02:25 +0100
Subject: [PATCH 135/164] do not shift as in training

---
 src/transformers/models/informer/modeling_informer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4c6907489369..0b4974e48a4d 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -2398,9 +2398,7 @@ def generate(
         # greedy decoding
         for k in range(self.config.prediction_length):
             lagged_sequence = self.model.get_lagged_subsequences(
-                sequence=repeated_past_values,
-                subsequences_length=1 + k,
-                shift=1,
+                sequence=repeated_past_values, subsequences_length=1 + k
             )
 
             lags_shape = lagged_sequence.shape

From 25867e2c669079e1e3786cb16038a7831ed30329 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 27 Feb 2023 16:54:37 +0100
Subject: [PATCH 136/164] undo

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 0b4974e48a4d..5f38a964429e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1846,7 +1846,7 @@ def _past_length(self) -> int:
         return self.config.context_length + max(self.config.lags_sequence)
 
     def get_lagged_subsequences(
-        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 1
     ) -> torch.Tensor:
         """
         Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),

From 7e8420e7988e5360daeef09e7310aa7b426be07f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 28 Feb 2023 13:12:30 +0100
Subject: [PATCH 137/164] fix make-copies

---
 src/transformers/models/informer/modeling_informer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 5f38a964429e..4c6907489369 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1846,7 +1846,7 @@ def _past_length(self) -> int:
         return self.config.context_length + max(self.config.lags_sequence)
 
     def get_lagged_subsequences(
-        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 1
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
     ) -> torch.Tensor:
         """
         Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
@@ -2398,7 +2398,9 @@ def generate(
         # greedy decoding
         for k in range(self.config.prediction_length):
             lagged_sequence = self.model.get_lagged_subsequences(
-                sequence=repeated_past_values, subsequences_length=1 + k
+                sequence=repeated_past_values,
+                subsequences_length=1 + k,
+                shift=1,
             )
 
             lags_shape = lagged_sequence.shape

From 3074c7341d4900b6ed1d187339467fdf4646e0c4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 1 Mar 2023 12:27:55 +0100
Subject: [PATCH 138/164] make fix-copies

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4c6907489369..6782585629e8 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1745,7 +1745,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 if use_cache:
-                    logger.warning(
+                    logger.warning_once(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                     )
                     use_cache = False

From b4cbddfa05e3bd739b79569cd3c3b89e316f2451 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 19:04:14 +0700
Subject: [PATCH 139/164] added `if prediction_length is None`

---
 src/transformers/models/informer/configuration_informer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index d5950275b988..1dc95f0d49f9 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -179,6 +179,9 @@ def __init__(
         **kwargs,
     ):
         # time series specific configuration
+        if prediction_length is None:
+            raise ValueError("The `prediction_length` should be set.")
+
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.distribution_output = distribution_output

From b934cb8fc2268e383df95217401f0681d3eb1928 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 19:28:53 +0700
Subject: [PATCH 140/164] changed `ProbSparseAttention` to
 `InformerProbSparseAttention`

---
 .../models/informer/modeling_informer.py             | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 6782585629e8..9822f293265e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -796,9 +796,11 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-class ProbSparseAttention(nn.Module):
-    """ProbSparse Attention"""
-
+class InformerProbSparseAttention(nn.Module):
+    """Probabilistic Attention mechanism to select the "active"
+     queries rather than the "lazy" queries and provides a sparse
+     Transformer thus mitigating the quadratic compute and memory requirements of
+    vanilla attention"""
     def __init__(
         self,
         embed_dim: int,
@@ -1034,7 +1036,7 @@ def __init__(self, config: InformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
         if config.attention_type == "prob":
-            self.self_attn = ProbSparseAttention(
+            self.self_attn = InformerProbSparseAttention(
                 embed_dim=self.embed_dim,
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,
@@ -1111,7 +1113,7 @@ def __init__(self, config: InformerConfig):
         self.embed_dim = config.d_model
 
         if config.attention_type == "prob":
-            self.self_attn = ProbSparseAttention(
+            self.self_attn = InformerProbSparseAttention(
                 embed_dim=self.embed_dim,
                 num_heads=config.encoder_attention_heads,
                 dropout=config.attention_dropout,

From 426d14f67df84ff43f8232d70d6f551ec5ab0695 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 19:42:11 +0700
Subject: [PATCH 141/164] changed `V_sum` -> `v_mean_dim_time`

---
 src/transformers/models/informer/modeling_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 9822f293265e..cfbc429e1ab1 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -980,8 +980,8 @@ def forward(
         if self.is_decoder:
             context = value_states.cumsum(dim=-2)
         else:
-            V_sum = value_states.mean(dim=-2)
-            context = V_sum.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, V_sum.size(-1)).clone()
+            v_mean_dim_time = value_states.mean(dim=-2)
+            context = v_mean_dim_time.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, v_mean_dim_time.size(-1)).clone()
 
         if M_top is not None:
             # update context: copy the attention output to the context at M_top index

From 73d18404b87654bfdab3c55f57bb5aaa556ee7d8 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 19:47:27 +0700
Subject: [PATCH 142/164] changed `ConvLayer` to `InformerConvLayer` and fixed
 `super()`

---
 src/transformers/models/informer/modeling_informer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index cfbc429e1ab1..3a0aef9bd0c7 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1008,9 +1008,9 @@ def forward(
 
 
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
-class ConvLayer(nn.Module):
+class InformerConvLayer(nn.Module):
     def __init__(self, c_in):
-        super(ConvLayer, self).__init__()
+        super().__init__()
         self.downConv = nn.Conv1d(
             in_channels=c_in,
             out_channels=c_in,
@@ -1452,7 +1452,7 @@ def __init__(self, config: InformerConfig):
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         if config.distil:
-            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
+            self.conv_layers = nn.ModuleList([InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
             self.conv_layers.append(None)
         else:
             self.conv_layers = [None] * config.encoder_layers

From 3bc616394e2c88e83ab02f97a3357cffda1f38b8 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 19:51:55 +0700
Subject: [PATCH 143/164] TimeSeriesTansformer->Informer in decoder's Copied
 from

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 3a0aef9bd0c7..334124d0c062 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1576,7 +1576,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer,Transformer->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer,TimeSeriesTansformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """
     Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]

From ba6a8c3f27bddf8af4357db85941e08edadc6613 Mon Sep 17 00:00:00 2001
From: elisim <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 20:11:30 +0700
Subject: [PATCH 144/164] more descriptive in ProbSparse

---
 .../models/informer/modeling_informer.py      | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 334124d0c062..8f539050ebaa 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -894,37 +894,37 @@ def forward(
         key_states = key_states.view(*proj_shape)
         value_states = value_states.view(*proj_shape)
 
-        L_K = key_states.size(1)
-        log_L_K = np.ceil(np.log1p(L_K)).astype("int").item()
+        key_states_time_length = key_states.size(1)  # L_K
+        log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K
 
-        L_Q = query_states.size(1)
-        log_L_Q = np.ceil(np.log1p(L_Q)).astype("int").item()
+        query_states_time_length = query_states.size(1)  # L_Q
+        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item() # log_L_Q
 
-        U_part = min(self.factor * L_Q * log_L_K, L_K)
-        u = min(self.factor * log_L_Q, L_Q)
+        u_part = min(self.factor * query_states_time_length * log_key_states_time_length, key_states_time_length)
+        u = min(self.factor * log_query_states_time_length, query_states_time_length)
 
-        if L_K > 0:
-            index_sample = torch.randint(0, L_K, (U_part,))
-            K_sample = key_states[:, index_sample, :]
+        if key_states_time_length > 0:
+            index_sample = torch.randint(0, key_states_time_length, (u_part,))
+            k_sample = key_states[:, index_sample, :]
         else:
-            K_sample = key_states
+            k_sample = key_states
 
-        Q_K_sample = torch.bmm(query_states, K_sample.transpose(1, 2))
+        queries_keys_sample = torch.bmm(query_states, k_sample.transpose(1, 2))  # Q_K_sampled
 
         # find the Top_k query with sparsity measurement
         if u > 0:
-            M = Q_K_sample.max(dim=-1)[0] - torch.div(Q_K_sample.sum(dim=-1), L_K)
-            M_top = M.topk(u, sorted=False)[1]
+            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(queries_keys_sample.sum(dim=-1), key_states_time_length)  # M
+            top_u_sparsity_measurement = sparsity_measurement.topk(u, sorted=False)[1]  # M_top
 
-            # calculate Q_reduce: query_states[:, M_top]
+            # calculate q_reduce: query_states[:, top_u_sparsity_measurement]
             dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
-            Q_reduce = query_states[dim_for_slice, M_top]
+            q_reduce = query_states[dim_for_slice, top_u_sparsity_measurement]
         else:
-            Q_reduce = query_states
-            M_top = None
+            q_reduce = query_states
+            top_u_sparsity_measurement = None
 
-        # Use Q_reduce to calculate attention weights
-        attn_weights = torch.bmm(Q_reduce, key_states.transpose(1, 2))
+        # Use q_reduce to calculate attention weights
+        attn_weights = torch.bmm(q_reduce, key_states.transpose(1, 2))
 
         src_len = key_states.size(1)
         if attn_weights.size() != (bsz * self.num_heads, u, src_len):
@@ -942,9 +942,9 @@ def forward(
                 bsz * self.num_heads, tgt_len, src_len
             )
 
-            if M_top is not None:
+            if top_u_sparsity_measurement is not None:
                 dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
-                prob_mask = prob_mask[dim_for_slice, M_top, :]
+                prob_mask = prob_mask[dim_for_slice, top_u_sparsity_measurement, :]
 
             attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
                 bsz, self.num_heads, u, src_len
@@ -981,12 +981,12 @@ def forward(
             context = value_states.cumsum(dim=-2)
         else:
             v_mean_dim_time = value_states.mean(dim=-2)
-            context = v_mean_dim_time.unsqueeze(dim=1).expand(bsz * self.num_heads, L_Q, v_mean_dim_time.size(-1)).clone()
+            context = v_mean_dim_time.unsqueeze(dim=1).expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1)).clone()
 
-        if M_top is not None:
-            # update context: copy the attention output to the context at M_top index
+        if top_u_sparsity_measurement is not None:
+            # update context: copy the attention output to the context at top_u_sparsity_measurement index
             dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
-            context[dim_for_slice, M_top, :] = attn_output
+            context[dim_for_slice, top_u_sparsity_measurement, :] = attn_output
             attn_output = context
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):

From f4fde7a5ccefb7fa94bc9c74520a7cc02d18cde8 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Wed, 1 Mar 2023 13:22:56 +0000
Subject: [PATCH 145/164] make style

---
 .../models/informer/modeling_informer.py      | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 8f539050ebaa..d924756fd62b 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -798,9 +798,10 @@ def forward(
 
 class InformerProbSparseAttention(nn.Module):
     """Probabilistic Attention mechanism to select the "active"
-     queries rather than the "lazy" queries and provides a sparse
-     Transformer thus mitigating the quadratic compute and memory requirements of
+     queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
+     memory requirements of
     vanilla attention"""
+
     def __init__(
         self,
         embed_dim: int,
@@ -898,7 +899,7 @@ def forward(
         log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K
 
         query_states_time_length = query_states.size(1)  # L_Q
-        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item() # log_L_Q
+        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item()  # log_L_Q
 
         u_part = min(self.factor * query_states_time_length * log_key_states_time_length, key_states_time_length)
         u = min(self.factor * log_query_states_time_length, query_states_time_length)
@@ -913,7 +914,9 @@ def forward(
 
         # find the Top_k query with sparsity measurement
         if u > 0:
-            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(queries_keys_sample.sum(dim=-1), key_states_time_length)  # M
+            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(
+                queries_keys_sample.sum(dim=-1), key_states_time_length
+            )  # M
             top_u_sparsity_measurement = sparsity_measurement.topk(u, sorted=False)[1]  # M_top
 
             # calculate q_reduce: query_states[:, top_u_sparsity_measurement]
@@ -981,7 +984,11 @@ def forward(
             context = value_states.cumsum(dim=-2)
         else:
             v_mean_dim_time = value_states.mean(dim=-2)
-            context = v_mean_dim_time.unsqueeze(dim=1).expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1)).clone()
+            context = (
+                v_mean_dim_time.unsqueeze(dim=1)
+                .expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1))
+                .clone()
+            )
 
         if top_u_sparsity_measurement is not None:
             # update context: copy the attention output to the context at top_u_sparsity_measurement index
@@ -1452,7 +1459,9 @@ def __init__(self, config: InformerConfig):
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         if config.distil:
-            self.conv_layers = nn.ModuleList([InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
+            self.conv_layers = nn.ModuleList(
+                [InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)]
+            )
             self.conv_layers.append(None)
         else:
             self.conv_layers = [None] * config.encoder_layers

From 65811d12c71afab94f4635431fb626a483c56f60 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 1 Mar 2023 15:07:08 +0100
Subject: [PATCH 146/164] fix coped from

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index d924756fd62b..46366e03105f 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1585,7 +1585,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer,TimeSeriesTansformer->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,time-series-transformer->informer,Transformer->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """
     Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]

From 18fd0f8f8a8e65ca6349eb27f69ae9afe252d1d3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 1 Mar 2023 15:07:20 +0100
Subject: [PATCH 147/164] Revert "added `if prediction_length is None`"

This reverts commit b4cbddfa05e3bd739b79569cd3c3b89e316f2451.
---
 src/transformers/models/informer/configuration_informer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index 1dc95f0d49f9..d5950275b988 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -179,9 +179,6 @@ def __init__(
         **kwargs,
     ):
         # time series specific configuration
-        if prediction_length is None:
-            raise ValueError("The `prediction_length` should be set.")
-
         self.prediction_length = prediction_length
         self.context_length = context_length or prediction_length
         self.distribution_output = distribution_output

From f9e3bee68337ad419231702ae9b258a299a4dd5c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 1 Mar 2023 15:09:14 +0100
Subject: [PATCH 148/164] fixed indent

---
 src/transformers/models/informer/modeling_informer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 46366e03105f..5caef3250d71 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -798,9 +798,8 @@ def forward(
 
 class InformerProbSparseAttention(nn.Module):
     """Probabilistic Attention mechanism to select the "active"
-     queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
-     memory requirements of
-    vanilla attention"""
+    queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
+    memory requirements of vanilla attention"""
 
     def __init__(
         self,

From e13dfab24bba015d4a77b531a420df94bc50f299 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 2 Mar 2023 10:58:38 +0100
Subject: [PATCH 149/164] use InformerSinusoidalPositionalEmbedding

---
 src/transformers/models/informer/modeling_informer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 5caef3250d71..76d41793abf6 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -444,8 +444,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->TimeSeries
-class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Informer
+class InformerSinusoidalPositionalEmbedding(nn.Embedding):
     """This module produces sinusoidal positional embeddings of any length."""
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
@@ -1451,7 +1451,7 @@ def __init__(self, config: InformerConfig):
             raise ValueError("The `prediction_length` config needs to be specified.")
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
-        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
         self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
@@ -1601,7 +1601,7 @@ def __init__(self, config: InformerConfig):
             raise ValueError("The `prediction_length` config needs to be specified.")
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
-        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])

From b873c383c21dedd3b3965af4abb71d98ef5c12da Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 2 Mar 2023 15:19:46 +0100
Subject: [PATCH 150/164] make fix-style

---
 src/transformers/models/informer/modeling_informer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 76d41793abf6..088e5d62755e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -733,8 +733,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
@@ -780,7 +780,7 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
@@ -788,7 +788,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
@@ -1601,7 +1601,7 @@ def __init__(self, config: InformerConfig):
             raise ValueError("The `prediction_length` config needs to be specified.")
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
-        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])

From f5579648e0f0779f8b63a2d2b6fbaad02e0eed82 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 2 Mar 2023 15:49:19 +0100
Subject: [PATCH 151/164] fix from #21860

---
 src/transformers/models/informer/modeling_informer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 088e5d62755e..a44108d59dc6 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -891,8 +891,8 @@ def forward(
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
         query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
 
         key_states_time_length = key_states.size(1)  # L_K
         log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K

From d58f3ab732cb643b2c0759062785cd51a0c6c5df Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 2 Mar 2023 16:14:22 +0100
Subject: [PATCH 152/164] fix name

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index a44108d59dc6..e1dbd225f53f 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1601,7 +1601,7 @@ def __init__(self, config: InformerConfig):
             raise ValueError("The `prediction_length` config needs to be specified.")
 
         self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
-        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
         self.layers = nn.ModuleList([InformerDecoderLayer(config) for _ in range(config.decoder_layers)])

From 3bf819d0aecb7c1f398fbcebe0ce4c656bb5840d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 2 Mar 2023 20:48:10 +0100
Subject: [PATCH 153/164] make fix-copies

---
 .../models/informer/modeling_informer.py      | 37 +++++++------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index e1dbd225f53f..347937dbe5c5 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1584,7 +1584,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,time-series-transformer->informer,Transformer->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerDecoder with TimeSeriesTransformer->Informer,TimeSeriesTransformerConfig->InformerConfig,time-series-transformer->informer,Transformer->Informer,TimeSeries->Informer
 class InformerDecoder(InformerPreTrainedModel):
     """
     Informer decoder consisting of *config.decoder_layers* layers. Each layer is a [`InformerDecoderLayer`]
@@ -1874,14 +1874,11 @@ def get_lagged_subsequences(
         sequence_length = sequence.shape[1]
         indices = [lag - shift for lag in self.config.lags_sequence]
 
-        try:
-            assert max(indices) + subsequences_length <= sequence_length, (
+        if max(indices) + subsequences_length > sequence_length:
+            raise ValueError(
                 f"lags cannot go further than history length, found lag {max(indices)} "
                 f"while history length is only {sequence_length}"
             )
-        except AssertionError as e:
-            e.args += (max(indices), sequence_length)
-            raise
 
         lagged_values = []
         for lag_index in indices:
@@ -1927,23 +1924,6 @@ def create_network_inputs(
             else (past_values - loc) / scale
         )
 
-        inputs_length = (
-            self._past_length + self.config.prediction_length if future_values is not None else self._past_length
-        )
-        try:
-            assert inputs.shape[1] == inputs_length, (
-                f"input length {inputs.shape[1]} and dynamic feature lengths {inputs_length} does not match",
-            )
-        except AssertionError as e:
-            e.args += (inputs.shape[1], inputs_length)
-            raise
-
-        subsequences_length = (
-            self.config.context_length + self.config.prediction_length
-            if future_values is not None
-            else self.config.context_length
-        )
-
         # static features
         log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
         log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
@@ -1960,10 +1940,21 @@ def create_network_inputs(
         features = torch.cat((expanded_static_feat, time_feat), dim=-1)
 
         # lagged features
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
         lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
         lags_shape = lagged_sequence.shape
         reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
 
+        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
+            raise ValueError(
+                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
+            )
+
+        # transformer inputs
         transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
 
         return transformer_inputs, loc, scale, static_feat

From 370373832548ea91c1734334a45a3c0034f94c80 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 13:24:41 +0100
Subject: [PATCH 154/164] use time series utils

---
 src/transformers/__init__.py                  |  11 +
 .../models/informer/modeling_informer.py      | 396 +-----------------
 .../modeling_time_series_transformer.py       | 381 +----------------
 src/transformers/time_series_utils.py         | 382 +++++++++++++++++
 4 files changed, 426 insertions(+), 744 deletions(-)
 create mode 100644 src/transformers/time_series_utils.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b2b7a8fb4778..cff526f24fd4 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2718,6 +2718,17 @@
     ]
     _import_structure["pytorch_utils"] = ["Conv1D", "apply_chunking_to_forward", "prune_layer"]
     _import_structure["sagemaker"] = []
+    _import_structure["time_series_utils"] = [
+        "FeatureEmbedder",
+        "MeanScaler",
+        "NegativeBinomialOutput",
+        "NegativeLogLikelihood",
+        "NOPScaler",
+        "NormalOutput",
+        "StdScaler",
+        "StudentTOutput",
+        "weighted_average",
+    ]
     _import_structure["trainer"] = ["Trainer"]
     _import_structure["trainer_pt_utils"] = ["torch_distributed_zero_first"]
     _import_structure["trainer_seq2seq"] = ["Seq2SeqTrainer"]
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 347937dbe5c5..587be255e0e0 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -16,24 +16,26 @@
 
 import random
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from torch import nn
-from torch.distributions import (
-    AffineTransform,
-    Distribution,
-    Independent,
-    NegativeBinomial,
-    Normal,
-    StudentT,
-    TransformedDistribution,
-)
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import (
+    FeatureEmbedder,
+    MeanScaler,
+    NegativeBinomialOutput,
+    NegativeLogLikelihood,
+    NOPScaler,
+    NormalOutput,
+    StdScaler,
+    StudentTOutput,
+    weighted_average,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
 
@@ -49,370 +51,6 @@
 ]
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.AffineTransformed
-class AffineTransformed(TransformedDistribution):
-    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
-        self.scale = 1.0 if scale is None else scale
-        self.loc = 0.0 if loc is None else loc
-
-        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
-
-    @property
-    def mean(self):
-        """
-        Returns the mean of the distribution.
-        """
-        return self.base_dist.mean * self.scale + self.loc
-
-    @property
-    def variance(self):
-        """
-        Returns the variance of the distribution.
-        """
-        return self.base_dist.variance * self.scale**2
-
-    @property
-    def stddev(self):
-        """
-        Returns the standard deviation of the distribution.
-        """
-        return self.variance.sqrt()
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.ParameterProjection
-class ParameterProjection(nn.Module):
-    def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        self.args_dim = args_dim
-        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
-        self.domain_map = domain_map
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
-        params_unbounded = [proj(x) for proj in self.proj]
-
-        return self.domain_map(*params_unbounded)
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.LambdaLayer
-class LambdaLayer(nn.Module):
-    def __init__(self, function):
-        super().__init__()
-        self.function = function
-
-    def forward(self, x, *args):
-        return self.function(x, *args)
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.DistributionOutput
-class DistributionOutput:
-    distribution_class: type
-    in_features: int
-    args_dim: Dict[str, int]
-
-    def __init__(self, dim: int = 1) -> None:
-        self.dim = dim
-        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
-
-    def _base_distribution(self, distr_args):
-        if self.dim == 1:
-            return self.distribution_class(*distr_args)
-        else:
-            return Independent(self.distribution_class(*distr_args), 1)
-
-    def distribution(
-        self,
-        distr_args,
-        loc: Optional[torch.Tensor] = None,
-        scale: Optional[torch.Tensor] = None,
-    ) -> Distribution:
-        distr = self._base_distribution(distr_args)
-        if loc is None and scale is None:
-            return distr
-        else:
-            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
-
-    @property
-    def event_shape(self) -> Tuple:
-        r"""
-        Shape of each individual event contemplated by the distributions that this object constructs.
-        """
-        return () if self.dim == 1 else (self.dim,)
-
-    @property
-    def event_dim(self) -> int:
-        r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
-        constructs.
-        """
-        return len(self.event_shape)
-
-    @property
-    def value_in_support(self) -> float:
-        r"""
-        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
-        default 0.0. This value will be used when padding data series.
-        """
-        return 0.0
-
-    def get_parameter_projection(self, in_features: int) -> nn.Module:
-        r"""
-        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
-        """
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        r"""
-        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
-        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
-        distribution of the right event_shape.
-        """
-        raise NotImplementedError()
-
-    @classmethod
-    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
-        r"""
-        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
-        https://twitter.com/jon_barron/status/1387167648669048833
-        """
-        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.StudentTOutput
-class StudentTOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
-    distribution_class: type = StudentT
-
-    @classmethod
-    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        df = 2.0 + cls.squareplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NormalOutput
-class NormalOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
-    distribution_class: type = Normal
-
-    @classmethod
-    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NegativeBinomialOutput
-class NegativeBinomialOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
-    distribution_class: type = NegativeBinomial
-
-    @classmethod
-    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
-        total_count = cls.squareplus(total_count)
-        return total_count.squeeze(-1), logits.squeeze(-1)
-
-    def _base_distribution(self, distr_args) -> Distribution:
-        total_count, logits = distr_args
-        if self.dim == 1:
-            return self.distribution_class(total_count=total_count, logits=logits)
-        else:
-            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
-
-    # Overwrites the parent class method. We cannot scale using the affine
-    # transformation since negative binomial should return integers. Instead
-    # we scale the parameters.
-    def distribution(
-        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
-    ) -> Distribution:
-        total_count, logits = distr_args
-
-        if scale is not None:
-            # See scaling property of Gamma.
-            logits += scale.log()
-
-        return self._base_distribution((total_count, logits))
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.FeatureEmbedder
-class FeatureEmbedder(nn.Module):
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
-
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
-        else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.StdScaler
-class StdScaler(nn.Module):
-    """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
-        denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
-
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
-        scale = torch.sqrt(variance + self.minimum_scale)
-        return (data - loc) / scale, loc, scale
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.MeanScaler
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
-    """
-
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
-
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
-        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
-        num_observed = observed_indicator.sum(self.dim, keepdim=True)
-
-        scale = ts_sum / torch.clamp(num_observed, min=1)
-
-        # If `default_scale` is provided, we use it, otherwise we use the scale
-        # of the batch.
-        if self.default_scale is None:
-            batch_sum = ts_sum.sum(dim=0)
-            batch_observations = torch.clamp(num_observed.sum(0), min=1)
-            default_scale = torch.squeeze(batch_sum / batch_observations)
-        else:
-            default_scale = self.default_scale * torch.ones_like(scale)
-
-        # apply default scale where there are no observations
-        scale = torch.where(num_observed > 0, scale, default_scale)
-
-        # ensure the scale is at least `self.minimum_scale`
-        scale = torch.clamp(scale, min=self.minimum_scale)
-        scaled_data = data / scale
-
-        if not self.keepdim:
-            scale = scale.squeeze(dim=self.dim)
-
-        return scaled_data, torch.zeros_like(scale), scale
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NOPScaler
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, loc, scale
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
-def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
-    """
-    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-
-    Args:
-        input_tensor (`torch.FloatTensor`):
-            Input tensor, of which the average must be computed.
-        weights (`torch.FloatTensor`, *optional*):
-            Weights tensor, of the same shape as `input_tensor`.
-        dim (`int`, *optional*):
-            The dim along which to average `input_tensor`.
-
-    Returns:
-        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
-    """
-    if weights is not None:
-        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
-        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
-        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
-    else:
-        return input_tensor.mean(dim=dim)
-
-
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.NegativeLogLikelihood
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
-
-
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
@@ -479,10 +117,10 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0)
         return super().forward(positions)
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.ValueEmbedding
-class ValueEmbedding(nn.Module):
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Info
+class InformerValueEmbedding(nn.Module):
     def __init__(self, feature_size, d_model):
-        super(ValueEmbedding, self).__init__()
+        super().__init__()
         self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
 
     def forward(self, x):
@@ -1450,7 +1088,7 @@ def __init__(self, config: InformerConfig):
         if config.prediction_length is None:
             raise ValueError("The `prediction_length` config needs to be specified.")
 
-        self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = InformerSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
@@ -1600,7 +1238,7 @@ def __init__(self, config: InformerConfig):
         if config.prediction_length is None:
             raise ValueError("The `prediction_length` config needs to be specified.")
 
-        self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = InformerSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 4708bde705bd..9796d9378bce 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -17,24 +17,26 @@
 
 import random
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from torch import nn
-from torch.distributions import (
-    AffineTransform,
-    Distribution,
-    Independent,
-    NegativeBinomial,
-    Normal,
-    StudentT,
-    TransformedDistribution,
-)
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import (
+    FeatureEmbedder,
+    MeanScaler,
+    NegativeBinomialOutput,
+    NegativeLogLikelihood,
+    NOPScaler,
+    NormalOutput,
+    StdScaler,
+    StudentTOutput,
+    weighted_average,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
@@ -50,357 +52,6 @@
 ]
 
 
-class AffineTransformed(TransformedDistribution):
-    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
-        self.scale = 1.0 if scale is None else scale
-        self.loc = 0.0 if loc is None else loc
-
-        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
-
-    @property
-    def mean(self):
-        """
-        Returns the mean of the distribution.
-        """
-        return self.base_dist.mean * self.scale + self.loc
-
-    @property
-    def variance(self):
-        """
-        Returns the variance of the distribution.
-        """
-        return self.base_dist.variance * self.scale**2
-
-    @property
-    def stddev(self):
-        """
-        Returns the standard deviation of the distribution.
-        """
-        return self.variance.sqrt()
-
-
-class ParameterProjection(nn.Module):
-    def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        self.args_dim = args_dim
-        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
-        self.domain_map = domain_map
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
-        params_unbounded = [proj(x) for proj in self.proj]
-
-        return self.domain_map(*params_unbounded)
-
-
-class LambdaLayer(nn.Module):
-    def __init__(self, function):
-        super().__init__()
-        self.function = function
-
-    def forward(self, x, *args):
-        return self.function(x, *args)
-
-
-class DistributionOutput:
-    distribution_class: type
-    in_features: int
-    args_dim: Dict[str, int]
-
-    def __init__(self, dim: int = 1) -> None:
-        self.dim = dim
-        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
-
-    def _base_distribution(self, distr_args):
-        if self.dim == 1:
-            return self.distribution_class(*distr_args)
-        else:
-            return Independent(self.distribution_class(*distr_args), 1)
-
-    def distribution(
-        self,
-        distr_args,
-        loc: Optional[torch.Tensor] = None,
-        scale: Optional[torch.Tensor] = None,
-    ) -> Distribution:
-        distr = self._base_distribution(distr_args)
-        if loc is None and scale is None:
-            return distr
-        else:
-            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
-
-    @property
-    def event_shape(self) -> Tuple:
-        r"""
-        Shape of each individual event contemplated by the distributions that this object constructs.
-        """
-        return () if self.dim == 1 else (self.dim,)
-
-    @property
-    def event_dim(self) -> int:
-        r"""
-        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
-        constructs.
-        """
-        return len(self.event_shape)
-
-    @property
-    def value_in_support(self) -> float:
-        r"""
-        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
-        default 0.0. This value will be used when padding data series.
-        """
-        return 0.0
-
-    def get_parameter_projection(self, in_features: int) -> nn.Module:
-        r"""
-        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
-        """
-        return ParameterProjection(
-            in_features=in_features,
-            args_dim=self.args_dim,
-            domain_map=LambdaLayer(self.domain_map),
-        )
-
-    def domain_map(self, *args: torch.Tensor):
-        r"""
-        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
-        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
-        distribution of the right event_shape.
-        """
-        raise NotImplementedError()
-
-    @classmethod
-    def squareplus(cls, x: torch.Tensor) -> torch.Tensor:
-        r"""
-        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
-        https://twitter.com/jon_barron/status/1387167648669048833
-        """
-        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
-
-
-class StudentTOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
-    distribution_class: type = StudentT
-
-    @classmethod
-    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        df = 2.0 + cls.squareplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NormalOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
-    distribution_class: type = Normal
-
-    @classmethod
-    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
-
-
-class NegativeBinomialOutput(DistributionOutput):
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
-    distribution_class: type = NegativeBinomial
-
-    @classmethod
-    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
-        total_count = cls.squareplus(total_count)
-        return total_count.squeeze(-1), logits.squeeze(-1)
-
-    def _base_distribution(self, distr_args) -> Distribution:
-        total_count, logits = distr_args
-        if self.dim == 1:
-            return self.distribution_class(total_count=total_count, logits=logits)
-        else:
-            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
-
-    # Overwrites the parent class method. We cannot scale using the affine
-    # transformation since negative binomial should return integers. Instead
-    # we scale the parameters.
-    def distribution(
-        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
-    ) -> Distribution:
-        total_count, logits = distr_args
-
-        if scale is not None:
-            # See scaling property of Gamma.
-            logits += scale.log()
-
-        return self._base_distribution((total_count, logits))
-
-
-class FeatureEmbedder(nn.Module):
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
-
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
-        else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
-
-
-class StdScaler(nn.Module):
-    """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
-        denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
-
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
-        scale = torch.sqrt(variance + self.minimum_scale)
-        return (data - loc) / scale, loc, scale
-
-
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
-    """
-
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
-
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
-        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
-        num_observed = observed_indicator.sum(self.dim, keepdim=True)
-
-        scale = ts_sum / torch.clamp(num_observed, min=1)
-
-        # If `default_scale` is provided, we use it, otherwise we use the scale
-        # of the batch.
-        if self.default_scale is None:
-            batch_sum = ts_sum.sum(dim=0)
-            batch_observations = torch.clamp(num_observed.sum(0), min=1)
-            default_scale = torch.squeeze(batch_sum / batch_observations)
-        else:
-            default_scale = self.default_scale * torch.ones_like(scale)
-
-        # apply default scale where there are no observations
-        scale = torch.where(num_observed > 0, scale, default_scale)
-
-        # ensure the scale is at least `self.minimum_scale`
-        scale = torch.clamp(scale, min=self.minimum_scale)
-        scaled_data = data / scale
-
-        if not self.keepdim:
-            scale = scale.squeeze(dim=self.dim)
-
-        return scaled_data, torch.zeros_like(scale), scale
-
-
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, loc, scale
-
-
-def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
-    """
-    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-
-    Args:
-        input_tensor (`torch.FloatTensor`):
-            Input tensor, of which the average must be computed.
-        weights (`torch.FloatTensor`, *optional*):
-            Weights tensor, of the same shape as `input_tensor`.
-        dim (`int`, *optional*):
-            The dim along which to average `input_tensor`.
-
-    Returns:
-        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
-    """
-    if weights is not None:
-        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
-        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
-        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
-    else:
-        return input_tensor.mean(dim=dim)
-
-
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)
-
-
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
@@ -467,9 +118,9 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0)
         return super().forward(positions)
 
 
-class ValueEmbedding(nn.Module):
+class TimeSeriesValueEmbedding(nn.Module):
     def __init__(self, feature_size, d_model):
-        super(ValueEmbedding, self).__init__()
+        super().__init__()
         self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
 
     def forward(self, x):
@@ -1179,7 +830,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         if config.prediction_length is None:
             raise ValueError("The `prediction_length` config needs to be specified.")
 
-        self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
@@ -1316,7 +967,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         if config.prediction_length is None:
             raise ValueError("The `prediction_length` config needs to be specified.")
 
-        self.value_embedding = ValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
         self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         )
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
new file mode 100644
index 000000000000..be0331701592
--- /dev/null
+++ b/src/transformers/time_series_utils.py
@@ -0,0 +1,382 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Time series distributional output classes and utilities.
+"""
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    Independent,
+    NegativeBinomial,
+    Normal,
+    StudentT,
+    TransformedDistribution,
+)
+
+
+class AffineTransformed(TransformedDistribution):
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
+class ParameterProjection(nn.Module):
+    def __init__(
+        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self.function = function
+
+    def forward(self, x, *args):
+        return self.function(x, *args)
+
+
+class DistributionOutput:
+    distribution_class: type
+    in_features: int
+    args_dim: Dict[str, int]
+
+    def __init__(self, dim: int = 1) -> None:
+        self.dim = dim
+        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
+
+    def _base_distribution(self, distr_args):
+        if self.dim == 1:
+            return self.distribution_class(*distr_args)
+        else:
+            return Independent(self.distribution_class(*distr_args), 1)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions that this object constructs.
+        """
+        return () if self.dim == 1 else (self.dim,)
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
+        constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
+        default 0.0. This value will be used when padding data series.
+        """
+        return 0.0
+
+    def get_parameter_projection(self, in_features: int) -> nn.Module:
+        r"""
+        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
+        """
+        return ParameterProjection(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
+        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
+        distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def squareplus(x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
+        https://twitter.com/jon_barron/status/1387167648669048833
+        """
+        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distribution_class: type = StudentT
+
+    @classmethod
+    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        df = 2.0 + cls.squareplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distribution_class: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = cls.squareplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+
+class NegativeBinomialOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    distribution_class: type = NegativeBinomial
+
+    @classmethod
+    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
+        total_count = cls.squareplus(total_count)
+        return total_count.squeeze(-1), logits.squeeze(-1)
+
+    def _base_distribution(self, distr_args) -> Distribution:
+        total_count, logits = distr_args
+        if self.dim == 1:
+            return self.distribution_class(total_count=total_count, logits=logits)
+        else:
+            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since negative binomial should return integers. Instead
+    # we scale the parameters.
+    def distribution(
+        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
+    ) -> Distribution:
+        total_count, logits = distr_args
+
+        if scale is not None:
+            # See scaling property of Gamma.
+            logits += scale.log()
+
+        return self._base_distribution((total_count, logits))
+
+
+class FeatureEmbedder(nn.Module):
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class StdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+class MeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+class NOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+class NegativeLogLikelihood:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+
+    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+        return -input.log_prob(target)

From 240ac6a328da63b3660e3b5013f9e949a9ae833f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 13:37:31 +0100
Subject: [PATCH 155/164] fix dec num_heads

---
 src/transformers/models/informer/modeling_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 587be255e0e0..9846aacf5f99 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -759,7 +759,7 @@ def __init__(self, config: InformerConfig):
         if config.attention_type == "prob":
             self.self_attn = InformerProbSparseAttention(
                 embed_dim=self.embed_dim,
-                num_heads=config.encoder_attention_heads,
+                num_heads=config.decoder_attention_heads,
                 dropout=config.attention_dropout,
                 sampling_factor=config.sampling_factor,
                 is_decoder=True,

From c638f24fb97bf036a3960c7944a03081e3c099a4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 14:03:58 +0100
Subject: [PATCH 156/164] docstring

---
 src/transformers/time_series_utils.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index be0331701592..3c6afdd8b039 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -162,6 +162,10 @@ def squareplus(x: torch.Tensor) -> torch.Tensor:
 
 
 class StudentTOutput(DistributionOutput):
+    """
+    Student-T distribution output class.
+    """
+
     args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
     distribution_class: type = StudentT
 
@@ -173,6 +177,10 @@ def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
 
 
 class NormalOutput(DistributionOutput):
+    """
+    Normal distribution output class.
+    """
+
     args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
     distribution_class: type = Normal
 
@@ -183,6 +191,10 @@ def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
 
 
 class NegativeBinomialOutput(DistributionOutput):
+    """
+    Negative Binomial distribution output class.
+    """
+
     args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
     distribution_class: type = NegativeBinomial
 
@@ -214,6 +226,16 @@ def distribution(
 
 
 class FeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
     def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
         super().__init__()
 

From 960821718b062b8a07371d417aa802f80cc7bf1d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 14:17:49 +0100
Subject: [PATCH 157/164] added time series util doc

---
 docs/source/en/_toctree.yml                   |  2 +
 docs/source/en/internal/time_series_utils.mdx | 44 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 docs/source/en/internal/time_series_utils.mdx

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 04723cbef29e..9d7fd41aa8fe 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -640,5 +640,7 @@
       title: Utilities for Audio processing
     - local: internal/file_utils
       title: General Utilities
+    - local: internal/time_series_utils
+      title: Utilities for Time Series
     title: Internal Helpers
   title: API
diff --git a/docs/source/en/internal/time_series_utils.mdx b/docs/source/en/internal/time_series_utils.mdx
new file mode 100644
index 000000000000..4fb9d5225c4e
--- /dev/null
+++ b/docs/source/en/internal/time_series_utils.mdx
@@ -0,0 +1,44 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Time Series Utilities
+
+This page lists all the utility functions and classes that can be used for Time Series based models.
+
+Most of those are only useful if you are studying the code of the time series models or you wish to add to the collection of distributional output classes.
+
+## Distributional Output
+
+[[autodoc]] time_series_utils.NormalOutput
+
+[[autodoc]] time_series_utils.StudentTOutput
+
+[[autodoc]] time_series_utils.NegativeBinomialOutput
+
+##  Loss
+
+[[autodoc]] time_series_utils.weighted_average
+
+
+[[autodoc]] time_series_utils.NegativeLogLikelihood
+
+## Input Scalers
+
+[[autodoc]] time_series_utils.NOPScaler
+
+[[autodoc]] time_series_utils.MeanScaler
+
+[[autodoc]] time_series_utils.StdScaler
+
+## Features
+
+[[autodoc]] time_series_utils.FeatureEmbedder

From 0bc09de9781ae2fcccdbda52c9d98a92838fcb73 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 14:42:36 +0100
Subject: [PATCH 158/164] _import_structure

---
 src/transformers/__init__.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index cff526f24fd4..236868fb6468 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2718,17 +2718,7 @@
     ]
     _import_structure["pytorch_utils"] = ["Conv1D", "apply_chunking_to_forward", "prune_layer"]
     _import_structure["sagemaker"] = []
-    _import_structure["time_series_utils"] = [
-        "FeatureEmbedder",
-        "MeanScaler",
-        "NegativeBinomialOutput",
-        "NegativeLogLikelihood",
-        "NOPScaler",
-        "NormalOutput",
-        "StdScaler",
-        "StudentTOutput",
-        "weighted_average",
-    ]
+    _import_structure["time_series_utils"] = []
     _import_structure["trainer"] = ["Trainer"]
     _import_structure["trainer_pt_utils"] = ["torch_distributed_zero_first"]
     _import_structure["trainer_seq2seq"] = ["Seq2SeqTrainer"]

From bd724fccf55b99d49409538e441d09854ec8a801 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 15:07:37 +0100
Subject: [PATCH 159/164] formatting

---
 docs/source/en/internal/time_series_utils.mdx         | 1 -
 src/transformers/models/informer/modeling_informer.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/internal/time_series_utils.mdx b/docs/source/en/internal/time_series_utils.mdx
index 4fb9d5225c4e..8d87fd11adca 100644
--- a/docs/source/en/internal/time_series_utils.mdx
+++ b/docs/source/en/internal/time_series_utils.mdx
@@ -28,7 +28,6 @@ Most of those are only useful if you are studying the code of the time series mo
 
 [[autodoc]] time_series_utils.weighted_average
 
-
 [[autodoc]] time_series_utils.NegativeLogLikelihood
 
 ## Input Scalers
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 9846aacf5f99..b725ff9f55e2 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -615,7 +615,7 @@ def forward(
         attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
 
-        # calculate contex for updating the attn_output, based on:
+        # calculate context for updating the attn_output, based on:
         # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L74
         if self.is_decoder:
             context = value_states.cumsum(dim=-2)

From db6e0bdd2f4f8ae481df444f9b7cf346164b90df Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 20:55:28 +0100
Subject: [PATCH 160/164] changes from review

---
 docs/source/en/internal/time_series_utils.mdx |  12 -
 docs/source/en/main_classes/output.mdx        |  12 +
 src/transformers/modeling_outputs.py          | 157 ++++++++
 .../models/informer/modeling_informer.py      | 380 +++++++++---------
 .../modeling_time_series_transformer.py       | 366 +++++++++--------
 src/transformers/time_series_utils.py         | 170 --------
 6 files changed, 561 insertions(+), 536 deletions(-)

diff --git a/docs/source/en/internal/time_series_utils.mdx b/docs/source/en/internal/time_series_utils.mdx
index 8d87fd11adca..6f869f91397e 100644
--- a/docs/source/en/internal/time_series_utils.mdx
+++ b/docs/source/en/internal/time_series_utils.mdx
@@ -29,15 +29,3 @@ Most of those are only useful if you are studying the code of the time series mo
 [[autodoc]] time_series_utils.weighted_average
 
 [[autodoc]] time_series_utils.NegativeLogLikelihood
-
-## Input Scalers
-
-[[autodoc]] time_series_utils.NOPScaler
-
-[[autodoc]] time_series_utils.MeanScaler
-
-[[autodoc]] time_series_utils.StdScaler
-
-## Features
-
-[[autodoc]] time_series_utils.FeatureEmbedder
diff --git a/docs/source/en/main_classes/output.mdx b/docs/source/en/main_classes/output.mdx
index ced38976e845..7af660451615 100644
--- a/docs/source/en/main_classes/output.mdx
+++ b/docs/source/en/main_classes/output.mdx
@@ -164,6 +164,18 @@ documented on their corresponding model page.
 
 [[autodoc]] modeling_outputs.XVectorOutput
 
+## Seq2SeqTimeSeriesModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTimeSeriesModelOutput
+
+## Seq2SeqTimeSeriesPredictionOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTimeSeriesPredictionOutput
+
+## SampleTimeSeriesPredictionOutput
+
+[[autodoc]] modeling_outputs.SampleTimeSeriesPredictionOutput
+
 ## TFBaseModelOutput
 
 [[autodoc]] modeling_tf_outputs.TFBaseModelOutput
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 0177fa88c67b..300ffca89dfe 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -1464,3 +1464,160 @@ class Seq2SeqSpectrogramOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class Seq2SeqTSModelOutput(ModelOutput):
+    """
+    Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale back to the original magnitude.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class Seq2SeqTSPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the chosen
+    distribution.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
+            Distributional loss.
+        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
+            Parameters of the chosen distribution.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale back to the original magnitude.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    params: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class SampleTSPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen distribution.
+
+    Args:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`):
+            Sampled values from the chosen distribution.
+    """
+
+    sequences: torch.FloatTensor = None
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index b725ff9f55e2..0d571c3cd7c2 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -23,19 +23,16 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...time_series_utils import (
-    FeatureEmbedder,
-    MeanScaler,
-    NegativeBinomialOutput,
-    NegativeLogLikelihood,
-    NOPScaler,
-    NormalOutput,
-    StdScaler,
-    StudentTOutput,
-    weighted_average,
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    ModelOutput,
+    Seq2SeqTSModelOutput,
+    Seq2SeqTSPredictionOutput,
+    SampleTSPredictionOutput,
 )
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NegativeLogLikelihood, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
 
@@ -51,6 +48,181 @@
 ]
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder with TimeSeries->Informer
+class InformerFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Informer
+class InformerStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Informer
+class InformerMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Informer
+class InformerNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
@@ -127,158 +299,6 @@ def forward(self, x):
         return self.value_projection(x)
 
 
-@dataclass
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.Seq2SeqTimeSeriesModelOutput
-class Seq2SeqTimeSeriesModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Shift values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to shift back to the original magnitude.
-        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: Optional[torch.FloatTensor] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.Seq2SeqTimeSeriesPredictionOutput
-class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
-    """
-    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
-    distribution.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
-            Distributional loss.
-        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
-            Parameters of the chosen distribution.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Shift values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to shift back to the original magnitude.
-        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    params: Optional[Tuple[torch.FloatTensor]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: Optional[torch.FloatTensor] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.SampleTimeSeriesPredictionOutput
-class SampleTimeSeriesPredictionOutput(ModelOutput):
-    sequences: torch.FloatTensor = None
-
-
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
 class InformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -1464,20 +1484,20 @@ def custom_forward(*inputs):
     "The bare Informer Model outputting raw hidden-states without any specific head on top.",
     INFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->Informer,TIME_SERIES_TRANSFORMER->INFORMER,time-series-transformer->informer,TimeSeries->Informer
 class InformerModel(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
+            self.scaler = InformerMeanScaler(dim=1, keepdim=True)
         elif config.scaling == "std":
-            self.scaler = StdScaler(dim=1, keepdim=True)
+            self.scaler = InformerStdScaler(dim=1, keepdim=True)
         else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
+            self.scaler = InformerNOPScaler(dim=1, keepdim=True)
 
         if config.num_static_categorical_features > 0:
-            self.embedder = FeatureEmbedder(
+            self.embedder = InformerFeatureEmbedder(
                 cardinalities=config.cardinality,
                 embedding_dims=config.embedding_dimension,
             )
@@ -1604,7 +1624,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1624,7 +1644,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
         r"""
         Returns:
 
@@ -1707,7 +1727,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
 
-        return Seq2SeqTimeSeriesModelOutput(
+        return Seq2SeqTSModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
             decoder_hidden_states=decoder_outputs.hidden_states,
@@ -1768,7 +1788,7 @@ def output_distribution(self, params, loc=None, scale=None, trailing_n=None) ->
         return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
 
     @add_start_docstrings_to_model_forward(INFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1789,7 +1809,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
         r"""
         Returns:
 
@@ -1884,7 +1904,7 @@ def forward(
             outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
             return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
-        return Seq2SeqTimeSeriesPredictionOutput(
+        return Seq2SeqTSPredictionOutput(
             loss=prediction_loss,
             params=params,
             past_key_values=outputs.past_key_values,
@@ -1910,7 +1930,7 @@ def generate(
         static_real_features: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> SampleTimeSeriesPredictionOutput:
+    ) -> SampleTSPredictionOutput:
         r"""
         Greedily generate sequences of sample predictions from a model with a probability distribution head.
 
@@ -1994,9 +2014,9 @@ def generate(
                 Whether or not to return the hidden states of all layers.
 
         Return:
-            [`SampleTimeSeriesPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
-            number of samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)`
-            for multivariate predictions.
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
         """
         outputs = self(
             static_categorical_features=static_categorical_features,
@@ -2061,7 +2081,7 @@ def generate(
 
         concat_future_samples = torch.cat(future_samples, dim=1)
 
-        return SampleTimeSeriesPredictionOutput(
+        return SampleTSPredictionOutput(
             sequences=concat_future_samples.reshape(
                 (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
             )
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 9796d9378bce..745682c74f44 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -24,19 +24,16 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...time_series_utils import (
-    FeatureEmbedder,
-    MeanScaler,
-    NegativeBinomialOutput,
-    NegativeLogLikelihood,
-    NOPScaler,
-    NormalOutput,
-    StdScaler,
-    StudentTOutput,
-    weighted_average,
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    ModelOutput,
+    Seq2SeqTSModelOutput,
+    Seq2SeqTSPredictionOutput,
+    SampleTSPredictionOutput,
 )
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NegativeLogLikelihood, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
@@ -52,6 +49,176 @@
 ]
 
 
+class TimeSeriesFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class TimeSeriesStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+class TimeSeriesMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+class TimeSeriesNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
@@ -127,155 +294,6 @@ def forward(self, x):
         return self.value_projection(x)
 
 
-@dataclass
-class Seq2SeqTimeSeriesModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Shift values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to shift back to the original magnitude.
-        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: Optional[torch.FloatTensor] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
-    """
-    Base class for model's predictions outputs that also contain the loss as well parameters of the chosen
-    distribution.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
-            Distributional loss.
-        params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
-            Parameters of the chosen distribution.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Shift values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to shift back to the original magnitude.
-        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            Scaling values of each time series' context window which is used to give the model inputs of the same
-            magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
-            Static features of each time series' in a batch which are copied to the covariates at inference time.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    params: Optional[Tuple[torch.FloatTensor]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: Optional[torch.FloatTensor] = None
-    scale: Optional[torch.FloatTensor] = None
-    static_features: Optional[torch.FloatTensor] = None
-
-
-@dataclass
-class SampleTimeSeriesPredictionOutput(ModelOutput):
-    sequences: torch.FloatTensor = None
-
-
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->TimeSeriesTransformer
 class TimeSeriesTransformerAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -1198,14 +1216,14 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling:
-            self.scaler = MeanScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesMeanScaler(dim=1, keepdim=True)
         elif config.scaling == "std":
-            self.scaler = StdScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesStdScaler(dim=1, keepdim=True)
         else:
-            self.scaler = NOPScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesNOPScaler(dim=1, keepdim=True)
 
         if config.num_static_categorical_features > 0:
-            self.embedder = FeatureEmbedder(
+            self.embedder = TimeSeriesFeatureEmbedder(
                 cardinalities=config.cardinality,
                 embedding_dims=config.embedding_dimension,
             )
@@ -1332,7 +1350,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1352,7 +1370,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
         r"""
         Returns:
 
@@ -1435,7 +1453,7 @@ def forward(
         if not return_dict:
             return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
 
-        return Seq2SeqTimeSeriesModelOutput(
+        return Seq2SeqTSModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
             decoder_hidden_states=decoder_outputs.hidden_states,
@@ -1495,7 +1513,7 @@ def output_distribution(self, params, loc=None, scale=None, trailing_n=None) ->
         return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
 
     @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqTimeSeriesModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         past_values: torch.Tensor,
@@ -1516,7 +1534,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
+    ) -> Union[Seq2SeqTSModelOutput, Tuple]:
         r"""
         Returns:
 
@@ -1613,7 +1631,7 @@ def forward(
             outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
             return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
 
-        return Seq2SeqTimeSeriesPredictionOutput(
+        return Seq2SeqTSPredictionOutput(
             loss=prediction_loss,
             params=params,
             past_key_values=outputs.past_key_values,
@@ -1639,7 +1657,7 @@ def generate(
         static_real_features: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> SampleTimeSeriesPredictionOutput:
+    ) -> SampleTSPredictionOutput:
         r"""
         Greedily generate sequences of sample predictions from a model with a probability distribution head.
 
@@ -1723,7 +1741,7 @@ def generate(
                 Whether or not to return the hidden states of all layers.
 
         Return:
-            [`SampleTimeSeriesPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
             number of samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)`
             for multivariate predictions.
         """
@@ -1790,7 +1808,7 @@ def generate(
 
         concat_future_samples = torch.cat(future_samples, dim=1)
 
-        return SampleTimeSeriesPredictionOutput(
+        return SampleTSPredictionOutput(
             sequences=concat_future_samples.reshape(
                 (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
             )
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index 3c6afdd8b039..50eee8fe6d38 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -225,176 +225,6 @@ def distribution(
         return self._base_distribution((total_count, logits))
 
 
-class FeatureEmbedder(nn.Module):
-    """
-    Embed a sequence of categorical features.
-
-    Args:
-        cardinalities (`list[int]`):
-            List of cardinalities of the categorical features.
-        embedding_dims (`list[int]`):
-            List of embedding dimensions of the categorical features.
-    """
-
-    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
-        super().__init__()
-
-        self.num_features = len(cardinalities)
-        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        if self.num_features > 1:
-            # we slice the last dimension, giving an array of length
-            # self.num_features with shape (N,T) or (N)
-            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
-        else:
-            cat_feature_slices = [features]
-
-        return torch.cat(
-            [
-                embed(cat_feature_slice.squeeze(-1))
-                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
-            ],
-            dim=-1,
-        )
-
-
-class StdScaler(nn.Module):
-    """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
-        super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
-        denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
-
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
-        scale = torch.sqrt(variance + self.minimum_scale)
-        return (data - loc) / scale, loc, scale
-
-
-class MeanScaler(nn.Module):
-    """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
-    accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
-    """
-
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
-
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
-        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
-        num_observed = observed_indicator.sum(self.dim, keepdim=True)
-
-        scale = ts_sum / torch.clamp(num_observed, min=1)
-
-        # If `default_scale` is provided, we use it, otherwise we use the scale
-        # of the batch.
-        if self.default_scale is None:
-            batch_sum = ts_sum.sum(dim=0)
-            batch_observations = torch.clamp(num_observed.sum(0), min=1)
-            default_scale = torch.squeeze(batch_sum / batch_observations)
-        else:
-            default_scale = self.default_scale * torch.ones_like(scale)
-
-        # apply default scale where there are no observations
-        scale = torch.where(num_observed > 0, scale, default_scale)
-
-        # ensure the scale is at least `self.minimum_scale`
-        scale = torch.clamp(scale, min=self.minimum_scale)
-        scaled_data = data / scale
-
-        if not self.keepdim:
-            scale = scale.squeeze(dim=self.dim)
-
-        return scaled_data, torch.zeros_like(scale), scale
-
-
-class NOPScaler(nn.Module):
-    """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-    """
-
-    def __init__(self, dim: int, keepdim: bool = False):
-        super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-
-    def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        return data, loc, scale
-
-
-def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
-    """
-    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
-    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
-
-    Args:
-        input_tensor (`torch.FloatTensor`):
-            Input tensor, of which the average must be computed.
-        weights (`torch.FloatTensor`, *optional*):
-            Weights tensor, of the same shape as `input_tensor`.
-        dim (`int`, *optional*):
-            The dim along which to average `input_tensor`.
-
-    Returns:
-        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
-    """
-    if weights is not None:
-        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
-        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
-        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
-    else:
-        return input_tensor.mean(dim=dim)
-
-
 class NegativeLogLikelihood:
     """
     Computes the negative log likelihood loss from input distribution with respect to target.

From b14aa34051c70d0820b184599c555d630abfb4f4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 21:04:18 +0100
Subject: [PATCH 161/164] make style

---
 src/transformers/modeling_outputs.py          | 19 ++++++++++---------
 .../models/informer/modeling_informer.py      |  4 +---
 .../modeling_time_series_transformer.py       | 10 ++++------
 src/transformers/time_series_utils.py         |  2 +-
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 300ffca89dfe..4f7540d0ff9e 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -1469,8 +1469,8 @@ class Seq2SeqSpectrogramOutput(ModelOutput):
 @dataclass
 class Seq2SeqTSModelOutput(ModelOutput):
     """
-    Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up sequential
-    decoding.
+    Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up
+    sequential decoding.
 
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -1515,10 +1515,10 @@ class Seq2SeqTSModelOutput(ModelOutput):
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+        loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Shift values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to shift back to the original magnitude.
-        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+        scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to rescale back to the original magnitude.
         static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
@@ -1541,8 +1541,8 @@ class Seq2SeqTSModelOutput(ModelOutput):
 @dataclass
 class Seq2SeqTSPredictionOutput(ModelOutput):
     """
-    Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the chosen
-    distribution.
+    Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the
+    chosen distribution.
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
@@ -1586,10 +1586,10 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        loc (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+        loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Shift values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to shift back to the original magnitude.
-        scale (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+        scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to rescale back to the original magnitude.
         static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
@@ -1613,7 +1613,8 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
 @dataclass
 class SampleTSPredictionOutput(ModelOutput):
     """
-    Base class for time series model's predictions outputs that contains the sampled values from the chosen distribution.
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
 
     Args:
         sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`):
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 0d571c3cd7c2..69604e1cf6fd 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -15,7 +15,6 @@
 """ PyTorch Informer model."""
 
 import random
-from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -26,10 +25,9 @@
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
-    ModelOutput,
+    SampleTSPredictionOutput,
     Seq2SeqTSModelOutput,
     Seq2SeqTSPredictionOutput,
-    SampleTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NegativeLogLikelihood, NormalOutput, StudentTOutput
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 745682c74f44..7d6977995fdd 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -16,7 +16,6 @@
 """ PyTorch Time Series Transformer model."""
 
 import random
-from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -27,10 +26,9 @@
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
-    ModelOutput,
+    SampleTSPredictionOutput,
     Seq2SeqTSModelOutput,
     Seq2SeqTSPredictionOutput,
-    SampleTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NegativeLogLikelihood, NormalOutput, StudentTOutput
@@ -1741,9 +1739,9 @@ def generate(
                 Whether or not to return the hidden states of all layers.
 
         Return:
-            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
-            number of samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)`
-            for multivariate predictions.
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
         """
         outputs = self(
             static_categorical_features=static_categorical_features,
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index 50eee8fe6d38..c4fac91162d4 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -16,7 +16,7 @@
 """
 Time series distributional output classes and utilities.
 """
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 
 import torch
 from torch import nn

From e36f6c04fb161aa6f7681a5bd7426e5ba84d76c3 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 21:18:02 +0100
Subject: [PATCH 162/164] fix docs

---
 docs/source/en/main_classes/output.mdx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/main_classes/output.mdx b/docs/source/en/main_classes/output.mdx
index 7af660451615..ca4e8dfc0ace 100644
--- a/docs/source/en/main_classes/output.mdx
+++ b/docs/source/en/main_classes/output.mdx
@@ -164,17 +164,17 @@ documented on their corresponding model page.
 
 [[autodoc]] modeling_outputs.XVectorOutput
 
-## Seq2SeqTimeSeriesModelOutput
+## Seq2SeqTSModelOutput
 
-[[autodoc]] modeling_outputs.Seq2SeqTimeSeriesModelOutput
+[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput
 
-## Seq2SeqTimeSeriesPredictionOutput
+## Seq2SeqTSPredictionOutput
 
-[[autodoc]] modeling_outputs.Seq2SeqTimeSeriesPredictionOutput
+[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput
 
-## SampleTimeSeriesPredictionOutput
+## SampleTSPredictionOutput
 
-[[autodoc]] modeling_outputs.SampleTimeSeriesPredictionOutput
+[[autodoc]] modeling_outputs.SampleTSPredictionOutput
 
 ## TFBaseModelOutput
 

From 0ac82bb46c8b3f46807009074c1c905a7a42c447 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 22:22:01 +0100
Subject: [PATCH 163/164] fix doc

---
 docs/source/en/internal/time_series_utils.mdx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/en/internal/time_series_utils.mdx b/docs/source/en/internal/time_series_utils.mdx
index 6f869f91397e..749627ff4fb8 100644
--- a/docs/source/en/internal/time_series_utils.mdx
+++ b/docs/source/en/internal/time_series_utils.mdx
@@ -26,6 +26,4 @@ Most of those are only useful if you are studying the code of the time series mo
 
 ##  Loss
 
-[[autodoc]] time_series_utils.weighted_average
-
 [[autodoc]] time_series_utils.NegativeLogLikelihood

From e5eff8a780d046e79ced4483f6b2a56ff9f530dc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Mar 2023 23:11:22 +0100
Subject: [PATCH 164/164] removed NegativeLogLikelihood

---
 docs/source/en/internal/time_series_utils.mdx        |  4 ----
 .../models/informer/modeling_informer.py             | 12 ++++++++++--
 .../modeling_time_series_transformer.py              | 11 +++++++++--
 src/transformers/time_series_utils.py                |  9 ---------
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/docs/source/en/internal/time_series_utils.mdx b/docs/source/en/internal/time_series_utils.mdx
index 749627ff4fb8..7ee9b3ecef0e 100644
--- a/docs/source/en/internal/time_series_utils.mdx
+++ b/docs/source/en/internal/time_series_utils.mdx
@@ -23,7 +23,3 @@ Most of those are only useful if you are studying the code of the time series mo
 [[autodoc]] time_series_utils.StudentTOutput
 
 [[autodoc]] time_series_utils.NegativeBinomialOutput
-
-##  Loss
-
-[[autodoc]] time_series_utils.NegativeLogLikelihood
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 69604e1cf6fd..f6422fb179ab 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -30,7 +30,7 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...time_series_utils import NegativeBinomialOutput, NegativeLogLikelihood, NormalOutput, StudentTOutput
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
 
@@ -221,6 +221,14 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return input_tensor.mean(dim=dim)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
@@ -1762,7 +1770,7 @@ def __init__(self, config: InformerConfig):
         self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
-            self.loss = NegativeLogLikelihood()
+            self.loss = nll
         else:
             raise ValueError(f"Unknown loss function {config.loss}")
 
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 7d6977995fdd..86071d1fb8ed 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -31,7 +31,7 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...time_series_utils import NegativeBinomialOutput, NegativeLogLikelihood, NormalOutput, StudentTOutput
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
@@ -193,6 +193,13 @@ def forward(
         return data, loc, scale
 
 
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
 def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
     """
     Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
@@ -1487,7 +1494,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.target_shape = self.distribution_output.event_shape
 
         if config.loss == "nll":
-            self.loss = NegativeLogLikelihood()
+            self.loss = nll
         else:
             raise ValueError(f"Unknown loss function {config.loss}")
 
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index c4fac91162d4..b07451253e87 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -223,12 +223,3 @@ def distribution(
             logits += scale.log()
 
         return self._base_distribution((total_count, logits))
-
-
-class NegativeLogLikelihood:
-    """
-    Computes the negative log likelihood loss from input distribution with respect to target.
-    """
-
-    def __call__(self, input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
-        return -input.log_prob(target)