Merge pull request #184 from winedarksea/dev

0.5.7
winedarksea · May 23, 2023 · 27999a2 · 27999a2
2 parents de2d2a1 + 6037679
commit 27999a2
Show file tree

Hide file tree

Showing 44 changed files with 796 additions and 197 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 AutoTS is a time series package for Python designed for rapidly deploying high-accuracy forecasts at scale. 
 
-In 2023, AutoTS has won the M6 forecasting competition, delivering the highest performance investment decisions across 12 months of stock market forecasting.
+In 2023, AutoTS has won in the M6 forecasting competition, delivering the highest performance investment decisions across 12 months of stock market forecasting.
 
 There are dozens of forecasting models usable in the `sklearn` style of `.fit()` and `.predict()`. 
 These includes naive, statistical, machine learning, and deep learning models. 

diff --git a/TODO.md b/TODO.md
@@ -12,10 +12,12 @@
 * The most recent data will generally be the most important
 * Forecasts are desired for the future immediately following the most recent data.
 
-# 0.5.6 🌌 🌌 🌌
-* fixes for annoying things broken by pandas 2.0, without them giving deprecation warnings
-* also for gluonts who also love breaking their API
-* ensembling tuning
+# 0.5.7 🛂🛂🛂
+* slight changes to holiday_flag to allow list in some cases
+* DatepartRegressionTransformer now accepts holiday country input as regressor
+* added RegressionFilter
+* changed bounded behavior of AlignLastValue
+* small bug fixes
 
 ### New Model Checklist:
 	* Add to ModelMonster in auto_model.py

diff --git a/autots/evaluator/auto_model.py b/autots/evaluator/auto_model.py
@@ -699,7 +699,9 @@ def ModelPrediction(
                 pass
             print(error_msg)
 
-    transformer_object = GeneralTransformer(**transformation_dict, n_jobs=n_jobs)
+    transformer_object = GeneralTransformer(
+        **transformation_dict, n_jobs=n_jobs, holiday_country=holiday_country
+    )
     df_train_transformed = transformer_object._fit(df_train)
 
     # make sure regressor has same length. This could be a problem if wrong size regressor is passed.
@@ -736,6 +738,7 @@ def ModelPrediction(
 
     transformationStartTime = datetime.datetime.now()
     # Inverse the transformations, NULL FILLED IN UPPER/LOWER ONLY
+    # forecast inverse MUST come before upper and lower bounds inverse
     df_forecast.forecast = pd.DataFrame(
         transformer_object.inverse_transform(df_forecast.forecast)
     )

diff --git a/autots/evaluator/auto_ts.py b/autots/evaluator/auto_ts.py
@@ -6,6 +6,7 @@
 import json
 import sys
 import time
+import traceback as tb
 
 from autots.tools.shaping import (
     long_to_wide,
@@ -50,6 +51,7 @@ class AutoTS(object):
 
     Args:
         forecast_length (int): number of periods over which to evaluate forecast. Can be overriden later in .predict().
+            when you don't have much historical data, using a small forecast length for .fit and the full desired forecast lenght for .predict is usually the best possible approach given limitations.
         frequency (str): 'infer' or a specific pandas datetime offset. Can be used to force rollup of data (ie daily input, but frequency 'M' will rollup to monthly).
         prediction_interval (float): 0-1, uncertainty range for upper and lower forecasts. Adjust range, but rarely matches actual containment.
         max_generations (int): number of genetic algorithms generations to run.
@@ -87,8 +89,9 @@ class AutoTS(object):
         transformer_list (list): list of transformers to use, or dict of transformer:probability. Note this does not apply to initial templates.
             can accept string aliases: "all", "fast", "superfast"
         transformer_max_depth (int): maximum number of sequential transformers to generate for new Random Transformers. Fewer will be faster.
-        models_mode (str): option to adjust parameter options for newly generated models. Currently includes:
-            'default', 'deep' (searches more params, likely slower), and 'regressor' (forces 'User' regressor mode in regressor capable models)
+        models_mode (str): option to adjust parameter options for newly generated models. Only sporadically utilized. Currently includes:
+            'default'/'random', 'deep' (searches more params, likely slower), and 'regressor' (forces 'User' regressor mode in regressor capable models),
+            'gradient_boosting', 'neuralnets' (~Regression class models only)
         num_validations (int): number of cross validations to perform. 0 for just train/test on best split.
             Possible confusion: num_validations is the number of validations to perform *after* the first eval segment, so totally eval/validations will be this + 1.
             Also "auto" and "max" aliases available. Max maxes out at 50.
@@ -427,7 +430,7 @@ def __init__(
         )
         self.initial_results = TemplateEvalObject()
         self.best_model_name = ""
-        self.best_model_params = ""
+        self.best_model_params = {}
         self.best_model_transformation_params = ""
         self.traceback = True if verbose > 1 else False
         self.future_regressor_train = None
@@ -928,7 +931,11 @@ def fit(
 
         # preclean data
         if self.preclean is not None:
-            self.preclean_transformer = GeneralTransformer(**self.preclean)
+            self.preclean_transformer = GeneralTransformer(
+                **self.preclean,
+                n_jobs=self.n_jobs,
+                holiday_country=self.holiday_country,
+            )
             df_wide_numeric = self.preclean_transformer.fit_transform(df_wide_numeric)
 
         self.df_wide_numeric = df_wide_numeric
@@ -1153,7 +1160,9 @@ def fit(
                     result_file=result_file,
                 )
             except Exception as e:
-                print(f"Ensembling Error: {repr(e)}")
+                print(
+                    f"Ensembling Error: {repr(e)}: {''.join(tb.format_exception(None, e, e.__traceback__))}"
+                )
 
         # drop any duplicates in results
         self.initial_results.model_results = (
@@ -1283,7 +1292,9 @@ def fit(
                         first_validation=False,
                     )
                 except Exception as e:
-                    print(f"Ensembling Error: {repr(e)}")
+                    print(
+                        f"Post-Validation Ensembling Error: {repr(e)}: {''.join(tb.format_exception(None, e, e.__traceback__))}"
+                    )
                     time.sleep(5)
 
         error_msg_template = """No models available from validation.
@@ -1554,7 +1565,9 @@ def fit(
                 ].copy()
             except Exception as e:
                 if self.verbose >= 0:
-                    print(f"Ensembling Error: {repr(e)}")
+                    print(
+                        f"Horizontal/Mosaic Ensembling Error: {repr(e)}: {''.join(tb.format_exception(None, e, e.__traceback__))}"
+                    )
                 hens_model_results = TemplateEvalObject().model_results.copy()
 
             # rerun validation_results aggregation with new models added
@@ -1624,7 +1637,9 @@ def fit(
         self.ensemble_check = int(self.best_model_ensemble > 0)
 
         # set flags to check if regressors or ensemble used in final model.
-        self.used_regressor_check = self._regr_param_check(self.best_model_params)
+        self.used_regressor_check = self._regr_param_check(
+            self.best_model_params.copy()
+        )
         self.regressor_used = self.used_regressor_check
         # clean up any remaining print statements
         sys.stdout.flush()
@@ -1866,7 +1881,7 @@ def predict(
             for interval in prediction_interval:
                 df_forecast = model_forecast(
                     model_name=self.best_model_name,
-                    model_param_dict=self.best_model_params,
+                    model_param_dict=self.best_model_params.copy(),
                     model_transform_dict=self.best_model_transformation_params,
                     df_train=self.df_wide_numeric,
                     forecast_length=forecast_length,
@@ -1916,7 +1931,7 @@ def predict(
         else:
             df_forecast = model_forecast(
                 model_name=self.best_model_name,
-                model_param_dict=self.best_model_params,
+                model_param_dict=self.best_model_params.copy(),
                 model_transform_dict=self.best_model_transformation_params,
                 df_train=self.df_wide_numeric,
                 forecast_length=forecast_length,
@@ -2333,7 +2348,7 @@ def back_forecast(
         result = back_forecast(
             df=input_df,
             model_name=self.best_model_name,
-            model_param_dict=self.best_model_params,
+            model_param_dict=self.best_model_params.copy(),
             model_transform_dict=self.best_model_transformation_params,
             future_regressor_train=self.future_regressor_train,
             n_splits=n_splits,
@@ -2356,7 +2371,7 @@ def horizontal_to_df(self):
             raise ValueError("No best_model. AutoTS .fit() needs to be run.")
         if self.best_model['Ensemble'].iloc[0] != 2:
             raise ValueError("Only works on horizontal ensemble type models.")
-        ModelParameters = self.best_model_params
+        ModelParameters = self.best_model_params.copy()
         series = ModelParameters['series']
         series = pd.DataFrame.from_dict(series, orient="index").reset_index(drop=False)
         if series.shape[1] > 2:
@@ -2398,7 +2413,7 @@ def mosaic_to_df(self):
             raise ValueError("No best_model. AutoTS .fit() needs to be run.")
         if self.best_model_ensemble != 2:
             raise ValueError("Only works on horizontal ensemble type models.")
-        ModelParameters = self.best_model_params
+        ModelParameters = self.best_model_params.copy()
         if str(ModelParameters['model_name']).lower() != 'mosaic':
             raise ValueError("Only works on mosaic ensembles.")
         series = pd.DataFrame.from_dict(ModelParameters['series'])
@@ -2709,7 +2724,7 @@ def plot_horizontal_model_count(
         elif self.best_model_ensemble != 2:
             raise ValueError("this plot only works on horizontal-style ensembles.")
 
-        if str(self.best_model_params['model_name']).lower() == "mosaic":
+        if str(self.best_model_params.get('model_name', None)).lower() == "mosaic":
             series = self.mosaic_to_df()
             transformers = series.stack().value_counts()
         else:

diff --git a/autots/evaluator/metrics.py b/autots/evaluator/metrics.py
@@ -504,7 +504,7 @@ def full_metric_evaluation(
     u_weights[-1, :] = first_weight * 0.5
 
     # over/under estimate mask
-    ovm = filled_full_mae_errors > 0
+    ovm = full_errors > 0
 
     # note a number of these are created from my own imagination (winedarksea)
     # those are also subject to change as they are tested and refined
@@ -517,9 +517,9 @@ def full_metric_evaluation(
             'made': mean_absolute_differential_error(lA, lF, 1, scaler=scaler),
             # aggregate error
             'mage': mage,  # Gandalf approved
-            'underestimate': np.sum(filled_full_mae_errors[~ovm], axis=0),
+            'underestimate': np.nansum(full_errors[~ovm], axis=0),
             'mle': msle(full_errors, full_mae_errors, log_errors, nan_flag=nan_flag),
-            'overestimate': np.sum(filled_full_mae_errors[ovm], axis=0),
+            'overestimate': np.nansum(full_errors[ovm], axis=0),
             'imle': msle(
                 -full_errors,
                 full_mae_errors,

diff --git a/autots/models/base.py b/autots/models/base.py
@@ -437,7 +437,16 @@ def plot(
             start_date=start_date,
         )
         if title is None:
-            title = f"{series} with model {str(self.model_name)[0:80]}"
+            title_prelim = str(self.model_name)[0:80]
+            if title_prelim == "Ensemble":
+                ensemble_type = self.model_parameters.get('model_name', "unknown")
+                if ensemble_type == "Horizontal":
+                    title_prelim = self.model_parameters['series'].get(
+                        series, "Horizontal"
+                    )
+                else:
+                    title_prelim = ensemble_type
+            title = f"{series} with model {title_prelim}"
         if vline is None:
             return plot_df.plot(title=title, **kwargs)
         else:

diff --git a/autots/models/cassandra.py b/autots/models/cassandra.py
@@ -13,7 +13,12 @@
 import pandas as pd
 
 # using transformer version of Anomaly/Holiday to use a lower level import than evaluator
-from autots.tools.seasonal import create_seasonality_feature, seasonal_int
+from autots.tools.seasonal import (
+    create_seasonality_feature,
+    seasonal_int,
+    datepart_components,
+    date_part_methods,
+)
 from autots.tools.transform import (
     GeneralTransformer,
     RandomTransform,
@@ -139,6 +144,7 @@ def __init__(
         random_seed: int = 2022,
         verbose: int = 0,
         n_jobs: int = "auto",
+        **kwargs,
     ):
         if preprocessing_transformation is None:
             preprocessing_transformation = {}
@@ -1750,8 +1756,9 @@ def get_new_params(self, method='fast'):
                     'UnivariateMotif',
                     'UnobservedComponents',
                     "KalmanStateSpace",
+                    'RRVAR',
                 ],
-                [0.05, 0.05, 0.1, 0.05, 0.05, 0.15, 0.05, 0.05, 0.05, 0.05],
+                [0.05, 0.05, 0.1, 0.05, 0.05, 0.15, 0.05, 0.05, 0.05, 0.05, 0.05],
                 k=1,
             )[0]
             trend_model = {'Model': model_str}
@@ -1853,7 +1860,7 @@ def get_new_params(self, method='fast'):
             [0.6, 0.2, 0.1, 0.05, 0.02, 0.03],
         )[0]
         recency_weighting = random.choices(
-            [None, 0.05, 0.1, 0.25], [0.7, 0.1, 0.1, 0.1]
+            [None, 0.05, 0.1, 0.25, 0.5], [0.7, 0.1, 0.1, 0.1, 0.05]
         )[0]
         if linear_model in ['lstsq']:
             linear_model = {
@@ -1888,21 +1895,30 @@ def get_new_params(self, method='fast'):
             ar_interaction_seasonality = random.choices(
                 [None, 7, 'dayofweek', 'common_fourier'], [0.4, 0.2, 0.2, 0.2]
             )[0]
+        seasonalities = random.choices(
+            [
+                [7, 365.25],
+                ["dayofweek", 365.25],
+                ["month", "dayofweek", "weekdayofmonth"],
+                ['weekdayofmonth', 'common_fourier'],
+                "other",
+            ],
+            [0.1, 0.1, 0.1, 0.05, 0.1],
+        )[0]
+        if seasonalities == "other":
+            predefined = random.choices([True, False], [0.5, 0.5])[0]
+            if predefined:
+                seasonalities = random.choice(date_part_methods)
+            else:
+                comp_opts = datepart_components + [7, 365.25, 12]
+                seasonalities = random.choices(comp_opts, k=2)
         return {
             "preprocessing_transformation": RandomTransform(
                 transformer_list=filters, transformer_max_depth=2, allow_none=True
             ),
             "scaling": scaling,
             # "past_impacts_intervention": self.past_impacts_intervention,
-            "seasonalities": random.choices(
-                [
-                    [7, 365.25],
-                    ["dayofweek", 365.25],
-                    ["month", "dayofweek", "weekdayofmonth"],
-                    ['weekdayofmonth', 'common_fourier'],
-                ],
-                [0.1, 0.1, 0.1, 0.05],
-            )[0],
+            "seasonalities": seasonalities,
             "ar_lags": ar_lags,
             "ar_interaction_seasonality": ar_interaction_seasonality,
             "anomaly_detector_params": anomaly_detector_params,

diff --git a/autots/models/ensemble.py b/autots/models/ensemble.py
@@ -1136,6 +1136,10 @@ def HorizontalTemplateGenerator(
         )
         nomen = 'Horizontal'
         metric = 'Score-max'
+        if len(mods_per_series) < per_series.shape[1]:
+            raise ValueError(
+                "ERROR in Horizontal Generation insufficient series created, horizontal-max"
+            )
         best5_params = {
             'Model': 'Ensemble',
             'ModelParameters': json.dumps(
@@ -1229,6 +1233,10 @@ def HorizontalTemplateGenerator(
         )
         nomen = 'Horizontal'
         metric = 'Score'
+        if len(mods_per_series) < per_series.shape[1]:
+            raise ValueError(
+                "ERROR in Horizontal Generation insufficient series created, horizontal"
+            )
         best5_params = {
             'Model': 'Ensemble',
             'ModelParameters': json.dumps(
@@ -1284,6 +1292,10 @@ def HorizontalTemplateGenerator(
         )
         nomen = 'Horizontal'
         metric = 'Score-min'
+        if len(mods_per_series) < per_series.shape[1]:
+            raise ValueError(
+                "ERROR in Horizontal Generation insufficient series created, horizontal-min"
+            )
         best5_params = {
             'Model': 'Ensemble',
             'ModelParameters': json.dumps(

diff --git a/autots/models/model_list.py b/autots/models/model_list.py
@@ -12,7 +12,7 @@
     'GluonTS',
     'SeasonalNaive',
     'UnobservedComponents',
-    'VARMAX',
+    # 'VARMAX',
     'VECM',
     'DynamicFactor',
     'MotifSimulation',
@@ -147,6 +147,7 @@
 }
 # models that should be fast given many CPU cores
 fast_parallel = {**parallel, **fast}
+fast_parallel_no_arima = {i: fast_parallel[i] for i in fast_parallel if i != 'ARIMA'}
 # models that are explicitly not production ready
 experimental = [
     'MotifSimulation',
@@ -332,6 +333,7 @@
     "superfast": superfast,
     "parallel": parallel,
     "fast_parallel": fast_parallel,
+    "fast_parallel_no_arima": fast_parallel_no_arima,
     "probabilistic": probabilistic,
     "multivariate": multivariate,
     "univariate": univariate,