Merge pull request #36 from IBM/time_series

mplpl · web-flow · commit 1ecad40127ff · 2023-05-16T11:56:58.000+02:00
Add timeseries wrapper
diff --git a/docs/source/predictive.rst b/docs/source/predictive.rst
@@ -57,42 +57,50 @@ Association Rules
    :undoc-members:
    :show-inheritance:
 
-Classification base module
---------------------------------------------------
+Bisecting KMeans
+-----------------------------------------------------
 
-.. automodule:: nzpyida.analytics.predictive.classification
+.. automodule:: nzpyida.analytics.predictive.bisecting_kmeans
    :members:
    :undoc-members:
    :show-inheritance:
 
-Regression base module
-----------------------------------------------
+Two Step Clustering
+-----------------------------------------------------
 
-.. automodule:: nzpyida.analytics.predictive.regression
+.. automodule:: nzpyida.analytics.predictive.two_step_clustering
    :members:
    :undoc-members:
    :show-inheritance:
 
-Predictive Modeling base module
---------------------------------------------------------
+Time Series Forecasting
+-----------------------------------------------------
 
-.. automodule:: nzpyida.analytics.predictive.predictive_modeling
+.. automodule:: nzpyida.analytics.predictive.timeseries
    :members:
    :undoc-members:
    :show-inheritance:
 
-Bisecting KMeans
------------------------------------------------------
+Classification base module
+--------------------------------------------------
 
-.. automodule:: nzpyida.analytics.predictive.bisecting_kmeans
+.. automodule:: nzpyida.analytics.predictive.classification
    :members:
    :undoc-members:
    :show-inheritance:
 
-Two Step Clustering
------------------------------------------------------
+Regression base module
+----------------------------------------------
 
-.. automodule:: nzpyida.analytics.predictive.two_step_clustering
+.. automodule:: nzpyida.analytics.predictive.regression
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Predictive Modeling base module
+--------------------------------------------------------
+
+.. automodule:: nzpyida.analytics.predictive.predictive_modeling
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/nzpyida/analytics/__init__.py b/nzpyida/analytics/__init__.py
@@ -17,6 +17,7 @@
 from .predictive.bisecting_kmeans import BisectingKMeans
 from .predictive.regression_trees import DecisionTreeRegressor
 from .predictive.two_step_clustering import TwoStepClustering
+from .predictive.timeseries import TimeSeries
 from .exploration.distribution import bitable, moments, histogram, outliers
 from .exploration.distribution import quantile, unitable
 from .transform.discretization import EFDisc, EMDisc, EWDisc
diff --git a/nzpyida/analytics/predictive/timeseries.py b/nzpyida/analytics/predictive/timeseries.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#-----------------------------------------------------------------------------
+# Copyright (c) 2023, IBM Corp.
+# All rights reserved.
+#
+# Distributed under the terms of the BSD Simplified License.
+#
+# The full license is in the LICENSE file, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+A time series model is built by analyzing series of timed numeric values, and is 
+applied immediately for predicting future values. The model itself is stored but 
+not really needed any more (except for understanding the predicted values).
+
+If specified, a table <outtable> is additionally created with the following columns: 
+<by>, <time>, forecast, standarderror. The table contains the forecast values for 
+future time points of the time series identified by <by>. For each prediction, 
+the standarderror value indicates a confidence interval around the forecast value.
+
+If specified, a table <seasadjtable> is additionally created with the following columns: 
+<by>, <time>, adjusted. The values in column <target> of the input table are seasonally 
+adjusted and then copied into this table, with the values of columns <by> and <time>
+"""
+from typing import List
+from nzpyida.frame import IdaDataFrame
+from nzpyida.base import IdaDataBase
+from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling
+from nzpyida.analytics.utils import call_proc_df_in_out
+from nzpyida.analytics.model_manager import ModelManager
+
+class TimeSeries(PredictiveModeling):
+    """
+    Time Series Model
+    """
+    def __init__(self, idadb: IdaDataBase, model_name: str):
+        """
+        Creates Time Series
+        """
+        super().__init__(idadb, model_name)
+        self.fit_proc = "TIMESERIES"
+        self.has_print_proc = True
+    
+    def fit_predict(self, in_df: IdaDataFrame, time_column: str, target_column: str, by_column: str=None,
+            out_table: str=None, description_table: str=None, algorithm: str='ExponentialSmoothing', 
+            interpolation_method: str='linear', from_time=None, to_time=None, forecast_horizon: str=None,
+            forecast_times: str=None, trend: str=None, seasonality: str=None, period: float=None, 
+            unit: str=None, p: int=None, d: int=None, q: int=None, sp: int=None, sd: int=None, sq: int=None, 
+            saesonally_adjusted_table: str=None ) -> IdaDataFrame:
+        """
+        Predicts future values of series of timed numeric values
+
+        Parameters
+        ----------
+        in_df : IdaDataFrame
+            the input data frame
+        
+        time_column : str
+            the input data frame column which define an order on the numeric values
+        
+        target_columns : str
+            the input data frame column which contains the numeric values
+
+        by_column : str
+            the input data frame column which uniquely identifies a serie of values.
+            If not specified, all numeric values belong to only one time series.
+        
+        out_table : str
+            the output data frmae containing predicted future values. This parameter 
+            is not allowed for algorithm = SpectralAnalysis. If not specified, 
+            no output table is written out
+        
+        description_table : str
+            the optional input data frame containing the name and descriptions of the 
+            time series. The table must contain following columns: <by_column>, 'NAME'=str, 
+            'DESCRIPTION'=str. If not specified, the series do not have a name or a description
+        
+        algorithm : str
+            the time series algorithm to use. Allowed values are: ExponentialSmoothing, 
+            ARIMA, SeasonalTrendDecomposition, SpectralAnalysis
+
+        interpolation_method : str
+            the interpolation method. Allowed values are: linear, cubicspline, exponentialspline
+
+        from_time : same as type of <time column>
+            the value of column time to start the analysis from. If not specified, the analysis 
+            starts from the first value of the time series in the input table
+
+        to_time : same as type of <time column>
+            the value of column time to stop the analysis at. If not specified, the analysis 
+            stops at the last value of the time series in the input table
+
+        forecast_horizon : str
+            the value of column time until which to predict. This parameter is not allowed for 
+            algorithm=SpectralAnalysis. If not specified, the algorithm determines itself 
+            until which time it predicts values
+
+        forecast_times : str
+            list of semi-column separated values of column time to predict at. This parameter 
+            is not allowed for algorithm=SpectralAnalysis. If not specified, the times to predict 
+            values at is determined by the algorithm
+
+        trend : str
+            the trend type for algorithm=ExponentialSmoothing. Allowed values are: N (none), 
+            A (addditive), DA (damped additive), M (multiplicative), DM (damped multiplicative). 
+            If not specified, the trend type is determined by the algorithm
+
+        seasonality : str
+            the seasonality type for algorithm=ExponentialSmoothing. Allowed values are: N (none), 
+            A (addditive), M (multiplicative). If not specified, the seasonality type is 
+            determined by the algorithm
+
+        period : float
+            the seasonality period. This parameter is not allowed for algorithm=SpectralAnalysis. 
+            If not specified, the seasonality period is determined by the algorithm. If set to 0, 
+            no seasonality period will be considered by the algorithm
+        
+        unit : str
+            the seasonality period unit. This parameter is not allowed for algorithm=SpectralAnalysis. 
+            This parameter must be specified if the parameter period is specified and the <time_column>  
+            is of type date, time or timestamp. Otherwise, it must not be spe- cified. Allowed values are: 
+            ms, s, min, h, d, wk, qtr, q, a, y
+
+        p : int
+            the parameter p for algorithm=ARIMA, either equal to or below specified value. 
+            If not specified, the algorithm will determine its best value automatically
+
+        d : int
+            the parameter d for algorithm=ARIMA, either equal to or below specified value. 
+            If not specified, the algorithm will determine its best value automatically
+        
+        q : int
+            the parameter q for algorithm=ARIMA, either equal to or below specified value. 
+            If not specified, the algorithm will determine its best value automatically
+
+        sp : int
+            the seasonal parameter SP for algorithm=ARIMA, either equal to or below specified value. 
+            If not specified, the algorithm will determine its best value automatically
+
+        sd : int
+            the seasonal parameter SD for algorithm=ARIMA, either equal to or below specified value. 
+            If not specified, the algorithm will determine its best value automatically
+
+        sq : int
+            the seasonal parameter SQ for algorithm=ARIMA, either equal to or below specified value. 
+            If not specified, the algorithm will determine its best value automatically
+
+        saesonally_adjusted_table : str
+            the output table containing seasonally adjusted values. This parameter is not allowed 
+            for algorithm=SpectralAnalysis or algorithm=ARIMA. If not specified, no output table 
+            is written out
+        """
+
+        params = {
+            'model': self.model_name,
+            'time': time_column,
+            'target': target_column,
+            'by': by_column,
+            'desctable': description_table,
+            'algorithm': algorithm,
+            'interpolationmethod': interpolation_method,
+            'from': from_time,
+            'to': to_time,
+            'forecasthorizon': forecast_horizon,
+            'forecasttimes': forecast_times,
+            'trend': trend,
+            'seasonality': seasonality,
+            'period': period,
+            'unit': unit,
+            'p': p,
+            'd': d,
+            'q': q,
+            'SP': sp,
+            'SD': sd,
+            'SQ': sq,
+            'seasadjtable': saesonally_adjusted_table,
+        }
+
+        if not isinstance(in_df, IdaDataFrame):
+            raise TypeError("Argument in_df should be an IdaDataFrame")
+
+        ModelManager(self.idadb).drop_model(self.model_name)
+
+        return call_proc_df_in_out(proc=self.fit_proc, in_df=in_df, params=params,
+            out_table=out_table)[0]
diff --git a/nzpyida/analytics/tests/test_timeseries.py b/nzpyida/analytics/tests/test_timeseries.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#-----------------------------------------------------------------------------
+# Copyright (c) 2023, IBM Corp.
+# All rights reserved.
+#
+# Distributed under the terms of the BSD Simplified License.
+#
+# The full license is in the LICENSE file, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from nzpyida.analytics.predictive.timeseries import TimeSeries
+from nzpyida.base import IdaDataBase
+from nzpyida.frame import IdaDataFrame
+from nzpyida.analytics.model_manager import ModelManager
+import pytest
+from nzpyida.analytics.tests.conftest import MOD_NAME, TAB_NAME_TRAIN, OUT_TABLE_PRED
+import pandas as pd
+from math import sin
+
+
+@pytest.fixture(scope='module')
+def mm(idadb: IdaDataBase):
+    return ModelManager(idadb)
+
+@pytest.fixture
+def clean_up(idadb, mm):
+    if mm.model_exists(MOD_NAME):
+        mm.drop_model(MOD_NAME)
+    if idadb.exists_table(OUT_TABLE_PRED):
+        idadb.drop_table(OUT_TABLE_PRED)
+    yield
+    if mm.model_exists(MOD_NAME):
+        mm.drop_model(MOD_NAME)
+    if idadb.exists_table(OUT_TABLE_PRED):
+        idadb.drop_table(OUT_TABLE_PRED)
+
+
+@pytest.fixture
+def idf_train(idadb: IdaDataBase):
+    if idadb.exists_table(TAB_NAME_TRAIN):
+        idadb.drop_table(TAB_NAME_TRAIN)
+
+    time_series = [sin(x)+x for x in range(200)]
+    df = pd.DataFrame.from_dict({
+        "TIME": range(200),
+        "VALUE": time_series
+    })
+    yield idadb.as_idadataframe(df, TAB_NAME_TRAIN)
+
+    if idadb.exists_table(TAB_NAME_TRAIN):
+        idadb.drop_table(TAB_NAME_TRAIN)
+
+
+def test_timeseries(idadb: IdaDataBase, mm: ModelManager, idf_train: IdaDataFrame, clean_up):
+    model = TimeSeries(idadb, MOD_NAME)
+    assert model
+    assert not mm.model_exists(MOD_NAME) 
+
+    outtab = model.fit_predict(idf_train, time_column="TIME", target_column="VALUE", out_table=OUT_TABLE_PRED, 
+                       forecast_horizon='399')
+
+    assert mm.model_exists(MOD_NAME) 
+    assert outtab
+    assert len(outtab) == 200
+    assert round(outtab.head(10).iloc[-1]["VALUE"]) == round(sin(210)+210)
+    assert round(outtab.tail().iloc[-1]["VALUE"]) == round(sin(399)+399)