From 8ae6c1f378ee6b8137eae7e3587f87d3529f1965 Mon Sep 17 00:00:00 2001 From: Brad Ochocki Date: Tue, 9 May 2023 15:46:49 -0700 Subject: [PATCH 1/5] simplify Dockerfile ## Overview This PR uses updated versions of Python and `prophet` to greatly simplify the python environment setup in the Dockerfile. The code has been tested by creating a local Docker container, and sample outputs were written to the following tables in `moz-fx-data-bq-data-science.bochocki`: - `tmp_desktop_kpi_forecast` - `tmp_desktop_kpi_forecast_confidences` - `tmp_mobile_kpi_forecast` - `tmp_mobile_kpi_forecast_confidences` ## Additional Changes - `.gitignore`: ignore additional filetypes - `kpi_forecasting.py`: set confidence intervals `target` from `config` instead of relying on hardcoded `"desktop"`. This `target` is overwritten in `write_confidence_intervals_to_bigquery` [here](https://github.com/mozilla/docker-etl/blob/4cfbec915375343023944d1ca23f527251a5ada8/jobs/kpi-forecasting/kpi-forecasting/Utils/DBWriter.py#L116), but I think this change makes the it clear that we're not unintentionally using "desktop" labels on "mobile" forecasts. - `PosteriorSampling.py`: minor refactoring required to resolve errors and deprecation warnings that are now being raised by pandas as a result of package upgrades. - `README.md`: update examples - `requirements.txt`: updated packages to get easier-install versions of `prophet` and `statsforecast`. --- jobs/kpi-forecasting/.gitignore | 5 +- jobs/kpi-forecasting/Dockerfile | 12 +- jobs/kpi-forecasting/README.md | 4 +- .../Utils/PosteriorSampling.py | 15 +- .../kpi-forecasting/kpi_forecasting.py | 2 +- jobs/kpi-forecasting/requirements.txt | 187 ++++++++++-------- 6 files changed, 118 insertions(+), 107 deletions(-) diff --git a/jobs/kpi-forecasting/.gitignore b/jobs/kpi-forecasting/.gitignore index 849c49a5..e7e9bf4b 100644 --- a/jobs/kpi-forecasting/.gitignore +++ b/jobs/kpi-forecasting/.gitignore @@ -1,3 +1,6 @@ +.cache .idea -.vscode +.local .python-version +.python_history +.vscode diff --git a/jobs/kpi-forecasting/Dockerfile b/jobs/kpi-forecasting/Dockerfile index dca65ad1..f571ac30 100644 --- a/jobs/kpi-forecasting/Dockerfile +++ b/jobs/kpi-forecasting/Dockerfile @@ -1,5 +1,5 @@ -FROM python:3.8 -MAINTAINER Perry McManis +FROM python:3.10 +LABEL maintainer="Brad Ochocki " # https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md ARG USER_ID="10001" @@ -12,19 +12,11 @@ RUN groupadd --gid ${USER_ID} ${GROUP_ID} && \ WORKDIR ${HOME} -RUN apt install gcc -RUN apt install g++ - RUN pip install --upgrade pip -RUN pip install pystan==2.19.1.1 -RUN python3 -m pip install prophet --no-cache-dir - COPY requirements.txt requirements.txt RUN pip install -r requirements.txt -RUN pip install git+https://github.com/Nixtla/statsforecast.git - COPY . . # Drop root and change ownership of the application folder to the user diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index dbfb1504..919d8a18 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -21,9 +21,9 @@ pip install -r requirements.txt Run the scripts with: ```sh -python kpi_forecasting.py -c yaml/desktop.yaml +python ~/kpi-forecasting/kpi_forecasting.py -c ~/kpi-forecasting/yaml/desktop_non_cumulative.yaml -python kpi_forecasting.py -c yaml/mobile.yaml +python ~/kpi-forecasting/kpi_forecasting.py -c ~/kpi-forecasting/yaml/mobile_non_cumulative.yaml ``` ### On SQL Queries And Preprocessing diff --git a/jobs/kpi-forecasting/kpi-forecasting/Utils/PosteriorSampling.py b/jobs/kpi-forecasting/kpi-forecasting/Utils/PosteriorSampling.py index e5348e6f..b00bd911 100644 --- a/jobs/kpi-forecasting/kpi-forecasting/Utils/PosteriorSampling.py +++ b/jobs/kpi-forecasting/kpi-forecasting/Utils/PosteriorSampling.py @@ -31,10 +31,9 @@ def get_confidence_intervals( uncertainty_samples["ds"] > np.datetime64(final_observed_sample_date) ] .groupby("{}".format(aggregation_unit_of_time)) - .sum() + .sum(numeric_only=True) ) - print(samples_df_grouped.tail()) # start the aggregated dataframe with the mean of the uncertainty samples uncertainty_samples_aggregated = samples_df_grouped.mean(axis=1).reset_index() @@ -71,6 +70,8 @@ def get_confidence_intervals( columns={"y": "value"} ).sort_values(by="{}".format(aggregation_unit_of_time)) + observed_aggregated = observed_aggregated.astype({"value": np.float64}) + # check if whether there are overlap in actual and forecast at the group level if ( aggregation_unit_of_time == "ds_month" @@ -83,10 +84,12 @@ def get_confidence_intervals( ).dayofyear != 1 ): - uncertainty_samples_aggregated.at[0, 1:] = ( - uncertainty_samples_aggregated.iloc[0, 1:] - + observed_aggregated.iloc[-1].value - ) + # add observed samples from current time period to uncertainty samples for + # the remainder of the period. + uncertainty_samples_aggregated.iloc[0, 1:] += observed_aggregated["value"].iloc[ + -1 + ] + observed_aggregated = observed_aggregated.loc[ observed_aggregated[aggregation_unit_of_time] < observed_aggregated[aggregation_unit_of_time].max() diff --git a/jobs/kpi-forecasting/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi-forecasting/kpi_forecasting.py index 16a2a628..c86fd0de 100644 --- a/jobs/kpi-forecasting/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi-forecasting/kpi_forecasting.py @@ -50,7 +50,7 @@ def main() -> None: aggregation_unit_of_time=config["confidences"], asofdate=predictions["ds"].max(), final_observed_sample_date=dataset["ds"].max(), - target="desktop", + target=config["target"], ) write_predictions_to_bigquery(predictions, config) diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt index 47e4f79a..5f34a3db 100644 --- a/jobs/kpi-forecasting/requirements.txt +++ b/jobs/kpi-forecasting/requirements.txt @@ -1,94 +1,107 @@ +adagio==0.2.4 +ansi2html==1.8.0 +antlr4-python3-runtime==4.11.1 appdirs==1.4.4 -attrs==20.3.0 -bcrypt==3.2.0 -beautifulsoup4==4.10.0 -BigQuery-Python==1.15.0 -black==22.3.0 -cachetools==4.2.4 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -cmdstanpy==0.9.68 +asttokens==2.2.1 +backcall==0.2.0 +blinker==1.6.2 +cachetools==5.3.0 +certifi==2023.5.7 +charset-normalizer==3.1.0 +click==8.1.3 +cmdstanpy==1.1.0 +comm==0.1.3 +contourpy==1.0.7 convertdate==2.4.0 -cryptography==36.0.1 cycler==0.11.0 -Cython==0.29.28 -ephem==4.1.3 -flake8==3.8.4 -google-api-core==1.31.5 -google-api-python-client==2.38.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==0.5.0 -google-auth==1.35.0 -google-cloud-bigquery-storage==1.0.0 -google-cloud-bigquery==1.27.2 -google-cloud-core==1.7.2 -google-cloud-storage==1.31.0 -google-crc32c==1.3.0 -google-resumable-media==1.3.3 -google==3.0.0 -googleapis-common-protos==1.55.0 -grpcio==1.44.0 -hijri-converter==2.2.3 -holidays==0.16 -httplib2==0.20.4 -idna==3.3 -iniconfig==1.1.1 -Jinja2==2.11.2 -joblib==1.2.0 -kiwisolver==1.3.2 -korean-lunar-calendar==0.2.1 +dash==2.9.3 +dash-core-components==2.0.0 +dash-html-components==2.0.0 +dash-table==5.0.0 +db-dtypes==1.1.1 +debugpy==1.6.7 +decorator==5.1.1 +ephem==4.1.4 +executing==1.2.0 +Flask==2.3.2 +fonttools==4.39.3 +fs==2.4.16 +fugue==0.8.3 +fugue-sql-antlr==0.1.6 +google-api-core==2.11.0 +google-auth==2.17.3 +google-cloud-bigquery==3.10.0 +google-cloud-core==2.3.2 +google-crc32c==1.5.0 +google-resumable-media==2.5.0 +googleapis-common-protos==1.59.0 +grpcio==1.54.0 +grpcio-status==1.54.0 +hijri-converter==2.3.1 +holidays==0.24 +idna==3.4 +ipykernel==6.23.0 +ipython==8.13.2 +itsdangerous==2.1.2 +jedi==0.18.2 +Jinja2==3.1.2 +jupyter-dash==0.4.2 +jupyter_client==8.2.0 +jupyter_core==5.3.0 +kiwisolver==1.4.4 +korean-lunar-calendar==0.3.1 +llvmlite==0.40.0 LunarCalendar==0.0.9 -MarkupSafe==1.1.1 -matplotlib==3.3.2 -mccabe==0.6.1 -more-itertools==8.6.0 -mypy-extensions==0.4.3 -numpy -oauthlib==3.2.0 -packaging==21.3 -pandas-gbq==0.13.2 -pandas==1.3.5 -paramiko==2.9.2 -pathspec==0.9.0 -Pillow==9.0.1 -plotly==4.9.0 -pluggy==0.13.1 -protobuf==3.19.4 -py==1.10.0 -pyarrow==7.0.0 -pyasn1-modules==0.2.8 -pyasn1==0.4.8 -pycodestyle==2.6.0 -pycparser==2.21 -pydata-google-auth==1.3.0 -pyflakes==2.2.0 -PyMeeus==0.5.11 -PyNaCl==1.5.0 -pyparsing==2.4.7 -pytest-black==0.3.11 -pytest-flake8==1.0.6 -pytest==6.0.2 +MarkupSafe==2.1.2 +matplotlib==3.7.1 +matplotlib-inline==0.1.6 +nest-asyncio==1.5.6 +numba==0.57.0 +numpy==1.24.3 +orjson==3.8.12 +packaging==23.1 +pandas==1.5.3 +parso==0.8.3 +patsy==0.5.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.5.0 +platformdirs==3.5.0 +plotly==5.14.1 +plotly-resampler==0.8.3.2 +prompt-toolkit==3.0.38 +prophet==1.1.2 +proto-plus==1.22.2 +protobuf==4.23.0 +psutil==5.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyarrow==12.0.0 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +Pygments==2.15.1 +PyMeeus==0.5.12 +pyparsing==3.0.9 python-dateutil==2.8.2 -pytz==2021.3 +pytz==2023.3 PyYAML==6.0 -regex==2020.11.13 -requests-oauthlib==1.3.1 -requests==2.27.1 -retrying==1.3.3 -rsa==4.8 -setuptools-git==1.2 +pyzmq==25.0.2 +qpd==0.4.1 +requests==2.30.0 +retrying==1.3.4 +rsa==4.9 +scipy==1.10.1 six==1.16.0 -soupsieve==2.3.1 -statsforecast==1.1.0 -statsmodels==0.13.2 -storage==0.0.4.3 -threadpoolctl==3.1.0 -toml==0.10.2 -tqdm==4.63.0 -typed-ast==1.5.4 -typing-extensions==3.10.0.0 -ujson==5.1.0 -uritemplate==4.1.1 -urllib3==1.26.8 +sqlglot==12.2.0 +stack-data==0.6.2 +statsforecast==1.5.0 +statsmodels==0.14.0 +tenacity==8.2.2 +tornado==6.3.1 +tqdm==4.65.0 +trace-updater==0.0.9.1 +traitlets==5.9.0 +triad==0.8.7 +urllib3==2.0.2 +wcwidth==0.2.6 +Werkzeug==2.3.4 \ No newline at end of file From d230e99ecd1ddab5aeae5eb3806fe85b89c76f51 Mon Sep 17 00:00:00 2001 From: Brad Ochocki Date: Tue, 9 May 2023 16:07:11 -0700 Subject: [PATCH 2/5] black format --- jobs/kpi-forecasting/kpi-forecasting/Utils/AutoArimaFit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi-forecasting/Utils/AutoArimaFit.py b/jobs/kpi-forecasting/kpi-forecasting/Utils/AutoArimaFit.py index 165b3923..dae7f100 100644 --- a/jobs/kpi-forecasting/kpi-forecasting/Utils/AutoArimaFit.py +++ b/jobs/kpi-forecasting/kpi-forecasting/Utils/AutoArimaFit.py @@ -9,7 +9,6 @@ def run_forecast_arima(dataset: pd.DataFrame, config: dict) -> pd.DataFrame: - fit_parameters = config[ "forecast_parameters" ].copy() # you must force a copy here or it assigns a reference to From 27229dd77021ec9360031a7e270ef4ef65148c0d Mon Sep 17 00:00:00 2001 From: Brad Ochocki Date: Tue, 9 May 2023 16:11:10 -0700 Subject: [PATCH 3/5] change `MAINTAINER` label --- jobs/kpi-forecasting/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobs/kpi-forecasting/Dockerfile b/jobs/kpi-forecasting/Dockerfile index f571ac30..ec1c3819 100644 --- a/jobs/kpi-forecasting/Dockerfile +++ b/jobs/kpi-forecasting/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.10 -LABEL maintainer="Brad Ochocki " +MAINTAINER "Brad Ochocki " # https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md ARG USER_ID="10001" From 02b5f2a1b31b823732bc910712e40a689ac5e1cd Mon Sep 17 00:00:00 2001 From: Brad Ochocki Date: Tue, 9 May 2023 16:13:49 -0700 Subject: [PATCH 4/5] Revert "change `MAINTAINER` label" This reverts commit 27229dd77021ec9360031a7e270ef4ef65148c0d. --- jobs/kpi-forecasting/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobs/kpi-forecasting/Dockerfile b/jobs/kpi-forecasting/Dockerfile index ec1c3819..f571ac30 100644 --- a/jobs/kpi-forecasting/Dockerfile +++ b/jobs/kpi-forecasting/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.10 -MAINTAINER "Brad Ochocki " +LABEL maintainer="Brad Ochocki " # https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md ARG USER_ID="10001" From f627ae71746877f73cd6ae2553274760283eb1bd Mon Sep 17 00:00:00 2001 From: Brad Ochocki Date: Tue, 9 May 2023 16:17:46 -0700 Subject: [PATCH 5/5] include pytest-black --- jobs/kpi-forecasting/requirements.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt index 5f34a3db..39fabd15 100644 --- a/jobs/kpi-forecasting/requirements.txt +++ b/jobs/kpi-forecasting/requirements.txt @@ -4,6 +4,7 @@ antlr4-python3-runtime==4.11.1 appdirs==1.4.4 asttokens==2.2.1 backcall==0.2.0 +black==23.3.0 blinker==1.6.2 cachetools==5.3.0 certifi==2023.5.7 @@ -22,6 +23,7 @@ db-dtypes==1.1.1 debugpy==1.6.7 decorator==5.1.1 ephem==4.1.4 +exceptiongroup==1.1.1 executing==1.2.0 Flask==2.3.2 fonttools==4.39.3 @@ -40,6 +42,7 @@ grpcio-status==1.54.0 hijri-converter==2.3.1 holidays==0.24 idna==3.4 +iniconfig==2.0.0 ipykernel==6.23.0 ipython==8.13.2 itsdangerous==2.1.2 @@ -55,6 +58,7 @@ LunarCalendar==0.0.9 MarkupSafe==2.1.2 matplotlib==3.7.1 matplotlib-inline==0.1.6 +mypy-extensions==1.0.0 nest-asyncio==1.5.6 numba==0.57.0 numpy==1.24.3 @@ -62,6 +66,7 @@ orjson==3.8.12 packaging==23.1 pandas==1.5.3 parso==0.8.3 +pathspec==0.11.1 patsy==0.5.3 pexpect==4.8.0 pickleshare==0.7.5 @@ -69,6 +74,7 @@ Pillow==9.5.0 platformdirs==3.5.0 plotly==5.14.1 plotly-resampler==0.8.3.2 +pluggy==1.0.0 prompt-toolkit==3.0.38 prophet==1.1.2 proto-plus==1.22.2 @@ -82,6 +88,8 @@ pyasn1-modules==0.3.0 Pygments==2.15.1 PyMeeus==0.5.12 pyparsing==3.0.9 +pytest==7.3.1 +pytest-black==0.3.12 python-dateutil==2.8.2 pytz==2023.3 PyYAML==6.0 @@ -97,6 +105,8 @@ stack-data==0.6.2 statsforecast==1.5.0 statsmodels==0.14.0 tenacity==8.2.2 +toml==0.10.2 +tomli==2.0.1 tornado==6.3.1 tqdm==4.65.0 trace-updater==0.0.9.1