From 6dcb4aa223ab3512378141b29b21d9ae66df23aa Mon Sep 17 00:00:00 2001 From: Chen Date: Wed, 13 Jul 2022 19:07:55 +0800 Subject: [PATCH 01/13] data-splitter-example --- .../data _splitter_example.md | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 docs/tutorials/data_manipulation/data _splitter_example.md diff --git a/docs/tutorials/data_manipulation/data _splitter_example.md b/docs/tutorials/data_manipulation/data _splitter_example.md new file mode 100644 index 0000000000..a49273512a --- /dev/null +++ b/docs/tutorials/data_manipulation/data _splitter_example.md @@ -0,0 +1,150 @@ +# Data Splitter Usage + +In this notebook, we are going to show how to use the `split` method existed in our GluonTS project. + +In the `split` method: +- you need to provide the `dataset` for the method to split. +- you also need to provide `offset` or `date`, but not both simultaneously. These two arguments are provided for the method to know how to slices training and test data based on a fixed integer offset or a ``pandas.Period``. +As a result, the `split` method returns the splited dataset, consisting of the training data `training_dataset` and the TestTemplate objectives `test_template` which knows how to generate test data `test_pairs` using the memeber function `generate_instances`. + + +## Data loading and processing + + +```python +from gluonts.dataset import DataEntry +from gluonts.dataset.field_names import FieldName +from gluonts.dataset.pandas import PandasDataset +from gluonts.dataset.repository.datasets import get_dataset, dataset_recipes +from gluonts.dataset.split.splitter import OffsetSplitter, DateSplitter, split +from gluonts.dataset.util import to_pandas +``` + + +```python +%matplotlib inline +import mxnet as mx +from mxnet import gluon +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import json +``` + +### Provided datasets + + +```python +url = u"https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv" +df = pd.read_csv(url, header=0) +whole_dataset = PandasDataset(df, timestamp="timestamp", target="value") +``` + + +```python +plt.figure(figsize=(30, 8)) +to_pandas(list(whole_dataset)[0]).plot(color='royalblue') +plt.grid(which="both") +plt.legend(["whole dataset"], loc="upper left") +plt.show() +``` + +### Aggregate and get smaller datasets + + +```python +df["timestamp"] = pd.to_datetime(df["timestamp"]) +df2 = df.set_index("timestamp").resample("1H").sum().reset_index() +sub_dataset = PandasDataset(df2, timestamp="timestamp", target="value") +``` + + +```python +plt.figure(figsize=(20, 3)) +to_pandas(list(sub_dataset)[0]).plot(color='royalblue') +plt.grid(which="both") +plt.legend(["sub dataset"], loc="upper left") +plt.show() +``` + +## Data split + +we will take training data up to date `2015-04-07 00:00:00`, then generate several test instances from there onwards + + +```python +def get_start_end(entry): + start = entry['start'] + end = entry['start'] + len(entry['target']) * entry['start'].freq + return start, end +``` + + +```python +date = pd.Period("2015-04-07 00:00:00", freq='1H') +prediction_length=3 * 24 +training_dataset, test_template = split(sub_dataset, date=date) +test_pairs = test_template.generate_instances( + prediction_length=prediction_length, + windows=3, + ) +``` + + +```python +for original_entry, train_entry in zip(sub_dataset, training_dataset): + start, end = get_start_end(train_entry) + plt.figure(figsize=(20,3)) + to_pandas(original_entry).plot(color='royalblue') + plt.grid(which="both") + plt.axvspan(start, end, facecolor='red', alpha=.2) + plt.legend(["sub dataset"], loc="upper left") + +for original_entry in sub_dataset: + for test_input, test_label in test_pairs: + start_input, end_input = get_start_end(test_input) + start_label, end_label = get_start_end(test_label) + plt.figure(figsize=(20,3)) + to_pandas(original_entry).plot(color='royalblue') + plt.grid(which="both") + plt.axvspan(start_input, end_input, facecolor='green', alpha=.2) + plt.axvspan(start_label, end_label, facecolor='blue', alpha=.2) + plt.legend(["sub dataset"], loc="upper left") +``` + +we will take training data up to date `2015-03-27 00:00:00`, then generate several test instances from date `2015-04-07 00:00:00` onwards + + +```python +TRAINING_END_DATE = pd.Period("2015-03-27 00:00:00", freq='1H') +TEST_START_DATE = pd.Period("2015-04-07 00:00:00", freq='1H') +training_dataset, _ = split(sub_dataset, date=TRAINING_END_DATE) +_, test_template = split(sub_dataset, date=TEST_START_DATE) +test_pairs = test_template.generate_instances( + prediction_length=prediction_length, + windows=3, + ) +``` + + +```python +for original_entry, train_entry in zip(sub_dataset, training_dataset): + start, end = get_start_end(train_entry) + plt.figure(figsize=(20,3)) + to_pandas(original_entry).plot(color='royalblue') + plt.grid(which="both") + plt.axvspan(start, end, facecolor='red', alpha=.2) + plt.legend(["sub dataset"], loc="upper left") + +for original_entry in sub_dataset: + for test_input, test_label in test_pairs: + start_input, end_input = get_start_end(test_input) + start_label, end_label = get_start_end(test_label) + plt.figure(figsize=(20,3)) + to_pandas(original_entry).plot(color='royalblue') + plt.grid(which="both") + plt.axvspan(start_input, end_input, facecolor='green', alpha=.2) + plt.axvspan(start_label, end_label, facecolor='blue', alpha=.2) + plt.legend(["sub dataset"], loc="upper left") +``` From 3b15093abce07b495cf0c33be78e09b1b7720a48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Thu, 14 Jul 2022 16:30:51 +0800 Subject: [PATCH 02/13] Update docs/tutorials/data_manipulation/data _splitter_example.md Co-authored-by: Jasper --- docs/tutorials/data_manipulation/data _splitter_example.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/data_manipulation/data _splitter_example.md b/docs/tutorials/data_manipulation/data _splitter_example.md index a49273512a..0b0ed9352d 100644 --- a/docs/tutorials/data_manipulation/data _splitter_example.md +++ b/docs/tutorials/data_manipulation/data _splitter_example.md @@ -76,7 +76,7 @@ we will take training data up to date `2015-04-07 00:00:00`, then generate sever ```python def get_start_end(entry): start = entry['start'] - end = entry['start'] + len(entry['target']) * entry['start'].freq + end = entry['start'] + len(entry['target']) return start, end ``` From 310aa1a38744c0e49e93d162bb77d0c89bc95ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Thu, 14 Jul 2022 20:32:39 +0800 Subject: [PATCH 03/13] Update docs/tutorials/data_manipulation/data _splitter_example.md Co-authored-by: Lorenzo Stella --- docs/tutorials/data_manipulation/data _splitter_example.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/data_manipulation/data _splitter_example.md b/docs/tutorials/data_manipulation/data _splitter_example.md index 0b0ed9352d..54f3c63224 100644 --- a/docs/tutorials/data_manipulation/data _splitter_example.md +++ b/docs/tutorials/data_manipulation/data _splitter_example.md @@ -1,4 +1,4 @@ -# Data Splitter Usage +# Splitting datasets into training and test In this notebook, we are going to show how to use the `split` method existed in our GluonTS project. From 3fd8abc90314049800363aee6593a44849477402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Thu, 14 Jul 2022 23:16:41 +0800 Subject: [PATCH 04/13] Update docs/tutorials/data_manipulation/data _splitter_example.md Co-authored-by: Lorenzo Stella --- .../data _splitter_example.md | 150 ------------------ .../dataset_splitting_example.md.template | 118 ++++++++++++++ 2 files changed, 118 insertions(+), 150 deletions(-) delete mode 100644 docs/tutorials/data_manipulation/data _splitter_example.md create mode 100644 docs/tutorials/data_manipulation/dataset_splitting_example.md.template diff --git a/docs/tutorials/data_manipulation/data _splitter_example.md b/docs/tutorials/data_manipulation/data _splitter_example.md deleted file mode 100644 index 54f3c63224..0000000000 --- a/docs/tutorials/data_manipulation/data _splitter_example.md +++ /dev/null @@ -1,150 +0,0 @@ -# Splitting datasets into training and test - -In this notebook, we are going to show how to use the `split` method existed in our GluonTS project. - -In the `split` method: -- you need to provide the `dataset` for the method to split. -- you also need to provide `offset` or `date`, but not both simultaneously. These two arguments are provided for the method to know how to slices training and test data based on a fixed integer offset or a ``pandas.Period``. -As a result, the `split` method returns the splited dataset, consisting of the training data `training_dataset` and the TestTemplate objectives `test_template` which knows how to generate test data `test_pairs` using the memeber function `generate_instances`. - - -## Data loading and processing - - -```python -from gluonts.dataset import DataEntry -from gluonts.dataset.field_names import FieldName -from gluonts.dataset.pandas import PandasDataset -from gluonts.dataset.repository.datasets import get_dataset, dataset_recipes -from gluonts.dataset.split.splitter import OffsetSplitter, DateSplitter, split -from gluonts.dataset.util import to_pandas -``` - - -```python -%matplotlib inline -import mxnet as mx -from mxnet import gluon -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import matplotlib.dates as mdates -import json -``` - -### Provided datasets - - -```python -url = u"https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv" -df = pd.read_csv(url, header=0) -whole_dataset = PandasDataset(df, timestamp="timestamp", target="value") -``` - - -```python -plt.figure(figsize=(30, 8)) -to_pandas(list(whole_dataset)[0]).plot(color='royalblue') -plt.grid(which="both") -plt.legend(["whole dataset"], loc="upper left") -plt.show() -``` - -### Aggregate and get smaller datasets - - -```python -df["timestamp"] = pd.to_datetime(df["timestamp"]) -df2 = df.set_index("timestamp").resample("1H").sum().reset_index() -sub_dataset = PandasDataset(df2, timestamp="timestamp", target="value") -``` - - -```python -plt.figure(figsize=(20, 3)) -to_pandas(list(sub_dataset)[0]).plot(color='royalblue') -plt.grid(which="both") -plt.legend(["sub dataset"], loc="upper left") -plt.show() -``` - -## Data split - -we will take training data up to date `2015-04-07 00:00:00`, then generate several test instances from there onwards - - -```python -def get_start_end(entry): - start = entry['start'] - end = entry['start'] + len(entry['target']) - return start, end -``` - - -```python -date = pd.Period("2015-04-07 00:00:00", freq='1H') -prediction_length=3 * 24 -training_dataset, test_template = split(sub_dataset, date=date) -test_pairs = test_template.generate_instances( - prediction_length=prediction_length, - windows=3, - ) -``` - - -```python -for original_entry, train_entry in zip(sub_dataset, training_dataset): - start, end = get_start_end(train_entry) - plt.figure(figsize=(20,3)) - to_pandas(original_entry).plot(color='royalblue') - plt.grid(which="both") - plt.axvspan(start, end, facecolor='red', alpha=.2) - plt.legend(["sub dataset"], loc="upper left") - -for original_entry in sub_dataset: - for test_input, test_label in test_pairs: - start_input, end_input = get_start_end(test_input) - start_label, end_label = get_start_end(test_label) - plt.figure(figsize=(20,3)) - to_pandas(original_entry).plot(color='royalblue') - plt.grid(which="both") - plt.axvspan(start_input, end_input, facecolor='green', alpha=.2) - plt.axvspan(start_label, end_label, facecolor='blue', alpha=.2) - plt.legend(["sub dataset"], loc="upper left") -``` - -we will take training data up to date `2015-03-27 00:00:00`, then generate several test instances from date `2015-04-07 00:00:00` onwards - - -```python -TRAINING_END_DATE = pd.Period("2015-03-27 00:00:00", freq='1H') -TEST_START_DATE = pd.Period("2015-04-07 00:00:00", freq='1H') -training_dataset, _ = split(sub_dataset, date=TRAINING_END_DATE) -_, test_template = split(sub_dataset, date=TEST_START_DATE) -test_pairs = test_template.generate_instances( - prediction_length=prediction_length, - windows=3, - ) -``` - - -```python -for original_entry, train_entry in zip(sub_dataset, training_dataset): - start, end = get_start_end(train_entry) - plt.figure(figsize=(20,3)) - to_pandas(original_entry).plot(color='royalblue') - plt.grid(which="both") - plt.axvspan(start, end, facecolor='red', alpha=.2) - plt.legend(["sub dataset"], loc="upper left") - -for original_entry in sub_dataset: - for test_input, test_label in test_pairs: - start_input, end_input = get_start_end(test_input) - start_label, end_label = get_start_end(test_label) - plt.figure(figsize=(20,3)) - to_pandas(original_entry).plot(color='royalblue') - plt.grid(which="both") - plt.axvspan(start_input, end_input, facecolor='green', alpha=.2) - plt.axvspan(start_label, end_label, facecolor='blue', alpha=.2) - plt.legend(["sub dataset"], loc="upper left") -``` diff --git a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template new file mode 100644 index 0000000000..3cec1a456c --- /dev/null +++ b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template @@ -0,0 +1,118 @@ +# Splitting datasets into training and test + +In this notebook, we are going to show that given a dataset how can we split it when necessary. Specifically, we'll use the `split` function that already existed in our GluonTS project. + +In the `split` function: +- you need to provide the `dataset` for the method to split. +- you also need to provide `offset` or `date`, but not both simultaneously. These two arguments are provided for the method to know how to slices training and test data based on a fixed integer offset or a ``pandas.Period``. +As a result, the `split` method returns the splited dataset, consisting of the training data `training_dataset` and the TestTemplate objectives `test_template` which knows how to generate test data `test_pairs` using the memeber function `generate_instances`. + + +## Data loading and processing + + +```python +from gluonts.dataset.field_names import FieldName +from gluonts.dataset.pandas import PandasDataset +from gluonts.dataset.split.splitter import split +from gluonts.dataset.util import to_pandas +``` + + +```python +%matplotlib inline +import pandas as pd +import matplotlib.pyplot as plt +``` + +Config plotting attributes. + + +```python +plt.rcParams["axes.grid"] = True +plt.rcParams["figure.figsize"] = (20,3) +``` + +### Get some datasets + +Load data from an online csv into `pandas.DataFrame`, which is a long time series, with the frequency being 5 minutes and the length being 15831, so we resample the time series into an hour bins and sum the values of the timestamps falling into a bin, making the time series look nicer and more organized. + + +```python +url = "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv" +df = pd.read_csv(url, header=0) +df["timestamp"] = pd.to_datetime(df["timestamp"]) +df = df.set_index("timestamp") +df_resample = df.resample("1H").sum() +dataset = PandasDataset(df_resample.reset_index(), timestamp="timestamp", target="value") +``` + +Plot these two dataframe + + +```python +df.plot() +plt.legend(["whole dataset"], loc="upper left") + +df_resample.plot() +plt.legend(["sub dataset"], loc="upper left") +plt.show() +``` + +## Specific splitting examples + +Define two methods: +- In the `highlight_entry` method, we calculate the start and end of a given `entry` and highlight the region with the given `color`. +- In the `plot_dataset_splitting` method, we plot the `original_dataset` and highlight the corresponding splitted `training_dataset` and `test_pairs`. + + +```python +def highlight_entry(entry, color): + start = entry[FieldName.START] + end = entry[FieldName.START] + len(entry[FieldName.TARGET]) + plt.axvspan(start, end, facecolor=color, alpha=.2) + + +def plot_dataset_splitting(original_dataset, training_dataset, test_pairs): + for original_entry, train_entry in zip(original_dataset, training_dataset): + to_pandas(original_entry).plot() + highlight_entry(train_entry, "red") + plt.legend(["sub dataset", "training dataset"], loc="upper left") + plt.show() + + for original_entry in original_dataset: + for test_input, test_label in test_pairs: + to_pandas(original_entry).plot() + highlight_entry(test_input, "green") + highlight_entry(test_label, "blue") + plt.legend(["sub dataset", "test input", "test label"], loc="upper left") + plt.show() +``` + +For the first example, we will take training data up to specific date, and then directly generate several test instances from the same date onwards + + +```python +prediction_length = 3 * 24 +training_dataset, test_template = split(dataset, date=pd.Period("2015-04-07 00:00:00", freq='1H')) +test_pairs = test_template.generate_instances( + prediction_length=prediction_length, + windows=3, + ) + +plot_dataset_splitting(dataset, training_dataset, test_pairs) +``` + +However, we don't necessarily need to align the date where training data ends with the date where the testing part starts. So, for the second example, we will take training data up to date `2015-03-27 00:00:00`, then generate several test instances from date `2015-04-07 00:00:00` onwards by applying the `split` function two times. + + +```python +training_dataset, _ = split(dataset, date=pd.Period("2015-03-28 00:00:00", freq='1H')) +_, test_template = split(dataset, date=pd.Period("2015-04-07 00:00:00", freq='1H')) +test_pairs = test_template.generate_instances( + prediction_length=prediction_length, + windows=3, + ) + +plot_dataset_splitting(dataset, training_dataset, test_pairs) +``` From 8c48b2912b2af53684781d9786a5622545f3d69e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:28:18 +0800 Subject: [PATCH 05/13] Update docs/tutorials/data_manipulation/dataset_splitting_example.md.template Co-authored-by: Lorenzo Stella --- .../dataset_splitting_example.md.template | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template index 3cec1a456c..843f37a617 100644 --- a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template +++ b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template @@ -1,11 +1,14 @@ # Splitting datasets into training and test -In this notebook, we are going to show that given a dataset how can we split it when necessary. Specifically, we'll use the `split` function that already existed in our GluonTS project. +In this notebook, we are going to show how to split the entries of a dataset, along time, to construct training and test subsets of data (or training/validation/test). Specifically, we'll use the `split` function: -In the `split` function: -- you need to provide the `dataset` for the method to split. -- you also need to provide `offset` or `date`, but not both simultaneously. These two arguments are provided for the method to know how to slices training and test data based on a fixed integer offset or a ``pandas.Period``. -As a result, the `split` method returns the splited dataset, consisting of the training data `training_dataset` and the TestTemplate objectives `test_template` which knows how to generate test data `test_pairs` using the memeber function `generate_instances`. +PUT IMPORT LINE FOR `split` HERE + +This needs to be given: +- the `dataset` that we want to split; +- an `offset` or a `date`, but not both simultaneously. These two arguments are provided for the function to know how to slice training and test data, based on a fixed integer offset or a ``pandas.Period``, respectively. + +As a result, the `split` method returns the splited dataset, consisting of the training data `training_dataset` and a "test template" that knows how to generate input/output test pairs. ## Data loading and processing From c3c3a524536c6d1a75f8ec710e6e3ea2076f7a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:28:39 +0800 Subject: [PATCH 06/13] Update docs/tutorials/data_manipulation/dataset_splitting_example.md.template Co-authored-by: Lorenzo Stella --- .../data_manipulation/dataset_splitting_example.md.template | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template index 843f37a617..9a27f8cca6 100644 --- a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template +++ b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template @@ -15,7 +15,6 @@ As a result, the `split` method returns the splited dataset, consisting of the t ```python -from gluonts.dataset.field_names import FieldName from gluonts.dataset.pandas import PandasDataset from gluonts.dataset.split.splitter import split from gluonts.dataset.util import to_pandas From ecbd796e2e8b2a9fb4b558e5c7e866047c5b9d25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:30:59 +0800 Subject: [PATCH 07/13] Update docs/tutorials/data_manipulation/dataset_splitting_example.md.template Co-authored-by: Lorenzo Stella --- .../dataset_splitting_example.md.template | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template index 9a27f8cca6..0a1c950854 100644 --- a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template +++ b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template @@ -49,17 +49,10 @@ df_resample = df.resample("1H").sum() dataset = PandasDataset(df_resample.reset_index(), timestamp="timestamp", target="value") ``` -Plot these two dataframe - +This is what the data looks like: ```python -df.plot() -plt.legend(["whole dataset"], loc="upper left") - df_resample.plot() -plt.legend(["sub dataset"], loc="upper left") -plt.show() -``` ## Specific splitting examples From c9b39e80f294fcbe2a8295d70b001921ea95c217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:31:25 +0800 Subject: [PATCH 08/13] Update docs/tutorials/data_manipulation/dataset_splitting_example.md.template Co-authored-by: Lorenzo Stella --- .../data_manipulation/dataset_splitting_example.md.template | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template index 0a1c950854..1029ab96e2 100644 --- a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template +++ b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template @@ -56,9 +56,7 @@ df_resample.plot() ## Specific splitting examples -Define two methods: -- In the `highlight_entry` method, we calculate the start and end of a given `entry` and highlight the region with the given `color`. -- In the `plot_dataset_splitting` method, we plot the `original_dataset` and highlight the corresponding splitted `training_dataset` and `test_pairs`. +Let's define a few helper functions to visualize data splits. ```python From 0ab7258e88afee3186b79b47a2e09f15c0cd5ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=80=E5=A5=87=E5=96=B5=E5=96=B5?= <52767165+npnv@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:32:50 +0800 Subject: [PATCH 09/13] Update docs/tutorials/data_manipulation/dataset_splitting_example.md.template Co-authored-by: Lorenzo Stella --- .../dataset_splitting_example.md.template | 62 +++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template index 1029ab96e2..5eb8bf70d8 100644 --- a/docs/tutorials/data_manipulation/dataset_splitting_example.md.template +++ b/docs/tutorials/data_manipulation/dataset_splitting_example.md.template @@ -2,7 +2,10 @@ In this notebook, we are going to show how to split the entries of a dataset, along time, to construct training and test subsets of data (or training/validation/test). Specifically, we'll use the `split` function: -PUT IMPORT LINE FOR `split` HERE + +```python +from gluonts.dataset.split.splitter import split +``` This needs to be given: - the `dataset` that we want to split; @@ -10,49 +13,43 @@ This needs to be given: As a result, the `split` method returns the splited dataset, consisting of the training data `training_dataset` and a "test template" that knows how to generate input/output test pairs. - ## Data loading and processing -```python -from gluonts.dataset.pandas import PandasDataset -from gluonts.dataset.split.splitter import split -from gluonts.dataset.util import to_pandas -``` - - ```python %matplotlib inline import pandas as pd import matplotlib.pyplot as plt -``` - -Config plotting attributes. - - -```python plt.rcParams["axes.grid"] = True plt.rcParams["figure.figsize"] = (20,3) ``` ### Get some datasets -Load data from an online csv into `pandas.DataFrame`, which is a long time series, with the frequency being 5 minutes and the length being 15831, so we resample the time series into an hour bins and sum the values of the timestamps falling into a bin, making the time series look nicer and more organized. +For our examples, we will use data from the following `csv` file, which is originally sampled every 5 minutes, but we resample at hourly frequency. Note that this makes for a dataset consisting of a single time series, but everything we show here applies to any dataset, regardless of how many series it contains. ```python url = "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv" -df = pd.read_csv(url, header=0) -df["timestamp"] = pd.to_datetime(df["timestamp"]) -df = df.set_index("timestamp") -df_resample = df.resample("1H").sum() -dataset = PandasDataset(df_resample.reset_index(), timestamp="timestamp", target="value") +df = pd.read_csv(url, header=0, index_col="timestamp", parse_dates=True).resample("1H").sum() ``` This is what the data looks like: + +```python +df.plot() +plt.legend(["dataset"], loc="upper left") +plt.show() +``` + +Turn the dataframe into a GluonTS dataset using `PandasDataset` + + ```python -df_resample.plot() +from gluonts.dataset.pandas import PandasDataset +dataset = PandasDataset(df, target="value") +``` ## Specific splitting examples @@ -60,9 +57,10 @@ Let's define a few helper functions to visualize data splits. ```python +from gluonts.dataset.util import to_pandas def highlight_entry(entry, color): - start = entry[FieldName.START] - end = entry[FieldName.START] + len(entry[FieldName.TARGET]) + start = entry["start"] + end = entry["start"] + len(entry["target"]) plt.axvspan(start, end, facecolor=color, alpha=.2) @@ -82,7 +80,7 @@ def plot_dataset_splitting(original_dataset, training_dataset, test_pairs): plt.show() ``` -For the first example, we will take training data up to specific date, and then directly generate several test instances from the same date onwards +For the first example, we will take training data up to specific date, and then directly generate several test instances from the same date onwards. ```python @@ -109,3 +107,17 @@ test_pairs = test_template.generate_instances( plot_dataset_splitting(dataset, training_dataset, test_pairs) ``` + +Also, we don't necessarily need to align all the test instances time by time. So, for the third example, we will add `distance` arguments in `generate_instances` function to make test instances overlap between each other. + + +```python +training_dataset, test_template = split(dataset, date=pd.Period("2015-04-07 00:00:00", freq='1H')) +test_pairs = test_template.generate_instances( + prediction_length=prediction_length, + windows=3, + distance=24, + ) + +plot_dataset_splitting(dataset, training_dataset, test_pairs) +``` From 7cb672851b1238a6875919826a2d7592325c51a6 Mon Sep 17 00:00:00 2001 From: Chen Date: Wed, 3 Aug 2022 17:56:08 +0800 Subject: [PATCH 10/13] update test instance --- .../hp_tuning_with_optuna.md.template | 2 +- src/gluonts/dataset/split/splitter.py | 46 +++++++++++++++---- test/dataset/split/test_split.py | 12 +++-- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template b/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template index 471c05b720..36a20eb53f 100644 --- a/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template +++ b/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template @@ -104,7 +104,7 @@ class DeepARTuningObjective: self.metric_type = metric_type self.train, test_template = split(dataset, offset=-self.prediction_length) - validation = list(test_template.generate_instances(prediction_length=prediction_length)) + validation = test_template.generate_instances(prediction_length=prediction_length) self.validation_input = [entry[0] for entry in validation] self.validation_label = [dataentry_to_dataframe(entry[1]) for entry in validation] diff --git a/src/gluonts/dataset/split/splitter.py b/src/gluonts/dataset/split/splitter.py index c90f78f246..c59d83ee08 100644 --- a/src/gluonts/dataset/split/splitter.py +++ b/src/gluonts/dataset/split/splitter.py @@ -44,6 +44,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from dis import dis from typing import cast, Generator, Iterable, List, Optional, Tuple import numpy as np @@ -250,6 +251,34 @@ def _check_split_length( assert train_end + prediction_length <= test_end, msg +@dataclass +class TestIterable: + """ + An iterable class used for wrapping test data. + + Parameters + ---------- + dataset: + Whole dataset used for testing. + splitter: + A specific splitter that knows how to slices training and + test data. + kwargs: + Parameters used for generating specific test instances. + See `TestTemplate.generate_instances` + """ + + dataset: Dataset + splitter: AbstractBaseSplitter + kwargs: dict + + def __iter__(self): + yield from self.splitter._generate_test_slices( + self.dataset, + **self.kwargs, + ) + + @dataclass class TestTemplate: """ @@ -273,7 +302,7 @@ def generate_instances( windows=1, distance=None, max_history=None, - ) -> Generator[Tuple[DataEntry, DataEntry], None, None]: + ) -> TestIterable: """ Generate an iterator of test dataset, which includes input part and label part. @@ -292,14 +321,13 @@ def generate_instances( If given, all entries in the *test*-set have a max-length of `max_history`. This can be used to produce smaller file-sizes. """ - - yield from self.splitter._generate_test_slices( - self.dataset, - prediction_length=prediction_length, - windows=windows, - distance=distance, - max_history=max_history, - ) + kwargs = { + "prediction_length": prediction_length, + "windows": windows, + "distance": distance, + "max_history": max_history, + } + return TestIterable(self.dataset, self.splitter, kwargs) @dataclass diff --git a/test/dataset/split/test_split.py b/test/dataset/split/test_split.py index fe4f593e62..69baac1f63 100644 --- a/test/dataset/split/test_split.py +++ b/test/dataset/split/test_split.py @@ -160,11 +160,13 @@ def test_split(date, offset, windows, distance, max_history): train = list(train) validation = list( - test_template.generate_instances( - prediction_length=prediction_length, - windows=windows, - distance=distance, - max_history=max_history, + iter( + test_template.generate_instances( + prediction_length=prediction_length, + windows=windows, + distance=distance, + max_history=max_history, + ) ) ) From 20bb2f52f98cfa0ebf8428ded392ca0e60655e45 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Wed, 3 Aug 2022 13:10:24 +0200 Subject: [PATCH 11/13] remove unused import --- src/gluonts/dataset/split/splitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gluonts/dataset/split/splitter.py b/src/gluonts/dataset/split/splitter.py index c59d83ee08..698199aae6 100644 --- a/src/gluonts/dataset/split/splitter.py +++ b/src/gluonts/dataset/split/splitter.py @@ -44,7 +44,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from dis import dis from typing import cast, Generator, Iterable, List, Optional, Tuple import numpy as np From 6d2e3cabf57aa313a95cb262b9ae89e219e39650 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Wed, 3 Aug 2022 13:48:40 +0200 Subject: [PATCH 12/13] undo change --- test/dataset/split/test_split.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/dataset/split/test_split.py b/test/dataset/split/test_split.py index 69baac1f63..f6c221c630 100644 --- a/test/dataset/split/test_split.py +++ b/test/dataset/split/test_split.py @@ -160,13 +160,12 @@ def test_split(date, offset, windows, distance, max_history): train = list(train) validation = list( - iter( - test_template.generate_instances( - prediction_length=prediction_length, - windows=windows, - distance=distance, - max_history=max_history, - ) + test_template.generate_instances( + prediction_length=prediction_length, + windows=windows, + distance=distance, + max_history=max_history, + ) ) ) From 117db36403aed3ab4168ac00c6d60e30a2b73561 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Wed, 3 Aug 2022 13:49:42 +0200 Subject: [PATCH 13/13] fixup --- test/dataset/split/test_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/dataset/split/test_split.py b/test/dataset/split/test_split.py index f6c221c630..fe4f593e62 100644 --- a/test/dataset/split/test_split.py +++ b/test/dataset/split/test_split.py @@ -166,7 +166,6 @@ def test_split(date, offset, windows, distance, max_history): distance=distance, max_history=max_history, ) - ) ) assert len(train) == len(dataset)