From b28dcd083652ef1bc1f255494b76af34fa0ca620 Mon Sep 17 00:00:00 2001 From: clmrie Date: Fri, 20 Dec 2024 22:19:27 +0100 Subject: [PATCH 1/7] UP my solution --- sklearn_questions.py | 98 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 16 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..ad91ed2 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -50,18 +50,20 @@ import numpy as np import pandas as pd +import pandas.api.types as pdtypes + from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.model_selection import BaseCrossValidator +from sklearn.utils.multiclass import unique_labels +from sklearn.utils.validation import validate_data, check_is_fitted -from sklearn.utils.validation import check_X_y, check_is_fitted -from sklearn.utils.validation import check_array -from sklearn.utils.multiclass import check_classification_targets -from sklearn.metrics.pairwise import pairwise_distances +from collections import Counter -class KNearestNeighbors(BaseEstimator, ClassifierMixin): + +class KNearestNeighbors(ClassifierMixin, BaseEstimator): """KNearestNeighbors classifier.""" def __init__(self, n_neighbors=1): # noqa: D107 @@ -82,6 +84,10 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = validate_data(self, X, y) + self.classes_ = unique_labels(y) + self.X_ = X + self.y_ = y return self def predict(self, X): @@ -97,7 +103,21 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self) + X = validate_data(self, X, reset=False) + + y_pred = np.full(X.shape[0], self.y_[0]) + for id in range(X.shape[0]): + x = X[id] + liste_y = [] + + list_dis = np.sum((self.X_ - x) ** 2, axis=1) + list_Id_min = np.argpartition(list_dis, + self.n_neighbors)[:self.n_neighbors] + for Id_min in list_Id_min: + liste_y += [self.y_[Id_min]] + + y_pred[id] = Counter(liste_y).most_common(1)[0][0] return y_pred def score(self, X, y): @@ -115,7 +135,12 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + y_pred = self.predict(X) + Accu = 0 + for id in range(X.shape[0]): + if y[id] == y_pred[id]: + Accu += 1 + return Accu / X.shape[0] class MonthlySplit(BaseCrossValidator): @@ -155,7 +180,24 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + if self.time_col == 'index': + if not isinstance(X.index, pd.DatetimeIndex): + raise ValueError('datetime') + df_tri = X.sort_index() + liste_mois = df_tri.index.month + + else: + if not pdtypes.is_datetime64_dtype(X[self.time_col]): + raise ValueError('datetime') + df_tri = X.sort_values(by=self.time_col) + df_tri.index = df_tri[self.time_col] + liste_mois = df_tri.index.month + + n_splits = 0 + for id in range(1, len(liste_mois)): + if liste_mois[id] != liste_mois[id - 1]: + n_splits += 1 + return n_splits def split(self, X, y, groups=None): """Generate indices to split data into training and test set. @@ -177,12 +219,36 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ - - n_samples = X.shape[0] n_splits = self.get_n_splits(X, y, groups) - for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test - ) + + if self.time_col == 'index': + liste_mois = [sorted(X.index)[0]] + + else: + liste_mois = [sorted(X['date'])[0]] + + for mois in range(n_splits): + liste_mois += [liste_mois[-1] + pd.DateOffset(months=1)] + + for split in range(n_splits): + mois_train = liste_mois[split] + mois_test = liste_mois[split + 1] + idx_train = [] + idx_test = [] + + for Idx in range(len(X)): + if self.time_col == 'index': + date = X.index[Idx] + else: + date = X.iloc[Idx]['date'] + + if (date.month == mois_train.month and + date.year == mois_train.year): + idx_train.append(Idx) + + elif (date.month == mois_test.month and + date.year == mois_test.year): + idx_test.append(Idx) + + yield (idx_train, idx_test) + \ No newline at end of file From 4ef52c11a08595a49e355eb36f34a3997baa7e7d Mon Sep 17 00:00:00 2001 From: clmrie Date: Fri, 20 Dec 2024 22:46:01 +0100 Subject: [PATCH 2/7] UP my solution --- sklearn_questions.py | 267 ++++++++++++++++++++----------------------- 1 file changed, 121 insertions(+), 146 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index ad91ed2..a061416 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,52 +1,5 @@ -"""Assignment - making a sklearn estimator and cv splitter. - -The goal of this assignment is to implement by yourself: - -- a scikit-learn estimator for the KNearestNeighbors for classification - tasks and check that it is working properly. -- a scikit-learn CV splitter where the splits are based on a Pandas - DateTimeIndex. - -Detailed instructions for question 1: -The nearest neighbor classifier predicts for a point X_i the target y_k of -the training sample X_k which is the closest to X_i. We measure proximity with -the Euclidean distance. The model will be evaluated with the accuracy (average -number of samples corectly classified). You need to implement the `fit`, -`predict` and `score` methods for this class. The code you write should pass -the test we implemented. You can run the tests by calling at the root of the -repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a -scikit-learn estimator needs to check that the input given to `fit` and -`predict` are correct using the `check_*` functions imported in the file. -You can find more information on how they should be used in the following doc: -https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. -Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - - -Detailed instructions for question 2: -The data to split should contain the index or one column in -datatime format. Then the aim is to split the data between train and test -sets when for each pair of successive months, we learn on the first and -predict of the following. For example if you have data distributed from -november 2020 to march 2021, you have have 4 splits. The first split -will allow to learn on november data and predict on december data, the -second split to learn december and predict on january etc. - -We also ask you to respect the pep8 convention: https://pep8.org. This will be -enforced with `flake8`. You can check that there is no flake8 errors by -calling `flake8` at the root of the repo. - -Finally, you need to write docstrings for the methods you code and for the -class. The docstring will be checked using `pydocstyle` that you can also -call at the root of the repo. - -Hints ------ -- You can use the function: - -from sklearn.metrics.pairwise import pairwise_distances - -to compute distances between 2 sets of samples. -""" +#!/usr/bin/python +# -*- coding: utf-8 -*- import numpy as np import pandas as pd @@ -59,24 +12,24 @@ from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import validate_data, check_is_fitted - from collections import Counter class KNearestNeighbors(ClassifierMixin, BaseEstimator): + """KNearestNeighbors classifier.""" - def __init__(self, n_neighbors=1): # noqa: D107 - self.n_neighbors = n_neighbors + def __init__(self, num_neighbors=1): # noqa: D107 + self.num_neighbors = num_neighbors - def fit(self, X, y): + def fit(self, features, labels): """Fitting function. Parameters ---------- - X : ndarray, shape (n_samples, n_features) + features : ndarray, shape (n_samples, n_features) Data to train the model. - y : ndarray, shape (n_samples,) + labels : ndarray, shape (n_samples,) Labels associated with the training data. Returns @@ -84,171 +37,193 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ - X, y = validate_data(self, X, y) - self.classes_ = unique_labels(y) - self.X_ = X - self.y_ = y + + (features, labels) = validate_data(self, features, labels) + self.classes_ = unique_labels(labels) + self.training_features_ = features + self.training_labels_ = labels return self - def predict(self, X): + def predict(self, features): """Predict function. Parameters ---------- - X : ndarray, shape (n_test_samples, n_features) + features : ndarray, shape (n_test_samples, n_features) Data to predict on. Returns ---------- - y : ndarray, shape (n_test_samples,) + predictions : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - check_is_fitted(self) - X = validate_data(self, X, reset=False) - - y_pred = np.full(X.shape[0], self.y_[0]) - for id in range(X.shape[0]): - x = X[id] - liste_y = [] - - list_dis = np.sum((self.X_ - x) ** 2, axis=1) - list_Id_min = np.argpartition(list_dis, - self.n_neighbors)[:self.n_neighbors] - for Id_min in list_Id_min: - liste_y += [self.y_[Id_min]] - y_pred[id] = Counter(liste_y).most_common(1)[0][0] - return y_pred - - def score(self, X, y): + check_is_fitted(self) + features = validate_data(self, features, reset=False) + + predictions = np.full(features.shape[0], self.training_labels_[0]) + for idx in range(features.shape[0]): + feature = features[idx] + neighbor_labels = [] + + distances = np.sum( + (self.training_features_ - feature) ** 2, axis=1 + ) + nearest_indices = np.argpartition( + distances, self.num_neighbors + )[: self.num_neighbors] + for neighbor_idx in nearest_indices: + neighbor_labels += [self.training_labels_[neighbor_idx]] + + predictions[idx] = Counter(neighbor_labels).most_common(1)[0][0] + return predictions + + def score(self, features, labels): """Calculate the score of the prediction. Parameters ---------- - X : ndarray, shape (n_samples, n_features) + features : ndarray, shape (n_samples, n_features) Data to score on. - y : ndarray, shape (n_samples,) - target values. + labels : ndarray, shape (n_samples,) + Target values. Returns ---------- - score : float - Accuracy of the model computed for the (X, y) pairs. + accuracy : float + Accuracy of the model computed for the (features, labels) pairs. """ - y_pred = self.predict(X) - Accu = 0 - for id in range(X.shape[0]): - if y[id] == y_pred[id]: - Accu += 1 - return Accu / X.shape[0] + + predictions = self.predict(features) + correct_predictions = 0 + for idx in range(features.shape[0]): + if labels[idx] == predictions[idx]: + correct_predictions += 1 + return correct_predictions / features.shape[0] class MonthlySplit(BaseCrossValidator): + """CrossValidator based on monthly split. - Split data based on the given `time_col` (or default to index). Each split - corresponds to one month of data for the training and the next month of - data for the test. + Split data based on the given `time_column` (or default to index). + Each split corresponds to one month of data for the training + and the next month of data for the test. Parameters ---------- - time_col : str, defaults to 'index' + time_column : str, defaults to 'index' Column of the input DataFrame that will be used to split the data. This column should be of type datetime. If split is called with a DataFrame for which this column is not a datetime, it will raise a ValueError. - To use the index as column just set `time_col` to `'index'`. + To use the index as column just set `time_column` to `'index'`. """ - def __init__(self, time_col='index'): # noqa: D107 - self.time_col = time_col + def __init__(self, time_column="index"): # noqa: D107 + self.time_column = time_column - def get_n_splits(self, X, y=None, groups=None): + def get_n_splits( + self, + data, + labels=None, + groups=None, + ): """Return the number of splitting iterations in the cross-validator. Parameters ---------- - X : array-like of shape (n_samples, n_features) + data : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. - y : array-like of shape (n_samples,) + labels : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Returns ------- - n_splits : int + num_splits : int The number of splits. """ - if self.time_col == 'index': - if not isinstance(X.index, pd.DatetimeIndex): - raise ValueError('datetime') - df_tri = X.sort_index() - liste_mois = df_tri.index.month + if self.time_column == "index": + if not isinstance(data.index, pd.DatetimeIndex): + raise ValueError("datetime") + sorted_data = data.sort_index() + months = sorted_data.index.month else: - if not pdtypes.is_datetime64_dtype(X[self.time_col]): - raise ValueError('datetime') - df_tri = X.sort_values(by=self.time_col) - df_tri.index = df_tri[self.time_col] - liste_mois = df_tri.index.month - - n_splits = 0 - for id in range(1, len(liste_mois)): - if liste_mois[id] != liste_mois[id - 1]: - n_splits += 1 - return n_splits - - def split(self, X, y, groups=None): + + if not pdtypes.is_datetime64_dtype(data[self.time_column]): + raise ValueError("datetime") + sorted_data = data.sort_values(by=self.time_column) + sorted_data.index = sorted_data[self.time_column] + months = sorted_data.index.month + + num_splits = 0 + for idx in range(1, len(months)): + if months[idx] != months[idx - 1]: + num_splits += 1 + return num_splits + + def split( + self, + data, + labels, + groups=None, + ): """Generate indices to split data into training and test set. Parameters ---------- - X : array-like of shape (n_samples, n_features) + data : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. - y : array-like of shape (n_samples,) + labels : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Yields ------ - idx_train : ndarray + train_indices : ndarray The training set indices for that split. - idx_test : ndarray + test_indices : ndarray The testing set indices for that split. """ - n_splits = self.get_n_splits(X, y, groups) - if self.time_col == 'index': - liste_mois = [sorted(X.index)[0]] + num_splits = self.get_n_splits(data, labels, groups) + if self.time_column == "index": + months_list = [sorted(data.index)[0]] else: - liste_mois = [sorted(X['date'])[0]] - for mois in range(n_splits): - liste_mois += [liste_mois[-1] + pd.DateOffset(months=1)] + months_list = [sorted(data["date"])[0]] + + for _ in range(num_splits): + months_list += [months_list[-1] + pd.DateOffset(months=1)] - for split in range(n_splits): - mois_train = liste_mois[split] - mois_test = liste_mois[split + 1] - idx_train = [] - idx_test = [] + for split_idx in range(num_splits): + train_month = months_list[split_idx] + test_month = months_list[split_idx + 1] + train_indices = [] + test_indices = [] - for Idx in range(len(X)): - if self.time_col == 'index': - date = X.index[Idx] + for data_idx in range(len(data)): + if self.time_column == "index": + current_date = data.index[data_idx] else: - date = X.iloc[Idx]['date'] + current_date = data.iloc[data_idx]["date"] - if (date.month == mois_train.month and - date.year == mois_train.year): - idx_train.append(Idx) + if ( + current_date.month == train_month.month + and current_date.year == train_month.year + ): + train_indices.append(data_idx) + elif ( + current_date.month == test_month.month + and current_date.year == test_month.year + ): - elif (date.month == mois_test.month and - date.year == mois_test.year): - idx_test.append(Idx) + test_indices.append(data_idx) - yield (idx_train, idx_test) - \ No newline at end of file + yield (train_indices, test_indices) From c6b921c9ca38d43a0c3066d923253a6ec7080f72 Mon Sep 17 00:00:00 2001 From: clmrie Date: Thu, 2 Jan 2025 15:40:10 +0100 Subject: [PATCH 3/7] UP my solution --- sklearn_questions.py | 51 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index a061416..3b5b2b0 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -1,5 +1,52 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- +"""Assignment - making a sklearn estimator and cv splitter. + +The goal of this assignment is to implement by yourself: + +- a scikit-learn estimator for the KNearestNeighbors for classification + tasks and check that it is working properly. +- a scikit-learn CV splitter where the splits are based on a Pandas + DateTimeIndex. + +Detailed instructions for question 1: +The nearest neighbor classifier predicts for a point X_i the target y_k of +the training sample X_k which is the closest to X_i. We measure proximity with +the Euclidean distance. The model will be evaluated with the accuracy (average +number of samples corectly classified). You need to implement the `fit`, +`predict` and `score` methods for this class. The code you write should pass +the test we implemented. You can run the tests by calling at the root of the +repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a +scikit-learn estimator needs to check that the input given to `fit` and +`predict` are correct using the `check_*` functions imported in the file. +You can find more information on how they should be used in the following doc: +https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. +Make sure to use them to pass `test_nearest_neighbor_check_estimator`. + + +Detailed instructions for question 2: +The data to split should contain the index or one column in +datatime format. Then the aim is to split the data between train and test +sets when for each pair of successive months, we learn on the first and +predict of the following. For example if you have data distributed from +november 2020 to march 2021, you have have 4 splits. The first split +will allow to learn on november data and predict on december data, the +second split to learn december and predict on january etc. + +We also ask you to respect the pep8 convention: https://pep8.org. This will be +enforced with `flake8`. You can check that there is no flake8 errors by +calling `flake8` at the root of the repo. + +Finally, you need to write docstrings for the methods you code and for the +class. The docstring will be checked using `pydocstyle` that you can also +call at the root of the repo. + +Hints +----- +- You can use the function: + +from sklearn.metrics.pairwise import pairwise_distances + +to compute distances between 2 sets of samples. +""" import numpy as np import pandas as pd From de70ae189092f159056b9055b2e27a36e0c6a82e Mon Sep 17 00:00:00 2001 From: clmrie Date: Thu, 2 Jan 2025 15:43:53 +0100 Subject: [PATCH 4/7] UP my solution --- .DS_Store | Bin 0 -> 6148 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Thu, 2 Jan 2025 15:50:53 +0100 Subject: [PATCH 5/7] UP my solution --- sklearn_questions.py | 241 +++++++++++++++++++++---------------------- 1 file changed, 115 insertions(+), 126 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index 3b5b2b0..de7bb3c 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -50,227 +50,216 @@ import numpy as np import pandas as pd -import pandas.api.types as pdtypes - from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.model_selection import BaseCrossValidator -from sklearn.utils.multiclass import unique_labels -from sklearn.utils.validation import validate_data, check_is_fitted -from collections import Counter +from sklearn.utils.validation import (check_X_y, check_is_fitted, + validate_data) +from sklearn.utils.multiclass import check_classification_targets +from sklearn.metrics.pairwise import pairwise_distances class KNearestNeighbors(ClassifierMixin, BaseEstimator): + """KNearestNeighbors classifier. - """KNearestNeighbors classifier.""" + This class implements a K-Nearest Neighbors classifier for classification + tasks. The classifier predicts the label of a test point based on the + majority class of its nearest neighbors in the training dataset. + + Parameters + ---------- + n_neighbors : int, default=1 + Number of neighbors to use for classification. + """ - def __init__(self, num_neighbors=1): # noqa: D107 - self.num_neighbors = num_neighbors + def __init__(self, n_neighbors=1): # noqa: D107 + """Initialize the classifier with the specified number of neighbors.""" + self.n_neighbors = n_neighbors - def fit(self, features, labels): + def fit(self, X, y): """Fitting function. + This method stores the training data and labels for later use + during prediction. + Parameters ---------- - features : ndarray, shape (n_samples, n_features) + X : ndarray, shape (n_samples, n_features) Data to train the model. - labels : ndarray, shape (n_samples,) + y : ndarray, shape (n_samples,) Labels associated with the training data. Returns ---------- self : instance of KNearestNeighbors - The current instance of the classifier + The fitted instance of the classifier """ - - (features, labels) = validate_data(self, features, labels) - self.classes_ = unique_labels(labels) - self.training_features_ = features - self.training_labels_ = labels + X, y = check_X_y(X, y) + self.X_train_ = X + self.y_train_ = y + self.n_features_in_ = X.shape[1] + check_classification_targets(y) + self.classes_ = np.unique(y) return self - def predict(self, features): + def predict(self, X): """Predict function. Parameters ---------- - features : ndarray, shape (n_test_samples, n_features) + X : ndarray, shape (n_test_samples, n_features) Data to predict on. Returns ---------- - predictions : ndarray, shape (n_test_samples,) + y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - check_is_fitted(self) - features = validate_data(self, features, reset=False) - - predictions = np.full(features.shape[0], self.training_labels_[0]) - for idx in range(features.shape[0]): - feature = features[idx] - neighbor_labels = [] - - distances = np.sum( - (self.training_features_ - feature) ** 2, axis=1 - ) - nearest_indices = np.argpartition( - distances, self.num_neighbors - )[: self.num_neighbors] - for neighbor_idx in nearest_indices: - neighbor_labels += [self.training_labels_[neighbor_idx]] - - predictions[idx] = Counter(neighbor_labels).most_common(1)[0][0] - return predictions - - def score(self, features, labels): + X = validate_data(self, X, reset=False) + distances = pairwise_distances(X, self.X_train_) + nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors] + unique_classes, y_indices = np.unique(self.y_train_, + return_inverse=True) + neighbor_labels = y_indices[nearest_neighbors] + y_pred = np.array([unique_classes[np.bincount(labels).argmax()] + for labels in neighbor_labels]) + return y_pred + + def score(self, X, y): """Calculate the score of the prediction. Parameters ---------- - features : ndarray, shape (n_samples, n_features) + X : ndarray, shape (n_samples, n_features) Data to score on. - labels : ndarray, shape (n_samples,) - Target values. + y : ndarray, shape (n_samples,) + target values. Returns ---------- - accuracy : float - Accuracy of the model computed for the (features, labels) pairs. + score : float + Accuracy of the model computed as the + mean for correctly predicted labels. """ - - predictions = self.predict(features) - correct_predictions = 0 - for idx in range(features.shape[0]): - if labels[idx] == predictions[idx]: - correct_predictions += 1 - return correct_predictions / features.shape[0] + y_pred = self.predict(X) + return np.mean(y_pred == y) class MonthlySplit(BaseCrossValidator): - """CrossValidator based on monthly split. - Split data based on the given `time_column` (or default to index). - Each split corresponds to one month of data for the training - and the next month of data for the test. + Split data based on the given `time_col` (or default to index). Each split + corresponds to one month of data for the training and the next month of + data for the test. Parameters ---------- - time_column : str, defaults to 'index' + time_col : str, defaults to 'index' Column of the input DataFrame that will be used to split the data. This column should be of type datetime. If split is called with a DataFrame for which this column is not a datetime, it will raise a ValueError. - To use the index as column just set `time_column` to `'index'`. + To use the index as column just set `time_col` to `'index'`. """ - def __init__(self, time_column="index"): # noqa: D107 - self.time_column = time_column + def __init__(self, time_col='index'): # noqa: D107 + self.time_col = time_col - def get_n_splits( - self, - data, - labels=None, - groups=None, - ): + def get_n_splits(self, X, y=None, groups=None): """Return the number of splitting iterations in the cross-validator. Parameters ---------- - data : array-like of shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. - labels : array-like of shape (n_samples,) + y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Returns ------- - num_splits : int - The number of splits. + n_splits : int + The number of splits based on unique months in the data. """ - - if self.time_column == "index": - if not isinstance(data.index, pd.DatetimeIndex): - raise ValueError("datetime") - sorted_data = data.sort_index() - months = sorted_data.index.month + if isinstance(X, pd.Series): + times = X.index + elif isinstance(X, pd.DataFrame): + times = X.index if self.time_col == 'index' else X[self.time_col] else: + raise ValueError("X should be a pandas DataFrame or Series.") - if not pdtypes.is_datetime64_dtype(data[self.time_column]): - raise ValueError("datetime") - sorted_data = data.sort_values(by=self.time_column) - sorted_data.index = sorted_data[self.time_column] - months = sorted_data.index.month - - num_splits = 0 - for idx in range(1, len(months)): - if months[idx] != months[idx - 1]: - num_splits += 1 - return num_splits - - def split( - self, - data, - labels, - groups=None, - ): + if not pd.api.types.is_datetime64_any_dtype(times): + raise ValueError("time_col must be a datetime column.") + periods = pd.Series(times).dt.to_period("M") + return len(periods.unique()) - 1 + + def split(self, X, y, groups=None): """Generate indices to split data into training and test set. Parameters ---------- - data : array-like of shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. - labels : array-like of shape (n_samples,) + y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Yields ------ - train_indices : ndarray + idx_train : ndarray The training set indices for that split. - test_indices : ndarray + idx_test : ndarray The testing set indices for that split. """ + # Determine time column + if isinstance(X, pd.DataFrame): + if self.time_col == 'index': + times = X.index + else: + times = X[self.time_col] + elif isinstance(X, pd.Series): + times = X.index + else: + raise ValueError("X should be a pandas DataFrame or Series.") + + # Ensure time column is datetime + if not pd.api.types.is_datetime64_any_dtype(times): + raise ValueError("time_col must be a datetime column.") - num_splits = self.get_n_splits(data, labels, groups) + # Create a copy of the data + X_copy = X.copy() + y_copy = y.copy() if y is not None else None - if self.time_column == "index": - months_list = [sorted(data.index)[0]] + # Sort the copy of the data by time + if isinstance(X_copy, pd.DataFrame) and self.time_col != 'index': + sorted_data = X_copy.sort_values(by=self.time_col) else: + sorted_data = X_copy.sort_index() - months_list = [sorted(data["date"])[0]] + # Extract the sorted indices + sorted_indices = sorted_data.index - for _ in range(num_splits): - months_list += [months_list[-1] + pd.DateOffset(months=1)] + # Map sorted indices to original indices + times = pd.Series(times.values, index=sorted_indices).sort_index() - for split_idx in range(num_splits): - train_month = months_list[split_idx] - test_month = months_list[split_idx + 1] - train_indices = [] - test_indices = [] + # Sort y_copy if it exists + if y_copy is not None: + y_copy = y_copy.loc[sorted_indices] - for data_idx in range(len(data)): - if self.time_column == "index": - current_date = data.index[data_idx] - else: - current_date = data.iloc[data_idx]["date"] + # Group by unique months + periods = times.dt.to_period("M") + unique_periods = sorted(periods.unique()) - if ( - current_date.month == train_month.month - and current_date.year == train_month.year - ): - train_indices.append(data_idx) - elif ( - current_date.month == test_month.month - and current_date.year == test_month.year - ): + n_splits = self.get_n_splits(X_copy, y_copy, groups) - test_indices.append(data_idx) + for i in range(n_splits): + idx_train = np.where(periods == unique_periods[i])[0] + idx_test = np.where(periods == unique_periods[i + 1])[0] - yield (train_indices, test_indices) + yield idx_train, idx_test \ No newline at end of file From fb2223d300449fc7e05407e112bfdb3bcfae68d4 Mon Sep 17 00:00:00 2001 From: clmrie Date: Thu, 2 Jan 2025 15:59:56 +0100 Subject: [PATCH 6/7] UP my solution --- sklearn_questions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index de7bb3c..0dd2d6f 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -262,4 +262,4 @@ def split(self, X, y, groups=None): idx_train = np.where(periods == unique_periods[i])[0] idx_test = np.where(periods == unique_periods[i + 1])[0] - yield idx_train, idx_test \ No newline at end of file + yield idx_train, idx_test From e368040a75be0d5bc8720c97de04f0e3db1093a9 Mon Sep 17 00:00:00 2001 From: clmrie Date: Thu, 2 Jan 2025 16:10:55 +0100 Subject: [PATCH 7/7] Remove .DS_Store file --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0