From b28dcd083652ef1bc1f255494b76af34fa0ca620 Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Fri, 20 Dec 2024 22:19:27 +0100
Subject: [PATCH 1/7] UP my solution

---
 sklearn_questions.py | 98 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 16 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index fa02e0d..ad91ed2 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -50,18 +50,20 @@
 import numpy as np
 import pandas as pd
 
+import pandas.api.types as pdtypes
+
 from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
 
 from sklearn.model_selection import BaseCrossValidator
+from sklearn.utils.multiclass import unique_labels
+from sklearn.utils.validation import validate_data, check_is_fitted
 
-from sklearn.utils.validation import check_X_y, check_is_fitted
-from sklearn.utils.validation import check_array
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.metrics.pairwise import pairwise_distances
 
+from collections import Counter
 
-class KNearestNeighbors(BaseEstimator, ClassifierMixin):
+
+class KNearestNeighbors(ClassifierMixin, BaseEstimator):
     """KNearestNeighbors classifier."""
 
     def __init__(self, n_neighbors=1):  # noqa: D107
@@ -82,6 +84,10 @@ def fit(self, X, y):
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
+        X, y = validate_data(self, X, y)
+        self.classes_ = unique_labels(y)
+        self.X_ = X
+        self.y_ = y
         return self
 
     def predict(self, X):
@@ -97,7 +103,21 @@ def predict(self, X):
         y : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-        y_pred = np.zeros(X.shape[0])
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
+
+        y_pred = np.full(X.shape[0], self.y_[0])
+        for id in range(X.shape[0]):
+            x = X[id]
+            liste_y = []
+
+            list_dis = np.sum((self.X_ - x) ** 2, axis=1)
+            list_Id_min = np.argpartition(list_dis,
+                                          self.n_neighbors)[:self.n_neighbors]
+            for Id_min in list_Id_min:
+                liste_y += [self.y_[Id_min]]
+
+            y_pred[id] = Counter(liste_y).most_common(1)[0][0]
         return y_pred
 
     def score(self, X, y):
@@ -115,7 +135,12 @@ def score(self, X, y):
         score : float
             Accuracy of the model computed for the (X, y) pairs.
         """
-        return 0.
+        y_pred = self.predict(X)
+        Accu = 0
+        for id in range(X.shape[0]):
+            if y[id] == y_pred[id]:
+                Accu += 1
+        return Accu / X.shape[0]
 
 
 class MonthlySplit(BaseCrossValidator):
@@ -155,7 +180,24 @@ def get_n_splits(self, X, y=None, groups=None):
         n_splits : int
             The number of splits.
         """
-        return 0
+        if self.time_col == 'index':
+            if not isinstance(X.index, pd.DatetimeIndex):
+                raise ValueError('datetime')
+            df_tri = X.sort_index()
+            liste_mois = df_tri.index.month
+
+        else:
+            if not pdtypes.is_datetime64_dtype(X[self.time_col]):
+                raise ValueError('datetime')
+            df_tri = X.sort_values(by=self.time_col)
+            df_tri.index = df_tri[self.time_col]
+            liste_mois = df_tri.index.month
+
+        n_splits = 0
+        for id in range(1, len(liste_mois)):
+            if liste_mois[id] != liste_mois[id - 1]:
+                n_splits += 1
+        return n_splits
 
     def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
@@ -177,12 +219,36 @@ def split(self, X, y, groups=None):
         idx_test : ndarray
             The testing set indices for that split.
         """
-
-        n_samples = X.shape[0]
         n_splits = self.get_n_splits(X, y, groups)
-        for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
-            yield (
-                idx_train, idx_test
-            )
+
+        if self.time_col == 'index':
+            liste_mois = [sorted(X.index)[0]]
+
+        else:
+            liste_mois = [sorted(X['date'])[0]]
+
+        for mois in range(n_splits):
+            liste_mois += [liste_mois[-1] + pd.DateOffset(months=1)]
+
+        for split in range(n_splits):
+            mois_train = liste_mois[split]
+            mois_test = liste_mois[split + 1]
+            idx_train = []
+            idx_test = []
+
+            for Idx in range(len(X)):
+                if self.time_col == 'index':
+                    date = X.index[Idx]
+                else:
+                    date = X.iloc[Idx]['date']
+
+                if (date.month == mois_train.month and
+                        date.year == mois_train.year):
+                    idx_train.append(Idx)
+
+                elif (date.month == mois_test.month and
+                      date.year == mois_test.year):
+                    idx_test.append(Idx)
+
+            yield (idx_train, idx_test)
+            
\ No newline at end of file

From 4ef52c11a08595a49e355eb36f34a3997baa7e7d Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Fri, 20 Dec 2024 22:46:01 +0100
Subject: [PATCH 2/7] UP my solution

---
 sklearn_questions.py | 267 ++++++++++++++++++++-----------------------
 1 file changed, 121 insertions(+), 146 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index ad91ed2..a061416 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -1,52 +1,5 @@
-"""Assignment - making a sklearn estimator and cv splitter.
-
-The goal of this assignment is to implement by yourself:
-
-- a scikit-learn estimator for the KNearestNeighbors for classification
-  tasks and check that it is working properly.
-- a scikit-learn CV splitter where the splits are based on a Pandas
-  DateTimeIndex.
-
-Detailed instructions for question 1:
-The nearest neighbor classifier predicts for a point X_i the target y_k of
-the training sample X_k which is the closest to X_i. We measure proximity with
-the Euclidean distance. The model will be evaluated with the accuracy (average
-number of samples corectly classified). You need to implement the `fit`,
-`predict` and `score` methods for this class. The code you write should pass
-the test we implemented. You can run the tests by calling at the root of the
-repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
-scikit-learn estimator needs to check that the input given to `fit` and
-`predict` are correct using the `check_*` functions imported in the file.
-You can find more information on how they should be used in the following doc:
-https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
-Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
-
-
-Detailed instructions for question 2:
-The data to split should contain the index or one column in
-datatime format. Then the aim is to split the data between train and test
-sets when for each pair of successive months, we learn on the first and
-predict of the following. For example if you have data distributed from
-november 2020 to march 2021, you have have 4 splits. The first split
-will allow to learn on november data and predict on december data, the
-second split to learn december and predict on january etc.
-
-We also ask you to respect the pep8 convention: https://pep8.org. This will be
-enforced with `flake8`. You can check that there is no flake8 errors by
-calling `flake8` at the root of the repo.
-
-Finally, you need to write docstrings for the methods you code and for the
-class. The docstring will be checked using `pydocstyle` that you can also
-call at the root of the repo.
-
-Hints
------
-- You can use the function:
-
-from sklearn.metrics.pairwise import pairwise_distances
-
-to compute distances between 2 sets of samples.
-"""
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
 import numpy as np
 import pandas as pd
 
@@ -59,24 +12,24 @@
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import validate_data, check_is_fitted
 
-
 from collections import Counter
 
 
 class KNearestNeighbors(ClassifierMixin, BaseEstimator):
+
     """KNearestNeighbors classifier."""
 
-    def __init__(self, n_neighbors=1):  # noqa: D107
-        self.n_neighbors = n_neighbors
+    def __init__(self, num_neighbors=1):  # noqa: D107
+        self.num_neighbors = num_neighbors
 
-    def fit(self, X, y):
+    def fit(self, features, labels):
         """Fitting function.
 
          Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        features : ndarray, shape (n_samples, n_features)
             Data to train the model.
-        y : ndarray, shape (n_samples,)
+        labels : ndarray, shape (n_samples,)
             Labels associated with the training data.
 
         Returns
@@ -84,171 +37,193 @@ def fit(self, X, y):
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
-        X, y = validate_data(self, X, y)
-        self.classes_ = unique_labels(y)
-        self.X_ = X
-        self.y_ = y
+
+        (features, labels) = validate_data(self, features, labels)
+        self.classes_ = unique_labels(labels)
+        self.training_features_ = features
+        self.training_labels_ = labels
         return self
 
-    def predict(self, X):
+    def predict(self, features):
         """Predict function.
 
         Parameters
         ----------
-        X : ndarray, shape (n_test_samples, n_features)
+        features : ndarray, shape (n_test_samples, n_features)
             Data to predict on.
 
         Returns
         ----------
-        y : ndarray, shape (n_test_samples,)
+        predictions : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-        check_is_fitted(self)
-        X = validate_data(self, X, reset=False)
-
-        y_pred = np.full(X.shape[0], self.y_[0])
-        for id in range(X.shape[0]):
-            x = X[id]
-            liste_y = []
-
-            list_dis = np.sum((self.X_ - x) ** 2, axis=1)
-            list_Id_min = np.argpartition(list_dis,
-                                          self.n_neighbors)[:self.n_neighbors]
-            for Id_min in list_Id_min:
-                liste_y += [self.y_[Id_min]]
 
-            y_pred[id] = Counter(liste_y).most_common(1)[0][0]
-        return y_pred
-
-    def score(self, X, y):
+        check_is_fitted(self)
+        features = validate_data(self, features, reset=False)
+
+        predictions = np.full(features.shape[0], self.training_labels_[0])
+        for idx in range(features.shape[0]):
+            feature = features[idx]
+            neighbor_labels = []
+
+            distances = np.sum(
+                (self.training_features_ - feature) ** 2, axis=1
+            )
+            nearest_indices = np.argpartition(
+                distances, self.num_neighbors
+            )[: self.num_neighbors]
+            for neighbor_idx in nearest_indices:
+                neighbor_labels += [self.training_labels_[neighbor_idx]]
+
+            predictions[idx] = Counter(neighbor_labels).most_common(1)[0][0]
+        return predictions
+
+    def score(self, features, labels):
         """Calculate the score of the prediction.
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        features : ndarray, shape (n_samples, n_features)
             Data to score on.
-        y : ndarray, shape (n_samples,)
-            target values.
+        labels : ndarray, shape (n_samples,)
+            Target values.
 
         Returns
         ----------
-        score : float
-            Accuracy of the model computed for the (X, y) pairs.
+        accuracy : float
+            Accuracy of the model computed for the (features, labels) pairs.
         """
-        y_pred = self.predict(X)
-        Accu = 0
-        for id in range(X.shape[0]):
-            if y[id] == y_pred[id]:
-                Accu += 1
-        return Accu / X.shape[0]
+
+        predictions = self.predict(features)
+        correct_predictions = 0
+        for idx in range(features.shape[0]):
+            if labels[idx] == predictions[idx]:
+                correct_predictions += 1
+        return correct_predictions / features.shape[0]
 
 
 class MonthlySplit(BaseCrossValidator):
+
     """CrossValidator based on monthly split.
 
-    Split data based on the given `time_col` (or default to index). Each split
-    corresponds to one month of data for the training and the next month of
-    data for the test.
+    Split data based on the given `time_column` (or default to index).
+    Each split corresponds to one month of data for the training
+    and the next month of data for the test.
 
     Parameters
     ----------
-    time_col : str, defaults to 'index'
+    time_column : str, defaults to 'index'
         Column of the input DataFrame that will be used to split the data. This
         column should be of type datetime. If split is called with a DataFrame
         for which this column is not a datetime, it will raise a ValueError.
-        To use the index as column just set `time_col` to `'index'`.
+        To use the index as column just set `time_column` to `'index'`.
     """
 
-    def __init__(self, time_col='index'):  # noqa: D107
-        self.time_col = time_col
+    def __init__(self, time_column="index"):  # noqa: D107
+        self.time_column = time_column
 
-    def get_n_splits(self, X, y=None, groups=None):
+    def get_n_splits(
+        self,
+        data,
+        labels=None,
+        groups=None,
+    ):
         """Return the number of splitting iterations in the cross-validator.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        data : array-like of shape (n_samples, n_features)
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
-        y : array-like of shape (n_samples,)
+        labels : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
 
         Returns
         -------
-        n_splits : int
+        num_splits : int
             The number of splits.
         """
-        if self.time_col == 'index':
-            if not isinstance(X.index, pd.DatetimeIndex):
-                raise ValueError('datetime')
-            df_tri = X.sort_index()
-            liste_mois = df_tri.index.month
 
+        if self.time_column == "index":
+            if not isinstance(data.index, pd.DatetimeIndex):
+                raise ValueError("datetime")
+            sorted_data = data.sort_index()
+            months = sorted_data.index.month
         else:
-            if not pdtypes.is_datetime64_dtype(X[self.time_col]):
-                raise ValueError('datetime')
-            df_tri = X.sort_values(by=self.time_col)
-            df_tri.index = df_tri[self.time_col]
-            liste_mois = df_tri.index.month
-
-        n_splits = 0
-        for id in range(1, len(liste_mois)):
-            if liste_mois[id] != liste_mois[id - 1]:
-                n_splits += 1
-        return n_splits
-
-    def split(self, X, y, groups=None):
+
+            if not pdtypes.is_datetime64_dtype(data[self.time_column]):
+                raise ValueError("datetime")
+            sorted_data = data.sort_values(by=self.time_column)
+            sorted_data.index = sorted_data[self.time_column]
+            months = sorted_data.index.month
+
+        num_splits = 0
+        for idx in range(1, len(months)):
+            if months[idx] != months[idx - 1]:
+                num_splits += 1
+        return num_splits
+
+    def split(
+        self,
+        data,
+        labels,
+        groups=None,
+    ):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        data : array-like of shape (n_samples, n_features)
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
-        y : array-like of shape (n_samples,)
+        labels : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
 
         Yields
         ------
-        idx_train : ndarray
+        train_indices : ndarray
             The training set indices for that split.
-        idx_test : ndarray
+        test_indices : ndarray
             The testing set indices for that split.
         """
-        n_splits = self.get_n_splits(X, y, groups)
 
-        if self.time_col == 'index':
-            liste_mois = [sorted(X.index)[0]]
+        num_splits = self.get_n_splits(data, labels, groups)
 
+        if self.time_column == "index":
+            months_list = [sorted(data.index)[0]]
         else:
-            liste_mois = [sorted(X['date'])[0]]
 
-        for mois in range(n_splits):
-            liste_mois += [liste_mois[-1] + pd.DateOffset(months=1)]
+            months_list = [sorted(data["date"])[0]]
+
+        for _ in range(num_splits):
+            months_list += [months_list[-1] + pd.DateOffset(months=1)]
 
-        for split in range(n_splits):
-            mois_train = liste_mois[split]
-            mois_test = liste_mois[split + 1]
-            idx_train = []
-            idx_test = []
+        for split_idx in range(num_splits):
+            train_month = months_list[split_idx]
+            test_month = months_list[split_idx + 1]
+            train_indices = []
+            test_indices = []
 
-            for Idx in range(len(X)):
-                if self.time_col == 'index':
-                    date = X.index[Idx]
+            for data_idx in range(len(data)):
+                if self.time_column == "index":
+                    current_date = data.index[data_idx]
                 else:
-                    date = X.iloc[Idx]['date']
+                    current_date = data.iloc[data_idx]["date"]
 
-                if (date.month == mois_train.month and
-                        date.year == mois_train.year):
-                    idx_train.append(Idx)
+                if (
+                    current_date.month == train_month.month
+                    and current_date.year == train_month.year
+                ):
+                    train_indices.append(data_idx)
+                elif (
+                    current_date.month == test_month.month
+                    and current_date.year == test_month.year
+                ):
 
-                elif (date.month == mois_test.month and
-                      date.year == mois_test.year):
-                    idx_test.append(Idx)
+                    test_indices.append(data_idx)
 
-            yield (idx_train, idx_test)
-            
\ No newline at end of file
+            yield (train_indices, test_indices)

From c6b921c9ca38d43a0c3066d923253a6ec7080f72 Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Thu, 2 Jan 2025 15:40:10 +0100
Subject: [PATCH 3/7] UP my solution

---
 sklearn_questions.py | 51 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index a061416..3b5b2b0 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -1,5 +1,52 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
+"""Assignment - making a sklearn estimator and cv splitter.
+
+The goal of this assignment is to implement by yourself:
+
+- a scikit-learn estimator for the KNearestNeighbors for classification
+  tasks and check that it is working properly.
+- a scikit-learn CV splitter where the splits are based on a Pandas
+  DateTimeIndex.
+
+Detailed instructions for question 1:
+The nearest neighbor classifier predicts for a point X_i the target y_k of
+the training sample X_k which is the closest to X_i. We measure proximity with
+the Euclidean distance. The model will be evaluated with the accuracy (average
+number of samples corectly classified). You need to implement the `fit`,
+`predict` and `score` methods for this class. The code you write should pass
+the test we implemented. You can run the tests by calling at the root of the
+repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
+scikit-learn estimator needs to check that the input given to `fit` and
+`predict` are correct using the `check_*` functions imported in the file.
+You can find more information on how they should be used in the following doc:
+https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
+Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
+
+
+Detailed instructions for question 2:
+The data to split should contain the index or one column in
+datatime format. Then the aim is to split the data between train and test
+sets when for each pair of successive months, we learn on the first and
+predict of the following. For example if you have data distributed from
+november 2020 to march 2021, you have have 4 splits. The first split
+will allow to learn on november data and predict on december data, the
+second split to learn december and predict on january etc.
+
+We also ask you to respect the pep8 convention: https://pep8.org. This will be
+enforced with `flake8`. You can check that there is no flake8 errors by
+calling `flake8` at the root of the repo.
+
+Finally, you need to write docstrings for the methods you code and for the
+class. The docstring will be checked using `pydocstyle` that you can also
+call at the root of the repo.
+
+Hints
+-----
+- You can use the function:
+
+from sklearn.metrics.pairwise import pairwise_distances
+
+to compute distances between 2 sets of samples.
+"""
 import numpy as np
 import pandas as pd
 

From de70ae189092f159056b9055b2e27a36e0c6a82e Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Thu, 2 Jan 2025 15:43:53 +0100
Subject: [PATCH 4/7] UP my solution

---
 .DS_Store | Bin 0 -> 6148 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
GIT binary patch
literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

literal 0
HcmV?d00001


From f3741e268b14f9f0d4e36f09c731697dbb2b65e4 Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Thu, 2 Jan 2025 15:50:53 +0100
Subject: [PATCH 5/7] UP my solution

---
 sklearn_questions.py | 241 +++++++++++++++++++++----------------------
 1 file changed, 115 insertions(+), 126 deletions(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index 3b5b2b0..de7bb3c 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -50,227 +50,216 @@
 import numpy as np
 import pandas as pd
 
-import pandas.api.types as pdtypes
-
 from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
 
 from sklearn.model_selection import BaseCrossValidator
-from sklearn.utils.multiclass import unique_labels
-from sklearn.utils.validation import validate_data, check_is_fitted
 
-from collections import Counter
+from sklearn.utils.validation import (check_X_y, check_is_fitted,
+                                      validate_data)
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.metrics.pairwise import pairwise_distances
 
 
 class KNearestNeighbors(ClassifierMixin, BaseEstimator):
+    """KNearestNeighbors classifier.
 
-    """KNearestNeighbors classifier."""
+    This class implements a K-Nearest Neighbors classifier for classification
+    tasks. The classifier predicts the label of a test point based on the
+    majority class of its nearest neighbors in the training dataset.
+
+    Parameters
+    ----------
+    n_neighbors : int, default=1
+        Number of neighbors to use for classification.
+    """
 
-    def __init__(self, num_neighbors=1):  # noqa: D107
-        self.num_neighbors = num_neighbors
+    def __init__(self, n_neighbors=1):  # noqa: D107
+        """Initialize the classifier with the specified number of neighbors."""
+        self.n_neighbors = n_neighbors
 
-    def fit(self, features, labels):
+    def fit(self, X, y):
         """Fitting function.
 
+        This method stores the training data and labels for later use
+        during prediction.
+
          Parameters
         ----------
-        features : ndarray, shape (n_samples, n_features)
+        X : ndarray, shape (n_samples, n_features)
             Data to train the model.
-        labels : ndarray, shape (n_samples,)
+        y : ndarray, shape (n_samples,)
             Labels associated with the training data.
 
         Returns
         ----------
         self : instance of KNearestNeighbors
-            The current instance of the classifier
+            The fitted instance of the classifier
         """
-
-        (features, labels) = validate_data(self, features, labels)
-        self.classes_ = unique_labels(labels)
-        self.training_features_ = features
-        self.training_labels_ = labels
+        X, y = check_X_y(X, y)
+        self.X_train_ = X
+        self.y_train_ = y
+        self.n_features_in_ = X.shape[1]
+        check_classification_targets(y)
+        self.classes_ = np.unique(y)
         return self
 
-    def predict(self, features):
+    def predict(self, X):
         """Predict function.
 
         Parameters
         ----------
-        features : ndarray, shape (n_test_samples, n_features)
+        X : ndarray, shape (n_test_samples, n_features)
             Data to predict on.
 
         Returns
         ----------
-        predictions : ndarray, shape (n_test_samples,)
+        y : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-
         check_is_fitted(self)
-        features = validate_data(self, features, reset=False)
-
-        predictions = np.full(features.shape[0], self.training_labels_[0])
-        for idx in range(features.shape[0]):
-            feature = features[idx]
-            neighbor_labels = []
-
-            distances = np.sum(
-                (self.training_features_ - feature) ** 2, axis=1
-            )
-            nearest_indices = np.argpartition(
-                distances, self.num_neighbors
-            )[: self.num_neighbors]
-            for neighbor_idx in nearest_indices:
-                neighbor_labels += [self.training_labels_[neighbor_idx]]
-
-            predictions[idx] = Counter(neighbor_labels).most_common(1)[0][0]
-        return predictions
-
-    def score(self, features, labels):
+        X = validate_data(self, X, reset=False)
+        distances = pairwise_distances(X, self.X_train_)
+        nearest_neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors]
+        unique_classes, y_indices = np.unique(self.y_train_,
+                                              return_inverse=True)
+        neighbor_labels = y_indices[nearest_neighbors]
+        y_pred = np.array([unique_classes[np.bincount(labels).argmax()]
+                           for labels in neighbor_labels])
+        return y_pred
+
+    def score(self, X, y):
         """Calculate the score of the prediction.
 
         Parameters
         ----------
-        features : ndarray, shape (n_samples, n_features)
+        X : ndarray, shape (n_samples, n_features)
             Data to score on.
-        labels : ndarray, shape (n_samples,)
-            Target values.
+        y : ndarray, shape (n_samples,)
+            target values.
 
         Returns
         ----------
-        accuracy : float
-            Accuracy of the model computed for the (features, labels) pairs.
+        score : float
+            Accuracy of the model computed as the
+            mean for correctly predicted labels.
         """
-
-        predictions = self.predict(features)
-        correct_predictions = 0
-        for idx in range(features.shape[0]):
-            if labels[idx] == predictions[idx]:
-                correct_predictions += 1
-        return correct_predictions / features.shape[0]
+        y_pred = self.predict(X)
+        return np.mean(y_pred == y)
 
 
 class MonthlySplit(BaseCrossValidator):
-
     """CrossValidator based on monthly split.
 
-    Split data based on the given `time_column` (or default to index).
-    Each split corresponds to one month of data for the training
-    and the next month of data for the test.
+    Split data based on the given `time_col` (or default to index). Each split
+    corresponds to one month of data for the training and the next month of
+    data for the test.
 
     Parameters
     ----------
-    time_column : str, defaults to 'index'
+    time_col : str, defaults to 'index'
         Column of the input DataFrame that will be used to split the data. This
         column should be of type datetime. If split is called with a DataFrame
         for which this column is not a datetime, it will raise a ValueError.
-        To use the index as column just set `time_column` to `'index'`.
+        To use the index as column just set `time_col` to `'index'`.
     """
 
-    def __init__(self, time_column="index"):  # noqa: D107
-        self.time_column = time_column
+    def __init__(self, time_col='index'):  # noqa: D107
+        self.time_col = time_col
 
-    def get_n_splits(
-        self,
-        data,
-        labels=None,
-        groups=None,
-    ):
+    def get_n_splits(self, X, y=None, groups=None):
         """Return the number of splitting iterations in the cross-validator.
 
         Parameters
         ----------
-        data : array-like of shape (n_samples, n_features)
+        X : array-like of shape (n_samples, n_features)
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
-        labels : array-like of shape (n_samples,)
+        y : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
 
         Returns
         -------
-        num_splits : int
-            The number of splits.
+        n_splits : int
+            The number of splits based on unique months in the data.
         """
-
-        if self.time_column == "index":
-            if not isinstance(data.index, pd.DatetimeIndex):
-                raise ValueError("datetime")
-            sorted_data = data.sort_index()
-            months = sorted_data.index.month
+        if isinstance(X, pd.Series):
+            times = X.index
+        elif isinstance(X, pd.DataFrame):
+            times = X.index if self.time_col == 'index' else X[self.time_col]
         else:
+            raise ValueError("X should be a pandas DataFrame or Series.")
 
-            if not pdtypes.is_datetime64_dtype(data[self.time_column]):
-                raise ValueError("datetime")
-            sorted_data = data.sort_values(by=self.time_column)
-            sorted_data.index = sorted_data[self.time_column]
-            months = sorted_data.index.month
-
-        num_splits = 0
-        for idx in range(1, len(months)):
-            if months[idx] != months[idx - 1]:
-                num_splits += 1
-        return num_splits
-
-    def split(
-        self,
-        data,
-        labels,
-        groups=None,
-    ):
+        if not pd.api.types.is_datetime64_any_dtype(times):
+            raise ValueError("time_col must be a datetime column.")
+        periods = pd.Series(times).dt.to_period("M")
+        return len(periods.unique()) - 1
+
+    def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
 
         Parameters
         ----------
-        data : array-like of shape (n_samples, n_features)
+        X : array-like of shape (n_samples, n_features)
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
-        labels : array-like of shape (n_samples,)
+        y : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
 
         Yields
         ------
-        train_indices : ndarray
+        idx_train : ndarray
             The training set indices for that split.
-        test_indices : ndarray
+        idx_test : ndarray
             The testing set indices for that split.
         """
+        # Determine time column
+        if isinstance(X, pd.DataFrame):
+            if self.time_col == 'index':
+                times = X.index
+            else:
+                times = X[self.time_col]
+        elif isinstance(X, pd.Series):
+            times = X.index
+        else:
+            raise ValueError("X should be a pandas DataFrame or Series.")
+
+        # Ensure time column is datetime
+        if not pd.api.types.is_datetime64_any_dtype(times):
+            raise ValueError("time_col must be a datetime column.")
 
-        num_splits = self.get_n_splits(data, labels, groups)
+        # Create a copy of the data
+        X_copy = X.copy()
+        y_copy = y.copy() if y is not None else None
 
-        if self.time_column == "index":
-            months_list = [sorted(data.index)[0]]
+        # Sort the copy of the data by time
+        if isinstance(X_copy, pd.DataFrame) and self.time_col != 'index':
+            sorted_data = X_copy.sort_values(by=self.time_col)
         else:
+            sorted_data = X_copy.sort_index()
 
-            months_list = [sorted(data["date"])[0]]
+        # Extract the sorted indices
+        sorted_indices = sorted_data.index
 
-        for _ in range(num_splits):
-            months_list += [months_list[-1] + pd.DateOffset(months=1)]
+        # Map sorted indices to original indices
+        times = pd.Series(times.values, index=sorted_indices).sort_index()
 
-        for split_idx in range(num_splits):
-            train_month = months_list[split_idx]
-            test_month = months_list[split_idx + 1]
-            train_indices = []
-            test_indices = []
+        # Sort y_copy if it exists
+        if y_copy is not None:
+            y_copy = y_copy.loc[sorted_indices]
 
-            for data_idx in range(len(data)):
-                if self.time_column == "index":
-                    current_date = data.index[data_idx]
-                else:
-                    current_date = data.iloc[data_idx]["date"]
+        # Group by unique months
+        periods = times.dt.to_period("M")
+        unique_periods = sorted(periods.unique())
 
-                if (
-                    current_date.month == train_month.month
-                    and current_date.year == train_month.year
-                ):
-                    train_indices.append(data_idx)
-                elif (
-                    current_date.month == test_month.month
-                    and current_date.year == test_month.year
-                ):
+        n_splits = self.get_n_splits(X_copy, y_copy, groups)
 
-                    test_indices.append(data_idx)
+        for i in range(n_splits):
+            idx_train = np.where(periods == unique_periods[i])[0]
+            idx_test = np.where(periods == unique_periods[i + 1])[0]
 
-            yield (train_indices, test_indices)
+            yield idx_train, idx_test
\ No newline at end of file

From fb2223d300449fc7e05407e112bfdb3bcfae68d4 Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Thu, 2 Jan 2025 15:59:56 +0100
Subject: [PATCH 6/7] UP my solution

---
 sklearn_questions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn_questions.py b/sklearn_questions.py
index de7bb3c..0dd2d6f 100644
--- a/sklearn_questions.py
+++ b/sklearn_questions.py
@@ -262,4 +262,4 @@ def split(self, X, y, groups=None):
             idx_train = np.where(periods == unique_periods[i])[0]
             idx_test = np.where(periods == unique_periods[i + 1])[0]
 
-            yield idx_train, idx_test
\ No newline at end of file
+            yield idx_train, idx_test

From e368040a75be0d5bc8720c97de04f0e3db1093a9 Mon Sep 17 00:00:00 2001
From: clmrie <marieclement172@gmail.com>
Date: Thu, 2 Jan 2025 16:10:55 +0100
Subject: [PATCH 7/7] Remove .DS_Store file

---
 .DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I