x-datascience-datacamp · marija-brkic · Dec 20, 2024
diff --git a/sklearn_questions.py b/sklearn_questions.py
@@ -52,16 +52,18 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
+from typing import Counter
 
 from sklearn.model_selection import BaseCrossValidator
 
-from sklearn.utils.validation import check_X_y, check_is_fitted
-from sklearn.utils.validation import check_array
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import validate_data
 from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.multiclass import unique_labels
 from sklearn.metrics.pairwise import pairwise_distances
 
 
-class KNearestNeighbors(BaseEstimator, ClassifierMixin):
+class KNearestNeighbors(ClassifierMixin, BaseEstimator):
     """KNearestNeighbors classifier."""
 
     def __init__(self, n_neighbors=1):  # noqa: D107
@@ -76,12 +78,15 @@ def fit(self, X, y):
             Data to train the model.
         y : ndarray, shape (n_samples,)
             Labels associated with the training data.
-
         Returns
         ----------
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
+        X, y = validate_data(self, X, y)
+        self.X_train_ = X
+        self.y_train_ = y
+        self.classes_ = unique_labels(y)
         return self
 
     def predict(self, X):
@@ -97,7 +102,16 @@ def predict(self, X):
         y : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-        y_pred = np.zeros(X.shape[0])
+        check_is_fitted(self, ['X_train_', 'y_train_'])
+
+        X = validate_data(self, X, reset=False)
+        distances = pairwise_distances(self.X_train_, X, metric="euclidean")
+        nearest_indices = np.argsort(distances, axis=0)[:self.n_neighbors]
+        nearest_labels = self.y_train_[nearest_indices]
+        y_pred = np.apply_along_axis(
+            lambda labels: Counter(labels).most_common(1)[0][0],
+            axis=0, arr=nearest_labels
+        )
         return y_pred
 
     def score(self, X, y):
@@ -115,7 +129,10 @@ def score(self, X, y):
         score : float
             Accuracy of the model computed for the (X, y) pairs.
         """
-        return 0.
+        check_classification_targets(y)
+        y_pred = self.predict(X)
+        accuracy = np.sum(y_pred == y)/len(y)
+        return accuracy
 
 
 class MonthlySplit(BaseCrossValidator):
@@ -155,7 +172,20 @@ def get_n_splits(self, X, y=None, groups=None):
         n_splits : int
             The number of splits.
         """
-        return 0
+        X2 = X.copy()
+
+        if self.time_col != 'index':
+            X2 = X2.set_index(self.time_col)
+
+        if not isinstance(X.index, pd.DatetimeIndex):
+            X2.index = pd.to_datetime(X2.index)
+
+        time_data = pd.to_datetime(X2.index)
+
+        self.months = time_data.to_period('M')
+        self.unique_months = sorted(pd.unique(self.months))
+
+        return len(self.unique_months) - 1
 
     def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
@@ -177,12 +207,18 @@ def split(self, X, y, groups=None):
         idx_test : ndarray
             The testing set indices for that split.
         """
-
-        n_samples = X.shape[0]
         n_splits = self.get_n_splits(X, y, groups)
+        if n_splits < 1:
+            raise ValueError(
+                "not enough values in datetime to split")
+
         for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
-            yield (
-                idx_train, idx_test
-            )
+            train_month = self.unique_months[i]
+            test_month = self.unique_months[i + 1]
+
+            idx_train = [idx for idx, month in enumerate(
+                self.months) if month == train_month]
+            idx_test = [idx for idx, month in enumerate(
+                self.months) if month == test_month]
+
+            yield idx_train, idx_test