Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 50 additions & 14 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,18 @@

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from typing import Counter

from sklearn.model_selection import BaseCrossValidator

from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import validate_data
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics.pairwise import pairwise_distances


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
class KNearestNeighbors(ClassifierMixin, BaseEstimator):
"""KNearestNeighbors classifier."""

def __init__(self, n_neighbors=1): # noqa: D107
Expand All @@ -76,12 +78,15 @@ def fit(self, X, y):
Data to train the model.
y : ndarray, shape (n_samples,)
Labels associated with the training data.

Returns
----------
self : instance of KNearestNeighbors
The current instance of the classifier
"""
X, y = validate_data(self, X, y)
self.X_train_ = X
self.y_train_ = y
self.classes_ = unique_labels(y)
return self

def predict(self, X):
Expand All @@ -97,7 +102,16 @@ def predict(self, X):
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
y_pred = np.zeros(X.shape[0])
check_is_fitted(self, ['X_train_', 'y_train_'])

X = validate_data(self, X, reset=False)
distances = pairwise_distances(self.X_train_, X, metric="euclidean")
nearest_indices = np.argsort(distances, axis=0)[:self.n_neighbors]
nearest_labels = self.y_train_[nearest_indices]
y_pred = np.apply_along_axis(
lambda labels: Counter(labels).most_common(1)[0][0],
axis=0, arr=nearest_labels
)
return y_pred

def score(self, X, y):
Expand All @@ -115,7 +129,10 @@ def score(self, X, y):
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
check_classification_targets(y)
y_pred = self.predict(X)
accuracy = np.sum(y_pred == y)/len(y)
return accuracy


class MonthlySplit(BaseCrossValidator):
Expand Down Expand Up @@ -155,7 +172,20 @@ def get_n_splits(self, X, y=None, groups=None):
n_splits : int
The number of splits.
"""
return 0
X2 = X.copy()

if self.time_col != 'index':
X2 = X2.set_index(self.time_col)

if not isinstance(X.index, pd.DatetimeIndex):
X2.index = pd.to_datetime(X2.index)

time_data = pd.to_datetime(X2.index)

self.months = time_data.to_period('M')
self.unique_months = sorted(pd.unique(self.months))

return len(self.unique_months) - 1

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Expand All @@ -177,12 +207,18 @@ def split(self, X, y, groups=None):
idx_test : ndarray
The testing set indices for that split.
"""

n_samples = X.shape[0]
n_splits = self.get_n_splits(X, y, groups)
if n_splits < 1:
raise ValueError(
"not enough values in datetime to split")

for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
yield (
idx_train, idx_test
)
train_month = self.unique_months[i]
test_month = self.unique_months[i + 1]

idx_train = [idx for idx, month in enumerate(
self.months) if month == train_month]
idx_test = [idx for idx, month in enumerate(
self.months) if month == test_month]

yield idx_train, idx_test
Loading