From 0985aaffe7ad49d99ec8a6a2d99d6704638b3744 Mon Sep 17 00:00:00 2001 From: "mikolaj.mierzejewski" Date: Fri, 18 Aug 2023 16:18:27 +0200 Subject: [PATCH] add frequency encoder --- examples/08_frequence_encoder.ipynb | 408 ++++++++++++++++++++++++++++ skrub/_frequency_encoder.py | 35 +++ 2 files changed, 443 insertions(+) create mode 100644 examples/08_frequence_encoder.ipynb create mode 100644 skrub/_frequency_encoder.py diff --git a/examples/08_frequence_encoder.ipynb b/examples/08_frequence_encoder.ipynb new file mode 100644 index 000000000..a74ac8c4c --- /dev/null +++ b/examples/08_frequence_encoder.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from skrub.datasets import fetch_road_safety" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "road_safety_dataset = fetch_road_safety()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Vehicle_Type\n", + "9.0 271120\n", + "1.0 20281\n", + "19.0 19612\n", + "3.0 9966\n", + "5.0 8268\n", + "11.0 7758\n", + "8.0 7474\n", + "21.0 6785\n", + "4.0 2446\n", + "2.0 2392\n", + "20.0 2318\n", + "90.0 1740\n", + "10.0 906\n", + "98.0 790\n", + "17.0 650\n", + "97.0 301\n", + "22.0 226\n", + "16.0 119\n", + "18.0 20\n", + "23.0 9\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "road_safety_dataset.X.Vehicle_Type.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "employee_salaries.X.Vehicle_Type.value_counts().sort_values().plot(kind=\"bar\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's say we want to group together the values basing on their frequencies so in the end we will end with 6 values instead of 20. That will reduce the number of features when we want to perform other operations - like one-hot encoding and clustering. " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Vehicle_Type\n", + "23.0 9\n", + "18.0 20\n", + "16.0 119\n", + "22.0 226\n", + "97.0 301\n", + "17.0 650\n", + "98.0 790\n", + "10.0 906\n", + "90.0 1740\n", + "20.0 2318\n", + "2.0 2392\n", + "4.0 2446\n", + "21.0 6785\n", + "8.0 7474\n", + "11.0 7758\n", + "5.0 8268\n", + "3.0 9966\n", + "19.0 19612\n", + "1.0 20281\n", + "9.0 271120\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "road_safety_dataset.X.Vehicle_Type.value_counts().sort_values()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's say we want to prepare ranges for the bins basing on their counts. We see that there are vehicle types that could be grouped into similiar categories based on the frequencies. Our bins would look like this:\n", + "[9, 650, 1740, 6785, 19612, 271120]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "from _frequency_encoder import FrequencyEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "# you need to put np.inf at the end of the list. " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "fe = FrequencyEncoder(column = \"Vehicle_Type\", bins = [9, 650, 1740, 6785, 19612, 271120, np.inf])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
FrequencyEncoder(bins=[9, 650, 1740, 6785, 19612, 271120, inf],\n",
+       "                 column='Vehicle_Type')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "FrequencyEncoder(bins=[9, 650, 1740, 6785, 19612, 271120, inf],\n", + " column='Vehicle_Type')" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fe.fit(X=road_safety_dataset.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [19612.0, 271120.0)\n", + "1 [271120.0, inf)\n", + "2 [271120.0, inf)\n", + "3 [271120.0, inf)\n", + "4 [19612.0, 271120.0)\n", + " ... \n", + "363238 [271120.0, inf)\n", + "363239 [271120.0, inf)\n", + "363240 [271120.0, inf)\n", + "363241 [271120.0, inf)\n", + "363242 [271120.0, inf)\n", + "Name: Vehicle_Type, Length: 363243, dtype: category\n", + "Categories (6, interval[float64, left]): [[9.0, 650.0) < [650.0, 1740.0) < [1740.0, 6785.0) < [6785.0, 19612.0) < [19612.0, 271120.0) < [271120.0, inf)]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fe.transform(road_safety_dataset.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [19612.0, 271120.0)\n", + "1 [271120.0, inf)\n", + "2 [271120.0, inf)\n", + "3 [271120.0, inf)\n", + "4 [19612.0, 271120.0)\n", + " ... \n", + "363238 [271120.0, inf)\n", + "363239 [271120.0, inf)\n", + "363240 [271120.0, inf)\n", + "363241 [271120.0, inf)\n", + "363242 [271120.0, inf)\n", + "Name: Vehicle_Type, Length: 363243, dtype: category\n", + "Categories (6, interval[float64, left]): [[9.0, 650.0) < [650.0, 1740.0) < [1740.0, 6785.0) < [6785.0, 19612.0) < [19612.0, 271120.0) < [271120.0, inf)]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fe.transform(employee_salaries.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Vehicle_Type\n", + "[271120.0, inf) 271120\n", + "[6785.0, 19612.0) 40251\n", + "[19612.0, 271120.0) 39893\n", + "[1740.0, 6785.0) 8896\n", + "[650.0, 1740.0) 2346\n", + "[9.0, 650.0) 675\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fe.transform(road_safety_dataset.X).value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila, now you have values encoded according to chosen ranges basing on frequencies.\n", + "Soon we will have automatic way to compute this basing on the quantiles. " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fe.transform(road_safety_dataset.X).value_counts().sort_values().plot(kind=\"bar\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "next: one-hot encoding and clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/skrub/_frequency_encoder.py b/skrub/_frequency_encoder.py new file mode 100644 index 000000000..646c71921 --- /dev/null +++ b/skrub/_frequency_encoder.py @@ -0,0 +1,35 @@ +""" +Implements the Frequency Encoder, a transformer that allows +encoding a feature using it's frequency. +""" + + + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + + +class FrequencyEncoder(TransformerMixin, BaseEstimator): + def __init__( + self, + column, + bins + ): + + self.column = column + self.bins = bins + self.uniques_to_map = None + + + + + def fit(self, X:pd.DataFrame, y=None): + value_counts_series = X[self.column].value_counts() + self.uniques_to_map = pd.cut(value_counts_series, self.bins, right=False) + + return self + + + def transform(self, X) -> pd.DataFrame: + return X[self.column].map(self.uniques_to_map)