diff --git a/docs/examples/Home Team Win-Loss Modeling/Home Team Win-Loss Data Prep.ipynb b/docs/examples/Home Team Win-Loss Modeling/Home Team Win-Loss Data Prep.ipynb new file mode 100644 index 00000000..2d02deb6 --- /dev/null +++ b/docs/examples/Home Team Win-Loss Modeling/Home Team Win-Loss Data Prep.ipynb @@ -0,0 +1,2081 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NBA Win-Loss Modeling Data Prep" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "This Jupyter Notebook script will walk you through the process of engineering features using Team, Player, and Game data for the purpose of creating an NBA Home Team Win-Loss Binary Classification Model. Follow the link for inuition around the model https://medium.com/@theresearchlab/create-an-nba-win-loss-model-w-68-precision-d1c6a21f0ded" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from nba_api.stats.static import teams, players\n", + "from nba_api.stats.endpoints import cumestatsteamgames, cumestatsteam, gamerotation\n", + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "import difflib\n", + "import time\n", + "import requests" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "implement a retry wrapper using a python decorator function. This will prevent the HTTP Timeouts from stopping the script's run during API calls. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Retry Wrapper \n", + "def retry(func, retries=3):\n", + " def retry_wrapper(*args, **kwargs):\n", + " attempts = 0\n", + " while attempts < retries:\n", + " try:\n", + " return func(*args, **kwargs)\n", + " except requests.exceptions.RequestException as e:\n", + " print(e)\n", + " time.sleep(30)\n", + " attempts += 1\n", + "\n", + " return retry_wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Get Season Schedule Function \n", + "\n", + "def getSeasonScheduleFrame(seasons,seasonType): \n", + "\n", + " # Get date from string\n", + " def getGameDate(matchup):\n", + " return matchup.partition(' at')[0][:10]\n", + "\n", + " # Get Home team from string\n", + " def getHomeTeam(matchup):\n", + " return matchup.partition(' at')[2]\n", + "\n", + " # Get Away team from string\n", + " def getAwayTeam(matchup):\n", + " return matchup.partition(' at')[0][10:]\n", + "\n", + " # Match nickname from schedule to team table to find ID\n", + " def getTeamIDFromNickname(nickname):\n", + " return teamLookup.loc[teamLookup['nickname'] == difflib.get_close_matches(nickname,teamLookup['nickname'],1)[0]].values[0][0] \n", + " \n", + " @retry\n", + " def getRegularSeasonSchedule(season,teamID,seasonType):\n", + " season = str(season) + \"-\" + str(season+1)[-2:] # Convert year to season format ie. 2020 -> 2020-21\n", + " teamGames = cumestatsteamgames.CumeStatsTeamGames(league_id = '00',season = season ,\n", + " season_type_all_star=seasonType,\n", + " team_id = teamID).get_normalized_json()\n", + "\n", + " teamGames = pd.DataFrame(json.loads(teamGames)['CumeStatsTeamGames'])\n", + " teamGames['SEASON'] = season\n", + " return teamGames \n", + " \n", + " # Get team lookup table\n", + " teamLookup = pd.DataFrame(teams.get_teams())\n", + " \n", + " # Get teams schedule for each team for each season\n", + " scheduleFrame = pd.DataFrame()\n", + "\n", + " for season in seasons:\n", + " for id in teamLookup['id']:\n", + " time.sleep(1)\n", + " scheduleFrame = scheduleFrame.append(getRegularSeasonSchedule(season,id,seasonType))\n", + " \n", + " scheduleFrame['GAME_DATE'] = pd.to_datetime(scheduleFrame['MATCHUP'].map(getGameDate))\n", + " scheduleFrame['HOME_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getHomeTeam)\n", + " scheduleFrame['HOME_TEAM_ID'] = scheduleFrame['HOME_TEAM_NICKNAME'].map(getTeamIDFromNickname)\n", + " scheduleFrame['AWAY_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getAwayTeam)\n", + " scheduleFrame['AWAY_TEAM_ID'] = scheduleFrame['AWAY_TEAM_NICKNAME'].map(getTeamIDFromNickname)\n", + " scheduleFrame = scheduleFrame.drop_duplicates() # There's a row for both teams, only need 1\n", + " scheduleFrame = scheduleFrame.reset_index(drop=True)\n", + " \n", + " return scheduleFrame\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get Single Game aggregation columns\n", + "\n", + "def getSingleGameMetrics(gameID,homeTeamID,awayTeamID,awayTeamNickname,seasonYear,gameDate):\n", + "\n", + " @retry\n", + " def getGameStats(teamID,gameID,seasonYear):\n", + " gameStats = cumestatsteam.CumeStatsTeam(game_ids=gameID,league_id =\"00\",\n", + " season=seasonYear,season_type_all_star=\"Regular Season\",\n", + " team_id = teamID).get_normalized_json()\n", + "\n", + " gameStats = pd.DataFrame(json.loads(gameStats)['TotalTeamStats'])\n", + "\n", + " return gameStats\n", + "\n", + " data = getGameStats(homeTeamID,gameID,seasonYear)\n", + " data.at[1,'NICKNAME'] = awayTeamNickname\n", + " data.at[1,'TEAM_ID'] = awayTeamID\n", + " data.at[1,'OFFENSIVE_EFFICIENCY'] = (data.at[1,'FG'] + data.at[1,'AST'])/(data.at[1,'FGA'] - data.at[1,'OFF_REB'] + data.at[1,'AST'] + data.at[1,'TOTAL_TURNOVERS'])\n", + " data.at[1,'SCORING_MARGIN'] = data.at[1,'PTS'] - data.at[0,'PTS']\n", + "\n", + " data.at[0,'OFFENSIVE_EFFICIENCY'] = (data.at[0,'FG'] + data.at[0,'AST'])/(data.at[0,'FGA'] - data.at[0,'OFF_REB'] + data.at[0,'AST'] + data.at[0,'TOTAL_TURNOVERS'])\n", + " data.at[0,'SCORING_MARGIN'] = data.at[0,'PTS'] - data.at[1,'PTS']\n", + "\n", + " data['SEASON'] = seasonYear\n", + " data['GAME_DATE'] = gameDate\n", + " data['GAME_ID'] = gameID\n", + "\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def getGameLogs(gameLogs,scheduleFrame):\n", + " \n", + " # Functions to prepare additional columns after gameLogs table loads\n", + " def getHomeAwayFlag(gameDF):\n", + " gameDF['HOME_FLAG'] = np.where((gameDF['W_HOME']==1) | (gameDF['L_HOME']==1),1,0)\n", + " gameDF['AWAY_FLAG'] = np.where((gameDF['W_ROAD']==1) | (gameDF['L_ROAD']==1),1,0) \n", + "\n", + " def getTotalWinPctg(gameDF):\n", + " gameDF['TOTAL_GAMES_PLAYED'] = gameDF.groupby(['TEAM_ID','SEASON'])['GAME_DATE'].rank(ascending=True)\n", + " gameDF['TOTAL_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W'].cumsum()\n", + " gameDF['TOTAL_WIN_PCTG'] = gameDF['TOTAL_WINS']/gameDF['TOTAL_GAMES_PLAYED']\n", + " return gameDF.drop(['TOTAL_GAMES_PLAYED','TOTAL_WINS'],axis=1)\n", + "\n", + " def getHomeWinPctg(gameDF):\n", + " gameDF['HOME_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_FLAG'].cumsum()\n", + " gameDF['HOME_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W_HOME'].cumsum()\n", + " gameDF['HOME_WIN_PCTG'] = gameDF['HOME_WINS']/gameDF['HOME_GAMES_PLAYED']\n", + " return gameDF.drop(['HOME_GAMES_PLAYED','HOME_WINS'],axis=1)\n", + "\n", + " def getAwayWinPctg(gameDF):\n", + " gameDF['AWAY_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_FLAG'].cumsum()\n", + " gameDF['AWAY_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W_ROAD'].cumsum()\n", + " gameDF['AWAY_WIN_PCTG'] = gameDF['AWAY_WINS']/gameDF['AWAY_GAMES_PLAYED']\n", + " return gameDF.drop(['AWAY_GAMES_PLAYED','AWAY_WINS'],axis=1)\n", + "\n", + " def getRollingOE(gameDF):\n", + " gameDF['ROLLING_OE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].transform(lambda x: x.rolling(3, 1).mean())\n", + "\n", + " def getRollingScoringMargin(gameDF):\n", + " gameDF['ROLLING_SCORING_MARGIN'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['SCORING_MARGIN'].transform(lambda x: x.rolling(3, 1).mean())\n", + "\n", + " def getRestDays(gameDF):\n", + " gameDF['LAST_GAME_DATE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['GAME_DATE'].shift(1)\n", + " gameDF['NUM_REST_DAYS'] = (gameDF['GAME_DATE'] - gameDF['LAST_GAME_DATE'])/np.timedelta64(1,'D') \n", + " return gameDF.drop('LAST_GAME_DATE',axis=1)\n", + " \n", + " start = time.perf_counter_ns()\n", + "\n", + " i = int(len(gameLogs)/2) #Can use a previously completed gameLog dataset\n", + "\n", + " while i\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CITYNICKNAMETEAM_IDWLW_HOMEL_HOMEW_ROADL_ROADTEAM_TURNOVERS...BLKPTSAVG_REBAVG_PTSDQOFFENSIVE_EFFICIENCYSCORING_MARGINSEASONGAME_DATEGAME_ID
0ClevelandCavaliers16106127391010002...311749.0117.000.5887857.02020-212021-03-170022000620
1OPPONENTSCeltics16106127380100010...911048.0110.000.567308-7.02020-212021-03-170022000620
\n", + "

2 rows × 41 columns

\n", + "" + ], + "text/plain": [ + " CITY NICKNAME TEAM_ID W L W_HOME L_HOME W_ROAD L_ROAD \\\n", + "0 Cleveland Cavaliers 1610612739 1 0 1 0 0 0 \n", + "1 OPPONENTS Celtics 1610612738 0 1 0 0 0 1 \n", + "\n", + " TEAM_TURNOVERS ... BLK PTS AVG_REB AVG_PTS DQ OFFENSIVE_EFFICIENCY \\\n", + "0 2 ... 3 117 49.0 117.0 0 0.588785 \n", + "1 0 ... 9 110 48.0 110.0 0 0.567308 \n", + "\n", + " SCORING_MARGIN SEASON GAME_DATE GAME_ID \n", + "0 7.0 2020-21 2021-03-17 0022000620 \n", + "1 -7.0 2020-21 2021-03-17 0022000620 \n", + "\n", + "[2 rows x 41 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Example Output of Single Game Metrics\n", + "getSingleGameMetrics(scheduleFrame.at[104,'GAME_ID'],scheduleFrame.at[104,'HOME_TEAM_ID'],\n", + " scheduleFrame.at[104,'AWAY_TEAM_ID'],scheduleFrame.at[104,'AWAY_TEAM_NICKNAME'],\n", + " scheduleFrame.at[104,'SEASON'],scheduleFrame.at[104,'GAME_DATE'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 0.027592645000000002\n", + "100 3.761211635\n", + "200 7.446148228333333\n", + "300 10.91657379\n", + "400 14.178322275\n", + "500 17.562468318333334\n", + "600 22.57064343333333\n", + "700 28.885029056666667\n", + "800 32.53894844333333\n", + "900 36.77510828333333\n", + "1000 41.57496655333333\n", + "1100 45.66484153166667\n", + "1200 48.904151811666665\n", + "1300 52.46264781666666\n", + "1400 55.85839063666666\n", + "1500 60.04402927833333\n", + "1600 64.00290142\n", + "1700 67.65499090166666\n", + "1800 71.13305232833333\n", + "1900 75.12400890166667\n", + "2000 78.45767208000001\n", + "2100 82.388200135\n", + "2200 85.89361625166666\n", + "2300 90.73073644\n", + "2400 92.95489135999999\n", + "2500 95.15283247333333\n", + "2600 97.83685436833333\n", + "2700 101.011523535\n", + "2800 104.15464245833333\n", + "2900 108.15016159333332\n" + ] + } + ], + "source": [ + "#Create the gameLogs DataFrame\n", + "gameLogs = pd.DataFrame()\n", + "gameLogs = getGameLogs(gameLogs,scheduleFrame)\n", + "gameLogs.to_csv('gameLogs.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CITYNICKNAMETEAM_IDWLW_HOMEL_HOMEW_ROADL_ROADTEAM_TURNOVERS...GAME_DATEGAME_IDHOME_FLAGAWAY_FLAGHOME_WIN_PCTGAWAY_WIN_PCTGTOTAL_WIN_PCTGROLLING_SCORING_MARGINROLLING_OENUM_REST_DAYS
4696AtlantaHawks16106127371010000...2022-10-190022200005101.000000NaN1.00000010.0000000.600000NaN
4694AtlantaHawks16106127371010000...2022-10-210022200020101.000000NaN1.00000010.0000000.5682932.0
4692AtlantaHawks16106127370101000...2022-10-230022200038100.666667NaN0.6666671.0000000.5601482.0
4691OPPONENTSHawks16106127371000100...2022-10-260022200057010.6666671.0000000.750000-0.6666670.5543513.0
4689OPPONENTSHawks16106127371000101...2022-10-280022200070010.6666671.0000000.8000004.0000000.6048232.0
4687OPPONENTSHawks16106127370100011...2022-10-290022200085010.6666670.6666670.6666677.0000000.6106711.0
4685OPPONENTSHawks16106127370100010...2022-10-310022200099010.6666670.5000000.571429-4.6666670.5980402.0
4683OPPONENTSHawks16106127371000102...2022-11-020022200110010.6666670.6000000.625000-8.3333330.5576832.0
4680AtlantaHawks16106127371010000...2022-11-050022200134100.7500000.6000000.666667-4.6666670.5599423.0
4678AtlantaHawks16106127371010000...2022-11-070022200149100.8000000.6000000.70000011.6666670.5668952.0
4676AtlantaHawks16106127370101000...2022-11-090022200162100.6666670.6000000.6363645.3333330.5747692.0
4674AtlantaHawks16106127371010000...2022-11-100022200173100.7142860.6000000.6666677.3333330.5549661.0
4673OPPONENTSHawks16106127370100011...2022-11-120022200188010.7142860.5000000.615385-3.0000000.5388552.0
4671OPPONENTSHawks16106127371000100...2022-11-140022200204010.7142860.5714290.6428574.0000000.5432062.0
4668AtlantaHawks16106127370101000...2022-11-160022200214100.6250000.5714290.600000-7.3333330.5531022.0
4666AtlantaHawks16106127371010001...2022-11-190022200235100.6666670.5714290.625000-2.6666670.5739663.0
4665OPPONENTSHawks16106127370100011...2022-11-210022200248010.6666670.5000000.588235-11.6666670.5479062.0
4662AtlantaHawks16106127371010000...2022-11-230022200263100.7000000.5000000.611111-0.3333330.5492102.0
4661OPPONENTSHawks16106127370100011...2022-11-250022200278010.7000000.4444440.578947-3.0000000.5258042.0
4658AtlantaHawks16106127370101003...2022-11-270022200293100.6363640.4444440.550000-1.6666670.5202032.0
4657OPPONENTSHawks16106127370100011...2022-11-280022200299010.6363640.4000000.523810-5.6666670.5269121.0
4655OPPONENTSHawks16106127371000100...2022-11-300022200313010.6363640.4545450.5454552.0000000.5735642.0
4652AtlantaHawks16106127371010000...2022-12-020022200327100.6666670.4545450.5652177.3333330.6212862.0
4650AtlantaHawks16106127370101000...2022-12-050022200354100.6153850.4545450.5416676.0000000.6122643.0
4649OPPONENTSHawks16106127370100010...2022-12-070022200365010.6153850.4166670.520000-7.6666670.5541552.0
4647OPPONENTSHawks16106127370100010...2022-12-090022200380010.6153850.3846150.500000-11.6666670.5230052.0
4644AtlantaHawks16106127371010001...2022-12-110022200400100.6428570.3846150.518519-9.0000000.5324972.0
4643OPPONENTSHawks16106127370100011...2022-12-120022200404010.6428570.3571430.500000-9.3333330.5324141.0
4641OPPONENTSHawks16106127370100010...2022-12-140022200416010.6428570.3333330.482759-11.6666670.5518592.0
4639OPPONENTSHawks16106127371000102...2022-12-160022200428010.6428570.3750000.500000-5.6666670.5638892.0
4636AtlantaHawks16106127371010000...2022-12-190022200454100.6666670.3750000.5161293.0000000.6076553.0
4634AtlantaHawks16106127370101000...2022-12-210022200468100.6250000.3750000.5000006.0000000.5729682.0
4632AtlantaHawks16106127371010000...2022-12-230022200481100.6470590.3750000.5151528.0000000.5888152.0
4631OPPONENTSHawks16106127370100010...2022-12-270022200508010.6470590.3529410.5000002.6666670.5780204.0
4628AtlantaHawks16106127370101001...2022-12-280022200517100.6111110.3529410.4857143.0000000.5976281.0
4626AtlantaHawks16106127370101000...2022-12-300022200530100.5789470.3529410.472222-8.3333330.5746182.0
4625OPPONENTSHawks16106127370100010...2023-01-020022200558010.5789470.3333330.459459-4.0000000.5781493.0
4623OPPONENTSHawks16106127371000101...2023-01-040022200575010.5789470.3684210.473684-2.6666670.5806562.0
4621OPPONENTSHawks16106127370100010...2023-01-060022200590010.5789470.3500000.461538-5.0000000.5691512.0
\n", + "

39 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " CITY NICKNAME TEAM_ID W L W_HOME L_HOME W_ROAD L_ROAD \\\n", + "4696 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4694 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4692 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4691 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4689 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4687 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4685 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4683 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4680 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4678 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4676 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4674 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4673 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4671 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4668 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4666 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4665 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4662 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4661 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4658 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4657 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4655 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4652 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4650 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4649 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4647 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4644 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4643 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4641 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4639 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4636 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4634 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4632 Atlanta Hawks 1610612737 1 0 1 0 0 0 \n", + "4631 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4628 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4626 Atlanta Hawks 1610612737 0 1 0 1 0 0 \n", + "4625 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "4623 OPPONENTS Hawks 1610612737 1 0 0 0 1 0 \n", + "4621 OPPONENTS Hawks 1610612737 0 1 0 0 0 1 \n", + "\n", + " TEAM_TURNOVERS ... GAME_DATE GAME_ID HOME_FLAG AWAY_FLAG \\\n", + "4696 0 ... 2022-10-19 0022200005 1 0 \n", + "4694 0 ... 2022-10-21 0022200020 1 0 \n", + "4692 0 ... 2022-10-23 0022200038 1 0 \n", + "4691 0 ... 2022-10-26 0022200057 0 1 \n", + "4689 1 ... 2022-10-28 0022200070 0 1 \n", + "4687 1 ... 2022-10-29 0022200085 0 1 \n", + "4685 0 ... 2022-10-31 0022200099 0 1 \n", + "4683 2 ... 2022-11-02 0022200110 0 1 \n", + "4680 0 ... 2022-11-05 0022200134 1 0 \n", + "4678 0 ... 2022-11-07 0022200149 1 0 \n", + "4676 0 ... 2022-11-09 0022200162 1 0 \n", + "4674 0 ... 2022-11-10 0022200173 1 0 \n", + "4673 1 ... 2022-11-12 0022200188 0 1 \n", + "4671 0 ... 2022-11-14 0022200204 0 1 \n", + "4668 0 ... 2022-11-16 0022200214 1 0 \n", + "4666 1 ... 2022-11-19 0022200235 1 0 \n", + "4665 1 ... 2022-11-21 0022200248 0 1 \n", + "4662 0 ... 2022-11-23 0022200263 1 0 \n", + "4661 1 ... 2022-11-25 0022200278 0 1 \n", + "4658 3 ... 2022-11-27 0022200293 1 0 \n", + "4657 1 ... 2022-11-28 0022200299 0 1 \n", + "4655 0 ... 2022-11-30 0022200313 0 1 \n", + "4652 0 ... 2022-12-02 0022200327 1 0 \n", + "4650 0 ... 2022-12-05 0022200354 1 0 \n", + "4649 0 ... 2022-12-07 0022200365 0 1 \n", + "4647 0 ... 2022-12-09 0022200380 0 1 \n", + "4644 1 ... 2022-12-11 0022200400 1 0 \n", + "4643 1 ... 2022-12-12 0022200404 0 1 \n", + "4641 0 ... 2022-12-14 0022200416 0 1 \n", + "4639 2 ... 2022-12-16 0022200428 0 1 \n", + "4636 0 ... 2022-12-19 0022200454 1 0 \n", + "4634 0 ... 2022-12-21 0022200468 1 0 \n", + "4632 0 ... 2022-12-23 0022200481 1 0 \n", + "4631 0 ... 2022-12-27 0022200508 0 1 \n", + "4628 1 ... 2022-12-28 0022200517 1 0 \n", + "4626 0 ... 2022-12-30 0022200530 1 0 \n", + "4625 0 ... 2023-01-02 0022200558 0 1 \n", + "4623 1 ... 2023-01-04 0022200575 0 1 \n", + "4621 0 ... 2023-01-06 0022200590 0 1 \n", + "\n", + " HOME_WIN_PCTG AWAY_WIN_PCTG TOTAL_WIN_PCTG ROLLING_SCORING_MARGIN \\\n", + "4696 1.000000 NaN 1.000000 10.000000 \n", + "4694 1.000000 NaN 1.000000 10.000000 \n", + "4692 0.666667 NaN 0.666667 1.000000 \n", + "4691 0.666667 1.000000 0.750000 -0.666667 \n", + "4689 0.666667 1.000000 0.800000 4.000000 \n", + "4687 0.666667 0.666667 0.666667 7.000000 \n", + "4685 0.666667 0.500000 0.571429 -4.666667 \n", + "4683 0.666667 0.600000 0.625000 -8.333333 \n", + "4680 0.750000 0.600000 0.666667 -4.666667 \n", + "4678 0.800000 0.600000 0.700000 11.666667 \n", + "4676 0.666667 0.600000 0.636364 5.333333 \n", + "4674 0.714286 0.600000 0.666667 7.333333 \n", + "4673 0.714286 0.500000 0.615385 -3.000000 \n", + "4671 0.714286 0.571429 0.642857 4.000000 \n", + "4668 0.625000 0.571429 0.600000 -7.333333 \n", + "4666 0.666667 0.571429 0.625000 -2.666667 \n", + "4665 0.666667 0.500000 0.588235 -11.666667 \n", + "4662 0.700000 0.500000 0.611111 -0.333333 \n", + "4661 0.700000 0.444444 0.578947 -3.000000 \n", + "4658 0.636364 0.444444 0.550000 -1.666667 \n", + "4657 0.636364 0.400000 0.523810 -5.666667 \n", + "4655 0.636364 0.454545 0.545455 2.000000 \n", + "4652 0.666667 0.454545 0.565217 7.333333 \n", + "4650 0.615385 0.454545 0.541667 6.000000 \n", + "4649 0.615385 0.416667 0.520000 -7.666667 \n", + "4647 0.615385 0.384615 0.500000 -11.666667 \n", + "4644 0.642857 0.384615 0.518519 -9.000000 \n", + "4643 0.642857 0.357143 0.500000 -9.333333 \n", + "4641 0.642857 0.333333 0.482759 -11.666667 \n", + "4639 0.642857 0.375000 0.500000 -5.666667 \n", + "4636 0.666667 0.375000 0.516129 3.000000 \n", + "4634 0.625000 0.375000 0.500000 6.000000 \n", + "4632 0.647059 0.375000 0.515152 8.000000 \n", + "4631 0.647059 0.352941 0.500000 2.666667 \n", + "4628 0.611111 0.352941 0.485714 3.000000 \n", + "4626 0.578947 0.352941 0.472222 -8.333333 \n", + "4625 0.578947 0.333333 0.459459 -4.000000 \n", + "4623 0.578947 0.368421 0.473684 -2.666667 \n", + "4621 0.578947 0.350000 0.461538 -5.000000 \n", + "\n", + " ROLLING_OE NUM_REST_DAYS \n", + "4696 0.600000 NaN \n", + "4694 0.568293 2.0 \n", + "4692 0.560148 2.0 \n", + "4691 0.554351 3.0 \n", + "4689 0.604823 2.0 \n", + "4687 0.610671 1.0 \n", + "4685 0.598040 2.0 \n", + "4683 0.557683 2.0 \n", + "4680 0.559942 3.0 \n", + "4678 0.566895 2.0 \n", + "4676 0.574769 2.0 \n", + "4674 0.554966 1.0 \n", + "4673 0.538855 2.0 \n", + "4671 0.543206 2.0 \n", + "4668 0.553102 2.0 \n", + "4666 0.573966 3.0 \n", + "4665 0.547906 2.0 \n", + "4662 0.549210 2.0 \n", + "4661 0.525804 2.0 \n", + "4658 0.520203 2.0 \n", + "4657 0.526912 1.0 \n", + "4655 0.573564 2.0 \n", + "4652 0.621286 2.0 \n", + "4650 0.612264 3.0 \n", + "4649 0.554155 2.0 \n", + "4647 0.523005 2.0 \n", + "4644 0.532497 2.0 \n", + "4643 0.532414 1.0 \n", + "4641 0.551859 2.0 \n", + "4639 0.563889 2.0 \n", + "4636 0.607655 3.0 \n", + "4634 0.572968 2.0 \n", + "4632 0.588815 2.0 \n", + "4631 0.578020 4.0 \n", + "4628 0.597628 1.0 \n", + "4626 0.574618 2.0 \n", + "4625 0.578149 3.0 \n", + "4623 0.580656 2.0 \n", + "4621 0.569151 2.0 \n", + "\n", + "[39 rows x 49 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Example Output of Game Logs\n", + "gameLogs[(gameLogs['TEAM_ID'] == 1610612737 ) & (gameLogs['SEASON'] == '2022-23')].sort_values('GAME_DATE')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Engineered Dataset \n", + "This function produces the dataset with all current feature variables for modeling and additional columns to perform a training set/validation set split. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def getGameLogFeatureSet(gameDF):\n", + "\n", + " def shiftGameLogRecords(gameDF):\n", + " gameDF['LAST_GAME_OE'] = gameLogs.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].shift(1)\n", + " gameDF['LAST_GAME_HOME_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_WIN_PCTG'].shift(1)\n", + " gameDF['LAST_GAME_AWAY_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_WIN_PCTG'].shift(1)\n", + " gameDF['LAST_GAME_TOTAL_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['TOTAL_WIN_PCTG'].shift(1)\n", + " gameDF['LAST_GAME_ROLLING_SCORING_MARGIN'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_SCORING_MARGIN'].shift(1)\n", + " gameDF['LAST_GAME_ROLLING_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_OE'].shift(1)\n", + " \n", + " \n", + " def getHomeTeamFrame(gameDF):\n", + " homeTeamFrame = gameDF[gameDF['CITY'] != 'OPPONENTS']\n", + " homeTeamFrame = homeTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','W','TEAM_ID','GAME_ID','SEASON']]\n", + "\n", + " colRenameDict = {}\n", + " for col in homeTeamFrame.columns:\n", + " if (col != 'GAME_ID') & (col != 'SEASON') :\n", + " colRenameDict[col] = 'HOME_' + col \n", + "\n", + " homeTeamFrame.rename(columns=colRenameDict,inplace=True)\n", + "\n", + " return homeTeamFrame\n", + "\n", + " def getAwayTeamFrame(gameDF):\n", + " awayTeamFrame = gameDF[gameDF['CITY'] == 'OPPONENTS']\n", + " awayTeamFrame = awayTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','TEAM_ID','GAME_ID','SEASON']]\n", + "\n", + " colRenameDict = {}\n", + " for col in awayTeamFrame.columns:\n", + " if (col != 'GAME_ID') & (col != 'SEASON'):\n", + " colRenameDict[col] = 'AWAY_' + col \n", + "\n", + " awayTeamFrame.rename(columns=colRenameDict,inplace=True)\n", + "\n", + " return awayTeamFrame\n", + " \n", + " shiftGameLogRecords(gameLogs)\n", + " awayTeamFrame = getAwayTeamFrame(gameLogs)\n", + " homeTeamFrame = getHomeTeamFrame(gameLogs)\n", + " \n", + " return pd.merge(homeTeamFrame, awayTeamFrame, how=\"inner\", on=[ \"GAME_ID\",\"SEASON\"]).drop(['GAME_ID','AWAY_TEAM_ID','HOME_TEAM_ID'],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "modelData = getGameLogFeatureSet(gameLogs)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HOME_LAST_GAME_OEHOME_LAST_GAME_HOME_WIN_PCTGHOME_NUM_REST_DAYSHOME_LAST_GAME_AWAY_WIN_PCTGHOME_LAST_GAME_TOTAL_WIN_PCTGHOME_LAST_GAME_ROLLING_SCORING_MARGINHOME_LAST_GAME_ROLLING_OEHOME_WSEASONAWAY_LAST_GAME_OEAWAY_LAST_GAME_HOME_WIN_PCTGAWAY_NUM_REST_DAYSAWAY_LAST_GAME_AWAY_WIN_PCTGAWAY_LAST_GAME_TOTAL_WIN_PCTGAWAY_LAST_GAME_ROLLING_SCORING_MARGINAWAY_LAST_GAME_ROLLING_OE
00.5555560.6857143.00.4444440.5633809.3333330.57140512020-210.6129030.2500002.00.2285710.239437-2.0000000.605315
10.5739130.6764711.00.4444440.557143-0.6666670.58947212020-210.5000000.3055562.00.3030300.304348-18.0000000.512228
20.5847460.6666672.00.4444440.5507258.6666670.64059212020-210.6124030.5000002.00.4285710.4637680.6666670.567718
30.6097560.6562504.00.4444440.54411811.3333330.65332712020-210.5395680.5000002.00.4411760.4705880.6666670.570866
40.5123970.3548391.00.5588240.46153811.0000000.59649812020-210.7272730.6562501.00.4571430.55223917.3333330.633081
...................................................
29000.5785120.3333334.00.2500000.285714-9.0000000.56527012022-230.5370370.6666672.00.3000000.5000000.0000000.622617
29010.5401460.6000002.00.5000000.5625003.3333330.55624812022-230.5112780.1428572.00.3000000.235294-5.0000000.555121
29020.4909090.2500002.00.3333330.300000-17.6666670.48829702022-230.5689660.4000001.00.4000000.400000-12.6666670.565603
29030.5312501.0000002.00.5000000.666667-0.3333330.54738312022-230.5338981.0000003.00.0000000.333333-9.6666670.549413
29040.5130430.3076923.00.2142860.259259-14.3333330.55645802022-230.5772360.3076923.00.1875000.241379-8.0000000.512440
\n", + "

2905 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " HOME_LAST_GAME_OE HOME_LAST_GAME_HOME_WIN_PCTG HOME_NUM_REST_DAYS \\\n", + "0 0.555556 0.685714 3.0 \n", + "1 0.573913 0.676471 1.0 \n", + "2 0.584746 0.666667 2.0 \n", + "3 0.609756 0.656250 4.0 \n", + "4 0.512397 0.354839 1.0 \n", + "... ... ... ... \n", + "2900 0.578512 0.333333 4.0 \n", + "2901 0.540146 0.600000 2.0 \n", + "2902 0.490909 0.250000 2.0 \n", + "2903 0.531250 1.000000 2.0 \n", + "2904 0.513043 0.307692 3.0 \n", + "\n", + " HOME_LAST_GAME_AWAY_WIN_PCTG HOME_LAST_GAME_TOTAL_WIN_PCTG \\\n", + "0 0.444444 0.563380 \n", + "1 0.444444 0.557143 \n", + "2 0.444444 0.550725 \n", + "3 0.444444 0.544118 \n", + "4 0.558824 0.461538 \n", + "... ... ... \n", + "2900 0.250000 0.285714 \n", + "2901 0.500000 0.562500 \n", + "2902 0.333333 0.300000 \n", + "2903 0.500000 0.666667 \n", + "2904 0.214286 0.259259 \n", + "\n", + " HOME_LAST_GAME_ROLLING_SCORING_MARGIN HOME_LAST_GAME_ROLLING_OE \\\n", + "0 9.333333 0.571405 \n", + "1 -0.666667 0.589472 \n", + "2 8.666667 0.640592 \n", + "3 11.333333 0.653327 \n", + "4 11.000000 0.596498 \n", + "... ... ... \n", + "2900 -9.000000 0.565270 \n", + "2901 3.333333 0.556248 \n", + "2902 -17.666667 0.488297 \n", + "2903 -0.333333 0.547383 \n", + "2904 -14.333333 0.556458 \n", + "\n", + " HOME_W SEASON AWAY_LAST_GAME_OE AWAY_LAST_GAME_HOME_WIN_PCTG \\\n", + "0 1 2020-21 0.612903 0.250000 \n", + "1 1 2020-21 0.500000 0.305556 \n", + "2 1 2020-21 0.612403 0.500000 \n", + "3 1 2020-21 0.539568 0.500000 \n", + "4 1 2020-21 0.727273 0.656250 \n", + "... ... ... ... ... \n", + "2900 1 2022-23 0.537037 0.666667 \n", + "2901 1 2022-23 0.511278 0.142857 \n", + "2902 0 2022-23 0.568966 0.400000 \n", + "2903 1 2022-23 0.533898 1.000000 \n", + "2904 0 2022-23 0.577236 0.307692 \n", + "\n", + " AWAY_NUM_REST_DAYS AWAY_LAST_GAME_AWAY_WIN_PCTG \\\n", + "0 2.0 0.228571 \n", + "1 2.0 0.303030 \n", + "2 2.0 0.428571 \n", + "3 2.0 0.441176 \n", + "4 1.0 0.457143 \n", + "... ... ... \n", + "2900 2.0 0.300000 \n", + "2901 2.0 0.300000 \n", + "2902 1.0 0.400000 \n", + "2903 3.0 0.000000 \n", + "2904 3.0 0.187500 \n", + "\n", + " AWAY_LAST_GAME_TOTAL_WIN_PCTG AWAY_LAST_GAME_ROLLING_SCORING_MARGIN \\\n", + "0 0.239437 -2.000000 \n", + "1 0.304348 -18.000000 \n", + "2 0.463768 0.666667 \n", + "3 0.470588 0.666667 \n", + "4 0.552239 17.333333 \n", + "... ... ... \n", + "2900 0.500000 0.000000 \n", + "2901 0.235294 -5.000000 \n", + "2902 0.400000 -12.666667 \n", + "2903 0.333333 -9.666667 \n", + "2904 0.241379 -8.000000 \n", + "\n", + " AWAY_LAST_GAME_ROLLING_OE \n", + "0 0.605315 \n", + "1 0.512228 \n", + "2 0.567718 \n", + "3 0.570866 \n", + "4 0.633081 \n", + "... ... \n", + "2900 0.622617 \n", + "2901 0.555121 \n", + "2902 0.565603 \n", + "2903 0.549413 \n", + "2904 0.512440 \n", + "\n", + "[2905 rows x 16 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Final Data Set before Train,Test, Validation Split\n", + "modelData" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "modelData.to_csv('nbaHomeWinLossModelDataset.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/examples/Home Team Win-Loss Modeling/Home Team Win-Loss Modeling.ipynb b/docs/examples/Home Team Win-Loss Modeling/Home Team Win-Loss Modeling.ipynb new file mode 100644 index 00000000..f7b2950f --- /dev/null +++ b/docs/examples/Home Team Win-Loss Modeling/Home Team Win-Loss Modeling.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Binary Classification - Home Team Win-Loss Modeling\n", + "This short script imports logistic regression model class from sklearn and splits the data into test and validation sets to evaluate the performance on out of sample games." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn import preprocessing \n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HOME_LAST_GAME_OEHOME_LAST_GAME_HOME_WIN_PCTGHOME_NUM_REST_DAYSHOME_LAST_GAME_AWAY_WIN_PCTGHOME_LAST_GAME_TOTAL_WIN_PCTGHOME_LAST_GAME_ROLLING_SCORING_MARGINHOME_LAST_GAME_ROLLING_OEHOME_WSEASONAWAY_LAST_GAME_OEAWAY_LAST_GAME_HOME_WIN_PCTGAWAY_NUM_REST_DAYSAWAY_LAST_GAME_AWAY_WIN_PCTGAWAY_LAST_GAME_TOTAL_WIN_PCTGAWAY_LAST_GAME_ROLLING_SCORING_MARGINAWAY_LAST_GAME_ROLLING_OE
00.5555560.6857143.00.4444440.5633809.3333330.57140512020-210.6129030.2500002.00.2285710.239437-2.0000000.605315
10.5739130.6764711.00.4444440.557143-0.6666670.58947212020-210.5000000.3055562.00.3030300.304348-18.0000000.512228
20.5847460.6666672.00.4444440.5507258.6666670.64059212020-210.6124030.5000002.00.4285710.4637680.6666670.567718
30.6097560.6562504.00.4444440.54411811.3333330.65332712020-210.5395680.5000002.00.4411760.4705880.6666670.570866
40.5123970.3548391.00.5588240.46153811.0000000.59649812020-210.7272730.6562501.00.4571430.55223917.3333330.633081
50.6229510.6451612.00.4571430.545455-0.6666670.56199612020-210.6666670.7352941.00.7096770.72307713.3333330.644613
60.5490200.6333332.00.4571430.538462-18.3333330.50712412020-210.5966390.5000001.00.6250000.56250016.6666670.599989
70.5140190.6206901.00.4571430.531250-26.6666670.48027312020-210.5217390.3870971.00.4375000.412698-7.0000000.568524
80.6695650.7741942.00.5483870.66129012.3333330.56954912020-210.4583330.6206902.00.4705880.539683-17.0000000.489968
90.5967740.7666672.00.5483870.655738-4.6666670.54913812020-210.4684680.6206902.00.4848480.5483872.6666670.540221
\n", + "
" + ], + "text/plain": [ + " HOME_LAST_GAME_OE HOME_LAST_GAME_HOME_WIN_PCTG HOME_NUM_REST_DAYS \\\n", + "0 0.555556 0.685714 3.0 \n", + "1 0.573913 0.676471 1.0 \n", + "2 0.584746 0.666667 2.0 \n", + "3 0.609756 0.656250 4.0 \n", + "4 0.512397 0.354839 1.0 \n", + "5 0.622951 0.645161 2.0 \n", + "6 0.549020 0.633333 2.0 \n", + "7 0.514019 0.620690 1.0 \n", + "8 0.669565 0.774194 2.0 \n", + "9 0.596774 0.766667 2.0 \n", + "\n", + " HOME_LAST_GAME_AWAY_WIN_PCTG HOME_LAST_GAME_TOTAL_WIN_PCTG \\\n", + "0 0.444444 0.563380 \n", + "1 0.444444 0.557143 \n", + "2 0.444444 0.550725 \n", + "3 0.444444 0.544118 \n", + "4 0.558824 0.461538 \n", + "5 0.457143 0.545455 \n", + "6 0.457143 0.538462 \n", + "7 0.457143 0.531250 \n", + "8 0.548387 0.661290 \n", + "9 0.548387 0.655738 \n", + "\n", + " HOME_LAST_GAME_ROLLING_SCORING_MARGIN HOME_LAST_GAME_ROLLING_OE HOME_W \\\n", + "0 9.333333 0.571405 1 \n", + "1 -0.666667 0.589472 1 \n", + "2 8.666667 0.640592 1 \n", + "3 11.333333 0.653327 1 \n", + "4 11.000000 0.596498 1 \n", + "5 -0.666667 0.561996 1 \n", + "6 -18.333333 0.507124 1 \n", + "7 -26.666667 0.480273 1 \n", + "8 12.333333 0.569549 1 \n", + "9 -4.666667 0.549138 1 \n", + "\n", + " SEASON AWAY_LAST_GAME_OE AWAY_LAST_GAME_HOME_WIN_PCTG \\\n", + "0 2020-21 0.612903 0.250000 \n", + "1 2020-21 0.500000 0.305556 \n", + "2 2020-21 0.612403 0.500000 \n", + "3 2020-21 0.539568 0.500000 \n", + "4 2020-21 0.727273 0.656250 \n", + "5 2020-21 0.666667 0.735294 \n", + "6 2020-21 0.596639 0.500000 \n", + "7 2020-21 0.521739 0.387097 \n", + "8 2020-21 0.458333 0.620690 \n", + "9 2020-21 0.468468 0.620690 \n", + "\n", + " AWAY_NUM_REST_DAYS AWAY_LAST_GAME_AWAY_WIN_PCTG \\\n", + "0 2.0 0.228571 \n", + "1 2.0 0.303030 \n", + "2 2.0 0.428571 \n", + "3 2.0 0.441176 \n", + "4 1.0 0.457143 \n", + "5 1.0 0.709677 \n", + "6 1.0 0.625000 \n", + "7 1.0 0.437500 \n", + "8 2.0 0.470588 \n", + "9 2.0 0.484848 \n", + "\n", + " AWAY_LAST_GAME_TOTAL_WIN_PCTG AWAY_LAST_GAME_ROLLING_SCORING_MARGIN \\\n", + "0 0.239437 -2.000000 \n", + "1 0.304348 -18.000000 \n", + "2 0.463768 0.666667 \n", + "3 0.470588 0.666667 \n", + "4 0.552239 17.333333 \n", + "5 0.723077 13.333333 \n", + "6 0.562500 16.666667 \n", + "7 0.412698 -7.000000 \n", + "8 0.539683 -17.000000 \n", + "9 0.548387 2.666667 \n", + "\n", + " AWAY_LAST_GAME_ROLLING_OE \n", + "0 0.605315 \n", + "1 0.512228 \n", + "2 0.567718 \n", + "3 0.570866 \n", + "4 0.633081 \n", + "5 0.644613 \n", + "6 0.599989 \n", + "7 0.568524 \n", + "8 0.489968 \n", + "9 0.540221 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv('nbaHomeWinLossModelDataset.csv').drop(['Unnamed: 0'],axis=1)\n", + "data = data.dropna()\n", + "data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "validation = data[data['SEASON'] == '2022-23']\n", + "modelData = data[data['SEASON'] != '2022-23'].sample(frac=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = modelData.drop(['HOME_W','SEASON'],axis=1)\n", + "y = modelData['HOME_W']\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.33)\n", + "\n", + "# Standard Scaling Prediction Variables\n", + "scaler = preprocessing.StandardScaler()\n", + "scaler.fit(X_train)\n", + "scaled_data_train = scaler.transform(X_train)\n", + "\n", + "scaler.fit(X_test)\n", + "scaled_data_test = scaler.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.614441416893733" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Logistic Regression\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(scaled_data_train,y_train)\n", + "model.score(scaled_data_test,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Model F1 Accuracy: 0.61 (+/- 0.12)\n" + ] + } + ], + "source": [ + "F1Score = cross_val_score(model,scaled_data_test,y_test,cv=12,scoring='f1_macro');\n", + "print(\"Logistic Model F1 Accuracy: %0.2f (+/- %0.2f)\"%(F1Score.mean(), F1Score.std() *2))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.60 0.47 0.53 336\n", + " 1 0.62 0.74 0.68 398\n", + "\n", + " accuracy 0.61 734\n", + " macro avg 0.61 0.60 0.60 734\n", + "weighted avg 0.61 0.61 0.61 734\n", + "\n" + ] + } + ], + "source": [ + "# Test Set Review\n", + "\n", + "y_pred = model.predict(scaled_data_test)\n", + "print(classification_report(y_test,y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#Validation Set review\n", + "\n", + "# Standard Scaling Prediction Variables\n", + "scaler = preprocessing.StandardScaler()\n", + "scaler.fit(validation.drop(['HOME_W','SEASON'],axis=1))\n", + "scaled_val_data = scaler.transform(validation.drop(['HOME_W','SEASON'],axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.49 0.47 0.48 210\n", + " 1 0.68 0.70 0.69 338\n", + "\n", + " accuracy 0.61 548\n", + " macro avg 0.59 0.59 0.59 548\n", + "weighted avg 0.61 0.61 0.61 548\n", + "\n" + ] + } + ], + "source": [ + "# How the model performs on unseen data\n", + "y_pred = model.predict(scaled_val_data)\n", + "print(classification_report(validation['HOME_W'],y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}