{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. KBO 타자 OPS 예측\n", "## 1.2. 탐색적 데이터 분석" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# 필요 라이브러리 로드\n", "from matplotlib import font_manager, rc\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import numpy as np\n", "import platform\n", "\n", "if platform.system() == 'Windows': # 윈도우인 경우 맑은 고딕 폰트 이용\n", " font_name = font_manager.FontProperties(fname=\"c:/Windows/Fonts/malgun.ttf\").get_name()\n", " rc('font', family=font_name)\n", "else: # Mac 인 경우 \n", " rc('font', family='AppleGothic')\n", "\n", "#그래프에서 마이너스 기호가 표시되게 하는 설정입니다.\n", "matplotlib.rcParams['axes.unicode_minus'] = False" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2.1. 프리시즌 데이터 분석" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1393, 29)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearteamavgGABRH2B...GDPSLGOBPEheight/weightyear_bornpositioncareerstarting_salaryOPS
00가르시아2018LG0.350720171...10.5500.4091177cm/93kg1985년 04월 12일내야수(우투우타)쿠바 Ciego de Avila Maximo Gomez Baez(대)NaN0.959
11강경학2011한화0.00042200...00.0000.5000180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.500
21강경학2014한화-40200...0NaNNaN0180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원NaN
31강경학2015한화0.1301023330...00.1300.2862180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.416
41강경학2016한화0.1881432461...00.2810.2120180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.493
\n", "

5 rows × 29 columns

\n", "
" ], "text/plain": [ " batter_id batter_name year team avg G AB R H 2B ... GDP SLG \\\n", "0 0 가르시아 2018 LG 0.350 7 20 1 7 1 ... 1 0.550 \n", "1 1 강경학 2011 한화 0.000 4 2 2 0 0 ... 0 0.000 \n", "2 1 강경학 2014 한화 - 4 0 2 0 0 ... 0 NaN \n", "3 1 강경학 2015 한화 0.130 10 23 3 3 0 ... 0 0.130 \n", "4 1 강경학 2016 한화 0.188 14 32 4 6 1 ... 0 0.281 \n", "\n", " OBP E height/weight year_born position \\\n", "0 0.409 1 177cm/93kg 1985년 04월 12일 내야수(우투우타) \n", "1 0.500 0 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "2 NaN 0 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "3 0.286 2 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "4 0.212 0 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "\n", " career starting_salary OPS \n", "0 쿠바 Ciego de Avila Maximo Gomez Baez(대) NaN 0.959 \n", "1 광주대성초-광주동성중-광주동성고 10000만원 0.500 \n", "2 광주대성초-광주동성중-광주동성고 10000만원 NaN \n", "3 광주대성초-광주동성중-광주동성고 10000만원 0.416 \n", "4 광주대성초-광주동성중-광주동성고 10000만원 0.493 \n", "\n", "[5 rows x 29 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 프리시즌 데이터 로드\n", "preseason_df = pd.read_csv(\"./input/Pre_Season_Batter.csv\")\n", "# 정규시즌 데이터 로드\n", "regular_season_df = pd.read_csv(\"./input/Regular_Season_Batter.csv\")\n", "# 데이터 크기 확인\n", "print(preseason_df.shape)\n", "# 데이터 상단 출력\n", "display(preseason_df.head())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idyearGABRH2B3BHRTB...SBCSBBHBPSOGDPSLGOBPEOPS
count1393.0000001393.0000001393.0000001393.0000001393.0000001393.0000001393.0000001393.0000001393.0000001393.000000...1393.0000001393.0000001393.0000001393.0000001393.0000001393.0000001364.0000001368.0000001393.0000001364.000000
mean173.4343142013.0143588.70567119.2017232.6798285.0215360.9547740.1198850.3919607.391960...0.6295760.2914571.8779610.3302233.7142860.4472360.3610120.3179120.3819100.676924
std94.7168514.1667575.56268613.3959462.6372124.2325841.1969040.3799760.7485576.538787...1.1468540.5955222.0533920.6422043.1808840.7233640.2698920.1514890.7295210.386933
min0.0000002002.0000001.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%99.0000002010.0000006.0000009.0000001.0000002.0000000.0000000.0000000.0000002.000000...0.0000000.0000000.0000000.0000001.0000000.0000000.2170000.2500000.0000000.472000
50%178.0000002014.0000009.00000018.0000002.0000004.0000001.0000000.0000000.0000006.000000...0.0000000.0000001.0000000.0000003.0000000.0000000.3445000.3330000.0000000.675000
75%254.0000002017.00000011.00000028.0000004.0000008.0000002.0000000.0000001.00000011.000000...1.0000000.0000003.0000001.0000005.0000001.0000000.4780000.4000001.0000000.867000
max344.0000002018.000000119.000000183.00000035.00000051.00000011.0000004.0000005.00000068.000000...9.0000004.00000021.0000004.00000036.0000005.0000004.0000001.0000005.0000005.000000
\n", "

8 rows × 21 columns

\n", "
" ], "text/plain": [ " batter_id year G AB R \\\n", "count 1393.000000 1393.000000 1393.000000 1393.000000 1393.000000 \n", "mean 173.434314 2013.014358 8.705671 19.201723 2.679828 \n", "std 94.716851 4.166757 5.562686 13.395946 2.637212 \n", "min 0.000000 2002.000000 1.000000 0.000000 0.000000 \n", "25% 99.000000 2010.000000 6.000000 9.000000 1.000000 \n", "50% 178.000000 2014.000000 9.000000 18.000000 2.000000 \n", "75% 254.000000 2017.000000 11.000000 28.000000 4.000000 \n", "max 344.000000 2018.000000 119.000000 183.000000 35.000000 \n", "\n", " H 2B 3B HR TB ... \\\n", "count 1393.000000 1393.000000 1393.000000 1393.000000 1393.000000 ... \n", "mean 5.021536 0.954774 0.119885 0.391960 7.391960 ... \n", "std 4.232584 1.196904 0.379976 0.748557 6.538787 ... \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 ... \n", "25% 2.000000 0.000000 0.000000 0.000000 2.000000 ... \n", "50% 4.000000 1.000000 0.000000 0.000000 6.000000 ... \n", "75% 8.000000 2.000000 0.000000 1.000000 11.000000 ... \n", "max 51.000000 11.000000 4.000000 5.000000 68.000000 ... \n", "\n", " SB CS BB HBP SO \\\n", "count 1393.000000 1393.000000 1393.000000 1393.000000 1393.000000 \n", "mean 0.629576 0.291457 1.877961 0.330223 3.714286 \n", "std 1.146854 0.595522 2.053392 0.642204 3.180884 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 1.000000 \n", "50% 0.000000 0.000000 1.000000 0.000000 3.000000 \n", "75% 1.000000 0.000000 3.000000 1.000000 5.000000 \n", "max 9.000000 4.000000 21.000000 4.000000 36.000000 \n", "\n", " GDP SLG OBP E OPS \n", "count 1393.000000 1364.000000 1368.000000 1393.000000 1364.000000 \n", "mean 0.447236 0.361012 0.317912 0.381910 0.676924 \n", "std 0.723364 0.269892 0.151489 0.729521 0.386933 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.217000 0.250000 0.000000 0.472000 \n", "50% 0.000000 0.344500 0.333000 0.000000 0.675000 \n", "75% 1.000000 0.478000 0.400000 1.000000 0.867000 \n", "max 5.000000 4.000000 1.000000 5.000000 5.000000 \n", "\n", "[8 rows x 21 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 데이터 기초통계량 확인\n", "display(preseason_df.describe())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "ename": "AttributeError", "evalue": "'AxesSubplot' object has no attribute 'rowNum'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_4060\\1923514572.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 데이터 시각화\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mpreseason_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m9\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtight_layout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 그래프 간격 설정\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\pandas\\plotting\\_core.py\u001b[0m in \u001b[0;36mhist_frame\u001b[1;34m(data, column, by, grid, xlabelsize, xrot, ylabelsize, yrot, ax, sharex, sharey, figsize, layout, bins, **kwds)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[0mlayout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlayout\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 198\u001b[0m \u001b[0mbins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbins\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 200\u001b[0m )\n\u001b[0;32m 201\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\pandas\\plotting\\_matplotlib\\hist.py\u001b[0m in \u001b[0;36mhist_frame\u001b[1;34m(data, column, by, grid, xlabelsize, xrot, ylabelsize, yrot, ax, sharex, sharey, figsize, layout, bins, **kwds)\u001b[0m\n\u001b[0;32m 404\u001b[0m \u001b[0msharey\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msharey\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 405\u001b[0m \u001b[0mfigsize\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 406\u001b[1;33m \u001b[0mlayout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlayout\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 407\u001b[0m )\n\u001b[0;32m 408\u001b[0m \u001b[0m_axes\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_flatten\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\pandas\\plotting\\_matplotlib\\tools.py\u001b[0m in \u001b[0;36m_subplots\u001b[1;34m(naxes, sharex, sharey, squeeze, subplot_kw, ax, layout, layout_type, **fig_kw)\u001b[0m\n\u001b[0;32m 261\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_visible\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 262\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 263\u001b[1;33m \u001b[0m_handle_shared_axes\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxarr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnplots\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnaxes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mncols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msharex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msharey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 264\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 265\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msqueeze\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\pandas\\plotting\\_matplotlib\\tools.py\u001b[0m in \u001b[0;36m_handle_shared_axes\u001b[1;34m(axarr, nplots, naxes, nrows, ncols, sharex, sharey)\u001b[0m\n\u001b[0;32m 305\u001b[0m \u001b[0mlayout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mncols\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbool\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 306\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0max\u001b[0m \u001b[1;32min\u001b[0m \u001b[0maxarr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 307\u001b[1;33m \u001b[0mlayout\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrowNum\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolNum\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_visible\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 308\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0max\u001b[0m \u001b[1;32min\u001b[0m \u001b[0maxarr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mAttributeError\u001b[0m: 'AxesSubplot' object has no attribute 'rowNum'" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 데이터 시각화\n", "preseason_df.hist(figsize=(10,9))\n", "plt.tight_layout() # 그래프 간격 설정\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
year20022003200420052006200720082009201020112012201320142015201620172018
regular43.0054.0068.0073.0085.0098.00115.00124.00130.00151.0174.0194.00186.00207.00213.00217.00227.0
preseason12.0019.0028.0037.0036.0043.0061.0066.0072.0075.087.0104.00117.00134.00153.00167.00182.0
ratio0.280.350.410.510.420.440.530.530.550.50.50.540.630.650.720.770.8
\n", "
" ], "text/plain": [ "year 2002 2003 2004 2005 2006 2007 2008 2009 2010 \\\n", "regular 43.00 54.00 68.00 73.00 85.00 98.00 115.00 124.00 130.00 \n", "preseason 12.00 19.00 28.00 37.00 36.00 43.00 61.00 66.00 72.00 \n", "ratio 0.28 0.35 0.41 0.51 0.42 0.44 0.53 0.53 0.55 \n", "\n", "year 2011 2012 2013 2014 2015 2016 2017 2018 \n", "regular 151.0 174.0 194.00 186.00 207.00 213.00 217.00 227.0 \n", "preseason 75.0 87.0 104.00 117.00 134.00 153.00 167.00 182.0 \n", "ratio 0.5 0.5 0.54 0.63 0.65 0.72 0.77 0.8 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 정규시즌 데이터에서 2002년 이후의 연도별 기록된 선수의 수\n", "regular_count = regular_season_df.groupby('year')['batter_id'].count().rename('regular')\n", "\n", "# 프리시즌 데이터에서 연도별 기록된 선수의 수\n", "preseason_count = preseason_df.groupby('year')['batter_id'].count().rename('preseason')\n", "\n", "pd.concat([regular_count, preseason_count, np.round(preseason_count/regular_count, 2).rename('ratio')],\n", " axis = 1).transpose().loc[:,2002:] # 2002년부터 봅니다." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 가르시아2018\n", "1 강경학2011\n", "2 강경학2014\n", "3 강경학2015\n", "4 강경학2016\n", " ... \n", "1388 황재균2014\n", "1389 황재균2015\n", "1390 황재균2016\n", "1391 황재균2018\n", "1392 황진수2014\n", "Name: new_idx, Length: 1393, dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 타자의 이름과 연도를 이용해 새로운 인덱스를 생성\n", "regular_season_df['new_idx'] = regular_season_df['batter_name'] + regular_season_df['year'].apply(str)\n", "preseason_df['new_idx'] = preseason_df['batter_name'] + preseason_df['year'].apply(str)\n", "preseason_df['new_idx']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1358, 30) (1358, 30)\n" ] }, { "data": { "text/plain": [ "1358" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 새로운 인덱스의 교집합\n", "intersection_idx = list(set(regular_season_df['new_idx']).intersection(preseason_df['new_idx']))\n", "\n", "# 교집합에 존재하는 데이터만 불러오기\n", "regular_season_new = regular_season_df.loc[regular_season_df['new_idx'].apply(lambda x: x in intersection_idx)].copy()\n", "regular_season_new = regular_season_new.sort_values(by = 'new_idx').reset_index(drop=True) \n", "\n", "preseason_new = preseason_df.loc[preseason_df['new_idx'].apply(lambda x: x in intersection_idx)].copy()\n", "preseason_new = preseason_new.sort_values(by = 'new_idx').reset_index(drop=True)\n", "\n", "# 검정 코드\n", "print(regular_season_new.shape, preseason_new.shape)\n", "sum(regular_season_new['new_idx'] == preseason_new['new_idx'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\seaborn\\_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", " FutureWarning\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 정규시즌과 프리시즌의 상관관계 계산\n", "correlation = regular_season_new['OPS'].corr(preseason_new['OPS'])\n", "sns.scatterplot(regular_season_new['OPS'], preseason_new['OPS'])\n", "plt.title('correlation(상관계수): '+str(np.round(correlation,2)), fontsize=20)\n", "plt.xlabel(\"정규시즌 OPS\",fontsize=12)\n", "plt.ylabel(\"프리시즌 OPS\",fontsize=12)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2.2. 정규시즌 데이터 분석" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2454, 29)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearteamavgGABRH2B...GDPSLGOBPEheight/weightyear_bornpositioncareerstarting_salaryOPS
00가르시아2018LG0.3395018327629...30.5190.3839177cm/93kg1985년 04월 12일내야수(우투우타)쿠바 Ciego de Avila Maximo Gomez Baez(대)NaN0.902
11강경학2011한화0.00021000...00.0000.0001180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.000
21강경학2014한화0.221418611192...10.3490.3376180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.686
31강경학2015한화0.25712031150807...30.3250.34815180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.673
41강경학2016한화0.1584610116163...50.2570.2327180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.489
\n", "

5 rows × 29 columns

\n", "
" ], "text/plain": [ " batter_id batter_name year team avg G AB R H 2B ... GDP \\\n", "0 0 가르시아 2018 LG 0.339 50 183 27 62 9 ... 3 \n", "1 1 강경학 2011 한화 0.000 2 1 0 0 0 ... 0 \n", "2 1 강경학 2014 한화 0.221 41 86 11 19 2 ... 1 \n", "3 1 강경학 2015 한화 0.257 120 311 50 80 7 ... 3 \n", "4 1 강경학 2016 한화 0.158 46 101 16 16 3 ... 5 \n", "\n", " SLG OBP E height/weight year_born position \\\n", "0 0.519 0.383 9 177cm/93kg 1985년 04월 12일 내야수(우투우타) \n", "1 0.000 0.000 1 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "2 0.349 0.337 6 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "3 0.325 0.348 15 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "4 0.257 0.232 7 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "\n", " career starting_salary OPS \n", "0 쿠바 Ciego de Avila Maximo Gomez Baez(대) NaN 0.902 \n", "1 광주대성초-광주동성중-광주동성고 10000만원 0.000 \n", "2 광주대성초-광주동성중-광주동성고 10000만원 0.686 \n", "3 광주대성초-광주동성중-광주동성고 10000만원 0.673 \n", "4 광주대성초-광주동성중-광주동성고 10000만원 0.489 \n", "\n", "[5 rows x 29 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idyearavgGABRH2B3BHR...SBCSBBHBPSOGDPSLGOBPEOPS
count2454.0000002454.0000002428.0000002454.0000002454.0000002454.0000002454.0000002454.0000002454.0000002454.000000...2454.0000002454.0000002454.0000002454.0000002454.0000002454.0000002428.0000002430.0000002454.0000002428.000000
mean178.0794622011.6145070.23755972.535045201.51467029.91238855.9881839.8634880.9576205.504075...5.2901392.33577820.9437653.42461338.5969854.6035040.3438260.3066843.6764470.649939
std97.5579474.9928330.09844045.093871169.53702928.77875952.2538449.8713141.6471937.989380...9.0885803.19404521.2061134.13261431.8014664.7135310.1633350.1117784.5852480.261634
min0.0000001993.0000000.0000001.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%101.2500002008.0000000.20300028.00000038.2500005.0000008.0000001.0000000.0000000.000000...0.0000000.0000003.0000000.00000010.0000001.0000000.2674540.2727270.0000000.546000
50%183.0000002013.0000000.25500079.000000163.00000021.00000040.0000007.0000000.0000002.000000...2.0000001.00000014.0000002.00000033.0000003.0000000.3601240.3285922.0000000.688637
75%265.0000002016.0000000.291000115.000000357.50000049.000000100.00000016.0000001.0000008.000000...6.0000003.00000034.0000005.00000060.0000007.0000000.4360000.3670005.0000000.797234
max344.0000002018.0000001.000000144.000000600.000000135.000000201.00000047.00000017.00000053.000000...84.00000021.000000108.00000027.000000161.00000024.0000003.0000001.00000030.0000004.000000
\n", "

8 rows × 22 columns

\n", "
" ], "text/plain": [ " batter_id year avg G AB \\\n", "count 2454.000000 2454.000000 2428.000000 2454.000000 2454.000000 \n", "mean 178.079462 2011.614507 0.237559 72.535045 201.514670 \n", "std 97.557947 4.992833 0.098440 45.093871 169.537029 \n", "min 0.000000 1993.000000 0.000000 1.000000 0.000000 \n", "25% 101.250000 2008.000000 0.203000 28.000000 38.250000 \n", "50% 183.000000 2013.000000 0.255000 79.000000 163.000000 \n", "75% 265.000000 2016.000000 0.291000 115.000000 357.500000 \n", "max 344.000000 2018.000000 1.000000 144.000000 600.000000 \n", "\n", " R H 2B 3B HR ... \\\n", "count 2454.000000 2454.000000 2454.000000 2454.000000 2454.000000 ... \n", "mean 29.912388 55.988183 9.863488 0.957620 5.504075 ... \n", "std 28.778759 52.253844 9.871314 1.647193 7.989380 ... \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 ... \n", "25% 5.000000 8.000000 1.000000 0.000000 0.000000 ... \n", "50% 21.000000 40.000000 7.000000 0.000000 2.000000 ... \n", "75% 49.000000 100.000000 16.000000 1.000000 8.000000 ... \n", "max 135.000000 201.000000 47.000000 17.000000 53.000000 ... \n", "\n", " SB CS BB HBP SO \\\n", "count 2454.000000 2454.000000 2454.000000 2454.000000 2454.000000 \n", "mean 5.290139 2.335778 20.943765 3.424613 38.596985 \n", "std 9.088580 3.194045 21.206113 4.132614 31.801466 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 3.000000 0.000000 10.000000 \n", "50% 2.000000 1.000000 14.000000 2.000000 33.000000 \n", "75% 6.000000 3.000000 34.000000 5.000000 60.000000 \n", "max 84.000000 21.000000 108.000000 27.000000 161.000000 \n", "\n", " GDP SLG OBP E OPS \n", "count 2454.000000 2428.000000 2430.000000 2454.000000 2428.000000 \n", "mean 4.603504 0.343826 0.306684 3.676447 0.649939 \n", "std 4.713531 0.163335 0.111778 4.585248 0.261634 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 0.267454 0.272727 0.000000 0.546000 \n", "50% 3.000000 0.360124 0.328592 2.000000 0.688637 \n", "75% 7.000000 0.436000 0.367000 5.000000 0.797234 \n", "max 24.000000 3.000000 1.000000 30.000000 4.000000 \n", "\n", "[8 rows x 22 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "regular_season_df = pd.read_csv(\"./input/Regular_Season_Batter.csv\")\n", "display(regular_season_df.shape, regular_season_df.head(),regular_season_df.describe())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "regular_season_df.hist(figsize=(10,9))\n", "plt.tight_layout() # 그래프 간격 설정\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "연도별 OPS 중앙값 그래프" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(15,6)) # 그래프 크기 조정\n", "plt.subplot(1,2,1) # 1행 2열의 첫 번째(1행, 1열) 그래프\n", "g = sns.boxplot(x=\"year\", y=\"OPS\", data=regular_season_df, showfliers=False)\n", "g.set_title('연도별 OPS 상자그림', size = 20)\n", "g.set_xticklabels(g.get_xticklabels(),rotation=90)\n", "\n", "plt.subplot(1,2,2)\n", "plt.plot(regular_season_df.groupby('year')['OPS'].median(), marker='o')\n", "plt.grid(axis='y', linestyle='-', alpha=0.4)\n", "plt.title('연도별 OPS 중앙값', size = 20)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
year1993199419951996199719981999200020012002...2009201020112012201320142015201620172018
col_0
count121781014203243...124130151174194186207213217227
\n", "

1 rows × 26 columns

\n", "
" ], "text/plain": [ "year 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 ... 2009 \\\n", "col_0 ... \n", "count 1 2 1 7 8 10 14 20 32 43 ... 124 \n", "\n", "year 2010 2011 2012 2013 2014 2015 2016 2017 2018 \n", "col_0 \n", "count 130 151 174 194 186 207 213 217 227 \n", "\n", "[1 rows x 26 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.crosstab(regular_season_df['year'],'count').T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "연도별 팀 OPS" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 연도별 팀의 OPS 중앙값 계산\n", "med_OPS_team = regular_season_df.pivot_table(index=['team'], columns='year',\n", " values='OPS', aggfunc='median')\n", "\n", "# 2005년 이후에 결측치가 존재하지 않는 팀만 확인\n", "team_idx = med_OPS_team.loc[:,2005:].isna().sum(axis=1) <= 0\n", "\n", "plt.plot(med_OPS_team.loc[team_idx,2005:].T, marker = 'o', markersize=4)\n", "plt.grid(axis='y', linestyle='-', alpha=0.4)\n", "plt.legend(med_OPS_team.loc[team_idx,2005:].T.columns, \n", " loc='center left', bbox_to_anchor=(1, 0.5)) # 그래프 범례를 그래프 밖에 위치\n", "plt.title('연도별 팀 OPS')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "177cm/93kg 177.0 93.0\n" ] } ], "source": [ "import re\n", "\n", "regular_season_df['weight'] = regular_season_df['height/weight'].apply(\n", " lambda x: int(re.findall('\\d+',x.split('/')[1])[0]) if pd.notnull(x) else x)\n", "\n", "regular_season_df['height'] = regular_season_df['height/weight'].apply(\n", " lambda x: int(re.findall('\\d+',x.split('/')[0])[0]) if pd.notnull(x) else x)\n", "\n", "print(regular_season_df['height/weight'][0], regular_season_df['height'][0],\n", " regular_season_df['weight'][0])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\seaborn\\_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", " FutureWarning\n", "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\seaborn\\_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", " FutureWarning\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 몸무게/키 계산\n", "regular_season_df['weight_per_height'] = regular_season_df['weight'] / \\\n", " regular_season_df['height']\n", "plt.figure(figsize=(15, 5)) # 그래프 크기 조정\n", "plt.subplot(1, 2, 1) # 1행 2열의 첫번째(1행, 1열) 그래프\n", "\n", "# 정규시즌과 프리시즌의 상관관계 계산\n", "correlation = regular_season_df['weight_per_height'].corr(regular_season_df['OBP'])\n", "sns.scatterplot(regular_season_df['weight_per_height'], regular_season_df['OBP'])\n", "plt.title(\"'몸무게/키'와 OBP correlation(상관관계): \" + str(np.round(correlation, 2)), \\\n", " fontsize=15)\n", "plt.ylabel('정규시즌 OBP',fontsize=12)\n", "plt.xlabel('몸무게/키', fontsize=12)\n", "plt.subplot(1, 2, 2)\n", "\n", "# 정규시즌과 프리시즌의 상관관계 계산\n", "correlation = regular_season_df['weight_per_height'].corr(regular_season_df['SLG'])\n", "sns.scatterplot(regular_season_df['weight_per_height'], regular_season_df['SLG'])\n", "plt.title(\"'몸무게/키'와 SLG correlation(상관관계): \" + str(np.round(correlation, 2)), \\\n", " fontsize=15)\n", "plt.ylabel('정규시즌 SLG', fontsize=12)\n", "plt.xlabel('몸무게/키', fontsize=12)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "내야수(우투우타) 643\n", "외야수(우투우타) 230\n", "외야수(좌투좌타) 201\n", "포수(우투우타) 189\n", "외야수(우투좌타) 184\n", "내야수(우투좌타) 141\n", "내야수(좌투좌타) 36\n", "포수(우투좌타) 14\n", "외야수(우투양타) 7\n", "내야수(우투양타) 7\n", "Name: position, dtype: int64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regular_season_df['position'].value_counts()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "내야수(우투우타) 내야수 우타\n" ] } ], "source": [ "# position\n", "regular_season_df['pos']=regular_season_df['position'].apply(\n", " lambda x: x.split('(')[0] if pd.notnull(x) else x)\n", "\n", "# 우타, 좌타, 양타\n", "regular_season_df['hit_way'] = regular_season_df['position'].apply(\n", " lambda x: x[-3:-1] if pd.notnull(x) else x)\n", "print(regular_season_df['position'][0], regular_season_df['pos'][0], \n", " regular_season_df['hit_way'][0])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3QAAAFKCAYAAABVfx1PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAA170lEQVR4nO3deZgcZbn38e892TcISwiBEIMEUDYRArIYQBREEFEBl1cFVAjgEhHxRFEQBYUIeDTn6MGIx10QkIMLiKyBAAKGfZVETEIgZCMb2ZO53z+6J04mk5ks091TM9/Pdc01XfXUcnfqmlT/uup5KjITSZIkSVLx1NW6AEmSJEnSpjHQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6qQIiYt+I6LkRyx+0kdsfHhEDNr4ySZI2XkT0j4g3beQ6J23Esm+MiJPX07ZLRHy4ybw3RMTxG1mP5051SAY6qTK+D2zfdGZEvCsirmpm+WubWXZyk+mfR8QR5cnPAXtudpWSJG2YfYGvNNcQEQdFxM+babqiyXJXNzqPERFHRMTV5ckhwHGN2rpFxNCIGAocAJzcMB0RWwE7Ayc2U4vnTnU6XWtdgNQeRMRg4KZWFjs+M2eUl/8W8D6gGzAUmFRe5ouZeXcL2+ha/iEiegH/WZ6/zQaW+v2ImA+8Cfj5Bq4jSVKrIuI04DLg1Uaz52XmO1pZtXf5py0NKNfSYFWj6T8AM4BBEfEu4MnMnNXCtjx3qkMz0ElAZk4HhkfE94D7MvNGgIj4IHBQZv5Hk+UvBC6MiJ2AGzLzbRu4q93497eDK/j3ieWEZpYdHBETG00PBU4D7gN+1NqOIuIoSt+m9qN0NX4+8M3MnFBuPwK4AXga6AIsAj6XmS9GxBbA9yh9A9qrvI39MnPlhrxJSVJhXZWZF23kOvsDB0ZEV+DzwKnl+Ts0s2xDuALoD0xsZhky8xXgI+WQeTLQF3gcuDQzXy2fw7YHjqAU7hoCnedOdToGOmltiymdNBr0A15vYfkdgcEbsf2PA10i4mDgUaCh71yPZpadnpnDGybKt7N8DjgJOBAYt76dRMT7gYuAkzJzcnne7sANEfGZhhMTpfD6/nL7+4CfAO8Evgv8LTNPL7f1o/TtqCRJa5RDzCeBuyjdpXI55btPImJKM6tcBjxYfn0Q8K5Gbe+LiMeB8zLzjoj4f5RuwzwDmFd+/Xvg0PLyj2Xm15ts33OnOh370ElrW0wpxDXoV563PocCA8v/4bcoIs4BJgAfofQt4UBK3/zNB+qbWWWniHi84YfSLZ6XZeZplL5pbMkY4JSGExJAZv4DOAe4ZD3r3A3sUn69JaXbSRvWXZSZ2co+JUnFd1pEjG/08+VGbW8v90k7GdaEuRuBC4AzgcMj4svlK3Xr82pmTsnMKax9ayfAHzNz38y8ozy9O3B7Zr6SmUuBv7D2l6jvjYiJEXFko3meO9XpeIVOAiLiEEoha0B5+oxy0wAgI+ITwJmZ+VCjdboDnwK+CnwZOH092w7gS8B7gGMzc3lEfB64GfhQZj4XERc1XS8zm7tq1+Bu4JX17G8boG9mPtlM832Ubo1puk4X4AuUTswAo4HrIuKdwLcy86kWapEkdRw/b+GWyycoBZt55XPb74Bxmfl7gIj4APB1SgOcvNjM+lOAK0qrrnFTC7X8BPhjRBxL6cvPA4BLG7X/uRzU1vDcqc7IQCcBmfkApRG8AIiI84BVmfn9FlYbA/ya0ihet0bEiQ0ntSbqKN1y8b7MXF7e330RcQzrfju5loj4Mc2cRMqeXs/8rsD6vhFMSn33Grw9IsYDq4G/UjoZkZnTyreFfgD4RUTcnZlfaqlWSVKHt6jx1StKX1SuUe4r9o1Gsz7fpP0S1n+law6lrgiNl58eEcMpDWbSB3ghMxeUm5dQ6ju3Ds+d6mwMdNJGKn8reRmlzt7nZmZGxIeAv0ZE38z8RePlM3M1pccYEBFXZOZ55fkvt7avzDxzPTVczdq3hjZeZ2ZErIqIPTLz2SbNbwceajS9ph9AM9tJ4MaI+BNwT0QclZm3t1azJKnQ3l9+VEAdpdsHe1G6QtXs4CUR8Q7+PWJzU3sA3Zss34XSnS3HUwpKdUAAD1C6dbOppNR37oNAXfkcXAc8DJzf3E49d6qzMdCp0yt/+3d1k9kNt16e1mT+aZn5eET8DTi/4d74zFxQHjp561Z2dxJwXtOZmTl0PbVdTaMrh43sBPyqhf18FfhVRHwwM6eWt7UnpWD5qVZqJCL2Bp4uv79VlPoRttSXUJJUfDcA4ymFqHpKV6UWZOayRs9yW0v5UT37Nte2nkFRzqPUD25EZq4oL1cHnAX8GPhok+XPKG//8EbLB6U+ez8C/l8z+/XcqU7FQKdOLzMnsp6TUQvr3NTMvNdpeUTMTTGM0mhbUzZmpcy8JiIWA7+OiB6Uvs2cBZxefr+tORU4KiLmUTqp31S+LVWS1EFV6DzWVD2l4f6b6kbzA4R1B1ZSur0RKF0Fi4hlNLn614jnTnUq4eA7Utsr31t/WtOTSUQsB9bXSfqczLyvyfLjge1Y+979Br8pDw8tSVJFla/QndZ0EJJW1pnS9A6U8i2Xoyk9fzUp3W7ZBbgfuCAzFzaz/PnAMZSCXR2lCxLPAl/JzDnN7Hc8njvViRjopAqIiL7Aksxs7ttGSZIKpfwogu6ZuWQj1mmuP5qkNmagkyRJkqSC8sHikiRJklRQFQl0ETEgIr4dERc3mb9PRNwWERMi4rryg5klSZIkSZugUqNcXglMBno3mZ/A8Zm5PCIup9Qh9vrWNrbtttvm0KFD27xISVL78sgjj8zJzAG1rqMoPD9KUuexvnNkRQJdZp5SHg3pmCbzG4/uN48Wns0RESOBkQBDhgxh4sQNGS1WklRkETG11jUUydChQz0/SlInsb5zZE360EXEocCewF/Xt0xmjsvM4Zk5fMAAv6yVJEmSpKaq+mDxiAhKzx7pBpySmatbWUWSJEmStB5VDXTAWcCMzPxFlfcrSZIkSR1OVW65jIgx5REtjwfOjIjx5Z9zq7F/SZIkSeqIKnaFLjPHA+PLr0eXZx9bqf1JkiRJUmfjg8UlSZIkqaAMdJIkVVFEDIiIb0fExU3m7xMRt0XEhIi4rtxVQZKkFhnoJEmqriuB5ZRGfG4sgeMzcwQwFTih2oVJkorHQCdJUhVl5inAvc3Mfyozl5cn5wGLq1qYJKmQDHSSJLUjEXEosCfw1/W0j4yIiRExcfbs2dUtTpLU7lT7OXQdxtixY5k8eXJF9zF9+nQABg8eXNH9DBs2jFGjRlV0H5KklkVEAKMp3Yp5Smaubm65zBwHjAMYPnx4Vq/CDVON8yN4jpSkBga6dmzp0qW1LkGSVD1nATMy8xe1LqQIPEdKUomBbhNV49u6hn2MHTu24vuSJNVGRIwBLgCOB/pHxCfLTX/MzO/VrrJNU62rWZ4jJanEQCdJUpVl5nhgfPn16PLsY2tVjySpuBwURZIkSZIKykAnSZIkSQVloJMkSZKkgrIPnbQRHI5bkiRJ7YmBTmqHHI5bkiRJG8JAJ20Eh+OWJElSe2IfOkmSJEkqKAOdJEmSJBWUgU6SJEmSCspAJ0mSJEkF5aAokiRJ2mzVeLRPtR7rAz7aR8VhoJMkSVIh+FgfaV0GOkmSJG22alzN8rE+0rrsQydJkiRJBWWgkyRJkqSCMtBJkiRJUkEZ6CRJkiSpoAx0kiRJklRQjnIpSRugGs9Xguo9Y8nnK0mS1DEY6CSpHfEZS5IkaWMY6CRpA1TrapbPWJIkSRvDPnSSJEmSVFAGOkmSJEkqKAOdJEmSJBWUgU6SJEmSCspAJ0mSJEkFZaCTJEmSpIIy0EmSJElSQRnoJEmSJKmgDHSSJEmSVFAGOkmSJEkqqK6V2GhEDADOAeoz84JG8/sCPwF2BF4DTsnMhZWoQZIkSZI6ukpdobsSWA50azL/i8CfMvMw4Hbg7ArtX5IkSZI6vIoEusw8Bbi3maYjgevLr38PHFyJ/UuSJElSZ1DtPnQ9MnNl+fVcYKv1LRgRIyNiYkRMnD17dnWqkySpwiJiQER8OyIubjK/b0RcExH3RsRNEbFFrWqUJBVHtQNdfUQ07HMrYL1JLTPHZebwzBw+YMCA6lQnSVLl2S1BktRmqh3oHgJOKL8+EbijyvuXJKmmNrdbgnewSJIaq0qgi4gxEdEduBQYGRHjgf2Bn1Vj/5IkFcAGdUvwDhZJUmMVeWwBQGaOB8aXX48uz54DvKdS+5QkqcDqI6IuM+tppVuCJEkNfLC4JEntg90SJEkbrWJX6CRJUusiYgxwAaVuCb+KiC8Ak4HP1rQwSdpAY8eOZfLkyRXdx/Tp0wEYPHhwRfcDMGzYMEaNGlXx/bQVA50kSVVmtwRJ2jhLly6tdQntloFOkiRJ0iarxtWshn2MHTu24vsqGvvQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQXVtdYFSJIkqbLGjh3L5MmTa13GZps0aRIAo0aNqnElbWPYsGEd5r2odgx0kiRVUURcDBxG6Rw8MjOfKc/vDvwUGAosAj6amQvact8d5UM9dKwP9tX4UD958mReePpRhvRdXdH9VFr3laWby5ZN+XuNK9l8017vUusS1EEY6NRh+EGl/fIbSKkkIkYAAzPz8IjYC7gcOLbc/H5gamZ+IiJOB04HrmzL/U+ePJnHnnqW+t5bt+VmayJWJACP/PPVGleyeeqWvFa1fQ3pu5qvD3+9avtTyy6Z2LfWJaiDMNCpw+go3z6C30BKHdjRwDUAmfl0RDROVrOBrcqvtwVeqUQB9b23Ztke763EprUJej7751qXoA7ML7vbr7b8sttApw7Fbx/bH7+BlNayHaXg1mBVRNRlZj1wH3BBRDwD1AOHNLeBiBgJjAQYMmRIhcuVVGSTJ0/msWceg/61rqQN1Jd+PfbyY7Wtoy3Mb9vNGegkSaqeBfz7KhxAfTnMAXwHuCIzb4mIfYFxwEebbiAzx5XbGD58eFa2XEmF1x/qj6hvdTFVT934tn3QQMUCXa06fXtpuf2yH5UkMQE4CZgQEXsA0xu1vQFo6BA2C9ipyrVJkgqoIoGulp2+7fDdPlWz07cktWM3A8dGxARKX2qeGRFjgAvKPz+KiDqgG/Dl2pUpSSqKSl2hq2mnbzt8tz92+pYkKN9eeXaT2aPLv/8BvLO6FUmSiq5Sgc5O35IkSWozO4y6la79d1wzvWzK35n1y0+x9fsuoe++J6yz/Myfn8ryaY9S13ML+h/1JXrt/g7quvfh9cf/j3m3XFLN0qWKqlSgs9O3JEmS2tSS5+9i4YQfA1C/YgkAC+75Ea///bdrlul/1Hl06bsty196AoBtP/R9um//Jubf8Z+snPNPolvP6hcuVVClAp2dviVJUpvoWhe8a5+hfP64/XhiyizO/82ENW0H7TaIc48/gB237svkV+dz6Y0P8vzLpX7b3bvW8dn37Mcx+w5ly949eGjSDL7wv3fRrUsd551wACPePJgte/fgxZnz+f6fH+GRF2fW6i12Cl223IEdv/BXXn/8D3Tdcnu6D34LK2Y8w5zrzqV+yWtsf8bvgODVn3xovdtY9doUVsx4dq15qxe8wuoFpR483Xfcmx5v2J85158LuZqebzyYnkMPYO5NX2Pxk3+s5NuTaqZtx8z8t5uB7uVO31cAoyNiTHmEywuAyyPibuA67PQtSZJa8OOz382FHzqE7fv3WWv+Fr26c/kpRzB74RLO++V4+vbsxpWnvoMudQHA+ScexIcO2Z3fTHiOs8bdzv89VBo9unePbmzdtydjbnqYL/3ibnr36Mblpx5BXUTV31tn1GuXQ1j44C9Z9OAv6Tlkf/rufxIAs35zFrN+c1aL6/Z72ycY/NW/M/DT19BjyH7rtPc/chQrZ01m6fN3AtB98FsA6Drgjez4xbvY4Zw76PPWD7bxO5JqqyJX6Oz0LUmSGgzaqg9/Pv9E/jRxMtv378PebxjAcy/N5cu/vId5i5fxmy8cBxF87PvND6D123uf5ZmX5nLz105ca/6737ozvXt0439ufZynX5rDtfc/z1c+8Db2ecMAXnt9Gcfu90Z+cseT/HL8M2utt2DJcr78y3vWTE/856ucdPDu9O7RldeXrWz7fwCtZfEzt7Js0r0sn/YoW44YuaZfXP2SeS2uN+f/vgKrV9Kl30C2OvrLDPjwfzH9ysOhfhUA3XfYk547H8TcP1ywZp0ufbcFoK5bL2bfcC793/F5tj726yybfD+rF3lFVh2DDxaX2psu3en/zi/QZ8/3UNe7P8te/Buzr/ksdOnGVu/+Cr12O5y6XluwcvaLzL/9CpZPndhymyS1EwfvtgOX3PA39n7DAD79zn344EG78tM7n+KzV99BsP6rY3c+Na3Z+YO36QfAtDkLAXh57iIAtu/fhx237kuXujr69ezOn776QXp178rv7n+en9zx5No17b4Dx+73Rh584RXDXJXk6hWl36tKv4kuG7TeipceL796hh5v2J8tDjqFut79qX99DgB99v0guWo5S567bc06qxfNAmDRQ79i1bzpLHnmr/QceiBd+g8y0KnDqNQtl5I20dbHXUi/4R9h4UO/YtavTuf1R38PQF333nTpszXzbvk2c373Beq692bbD30foq7FNklqL257YgoTnnt5zRWzHbbuC8D8xcuZt3jZRm+ve9fS/3H1WRo7rWEEtVX1yTb9egEwcMvefO23E3h48gzOeve+7DWkfMUmgnPeuz9jP/VOnn1pLl/99b2b89bUBup69aeu91bNtnXpvyP9DjqFHkP2p/dex9Jnr+NYMWvSmjAH0PvN72TZ1IlkebAUgCXP/JWsX8WWh51Nj6EH0OetH2T14rmsnPlCxd+PVC1eoZPa2OZ0+u66zVD67PNeFtx7FYse+NlabfVLFzDn+i+umV425WH6Df8w0b13i225/PWKvVdJ2hgrVpYGvF6xajXAmr5um2rW/NIH9x237sc/XnltzRW76XMX0bVLads3PPgCT06dTd+e3Xj3vjuz0zb9eHraHC44+WCO2/+N/PdfHuWX9zxDOp52zW33iZ8AwavjTlq3sX41fd7yfrZ8x+fJFUtYPu0R5t1+5Zrmrtu+kS59tmHFy0+ttdqq16Yy5/ov0f/IUQx487tYOfMFZl/z2bVCn1R0BjqpQnrtcghz/3wRPQa/hS1HjKTv/iexcMK4cofv5j/E9NhxH6KuC3U9t2CHUbcS3Xqx6O/XsPDeq9Zarucuh9Jnn+NZ+s8H1glsLbVJUnvUv3cPiNKVuo1xx1NTOfuYffnMMfty7X3P85FD38QLr7zGc9PnMn3uIl5fuoJTjtiT+vrkI29/M8tWrOLxKbMYNqg/7ztgGH959EUenvwqb9pxGwCemz63Em9PZasXvMK0b+3daMaKtaZfHXfy+tdd+Cqv/nj9g5msmvPi2ttuZOk/7mLpP+7a+IKlgjDQtVMfeNuunPaOvRi4ZW+mzVnED299jHueeanFoZYv+vAhHD982Drb+vSPbuXxf82qwbvo3Dal03ddufN21y0GMufG0fQ78GP0P+KzLPvnA6x4+UmIOvq/64v0O+gUlk+dyNzfNxoktqU2SWrHfnTmUQTw0f9sflCU9XlpziK+9tv7OOvot3Dlae/guelzufiGvwGwaOkKvvCzuzjvfQfwg08fybQ5izjvF+OZMW8x73nrzgC8Z7838p793rhme/t/+Zdt9p4kqVoMdO3QkG378fWTDubGB1/gtsen8PHD9+DSjx3GUd+8jq5d6tYMtbx85Sq+fMKBXH7qEbzrouv48W1P8Lv7/7FmO+e8d3+27deLp6bOruG76bw2pdP3ms7bj1zHiulPsLhnP/rs9R66brUTK15+kq2P/yZ99jme+Xf+oHxL5r/vEWqpTZJqaca8xWuFpRWr6tea/n8bGOSaC1y3PzGF25+Y0uzyj/9rFh//wc3rzP/LY//iL4/9a4P2KUntnYGuQjZniOYlK1axbMUqXp2/mCenzmbmgiUsWraCFatWs3j5yvUOtTxj3mJmzFsMwF47bct+Ow/kP351D6vr/WDfntT16g8RzV6pWzrpXuqXLWKLg09jYf1q+h34MepXLmX5S4/Rbbvd6Lvv+1n81M0s/9eDdB/0ZgBWzHi2xbbOYOzYsUyePLnWZbSJSZNKz8kaNWpUjSvZfMOGDesQ70OSpPbMQFdhmzJE85yFS7ng2vu49GOH8Zlj3srylasZ9dM7WLm6fu1ttzDU8mff81b+OXM+dz/d/FDPqp2WOn3nsoXMvvZzbHX0aAZ89Iesem0ac647h9ULXqHHXscB0Gfv4+iz93Fr1pn2rb3ptt2u623rDCZPnsxjzzwG/WtdSRso/5k/9vJjta1jc82vdQGSGps+fTqLF3Xhkol9a12KyqYu6kKf6dNrXYY6AANdhTUM0fzYv2bx6Xfus9YQzeszeJt+XHjyIdz+xBSu/9s/+PChb2LMJw7npMv/yLzFy6iLYNRx+/GxEXvw6Isz1xlqeY/B23DgroO46Hf3V/S9qXmb0+kbYPm0R3n16g+vM3/J0zcz7el1bx1qra3T6A/1R9S3upiqo268j8yQJKkaDHQVtilDNL9jr53o16s7/3vXU7w4cwGLlq7g3fvuzH5v3I47n5rW6lDLJxw4jOUrV3PnU1Pb/P1IkqTiGTx4MMtWzeDrwx39uL24ZGJfeg4eXOsy1AEY6GqkpSGa//nqfAA+NmIP/vTIPzl+/11Ytbqef85csEFDLR+59xAefXEmS5avqsp7kSQVw/Tp06lbsoCez27caJKqnLolc5k+3fO1pE1noKuRloZofuAfr/C9P/2dDx3yJo7Zb2denvs6F1xzH1NmLWh1qOWdt9uSrfv24ulpL1TlfUiSJEmqHQNdhWzuEM2/ufc5fnPvc+vMb22o5X/NWuBzdCRJzRo8eDAzl3dl2R7vrXUpKuv57J8ZPHj7WpchFUqXui4cOexIzj7kbJ6a8RTf+Os31lnmw/t+mFEjRvHt27/NLc/fUoMqq8dAJ0mSJKkw/vsD/83u2+1Oj649eGrGU+u0D9piEJ868FM1qKw2DHTqMBySuX1yWGZJktTY9v225/en/Z5bnruFgX0HstegvXh+1vOcf8v5zF86n//98P8SEXzy2k82u/7vHv8dz818jhs/eeM6bd3qunHR0RfxwJQHOHr3oyv9VtoFx5WWJEmSVHUHDjmQax+/lmsfu5a37PAWTtjzBADO/cO5fPEPX1zveuP/OZ6Zr89stu38d53P6lzN1Q9dXZGa2yOv0KnDcEjm9slhmSVJUnPumnQXD0x5gCdeeYJTDziVQVsMAmD+svmbtL1PHfgp9hi4B2dcfwa9u/Vuw0rbtw4X6BySuX1yWGZJkiQ1tnx16fFdK1evBEqDnWyOY998LIO2GMRfzvjLmnlfO+prdO/anZuevmmztt2edbhAJ0mSJKm4tuy5JUFs9JW68/54Ht26dANg2z7bcsX7ruDqB6/mrsl3VaDK9qPDBTqHZG6fHJZZkiRJG+IH7/8BBJx2zWkbtd6UeVPWvF60fBEAMxfNZOGyhW1YXfvT4QKdJBVRt7punHPYORyy8yFs0WMLprw2hf++/7957OXHADhwpwMZNWIUg7YYxIuvvcjld1/OC7NfAOCK913BXtvvRZfownMzn+PKe65k6ryptXw7kiSt16uLXuXQ/zp0zfSK1SvWmj7t2tM2aDuN12ltHx2Zo1xKUjvQq3svtuq9FVeOv5Kv3vxVenXvxXeO/Q51UUe/Hv349rHfZs7iOZx/y/n06d6HS4+7lC5R6mvw0NSHGP3n0Yy5awz77LAPnz30szV+N5IkqVq8QidJbWRznquzcNlCzr/l/DXTj05/lA/s/QF6devFUbsdRe/uvRn34DienfksNzxxA1864kvsNWgvnnjlCa5/4nq6RBd222436rOe6fN97p8kSZ2FgU6S2tiBQw5kzF1j2OvVvTj1gFM5Yc8T+MXEX3DuH86FaH39tw15G+/e/d08PO1hFq9YzI5b7gjAS/NfAuCVha8AMLDvQABO3OdEzj38XAAenPogV/3tqgq8K0mS1B4Z6CSpjW3qc3Xqoo6zDzmbj7z1Izz+8uNceOuFAGtG7KrPegAyE4DVuRqAO164g2defYZh2w5j1IhRXPTui9a62idJ6pymT58OC6BuvL2s2pX5MD3b7m4aA50ktbFNfa7OV478Cse86RiueuAqfvvob0lKwW3267MB2GGLHZg0Z9KaK3YvL3gZgAXLFrBg2QKen/U8hww9hIOHHtym70eSJLVfLQa6iNgOmJNZ+lo4IvYGdgP+kJk+JVqSNkJLz9XZZZtdOG6P47jtH7fxyPRH2H273QF4ftbz3D35bs446AxGHjyS65+4npPfcjKT5kzi+VnP85Yd3sIeA/fg2ZnPMmiLQQzfaThPz3i6yu+s8/H8KKkIBg8ezOyYTf0R9bUuRY3Uja9j8I6D22x7rV2h+wswHCAi3gZ8HXgYOAY4o82qkKROoKXn6uyyzS4AHL370Ry9+9Fr5h/6X4cyfcF0vnnbNzn9badz2XGX8fys5xlz1xgAlq5cynFvPo4zDjqDJSuWcP+/7ueH9/+wKu+nk/P8KElqF1oLdAuyobMGnA98JDMXR8TtFa5Lkgpnc56rc9sLt3HbC7ett/3OSXdy56Q715n/wuwX+PhvP75pBWtzeH6UJLULrfWQXBER/SPiJODezFxcnt+3wnVJktSebfL5MSIujoh7IuL+iNizSdsnI+LBcts7K1G4JKljae0K3YXAH4HJwOkAETEE8CFHkqTObJPOjxExAhiYmYdHxF7A5cCx5bY9gRHAIQ198yRJak2LgS4zHwYOazJvGnByJYuSJKk924zz49HANeXln46IrRu1fRqYCtwVEbOAz2TmnKYbiIiRwEiAIUOGbPJ7kCR1DK2NcnkEcAWwBJgNnNncyUWSpM5kM86P25WXb7AqIurKV+R2BW7NzCPKt3J+A/h80w1k5jhgHMDw4cOzabu0PtNe78IlE4vda2bmklJvoYG9i38Re9rrXdit1kWoQ2jtlstvA0dl5ryI2B/4LvCpypclSRvOB6e2Q/Pb9qGp7dCmnh8XAFs1mq5vdHvlKuCW8uubgbPbqlhp2LBhtS6hTayYNAmAnkN3rXElm283Os5xUW21FuhWZOY8gMx8JCLeUIWaJElq7zb1/DgBOAmYEBF7sHafu79R6k/3Q+AI4Mm2K1ed3ahRo2pdQptoeB9jx46tcSVS+9FaoHtjRHyn0fSwhunMPL9yZUnShvPBqe1PWz80tR3a1PPjzcCxETEBWAScGRFjgAuAHwE/i4iTKV3J844YSVKrWgt0pwAJDAIGA+cAr1W4JkmS2rtNOj+Wb69seivl6PLvFTjomCRpI7UW6B4FfgZ0B14EjqTUmfusCtclSVJ75vlRktQutBbovgv8IjP/1DCjPPLWpcAXK1mYJEntmOdHSVK70NqQcG9qfLICyMwbgH0qV5IkSe2e50dJUrvQ2hW61euZH21diCRJBVLY82Pdktfo+eyfa13GZotlCwHInlvUuJLNU7fkNWD7WpchqcBaC3RzI2LfzHy8YUZ5mOUFrW04Ii4GDivvY2RmPtOo7ZPAmZROiBdm5p2bULskSbWyyefHWupIz7yaNGkRALvuUvQwtH2HOi6Sqq+1QHce8PuI+D/gOWB34IPAR1taKSJGAAMz8/CI2Au4nNKzdYiIPYERwCGNHqYqSVKRbNL5sdY6yrPIwOeRSVKDFvvQZeZLlMLXP4BhwBTgHZn5YivbPRq4pryNp4GtG7V9GpgK3BUR10XEtptWuiRJtbEZ50dJktpUa1foyMzlwI0bud3tKA3f3GBVRNSVr8jtCtyamUeURwT7BvD5phuIiJHASIAhQ4Zs5O4lSaqsTTw/SpLUploNdJtoAbBVo+n6RrdXrgJuKb++mXUfsApAZo4DxgEMHz48K1SnOphpr3fhkol9a13GZpu5pHTxfGDv4t+VPO31LuxW6yIkSZI6qEoFugnAScCEcifx6Y3a/kapP90PgSOAJytUgzqZjtSpfMWkSQD0HLprjSvZfLvRsY6NJElSe1KpQHczcGxETAAWAWdGxBjgAuBHwM8i4mRKV/I+VaEa1MnY2V+SJEmdTUUCXfn2yqa3Uo4u/14BnFyJ/UqSJElSZ9LiKJeSJEmSpParUrdc1lTdktfo+eyfa13GZotlCwHInlvUuJLNV7fkNaDoD3+VJEmS2pcOF+g60uALkyYtAmDXXTpCENq+Qx0bSZIkqT3ocIHOgTEkSZIkdRb2oZMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKqmutC5CkNjEf6sZ3gO+oXi//7lvTKjbffGDHWhchSVLHZ6CTVHjDhg2rdQltZtKkSQDsuuOuNa5kM+3YsY6LJEntlYFOUuGNGjWq1iW0mYb3Mnbs2BpXokqJiIuBwyidg0dm5jNN2gcC/wK2zsxlNShRklQgHeD+JEmSiiEiRgADM/Nw4Ezg8mYW+wowp6qFSZIKy0AnSVL1HA1cA5CZTwNbN26MiP2ABF6sfmmSpCIy0EmSVD3bAbMbTa+KiDqAiOgDXAZc1NIGImJkREyMiImzZ89uaVFJUidgoJMkqXoWAFs1mq7PzPry6+8BYzJzYUsbyMxxmTk8M4cPGDCgUnVKkgrCQCdJUvVMAE4CiIg9gOnl19sB+wNnRMS1wB7Az2tUoySpQBzlUpKk6rkZODYiJgCLgDMjYgxwQWYOb1goIsYDp9WkQklSoRjoJEmqkvLtlWc3mT26meWOqEpBkqTC85ZLSZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVVsUAXERdHxD0RcX9E7NlM+8CIWBIRPStVgyRJkiR1ZBUJdBExAhiYmYcDZwKXN7PYV4A5ldi/JEmSJHUGlbpCdzRwDUBmPg1s3bgxIvYDEnixQvuXJEmSpA6vUoFuO2B2o+lVEVEHEBF9gMuAi1raQESMjIiJETFx9uzZLS0qSZIkSZ1SpQLdAmCrRtP1mVlffv09YExmLmxpA5k5LjOHZ+bwAQMGVKhMSZIkSSqurhXa7gTgJGBCROwBTAeIiO2A/YEtI+IMYA/g58BHKlSHJEmSqmDs2LFMnjy5ovuYNGkSAKNGjarofgCGDRtWlf1Im6tSge5m4NiImAAsAs6MiDHABZk5vGGhiBgPnFahGiRJktSB9OrVq9YlSO1ORQJd+fbKs5vMHt3MckdUYv+SJEmqLq9mSbXhg8UlSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkqYoi4uKIuCci7o+IPRvN3ycibouICRFxXUR0r2WdkqRiMNBJklQlETECGJiZhwNnApc3ak7g+MwcAUwFTqhBiZKkgjHQSZJUPUcD1wBk5tPA1g0NmflUZi4vT84DFje3gYgYGRETI2Li7NmzK12vJKmdM9BJklQ92wGNU9iqiFjrXBwRhwJ7An9tbgOZOS4zh2fm8AEDBlSuUklSIXStdQGSJHUiC4CtGk3XZ2Y9QEQEMBroBpySmatrUJ8kqWC8QidJUvVMAE4CiIg9gOmN2s4CZmTmxYY5SdKGMtBJklQ9NwPdI2ICcAUwOiLGlEe0PB44MyLGl3/OrWmlkqRC8JZLSZKqpHx75dlNZo8u/z62yuVIkjoAA50kSWozY8eOZfLkyRXfz6RJkwAYNWpURfczbNiwiu9DkjaHgU6SJBVOr169al2CJLULBjpJktRmvJolSdXloCiSJEmSVFAGOkmSJEkqKAOdJEmSJBWUgU6SJEmSCspAJ0mSJEkFZaCTJEmSpILysQXSRvCBuZIkSWpPDHRSO+QDcyVJkrQhDHTSRvBqliRJktoT+9BJkiRJUkEZ6CRJkiSpoAx0kiRJklRQ9qHbRNUY7dCRDiVJkiS1xEDXjjnSoSRJkqSWGOg2kVe0JEmSJNWafegkSZIkqaAqFugi4uKIuCci7o+IPRvN3ycibouICRFxXUR0r1QNkiRJktSRVSTQRcQIYGBmHg6cCVzeqDmB4zNzBDAVOKESNUiSJElSR1epPnRHA9cAZObTEbF1Q0NmPtVouXnA4grVIEmSJEkdWqVuudwOmN1oelVErLWviDgU2BP4a3MbiIiRETExIibOnj27uUUkSZIkqVOrVKBbAGzVaLo+M+sBouQrwJHAKZm5urkNZOa4zByemcMHDBhQoTIlSZIkqbgqFegmACcBRMQewPRGbWcBMzLz4vWFOUmSJElS6yoV6G4GukfEBOAKYHREjCmPaHk8cGZEjC//nFuhGiRJkiSpQ6vIoCjl2yvPbjJ7dPn3sZXYpyRJkiR1Nj5YXJIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBGegkSZIkqaAMdJIkSZJUUAY6SZKqKCIujoh7IuL+iNiz0fy+EXFNRNwbETdFxBa1rFOSVAwGOkmSqiQiRgADM/Nw4Ezg8kbNXwT+lJmHAbez7vNcJUlaR0UeLC5JHc3YsWOZPHlyxfczadIkAEaNGlXR/QwbNqzi+1CzjgauAcjMpyNi60ZtRwKXlV//HriqyrVJ6ojmQ934Cl/DeR1YVdldVFVXoG8Ftz8f2LHtNmegk6R2pFevXrUuQZW1HTC70fSqiKjLzHqgR2auLM+fC2zV3AYiYiQwEmDIkCGVrFVSwQ0bNqwq+5k+fTpLly6tyr6qoVevXgzecXDldrBj2x4bA50kbQCvZqmNLGDtoFZfDnMA9Y3C3VasHfzWyMxxwDiA4cOHZyWLlVRsnrs6B/vQSZJUPROAkwAiYg9geqO2h4ATyq9PBO6obmmSpCIy0EmSVD03A90jYgJwBTA6IsZERHfgUmBkRIwH9gd+VrsyJUlF4S2XkiRVSfl2yqajV44u/54DvKe6FUmSis4rdJIkSZJUUAY6SZIkSSooA50kSZIkFZSBTpIkSZIKykAnSZIkSQVloJMkSZKkgjLQSZIkSVJBRWbWuoZWRcRsYGqt66iRbSk9m0idj8e+8+rMx/4NmTmg1kUURSc/P0Ln/lvpzDzunVdnP/bNniMLEeg6s4iYmJnDa12Hqs9j33l57KUN499K5+Rx77w89s3zlktJkiRJKigDnSRJkiQVlIGu/RtX6wJUMx77zstjL20Y/1Y6J4975+Wxb4Z96CRJkiSpoLxCV2MR0Sci3tnKMidUqx5VV0T0i4h3tLKMx1+S1OlFxIERMaiVZY6vVj2qjojYfwOX67TH3kBXJRHx84gY3Gh6cET8HNgK+ER53g4RcXNE3BURV0VE1/Lin29huyMj4jOVrF1tIyIuLB/bCRHxnfLsbYCPlds9/pLUBjrzB7uOICKuj4jxETG3/Ht8RAwBjgV2KS9zR5Ofr5RX/2zNCtdmiYiDI+Lu8vG8KSK2Kzd9u8lyHvsmura+iKrou8A3MnNiRHwJOBX4KdAvIs4CHs7MRxsWjohhwHGll3FbZk6uSdVqVUQcB/TPzCPL02Mj4n3Ak40W8/h3AhHxQWB9IfzHmXl9k+VHAl0z80cVL05qZyJiBPD1RrPqgEsz865y+x1NVrkjMy+j9MHuT9WpUm0tM08GiIjHgQ9l5qzydNPl3lX14lRJlwMnZObc8t/+t4CzYM3f+n9l5h/AY9+Uga59GZCZE8uvfwr8T/n3SuB5YC5ARHwUOAyYD3yc0gnuwojYGrgvM39a5brVujcDdzaavg3YnbUDnce/E8jMGyPiLuDdmfk7gIg4GbgzM19rvKyhXeJySn8rCwAiYkvg7og4IDNXgx/sOqqI+BDwKnBlRJyamfXNLLNvo8lpTf8PVeEsycy55dePA19oaGj6d+6xX5uBrrbeBJzWaHp1o9cLgX7l18syc3yjtoeB/8vMZY3mfSkiegODUXt0O/DNiLiPUgA7Ezi/yTIe/86jG3A88Lvy9HuBCQ2NhnZpjaXAgRFxP5DAAcDyhjAHfrDraCJiC+A/gEGU/m/8BHBLRDScM8eUz6WU2xv8GXgN2C4ibgJ+mZk3VqdqtZFbIuKbwEPAh4DvNTQ0vUKHx34tBrraWgRMAYaVp7tGRF35W6jdgKnl+d3LJ6wZwF7AV2HdWw8aRMR3M/O2ypWtjZWZT0TEfwK/BwK4MDOfioihjRbz+Hcei4G+jab7luc1MLRLJR+hdPvkZyj93/kMcGKTZfxg17EsAe7OzDsj4hxKd6v8lX9/6Tk6M++LiDsy85Jm1p+Vme+vTqlqS5n5/fLnop2Bcxt9OTMzM09tsqzHvhEDXXUdERELKH14mwa8DIwHGi4j30jpW/jfAJcCF5Xn9wKOAe7NzDtpdOteRHycUv+an1ehfm2GzLwnInaidLzub2YRj38nEBFnAx8GtoiIxyh9SK0H/hQRNwDPYWiXAMjMmcCFrSzjB7sOJDNX8e/z3NuBqzPzFVj3/8SIeC+l/0O7Ubqr5S/Vq1QV0gM4F+gdpQMelMYYWIvHfm0Guur5BbArpX/z14FlTRfIzKsi4kRKg2F8JzOfKDctKHfyVgFFRH/gPUAXSienHuUrLT1p1IfO4985ZOb/UPrGuaVAbmhXpxYRRwFfazRrIKUPb682mndZeVk/2HUgEfEW4CBKx3QX4IsRUQesarLopZRuy6wvt82gdIuuiu1/gLMy8wWAiOgL3BUR92TmkvIyHvsmDHRVkpl3A3c3TEejRxg0We73lG7LU8exktItJAn8EVgBLC/PW9F4QY+/JEFm3k6p7zGw/i82ImIlfrDraGYBT1E6np+idL5cRqlv+ecaFirfsbKO9d3ZoMJISn/P61/AY78OA13xzaR05UftVGYuBv7QXFuTPnSbwuNfIM1cdWiYf1qjycsy89aqFSUVmB/sOp7MnEEpmK/D49opfAb4z4joU54OSo90WtLCOp1eZGata5AkrUc5BHYx5Kkz29hbjyPi1sw8prJVSVL7YKCTJEntWkQMAuoy8+Va1yJJ7Y2BTpIkSZIKqq7WBUiSJEmSNo2BTpIkSZIKykAnSZIkSQVloJMkSVLVRMTQiLi2ybx9y6OZEhG7rO95vZLW5XPopBopP4PuR8ACYCdKz935GDAaOJrS8+UezsxzImIY8BNKf7P3ZObXa1K0JEkVkJmPA4+XJz8BPAhMr1U9UpF4hU6qrTcDZ2bm24FngI8CQ4HDMvMQoFtEHA8cB/w6M0cAF9aqWEmS2ki/iPh1RDwaET+IiCMi4rKIOA44DfhuRJzbdKWIuCoihpdfXx0Rp5dfXxERB0XEsRFxZ0Q8FBHfioieETExyk8lj4jTI+LsKr5PqeIMdFJtPZyZC8uvHwKGADfnv58ncgfwJkpX5wZFxPeA3atfpiRJberNwJnA/sDhQH+AzLwZ+DnwH5n5vWbWuwk4JiK6AH2BI8vz96F0Hv17Zr4TOAQ4EVhB6VzasNyHgV+2+buRashbLqXa2jsiembmMkpX4W4v/76x3H4k8AcgM/OSiOgD3AYcWpNqJUlqGxMzczFARPyDcqDbAHcBXwBGUDofvjsidgb+lZkZEcdFxN6UglxvoDvwQ+DbEbEAeLJhv1JHYaCTamsG8OuI2J7Sye0PEXFgRPwNWA7ckZl3RMSny7eVLAN+VcuCJUlqA/WNXmeTttVAj+ZWyswVETEXOAX4MtAL+BZwTXmRz2Xm8IjoC5xaXuel8i2X5wHnt91bkNoHA51UW7Mz8yONZ2Tm15oulJk/BX5ataokSaqdu4CfRcTgzPxhM+1/Bk7LzLkR8SdKge70cttDETEReASY1midayn1WX+xkoVLtRD/7qojqZrKo1xe1jTQSZKkthUR/wNcl5l317oWqa0Z6CRJktTulLsjXNtk9scy8+WN3M4DlB8D1Fa1Se2JgU6SJEmSCsrHFkiSJElSQRnoJEmSJKmgDHSSJEmSVFAGOkmSJEkqKAOdJEmSJBXU/wcxLY1nYbmmGQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(15,5)) # 그래프 크기 조정\n", "plt.subplot(1,2,1) # 1행 2열의 첫번째(1행, 1열) 그래프\n", "ax = sns.boxplot(x='pos', y='OPS', data = regular_season_df, showfliers=False)\n", "\n", "# position 별 OPS 중앙값\n", "medians = regular_season_df.groupby(['pos'])['OPS'].median().to_dict()\n", "\n", "# position별 관측치 수\n", "nobs = regular_season_df['pos'].value_counts().to_dict()\n", "\n", "# 키 값을 'n: 값' 형식으로 변환\n", "for key in nobs: nobs[key] = \"n: \" + str(nobs[key])\n", "\n", "# 그래프의 Xticks text 값 얻기\n", "xticks_labels = [item.get_text() for item in ax.get_xticklabels()]\n", "\n", "# tick은 tick의 위치, label은 그에 해당하는 text 값\n", "for label in ax.get_xticklabels():\n", " ax.text(xticks_labels.index(label.get_text()), \n", " medians[label.get_text()] + 0.03, nobs[label.get_text()],\n", " horizontalalignment='center', size='large', color='w', weight='semibold')\n", " \n", "ax.set_title('포지션별 OPS')\n", "\n", "plt.subplot(1,2,2) # 1행 2열의 두 번째(1행, 2열) 그래프\n", "ax = sns.boxplot(x='hit_way', y='OPS', data = regular_season_df, showfliers=False)\n", "\n", "# 타자 방향별 OPS 중앙값\n", "medians = regular_season_df.groupby(['hit_way'])['OPS'].median().to_dict()\n", "# 타자 방향 관측치 수\n", "nobs = regular_season_df['hit_way'].value_counts().to_dict()\n", "# 키 값을 'n: 값' 형식으로 변환\n", "for key in nobs: nobs[key] = \"n: \" + str(nobs[key])\n", "\n", "# 그래프의 Xticks text 값 얻기\n", "xticks_labels = [item.get_text() for item in ax.get_xticklabels()]\n", "\n", "# tick은 tick의 위치, label은 그에 해당하는 text 값\n", "for label in ax.get_xticklabels():\n", " ax.text(xticks_labels.index(label.get_text()), medians[label.get_text()] + 0.03,\n", " nobs[label.get_text()], horizontalalignment='center', size='large',\n", " color='w', weight='semibold')\n", "ax.set_title('타석방향별 OPS')\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 쿠바 Ciego de Avila Maximo Gomez Baez(대)\n", "1 광주대성초-광주동성중-광주동성고\n", "2 광주대성초-광주동성중-광주동성고\n", "3 광주대성초-광주동성중-광주동성고\n", "4 광주대성초-광주동성중-광주동성고\n", "Name: career, dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regular_season_df['career'].head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['쿠바', '도미니카', '네덜란드', '캐나다', '미국']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# career를 split\n", "foreign_country = regular_season_df['career'].apply(\n", " lambda x: x.replace('-', ' ').split(' ')[0])\n", "\n", "# 외국만 추출\n", "foreign_country_list = list(set(foreign_country.apply(\n", " lambda x: np.nan if '초' in x else x)))\n", "\n", "# 결측치 처리\n", "foreign_country_list = [x for x in foreign_country_list if str(x) != 'nan']\n", "foreign_country_list" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country
0foreign
1korean
2korean
3korean
4korean
\n", "
" ], "text/plain": [ " country\n", "0 foreign\n", "1 korean\n", "2 korean\n", "3 korean\n", "4 korean" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regular_season_df['country'] = foreign_country\n", "regular_season_df['country'] = regular_season_df['country'].apply(\n", " lambda x: x if pd.isnull(x)\n", " else ('foreign' if x in foreign_country_list else 'korean'))\n", "regular_season_df[['country']].head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(15,5)) # 그래프 크기 조정\n", "ax = sns.boxplot(x='country', y='OPS', data = regular_season_df, showfliers=False)\n", "\n", "# 내외국인 별 OPS 중앙값 dict\n", "medians = regular_season_df.groupby(['country'])['OPS'].median().to_dict()\n", "# 내외국인 관측치 수 dict\n", "nobs = regular_season_df['country'].value_counts().to_dict()\n", "# 키 값을 'n: 값' 형식으로 변환 \n", "for key in nobs: nobs[key] = \"n: \" + str(nobs[key])\n", "\n", "# 그래프의 Xticks text 값 얻기\n", "xticks_labels = [item.get_text() for item in ax.get_xticklabels()]\n", " \n", "for label in ax.get_xticklabels(): # tick은 tick의 위치, label은 그에 해당하는 text 값 \n", " ax.text(xticks_labels.index(label.get_text()), medians[label.get_text()] + 0.03, \\\n", " nobs[label.get_text()], # x 좌표, y 좌표, 해당 text\n", " horizontalalignment='center', size='large', color='w', weight='semibold') \n", "ax.set_title('국적별 OPS')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000만원 177\n", "6000만원 117\n", "3000만원 105\n", "9000만원 97\n", "5000만원 91\n", "8000만원 89\n", "30000만원 74\n", "4000만원 62\n", "12000만원 62\n", "18000만원 54\n", "7000만원 53\n", "11000만원 49\n", "13000만원 48\n", "20000만원 46\n", "25000만원 45\n", "15000만원 41\n", "16000만원 28\n", "14000만원 26\n", "28000만원 20\n", "43000만원 17\n", "45000만원 16\n", "27000만원 15\n", "21000만원 13\n", "23000만원 12\n", "33000만원 10\n", "6500만원 10\n", "100000달러 4\n", "300000달러 3\n", "50000달러 2\n", "17000만원 1\n", "Name: starting_salary, dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regular_season_df['starting_salary'].value_counts()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\seaborn\\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n", "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\seaborn\\_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", " FutureWarning\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 결측치라면 그대로 0으로 두고 ‘만원’이 포함되어 있다면 숫자만 뽑아서 초봉으로 넣어준다. 그외 만 원 단위가 아닌 초봉은 결측치로 처리한다.\n", "import re\n", "regular_season_df['starting_salary'] = regular_season_df['starting_salary'].apply(\n", " lambda x: x if pd.isnull(x)\n", " else(int(re.findall('\\d+',x)[0]) if '만원' in x else np.nan))\n", "\n", "plt.figure(figsize=(15,5)) # 그래프 크기 조정\n", "plt.subplot(1,2,1) # 1행 2열의 첫 번째(1행, 1열) 그래프\n", "b=sns.distplot(regular_season_df['starting_salary']. \\\n", " loc[regular_season_df['starting_salary'].notnull()], hist=True)\n", "b.set_xlabel(\"starting salary\",fontsize=12)\n", "b.set_title('초봉의 분포', fontsize=20)\n", "\n", "plt.subplot(1,2,2) # 1행 2열의 두 번째(1행, 2열) 그래프\n", "\n", "# 정규시즌과 프리시즌의 상관관계 계산\n", "correlation = regular_season_df['starting_salary'].corr(regular_season_df['OPS'])\n", "b = sns.scatterplot(regular_season_df['starting_salary'], regular_season_df['OPS'])\n", "b.axes.set_title('correlation(상관계수): '+str(np.round(correlation,2)), fontsize=20)\n", "b.set_ylabel(\"정규시즌 OPS\",fontsize=12)\n", "b.set_xlabel(\"초봉\",fontsize=12)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2.3. 일별 데이터 분석" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(112273, 20)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_namedateopposing_teamavg1ABRH2B3BHRRBISBCSBBHBPSOGDPavg2year
00가르시아3.24NC0.33331100000010100.3332018
10가르시아3.25NC0.00040000000000100.1432018
20가르시아3.27넥센0.20050100000000000.1672018
30가르시아3.28넥센0.20051100010000000.1762018
40가르시아3.29넥센0.25040100030000010.1902018
\n", "
" ], "text/plain": [ " batter_id batter_name date opposing_team avg1 AB R H 2B 3B HR \\\n", "0 0 가르시아 3.24 NC 0.333 3 1 1 0 0 0 \n", "1 0 가르시아 3.25 NC 0.000 4 0 0 0 0 0 \n", "2 0 가르시아 3.27 넥센 0.200 5 0 1 0 0 0 \n", "3 0 가르시아 3.28 넥센 0.200 5 1 1 0 0 0 \n", "4 0 가르시아 3.29 넥센 0.250 4 0 1 0 0 0 \n", "\n", " RBI SB CS BB HBP SO GDP avg2 year \n", "0 0 0 0 1 0 1 0 0.333 2018 \n", "1 0 0 0 0 0 1 0 0.143 2018 \n", "2 0 0 0 0 0 0 0 0.167 2018 \n", "3 1 0 0 0 0 0 0 0.176 2018 \n", "4 3 0 0 0 0 0 1 0.190 2018 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "day_by_day_df = pd.read_csv('./input/Regular_Season_Batter_Day_by_Day_b4.csv')\n", "display(day_by_day_df.shape, day_by_day_df.head())" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearmonthavg2
02001100.356400
1200140.205217
2200150.297157
3200160.306926
4200170.293171
............
129201850.274083
130201860.280630
131201870.280817
132201880.283923
133201890.277841
\n", "

134 rows × 3 columns

\n", "
" ], "text/plain": [ " year month avg2\n", "0 2001 10 0.356400\n", "1 2001 4 0.205217\n", "2 2001 5 0.297157\n", "3 2001 6 0.306926\n", "4 2001 7 0.293171\n", ".. ... ... ...\n", "129 2018 5 0.274083\n", "130 2018 6 0.280630\n", "131 2018 7 0.280817\n", "132 2018 8 0.283923\n", "133 2018 9 0.277841\n", "\n", "[134 rows x 3 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 날짜(date)를 ‘.’을 기준으로 나누고 첫 번째 값을 월(month)로 지정 \n", "day_by_day_df['month'] = day_by_day_df['date'].apply(lambda x: str(x).split('.')[0])\n", "\n", "# 각 연도의 월별 평균 누적 타율(avg2) 계산\n", "agg_df = day_by_day_df.groupby(['year', 'month'])['avg2'].mean().reset_index()\n", "agg_df" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
year200120022003200420052006200720082009201020112012201320142015201620172018
month
100.3564000.2690650.2165830.203636NaN0.2609850.2498880.2496380.033333NaN0.2435260.2469490.2578410.2735370.2740420.2825470.2802890.277482
3NaNNaNNaNNaNNaN0.2617140.2617140.271982NaN0.239861NaNNaN0.2312360.2105980.2144850.2578570.1619790.238015
40.2052170.3197920.2502960.2596630.2353170.2671060.2157030.2615310.2525460.2629530.2471330.2341990.2679940.2599180.2551750.2667110.2594300.263953
50.2971570.2679900.2414910.2379540.2535270.2642830.2373290.2625350.2808420.2729340.2508770.2478440.2683550.2738990.2613070.2752400.2743740.274083
60.3069260.2758670.2522900.2488000.2499130.2643920.2606000.2707660.2787810.2747910.2632640.2545770.2705330.2834800.2689990.2763070.2790600.280630
70.2931710.2666500.2442300.2519730.2563960.2624640.2591710.2648700.2750540.2655010.2648290.2615130.2628120.2756770.2726850.2831920.2845650.280817
80.3034890.2704810.2523190.2494600.2435700.2653690.2702580.2651730.2717960.2710750.2620480.2580690.2681220.2820250.2723770.2831050.2832830.283923
90.3086360.2483330.2437800.2039530.2370580.2587940.2510220.2529420.2644680.2653120.2585000.2512320.2605710.2724110.2716290.2765130.2732130.277841
\n", "
" ], "text/plain": [ "year 2001 2002 2003 2004 2005 2006 2007 \\\n", "month \n", "10 0.356400 0.269065 0.216583 0.203636 NaN 0.260985 0.249888 \n", "3 NaN NaN NaN NaN NaN 0.261714 0.261714 \n", "4 0.205217 0.319792 0.250296 0.259663 0.235317 0.267106 0.215703 \n", "5 0.297157 0.267990 0.241491 0.237954 0.253527 0.264283 0.237329 \n", "6 0.306926 0.275867 0.252290 0.248800 0.249913 0.264392 0.260600 \n", "7 0.293171 0.266650 0.244230 0.251973 0.256396 0.262464 0.259171 \n", "8 0.303489 0.270481 0.252319 0.249460 0.243570 0.265369 0.270258 \n", "9 0.308636 0.248333 0.243780 0.203953 0.237058 0.258794 0.251022 \n", "\n", "year 2008 2009 2010 2011 2012 2013 2014 \\\n", "month \n", "10 0.249638 0.033333 NaN 0.243526 0.246949 0.257841 0.273537 \n", "3 0.271982 NaN 0.239861 NaN NaN 0.231236 0.210598 \n", "4 0.261531 0.252546 0.262953 0.247133 0.234199 0.267994 0.259918 \n", "5 0.262535 0.280842 0.272934 0.250877 0.247844 0.268355 0.273899 \n", "6 0.270766 0.278781 0.274791 0.263264 0.254577 0.270533 0.283480 \n", "7 0.264870 0.275054 0.265501 0.264829 0.261513 0.262812 0.275677 \n", "8 0.265173 0.271796 0.271075 0.262048 0.258069 0.268122 0.282025 \n", "9 0.252942 0.264468 0.265312 0.258500 0.251232 0.260571 0.272411 \n", "\n", "year 2015 2016 2017 2018 \n", "month \n", "10 0.274042 0.282547 0.280289 0.277482 \n", "3 0.214485 0.257857 0.161979 0.238015 \n", "4 0.255175 0.266711 0.259430 0.263953 \n", "5 0.261307 0.275240 0.274374 0.274083 \n", "6 0.268999 0.276307 0.279060 0.280630 \n", "7 0.272685 0.283192 0.284565 0.280817 \n", "8 0.272377 0.283105 0.283283 0.283923 \n", "9 0.271629 0.276513 0.273213 0.277841 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pivot_table을 이용해 데이터 변형\n", "agg_df = agg_df.pivot_table(index=['month'], columns='year', values = 'avg2')\n", "agg_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "연도별 월 평균 타율" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
year20112012201320142015201620172018
month
40.2471330.2341990.2679940.2599180.2551750.2667110.2594300.263953
50.2508770.2478440.2683550.2738990.2613070.2752400.2743740.274083
60.2632640.2545770.2705330.2834800.2689990.2763070.2790600.280630
70.2648290.2615130.2628120.2756770.2726850.2831920.2845650.280817
80.2620480.2580690.2681220.2820250.2723770.2831050.2832830.283923
90.2585000.2512320.2605710.2724110.2716290.2765130.2732130.277841
\n", "
" ], "text/plain": [ "year 2011 2012 2013 2014 2015 2016 2017 \\\n", "month \n", "4 0.247133 0.234199 0.267994 0.259918 0.255175 0.266711 0.259430 \n", "5 0.250877 0.247844 0.268355 0.273899 0.261307 0.275240 0.274374 \n", "6 0.263264 0.254577 0.270533 0.283480 0.268999 0.276307 0.279060 \n", "7 0.264829 0.261513 0.262812 0.275677 0.272685 0.283192 0.284565 \n", "8 0.262048 0.258069 0.268122 0.282025 0.272377 0.283105 0.283283 \n", "9 0.258500 0.251232 0.260571 0.272411 0.271629 0.276513 0.273213 \n", "\n", "year 2018 \n", "month \n", "4 0.263953 \n", "5 0.274083 \n", "6 0.280630 \n", "7 0.280817 \n", "8 0.283923 \n", "9 0.277841 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "display(agg_df.iloc[2:, 10:])\n", "plt.plot(agg_df.iloc[2:,10:], marker = 'o', markersize=4) # 2011~2018년 데이터만 이용\n", "plt.grid(axis='y', linestyle='-', alpha=0.4)\n", "plt.legend(agg_df.iloc[2:,10:].columns, loc='center left', bbox_to_anchor=(1, 0.5)) # 범례 그래프 밖에 위치\n", "plt.title('연도별 월 평균 타율')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.3. 데이터 전처리" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearteamavgGABRH2B...positioncareerstarting_salaryOPSweightheightweight_per_heightposhit_waycountry
000002600000...80201076268028028028028020
\n", "

1 rows × 35 columns

\n", "
" ], "text/plain": [ " batter_id batter_name year team avg G AB R H 2B ... position \\\n", "0 0 0 0 0 26 0 0 0 0 0 ... 802 \n", "\n", " career starting_salary OPS weight height weight_per_height pos \\\n", "0 0 1076 26 802 802 802 802 \n", "\n", " hit_way country \n", "0 802 0 \n", "\n", "[1 rows x 35 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(regular_season_df.isna().sum()).transpose()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idyearavgGABRH2B3BHR...SOGDPSLGOBPEstarting_salaryOPSweightheightweight_per_height
0020180.339501832762908...2530.5190000.3830009NaN0.90200093.0177.00.525424
1213820050.127396398200...1510.1587300.2567573NaN0.415487NaNNaNNaN
1313820060.139373665200...1400.1944440.3260874NaN0.520531NaNNaNNaN
1413820070.0008430000...210.0000000.0000000NaN0.000000NaNNaNNaN
1513820080.0002100000...000.0000000.0000000NaN0.000000NaNNaNNaN
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " batter_id year avg G AB R H 2B 3B HR ... SO GDP \\\n", "0 0 2018 0.339 50 183 27 62 9 0 8 ... 25 3 \n", "12 138 2005 0.127 39 63 9 8 2 0 0 ... 15 1 \n", "13 138 2006 0.139 37 36 6 5 2 0 0 ... 14 0 \n", "14 138 2007 0.000 8 4 3 0 0 0 0 ... 2 1 \n", "15 138 2008 0.000 2 1 0 0 0 0 0 ... 0 0 \n", "\n", " SLG OBP E starting_salary OPS weight height \\\n", "0 0.519000 0.383000 9 NaN 0.902000 93.0 177.0 \n", "12 0.158730 0.256757 3 NaN 0.415487 NaN NaN \n", "13 0.194444 0.326087 4 NaN 0.520531 NaN NaN \n", "14 0.000000 0.000000 0 NaN 0.000000 NaN NaN \n", "15 0.000000 0.000000 0 NaN 0.000000 NaN NaN \n", "\n", " weight_per_height \n", "0 0.525424 \n", "12 NaN \n", "13 NaN \n", "14 NaN \n", "15 NaN \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 수치형 타입의 변수 저장\n", "numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # 모든 numeric(수치형) 타입\n", "num_cols = regular_season_df.select_dtypes(include=numerics).columns\n", "\n", "# 수치형 타입 변수 중 결측치가 하나라도 존재하는 행 출력\n", "# isna().sum(axis=1) -> 열 기준의 결측치 개수\n", "# df.loc[]를 통해 결측치 0개 이상 데이터를 추출\n", "regular_season_df.loc[regular_season_df[num_cols].isna().sum(axis=1) > 0,num_cols].head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearteamavgGABRH2B...positioncareerstarting_salaryOPSweightheightweight_per_heightposhit_waycountry
00가르시아2018LG0.3395018327629...내야수(우투우타)쿠바 Ciego de Avila Maximo Gomez Baez(대)0.00.90293.0177.00.525424내야수우타foreign
11강경학2011한화0.00021000...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.00072.0180.00.400000내야수좌타korean
21강경학2014한화0.221418611192...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.68672.0180.00.400000내야수좌타korean
31강경학2015한화0.25712031150807...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.67372.0180.00.400000내야수좌타korean
41강경학2016한화0.1584610116163...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.48972.0180.00.400000내야수좌타korean
..................................................................
2449344황진수2014롯데0.00055000...내야수(우투양타)석천초-대헌중-공주고4000.00.00082.0181.00.453039내야수양타korean
2450344황진수2015롯데0.00022000...내야수(우투양타)석천초-대헌중-공주고4000.00.00082.0181.00.453039내야수양타korean
2451344황진수2016롯데0.0001110200...내야수(우투양타)석천초-대헌중-공주고4000.00.00082.0181.00.453039내야수양타korean
2452344황진수2017롯데0.2916011718346...내야수(우투양타)석천초-대헌중-공주고4000.00.76182.0181.00.453039내야수양타korean
2453344황진수2018롯데0.1671824641...내야수(우투양타)석천초-대헌중-공주고4000.00.56482.0181.00.453039내야수양타korean
\n", "

2454 rows × 35 columns

\n", "
" ], "text/plain": [ " batter_id batter_name year team avg G AB R H 2B ... \\\n", "0 0 가르시아 2018 LG 0.339 50 183 27 62 9 ... \n", "1 1 강경학 2011 한화 0.000 2 1 0 0 0 ... \n", "2 1 강경학 2014 한화 0.221 41 86 11 19 2 ... \n", "3 1 강경학 2015 한화 0.257 120 311 50 80 7 ... \n", "4 1 강경학 2016 한화 0.158 46 101 16 16 3 ... \n", "... ... ... ... ... ... ... ... .. .. .. ... \n", "2449 344 황진수 2014 롯데 0.000 5 5 0 0 0 ... \n", "2450 344 황진수 2015 롯데 0.000 2 2 0 0 0 ... \n", "2451 344 황진수 2016 롯데 0.000 11 10 2 0 0 ... \n", "2452 344 황진수 2017 롯데 0.291 60 117 18 34 6 ... \n", "2453 344 황진수 2018 롯데 0.167 18 24 6 4 1 ... \n", "\n", " position career starting_salary \\\n", "0 내야수(우투우타) 쿠바 Ciego de Avila Maximo Gomez Baez(대) 0.0 \n", "1 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "2 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "3 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "4 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "... ... ... ... \n", "2449 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2450 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2451 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2452 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2453 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "\n", " OPS weight height weight_per_height pos hit_way country \n", "0 0.902 93.0 177.0 0.525424 내야수 우타 foreign \n", "1 0.000 72.0 180.0 0.400000 내야수 좌타 korean \n", "2 0.686 72.0 180.0 0.400000 내야수 좌타 korean \n", "3 0.673 72.0 180.0 0.400000 내야수 좌타 korean \n", "4 0.489 72.0 180.0 0.400000 내야수 좌타 korean \n", "... ... ... ... ... ... ... ... \n", "2449 0.000 82.0 181.0 0.453039 내야수 양타 korean \n", "2450 0.000 82.0 181.0 0.453039 내야수 양타 korean \n", "2451 0.000 82.0 181.0 0.453039 내야수 양타 korean \n", "2452 0.761 82.0 181.0 0.453039 내야수 양타 korean \n", "2453 0.564 82.0 181.0 0.453039 내야수 양타 korean \n", "\n", "[2454 rows x 35 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 정규 시즌 데이터에서 결측치를 0으로 채우기\n", "regular_season_df[regular_season_df.select_dtypes(include=numerics).columns] = \\\n", " regular_season_df[regular_season_df.select_dtypes(include=numerics).columns].fillna(0)\n", "regular_season_df" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_namedateopposing_teamavg1ABRH2B3B...RBISBCSBBHBPSOGDPavg2yearmonth
00가르시아3.24NC0.33331100...00010100.33320183
10가르시아3.25NC0.00040000...00000100.14320183
20가르시아3.27넥센0.20050100...00000000.16720183
30가르시아3.28넥센0.20051100...10000000.17620183
40가르시아3.29넥센0.25040100...30000010.19020183
..................................................................
112268344황진수6.23LG-00000...00010000.15820186
112269344황진수6.26넥센0.00010000...00000100.15020186
112270344황진수6.27넥센0.50021110...00000100.18220186
112271344황진수6.28넥센-00000...00000000.18220186
112272344황진수6.30한화0.00020000...00000000.16720186
\n", "

112273 rows × 21 columns

\n", "
" ], "text/plain": [ " batter_id batter_name date opposing_team avg1 AB R H 2B 3B \\\n", "0 0 가르시아 3.24 NC 0.333 3 1 1 0 0 \n", "1 0 가르시아 3.25 NC 0.000 4 0 0 0 0 \n", "2 0 가르시아 3.27 넥센 0.200 5 0 1 0 0 \n", "3 0 가르시아 3.28 넥센 0.200 5 1 1 0 0 \n", "4 0 가르시아 3.29 넥센 0.250 4 0 1 0 0 \n", "... ... ... ... ... ... .. .. .. .. .. \n", "112268 344 황진수 6.23 LG - 0 0 0 0 0 \n", "112269 344 황진수 6.26 넥센 0.000 1 0 0 0 0 \n", "112270 344 황진수 6.27 넥센 0.500 2 1 1 1 0 \n", "112271 344 황진수 6.28 넥센 - 0 0 0 0 0 \n", "112272 344 황진수 6.30 한화 0.000 2 0 0 0 0 \n", "\n", " ... RBI SB CS BB HBP SO GDP avg2 year month \n", "0 ... 0 0 0 1 0 1 0 0.333 2018 3 \n", "1 ... 0 0 0 0 0 1 0 0.143 2018 3 \n", "2 ... 0 0 0 0 0 0 0 0.167 2018 3 \n", "3 ... 1 0 0 0 0 0 0 0.176 2018 3 \n", "4 ... 3 0 0 0 0 0 1 0.190 2018 3 \n", "... ... ... .. .. .. ... .. ... ... ... ... \n", "112268 ... 0 0 0 1 0 0 0 0.158 2018 6 \n", "112269 ... 0 0 0 0 0 1 0 0.150 2018 6 \n", "112270 ... 0 0 0 0 0 1 0 0.182 2018 6 \n", "112271 ... 0 0 0 0 0 0 0 0.182 2018 6 \n", "112272 ... 0 0 0 0 0 0 0 0.167 2018 6 \n", "\n", "[112273 rows x 21 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 일별 데이터에서 결측치를 0으로 채우기\n", "day_by_day_df[day_by_day_df.select_dtypes(include=numerics).columns] = \\\n", " day_by_day_df[day_by_day_df.select_dtypes(include=numerics).columns].fillna(0)\n", "day_by_day_df" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearteamavgGABRH2B...SLGOBPEheight/weightyear_bornpositioncareerstarting_salaryOPSnew_idx
00가르시아2018LG0.350720171...0.5500.4091177cm/93kg1985년 04월 12일내야수(우투우타)쿠바 Ciego de Avila Maximo Gomez Baez(대)NaN0.959가르시아2018
11강경학2011한화0.00042200...0.0000.5000180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.500강경학2011
21강경학2014한화-40200...0.0000.0000180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.000강경학2014
31강경학2015한화0.1301023330...0.1300.2862180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.416강경학2015
41강경학2016한화0.1881432461...0.2810.2120180cm/72kg1992년 08월 11일내야수(우투좌타)광주대성초-광주동성중-광주동성고10000만원0.493강경학2016
..................................................................
1388342황재균2014롯데0.40710273112...0.5930.4481183cm/96kg1987년 07월 28일내야수(우투우타)사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코6000만원1.041황재균2014
1389342황재균2015롯데0.33311308103...0.4330.3890183cm/96kg1987년 07월 28일내야수(우투우타)사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코6000만원0.822황재균2015
1390342황재균2016롯데0.31016428133...0.4290.3701183cm/96kg1987년 07월 28일내야수(우투우타)사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코6000만원0.799황재균2016
1391342황재균2018KT0.250616341...0.5000.3333183cm/96kg1987년 07월 28일내야수(우투우타)사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코6000만원0.833황재균2018
1392344황진수2014롯데0.00011100...0.0000.0000181cm/82kg1989년 02월 15일내야수(우투양타)석천초-대헌중-공주고4000만원0.000황진수2014
\n", "

1393 rows × 30 columns

\n", "
" ], "text/plain": [ " batter_id batter_name year team avg G AB R H 2B ... SLG \\\n", "0 0 가르시아 2018 LG 0.350 7 20 1 7 1 ... 0.550 \n", "1 1 강경학 2011 한화 0.000 4 2 2 0 0 ... 0.000 \n", "2 1 강경학 2014 한화 - 4 0 2 0 0 ... 0.000 \n", "3 1 강경학 2015 한화 0.130 10 23 3 3 0 ... 0.130 \n", "4 1 강경학 2016 한화 0.188 14 32 4 6 1 ... 0.281 \n", "... ... ... ... ... ... .. .. .. .. .. ... ... \n", "1388 342 황재균 2014 롯데 0.407 10 27 3 11 2 ... 0.593 \n", "1389 342 황재균 2015 롯데 0.333 11 30 8 10 3 ... 0.433 \n", "1390 342 황재균 2016 롯데 0.310 16 42 8 13 3 ... 0.429 \n", "1391 342 황재균 2018 KT 0.250 6 16 3 4 1 ... 0.500 \n", "1392 344 황진수 2014 롯데 0.000 1 1 1 0 0 ... 0.000 \n", "\n", " OBP E height/weight year_born position \\\n", "0 0.409 1 177cm/93kg 1985년 04월 12일 내야수(우투우타) \n", "1 0.500 0 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "2 0.000 0 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "3 0.286 2 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "4 0.212 0 180cm/72kg 1992년 08월 11일 내야수(우투좌타) \n", "... ... .. ... ... ... \n", "1388 0.448 1 183cm/96kg 1987년 07월 28일 내야수(우투우타) \n", "1389 0.389 0 183cm/96kg 1987년 07월 28일 내야수(우투우타) \n", "1390 0.370 1 183cm/96kg 1987년 07월 28일 내야수(우투우타) \n", "1391 0.333 3 183cm/96kg 1987년 07월 28일 내야수(우투우타) \n", "1392 0.000 0 181cm/82kg 1989년 02월 15일 내야수(우투양타) \n", "\n", " career starting_salary OPS new_idx \n", "0 쿠바 Ciego de Avila Maximo Gomez Baez(대) NaN 0.959 가르시아2018 \n", "1 광주대성초-광주동성중-광주동성고 10000만원 0.500 강경학2011 \n", "2 광주대성초-광주동성중-광주동성고 10000만원 0.000 강경학2014 \n", "3 광주대성초-광주동성중-광주동성고 10000만원 0.416 강경학2015 \n", "4 광주대성초-광주동성중-광주동성고 10000만원 0.493 강경학2016 \n", "... ... ... ... ... \n", "1388 사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코 6000만원 1.041 황재균2014 \n", "1389 사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코 6000만원 0.822 황재균2015 \n", "1390 사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코 6000만원 0.799 황재균2016 \n", "1391 사당초-이수중-경기고-현대-우리-히어로즈-넥센-롯데-샌프란시스코 6000만원 0.833 황재균2018 \n", "1392 석천초-대헌중-공주고 4000만원 0.000 황진수2014 \n", "\n", "[1393 rows x 30 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 프리시즌 데이터에서 결측치를 0으로 채우기\n", "preseason_df[preseason_df.select_dtypes(include=numerics).columns] = \\\n", " preseason_df[preseason_df.select_dtypes(include=numerics).columns].fillna(0)\n", "preseason_df" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameteamheight/weightyear_bornpositioncareerposhit_waycountry
12백승룡한화NaN1982년 08월 16일NaN사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센NaNNaNkorean
13백승룡한화NaN1982년 08월 16일NaN사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센NaNNaNkorean
14백승룡한화NaN1982년 08월 16일NaN사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센NaNNaNkorean
15백승룡한화NaN1982년 08월 16일NaN사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센NaNNaNkorean
16백승룡한화NaN1982년 08월 16일NaN사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센NaNNaNkorean
\n", "
" ], "text/plain": [ " batter_name team height/weight year_born position \\\n", "12 백승룡 한화 NaN 1982년 08월 16일 NaN \n", "13 백승룡 한화 NaN 1982년 08월 16일 NaN \n", "14 백승룡 한화 NaN 1982년 08월 16일 NaN \n", "15 백승룡 한화 NaN 1982년 08월 16일 NaN \n", "16 백승룡 한화 NaN 1982년 08월 16일 NaN \n", "\n", " career pos hit_way country \n", "12 사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센 NaN NaN korean \n", "13 사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센 NaN NaN korean \n", "14 사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센 NaN NaN korean \n", "15 사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센 NaN NaN korean \n", "16 사직초(부산극동리틀)-사직중-경남상고-경성대-한화-넥센 NaN NaN korean " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 수치형이 아닌 변수 추출\n", "not_num_cols = [x for x in regular_season_df.columns if x not in num_cols]\n", "\n", "# 수치형이 아닌 변수 중 결측치가 하나라도 존재하는 행 출력\n", "# isna().sum(axis=1) -> 열 기준의 결측치 개수\n", "# df.loc[]를 통해 결측치 0개 이상 데이터를 추출\n", "regular_season_df.loc[regular_season_df[not_num_cols].isna().sum(axis=1) > 0, not_num_cols].head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearteamavgGABRH2B...positioncareerstarting_salaryOPSweightheightweight_per_heightposhit_waycountry
00가르시아2018LG0.3395018327629...내야수(우투우타)쿠바 Ciego de Avila Maximo Gomez Baez(대)0.00.90293.0177.00.525424내야수우타foreign
11강경학2011한화0.00021000...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.00072.0180.00.400000내야수좌타korean
21강경학2014한화0.221418611192...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.68672.0180.00.400000내야수좌타korean
31강경학2015한화0.25712031150807...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.67372.0180.00.400000내야수좌타korean
41강경학2016한화0.1584610116163...내야수(우투좌타)광주대성초-광주동성중-광주동성고10000.00.48972.0180.00.400000내야수좌타korean
..................................................................
2442344황진수2014롯데0.00055000...내야수(우투양타)석천초-대헌중-공주고4000.00.00082.0181.00.453039내야수양타korean
2443344황진수2015롯데0.00022000...내야수(우투양타)석천초-대헌중-공주고4000.00.00082.0181.00.453039내야수양타korean
2444344황진수2016롯데0.0001110200...내야수(우투양타)석천초-대헌중-공주고4000.00.00082.0181.00.453039내야수양타korean
2445344황진수2017롯데0.2916011718346...내야수(우투양타)석천초-대헌중-공주고4000.00.76182.0181.00.453039내야수양타korean
2446344황진수2018롯데0.1671824641...내야수(우투양타)석천초-대헌중-공주고4000.00.56482.0181.00.453039내야수양타korean
\n", "

2447 rows × 35 columns

\n", "
" ], "text/plain": [ " batter_id batter_name year team avg G AB R H 2B ... \\\n", "0 0 가르시아 2018 LG 0.339 50 183 27 62 9 ... \n", "1 1 강경학 2011 한화 0.000 2 1 0 0 0 ... \n", "2 1 강경학 2014 한화 0.221 41 86 11 19 2 ... \n", "3 1 강경학 2015 한화 0.257 120 311 50 80 7 ... \n", "4 1 강경학 2016 한화 0.158 46 101 16 16 3 ... \n", "... ... ... ... ... ... ... ... .. .. .. ... \n", "2442 344 황진수 2014 롯데 0.000 5 5 0 0 0 ... \n", "2443 344 황진수 2015 롯데 0.000 2 2 0 0 0 ... \n", "2444 344 황진수 2016 롯데 0.000 11 10 2 0 0 ... \n", "2445 344 황진수 2017 롯데 0.291 60 117 18 34 6 ... \n", "2446 344 황진수 2018 롯데 0.167 18 24 6 4 1 ... \n", "\n", " position career starting_salary \\\n", "0 내야수(우투우타) 쿠바 Ciego de Avila Maximo Gomez Baez(대) 0.0 \n", "1 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "2 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "3 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "4 내야수(우투좌타) 광주대성초-광주동성중-광주동성고 10000.0 \n", "... ... ... ... \n", "2442 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2443 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2444 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2445 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "2446 내야수(우투양타) 석천초-대헌중-공주고 4000.0 \n", "\n", " OPS weight height weight_per_height pos hit_way country \n", "0 0.902 93.0 177.0 0.525424 내야수 우타 foreign \n", "1 0.000 72.0 180.0 0.400000 내야수 좌타 korean \n", "2 0.686 72.0 180.0 0.400000 내야수 좌타 korean \n", "3 0.673 72.0 180.0 0.400000 내야수 좌타 korean \n", "4 0.489 72.0 180.0 0.400000 내야수 좌타 korean \n", "... ... ... ... ... ... ... ... \n", "2442 0.000 82.0 181.0 0.453039 내야수 양타 korean \n", "2443 0.000 82.0 181.0 0.453039 내야수 양타 korean \n", "2444 0.000 82.0 181.0 0.453039 내야수 양타 korean \n", "2445 0.761 82.0 181.0 0.453039 내야수 양타 korean \n", "2446 0.564 82.0 181.0 0.453039 내야수 양타 korean \n", "\n", "[2447 rows x 35 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 삭제할 데이터 추출\n", "drop_idx = regular_season_df.loc[\n", " # 안타가 0개 이상이면서 장타율이 0인 경우\n", " ((regular_season_df['H'] > 0) & (regular_season_df['SLG']==0)) |\n", " \n", " # 안타가 0개 이상 혹은 볼넷이 0개 이상 혹은 몸에 맞은 볼이 0개 이상이면서\n", " # 출루율이 0인 경우\n", " (((regular_season_df['H'] > 0) |\n", " (regular_season_df['BB'] > 0) |\n", " (regular_season_df['HBP'] > 0)) &\n", " (regular_season_df['OBP'] == 0))\n", "].index \n", "\n", "# 데이터 삭제\n", "regular_season_df = regular_season_df.drop(drop_idx).reset_index(drop=True)\n", "regular_season_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3.2. 규정 타수 정의" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(6, 3)) # 크기 조정\n", "plt.plot('AB', 'OPS', data=regular_season_df, linestyle='none', marker='o', \n", " markersize=2, color='blue', alpha=0.4)\n", "plt.xlabel('AB', fontsize=14)\n", "plt.ylabel('OPS', fontsize=14)\n", "plt.xticks(list(range(min(regular_season_df['AB']), max(regular_season_df['AB']), 30)),\n", " rotation=90)\n", "plt.vlines(30, ymin=min(regular_season_df['OPS']), ymax=max(regular_season_df['OPS']),\n", " linestyles='dashed', colors='r')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameAByearOPS
2329테임즈47220151.293656
97강정호41820141.200156
1318유재신3320181.192000
416김원섭2520050.116923
1543이여상2220130.090909
681문규현1820070.109000
578김회성1720100.105000
1902정병곤1520180.130000
1874정경운1520180.130000
2384현재윤1520141.229167
\n", "
" ], "text/plain": [ " batter_name AB year OPS\n", "2329 테임즈 472 2015 1.293656\n", "97 강정호 418 2014 1.200156\n", "1318 유재신 33 2018 1.192000\n", "416 김원섭 25 2005 0.116923\n", "1543 이여상 22 2013 0.090909\n", "681 문규현 18 2007 0.109000\n", "578 김회성 17 2010 0.105000\n", "1902 정병곤 15 2018 0.130000\n", "1874 정경운 15 2018 0.130000\n", "2384 현재윤 15 2014 1.229167" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# OPS 이상치 탐색을 위한 수치 정의\n", "Q1 = regular_season_df['OPS'].quantile(0.25)\n", "Q3 = regular_season_df['OPS'].quantile(0.75)\n", "IQR = Q3 - Q1\n", "\n", "# 실제 OPS 이상치 탐색\n", "regular_season_df.loc[(regular_season_df['OPS'] < (Q1 - 1.5 * IQR)) |\n", " (regular_season_df['OPS'] > (Q3 + 1.5 * IQR))].sort_values(\n", " by=['AB'], axis=0, ascending=False)[['batter_name','AB','year','OPS']].head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "7월 일별 경기수 합" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 7.01~7.31 숫자 생성 후 반 올림\n", "major_ticks = list(np.round(np.linspace(7.01, 7.31, 31), 2)) \n", "\n", "july = (day_by_day_df['date'] >= 7) & (day_by_day_df['date'] < 8) # 7월만 불러오는 index\n", "plt.plot(major_ticks, day_by_day_df['date'].loc[july].value_counts().sort_index(), marker='o')\n", "plt.grid(linestyle='-', alpha=0.4)\n", "plt.xticks(major_ticks,rotation=90)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3.3. 시간 변수" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# 시간 변수를 생성하는 함수 정의\n", "def lag_function(df, var_name, past):\n", " # df = 시간변수를 생성할 데이터 프레임\n", " # var_name = 시간변수 생성의 대상이 되는 변수 이름\n", " # past = 몇 년 전의 성적을 생성할지 결정 (정수형)\n", " df.reset_index(drop=True, inplace = True)\n", " \n", " #시간변수 생성\n", " df['lag' + str(past) + '_' + var_name] = np.nan \n", " # 'lag1_avg','lag1_G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'TB', 'RBI', 'SB', 'CS', 'BB', 'HBP', 'SO', 'GDP', 'OBP', 'E', 'starting_salary']\n", " df['lag' + str(past) + '_' + 'AB'] = np.nan # lag1_AB\n", " \n", " for col in ['AB', var_name]: # \n", " for i in range(0, (max(df.index)+1)): # 행개수 \n", " val = df.loc[(df['batter_name'] == df['batter_name'][i]) & (df['year'] == df['year'][i] - past), col]\n", " # 과거 기록이 결측치가 아니라면 값을 넣기\n", " if(len(val) != 0):\n", " df.loc[i, 'lag' + str(past) + '_' + col] = val.iloc[0]\n", "\n", " #30타수 미만 결측치 처리\n", " df.loc[df['lag' + str(past) + '_' + 'AB'] < 30, 'lag' + str(past) + '_' + var_name] = np.nan\n", " df.drop('lag' + str(past) + '_' + 'AB', axis = 1, inplace = True)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Series([], Name: AB, dtype: int64)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val1 = regular_season_df.loc[(regular_season_df['batter_name'] == regular_season_df['batter_name'][0]) & (regular_season_df['year'] == regular_season_df['year'][0] - 1), 'AB']\n", "val1" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['avg',\n", " 'G',\n", " 'AB',\n", " 'R',\n", " 'H',\n", " '2B',\n", " '3B',\n", " 'HR',\n", " 'TB',\n", " 'RBI',\n", " 'SB',\n", " 'CS',\n", " 'BB',\n", " 'HBP',\n", " 'SO',\n", " 'GDP',\n", " 'OBP',\n", " 'E',\n", " 'starting_salary',\n", " 'weight',\n", " 'height',\n", " 'weight_per_height']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric_cols = list(regular_season_df.select_dtypes(include=numerics).drop(['batter_id','year','OPS','SLG'], axis =1).columns)\n", "numeric_cols" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
avgGABRH2B3BHRTBRBI...SOGDPOBPEstarting_salaryweightheightweight_per_heightyearbatter_name
00.3395018327629089534...2530.38390.093.0177.00.5254242018가르시아
20.22141861119231307...2810.337610000.072.0180.00.4000002014강경학
30.257120311508074210127...5830.3481510000.072.0180.00.4000002015강경학
40.158461011616321267...3050.232710000.072.0180.00.4000002016강경학
50.21459841718210224...1910.290410000.072.0180.00.4000002017강경학
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " avg G AB R H 2B 3B HR TB RBI ... SO GDP OBP E \\\n", "0 0.339 50 183 27 62 9 0 8 95 34 ... 25 3 0.383 9 \n", "2 0.221 41 86 11 19 2 3 1 30 7 ... 28 1 0.337 6 \n", "3 0.257 120 311 50 80 7 4 2 101 27 ... 58 3 0.348 15 \n", "4 0.158 46 101 16 16 3 2 1 26 7 ... 30 5 0.232 7 \n", "5 0.214 59 84 17 18 2 1 0 22 4 ... 19 1 0.290 4 \n", "\n", " starting_salary weight height weight_per_height year batter_name \n", "0 0.0 93.0 177.0 0.525424 2018 가르시아 \n", "2 10000.0 72.0 180.0 0.400000 2014 강경학 \n", "3 10000.0 72.0 180.0 0.400000 2015 강경학 \n", "4 10000.0 72.0 180.0 0.400000 2016 강경학 \n", "5 10000.0 72.0 180.0 0.400000 2017 강경학 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 상관관계를 탐색할 변수 선택\n", "numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", "numeric_cols = list(regular_season_df.select_dtypes(include=numerics).drop(['batter_id','year','OPS','SLG'], axis =1).columns)\n", "regular_season_temp = regular_season_df[numeric_cols + ['year', 'batter_name']].copy()\n", "regular_season_temp = regular_season_temp.loc[regular_season_temp['AB'] >= 30]\n", "regular_season_temp.head()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OBPyearbatter_namelag1_avglag1_Glag1_Rlag1_Hlag1_2Blag1_3Blag1_HR...lag1_BBlag1_HBPlag1_SOlag1_GDPlag1_OBPlag1_Elag1_starting_salarylag1_weightlag1_heightlag1_weight_per_height
00.3832018가르시아NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10.3372014강경학NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20.3482015강경학0.22141.011.019.02.03.01.0...13.02.028.01.00.3376.010000.072.0180.00.4
30.2322016강경학0.257120.050.080.07.04.02.0...40.05.058.03.00.34815.010000.072.0180.00.4
40.2902017강경학0.15846.016.016.03.02.01.0...8.02.030.05.00.2327.010000.072.0180.00.4
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " OBP year batter_name lag1_avg lag1_G lag1_R lag1_H lag1_2B \\\n", "0 0.383 2018 가르시아 NaN NaN NaN NaN NaN \n", "1 0.337 2014 강경학 NaN NaN NaN NaN NaN \n", "2 0.348 2015 강경학 0.221 41.0 11.0 19.0 2.0 \n", "3 0.232 2016 강경학 0.257 120.0 50.0 80.0 7.0 \n", "4 0.290 2017 강경학 0.158 46.0 16.0 16.0 3.0 \n", "\n", " lag1_3B lag1_HR ... lag1_BB lag1_HBP lag1_SO lag1_GDP lag1_OBP \\\n", "0 NaN NaN ... NaN NaN NaN NaN NaN \n", "1 NaN NaN ... NaN NaN NaN NaN NaN \n", "2 3.0 1.0 ... 13.0 2.0 28.0 1.0 0.337 \n", "3 4.0 2.0 ... 40.0 5.0 58.0 3.0 0.348 \n", "4 2.0 1.0 ... 8.0 2.0 30.0 5.0 0.232 \n", "\n", " lag1_E lag1_starting_salary lag1_weight lag1_height \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 6.0 10000.0 72.0 180.0 \n", "3 15.0 10000.0 72.0 180.0 \n", "4 7.0 10000.0 72.0 180.0 \n", "\n", " lag1_weight_per_height \n", "0 NaN \n", "1 NaN \n", "2 0.4 \n", "3 0.4 \n", "4 0.4 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 시간변수 생성 함수를 통한 지표별 1년 전 성적 추출\n", "for col in numeric_cols:\n", " regular_season_temp = lag_function(regular_season_temp, col, 1)\n", "\n", "numeric_cols.remove('OBP')\n", "regular_season_temp.drop(numeric_cols, axis = 1, inplace= True)\n", "regular_season_temp.head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Diagonal Correlation HeatMap')" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 상관관계 도출\n", "corr_matrix = regular_season_temp.corr()\n", "corr_matrix = corr_matrix.sort_values(by = 'OBP', axis = 0, ascending=False)\n", "corr_matrix = corr_matrix[corr_matrix.index]\n", "\n", "# 상관관계의 시각적 표현\n", "f, ax = plt.subplots(figsize=(12, 12))\n", "corr = regular_season_temp.select_dtypes(exclude=[\"object\",\"bool\"]).corr()\n", "\n", "# 대각 행렬을 기준으로 한 쪽만 나타나게 설정해줍니다.\n", "mask = np.zeros_like(corr_matrix, dtype=np.bool)\n", "mask[np.triu_indices_from(mask)] = True\n", "\n", "g = sns.heatmap(corr_matrix, cmap='RdYlGn_r', vmax= 1, mask=mask, \n", "center=0, annot=True, fmt='.2f', square=True, linewidths=.5, cbar_kws={\"shrink\": .5})\n", "plt.title(\"Diagonal Correlation HeatMap\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearSF_1
0가르시아20180.032787
1강경학20110.000000
2강경학2014-0.000000
3강경학20150.009646
4강경학20160.009901
............
2442황진수20140.000000
2443황진수20150.000000
2444황진수20160.000000
2445황진수20170.008547
2446황진수2018-0.000000
\n", "

2447 rows × 3 columns

\n", "
" ], "text/plain": [ " batter_name year SF_1\n", "0 가르시아 2018 0.032787\n", "1 강경학 2011 0.000000\n", "2 강경학 2014 -0.000000\n", "3 강경학 2015 0.009646\n", "4 강경학 2016 0.009901\n", "... ... ... ...\n", "2442 황진수 2014 0.000000\n", "2443 황진수 2015 0.000000\n", "2444 황진수 2016 0.000000\n", "2445 황진수 2017 0.008547\n", "2446 황진수 2018 -0.000000\n", "\n", "[2447 rows x 3 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#희생 플라이 구하기\n", "#OBP(출루율) 계산 공식 이용하여 SF(희생 플라이)계산 >> (H+BB+HBP)/OBP-(AB+BB+HBP)\n", "regular_season_df['SF'] = \\\n", " regular_season_df[['H','BB','HBP']].sum(axis=1) / regular_season_df['OBP'] - \\\n", " regular_season_df[['AB','BB','HBP']].sum(axis=1)\n", "regular_season_df['SF'].fillna(0, inplace = True)\n", "regular_season_df['SF'] = regular_season_df['SF'].apply(lambda x : round(x,0))\n", "\n", "#한 타수당 평균 희생 플라이 계산 후 필요한 것만 추출\n", "regular_season_df['SF_1'] = regular_season_df['SF'] / regular_season_df['AB']\n", "regular_season_df_SF = regular_season_df[['batter_name','year','SF_1']]\n", "regular_season_df_SF" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABOBP
0가르시아2018850.418367
1강경학201110.000000
2강경학201401.000000
3강경학20151560.342541
4강경학2016810.222222
...............
1381황진수201240.400000
1382황진수201300.000000
1383황진수201690.000000
1384황진수2017710.316456
1385황진수2018240.230769
\n", "

1386 rows × 4 columns

\n", "
" ], "text/plain": [ " batter_name year AB OBP\n", "0 가르시아 2018 85 0.418367\n", "1 강경학 2011 1 0.000000\n", "2 강경학 2014 0 1.000000\n", "3 강경학 2015 156 0.342541\n", "4 강경학 2016 81 0.222222\n", "... ... ... ... ...\n", "1381 황진수 2012 4 0.400000\n", "1382 황진수 2013 0 0.000000\n", "1383 황진수 2016 9 0.000000\n", "1384 황진수 2017 71 0.316456\n", "1385 황진수 2018 24 0.230769\n", "\n", "[1386 rows x 4 columns]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#day_by_day에서 연도별 선수의 시즌 전반기 출루율과 관련된 성적 합 구하기\n", "sum_hf_yr_OBP = day_by_day_df.loc[day_by_day_df['date'] <= 7.18].groupby(['batter_name','year'])['AB','H','BB','HBP'].sum().reset_index()\n", "\n", "#day_by_day와 regular season에서 구한 희생 플라이 관련 데이터를 합치기\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.merge(regular_season_df_SF, how = 'left', on=['batter_name', 'year'])\n", "\n", "#선수별 전반기 희생 플라이 수 계산\n", "sum_hf_yr_OBP['SF'] = (sum_hf_yr_OBP['SF_1']*sum_hf_yr_OBP['AB']).apply(lambda x: round(x, 0))\n", "sum_hf_yr_OBP.drop('SF_1', axis = 1, inplace = True)\n", "\n", "#선수별 전반기 OBP(출루율) 계산\n", "sum_hf_yr_OBP['OBP'] = sum_hf_yr_OBP[['H', 'BB', 'HBP']].sum(axis = 1) / \\\n", " sum_hf_yr_OBP[['AB', 'BB', 'HBP','SF']].sum(axis = 1)\n", "# OBP 결측치를 0으로 처리 \n", "sum_hf_yr_OBP['OBP'].fillna(0, inplace = True)\n", "\n", "# 분석에 필요하지 않은 열 제거\n", "sum_hf_yr_OBP = sum_hf_yr_OBP[['batter_name','year','AB','OBP']]\n", "sum_hf_yr_OBP" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3.4. 추가 변수 생성" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "나이별 평균 성적" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 나이 변수 생성\n", "regular_season_df['age'] = regular_season_df['year'] - \\\n", " regular_season_df['year_born'].apply(lambda x: int(x[:4]))\n", "\n", "# 나이, 평균 출루율, 출루율 중위값으로 구성된 데이터프레임 구축\n", "temp_df = regular_season_df.loc[regular_season_df['AB'] >= 30].groupby('age').agg(\n", " {'OBP':['mean','median']}).reset_index()\n", "temp_df.columns = temp_df.columns.droplevel()\n", "temp_df.columns = ['age', 'mean_OBP', 'median_OBP']\n", "\n", "# 나이에 따른 출루율 추이 시각화\n", "plt.figure(figsize=(12,8))\n", "plt.plot('age', 'mean_OBP', data=temp_df, marker='o', markerfacecolor='blue',\n", " markersize=12, color='skyblue', linewidth=4)\n", "plt.xticks(temp_df['age']) # 나이 표시\n", "plt.grid(linestyle='-', alpha=0.4)\n", "plt.ylabel('평균OBP')\n", "plt.xlabel('나이')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABOBPagelag1_OBPlag2_OBPlag3_OBP
0가르시아2018850.41836733NaNNaNNaN
1강경학201110.00000019NaNNaNNaN
2강경학201401.00000022NaNNaNNaN
3강경학20151560.34254123NaNNaNNaN
4강경학2016810.222222240.342541NaNNaN
...........................
1381황진수201240.40000023NaNNaNNaN
1382황진수201300.00000024NaNNaNNaN
1383황진수201690.00000027NaNNaNNaN
1384황진수2017710.31645628NaNNaNNaN
1385황진수2018240.230769290.316456NaNNaN
\n", "

1386 rows × 8 columns

\n", "
" ], "text/plain": [ " batter_name year AB OBP age lag1_OBP lag2_OBP lag3_OBP\n", "0 가르시아 2018 85 0.418367 33 NaN NaN NaN\n", "1 강경학 2011 1 0.000000 19 NaN NaN NaN\n", "2 강경학 2014 0 1.000000 22 NaN NaN NaN\n", "3 강경학 2015 156 0.342541 23 NaN NaN NaN\n", "4 강경학 2016 81 0.222222 24 0.342541 NaN NaN\n", "... ... ... ... ... ... ... ... ...\n", "1381 황진수 2012 4 0.400000 23 NaN NaN NaN\n", "1382 황진수 2013 0 0.000000 24 NaN NaN NaN\n", "1383 황진수 2016 9 0.000000 27 NaN NaN NaN\n", "1384 황진수 2017 71 0.316456 28 NaN NaN NaN\n", "1385 황진수 2018 24 0.230769 29 0.316456 NaN NaN\n", "\n", "[1386 rows x 8 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 나이를 포함한 변수 선택\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.merge(regular_season_df[['batter_name','year','age']],\n", " how = 'left', on=['batter_name','year'])\n", "\n", "# 총 3년 전 성적까지 변수를 생성\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"OBP\", 1)\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"OBP\", 2)\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"OBP\", 3)\n", "sum_hf_yr_OBP" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3.5. 데이터 사후 처리" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lag1_OBP 0.41\n", "lag2_OBP 0.54\n", "lag3_OBP 0.61\n", "dtype: float64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "round(sum_hf_yr_OBP[['lag1_OBP','lag2_OBP','lag3_OBP']].isna().sum() / \\\n", " sum_hf_yr_OBP.shape[0], 2)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\ipykernel_launcher.py:4: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " after removing the cwd from sys.path.\n", "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\ipykernel_launcher.py:8: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABOBPagelag1_OBPlag2_OBPlag3_OBPmean_OBP
0가르시아2018850.41836733NaNNaNNaN0.383495
1강경학201110.00000019NaNNaNNaN0.337880
2강경학201401.00000022NaNNaNNaN0.337880
3강경학20151560.34254123NaNNaNNaN0.337880
4강경학2016810.222222240.342541NaNNaN0.337880
..............................
1347황진수201240.40000023NaNNaNNaN0.358779
1348황진수201300.00000024NaNNaNNaN0.358779
1349황진수201690.00000027NaNNaNNaN0.358779
1350황진수2017710.31645628NaNNaNNaN0.358779
1351황진수2018240.230769290.316456NaNNaN0.358779
\n", "

1352 rows × 9 columns

\n", "
" ], "text/plain": [ " batter_name year AB OBP age lag1_OBP lag2_OBP lag3_OBP \\\n", "0 가르시아 2018 85 0.418367 33 NaN NaN NaN \n", "1 강경학 2011 1 0.000000 19 NaN NaN NaN \n", "2 강경학 2014 0 1.000000 22 NaN NaN NaN \n", "3 강경학 2015 156 0.342541 23 NaN NaN NaN \n", "4 강경학 2016 81 0.222222 24 0.342541 NaN NaN \n", "... ... ... ... ... ... ... ... ... \n", "1347 황진수 2012 4 0.400000 23 NaN NaN NaN \n", "1348 황진수 2013 0 0.000000 24 NaN NaN NaN \n", "1349 황진수 2016 9 0.000000 27 NaN NaN NaN \n", "1350 황진수 2017 71 0.316456 28 NaN NaN NaN \n", "1351 황진수 2018 24 0.230769 29 0.316456 NaN NaN \n", "\n", " mean_OBP \n", "0 0.383495 \n", "1 0.337880 \n", "2 0.337880 \n", "3 0.337880 \n", "4 0.337880 \n", "... ... \n", "1347 0.358779 \n", "1348 0.358779 \n", "1349 0.358779 \n", "1350 0.358779 \n", "1351 0.358779 \n", "\n", "[1352 rows x 9 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#1. 선수별 OBP 평균\n", "# SF = (H+BB+HBP) / OBP-(AB+BB+HBP)\n", "# OBP = (H+BB+HBP) / (AB+BB+HBP+SF)\n", "player_OBP_mean = regular_season_df.loc[regular_season_df['AB'] >= 30].groupby('batter_name')['AB','H','BB','HBP','SF'].sum().reset_index()\n", "player_OBP_mean['mean_OBP'] = player_OBP_mean[['H', 'BB', 'HBP']].sum(axis=1) / player_OBP_mean[['AB','BB','HBP','SF']].sum(axis=1)\n", "\n", "#2. 시즌별 OBP 평균\n", "season_OBP_mean = regular_season_df.loc[regular_season_df['AB'] >= 30].groupby('year')['AB','H','BB','HBP','SF'].sum().reset_index()\n", "season_OBP_mean['mean_OBP'] = season_OBP_mean[['H', 'BB', 'HBP']].sum(axis=1) / season_OBP_mean[['AB','BB','HBP','SF']].sum(axis=1)\n", "season_OBP_mean = season_OBP_mean[['year', 'mean_OBP']]\n", "\n", "#### player_OBP_mean(선수평균) 열 추가\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.merge(player_OBP_mean[['batter_name', 'mean_OBP']], how ='left', on=\"batter_name\")\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.loc[~sum_hf_yr_OBP['mean_OBP'].isna()].reset_index(drop=True)\n", "sum_hf_yr_OBP" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# 결측치 처리하는 함수 정의\n", "def lag_na_fill(data_set, var_name, past, season_var_mean_data):\n", " # data_Set: 이용할 데이터셋\n", " # var_name: 시간 변수를 만들 변수 이름\n", " # past: 몇 년 전 변수를 만들지 결정\n", " # season_var_name_mean_data season별로 var_name의 평균을 구한 데이터\n", " \n", " for i in range(0, len(data_set)):\n", " if np.isnan(data_set[\"lag\" + str(past) + \"_\" + var_name][i]):\n", " data_set.loc[i, [\"lag\" + str(past) + \"_\" + var_name]] = (data_set[\"mean\" + \"_\" + var_name][i] + \n", " season_var_mean_data.loc[season_var_mean_data['year'] == (data_set['year'][i] - past), \n", " \"mean_\" + var_name].iloc[0]) / 2\n", " return data_set" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABOBPagelag1_OBPlag2_OBPlag3_OBPmean_OBP
0가르시아2018850.418367330.3699820.3759100.3731190.383495
1강경학201110.000000190.3474340.3486030.3442590.337880
2강경학201401.000000220.3466820.3375110.3431310.337880
3강경학20151560.342541230.3534250.3466820.3375110.337880
4강경학2016810.222222240.3425410.3534250.3466820.337880
..............................
1347황진수201240.400000230.3535800.3578830.3590520.358779
1348황진수201300.000000240.3479600.3535800.3578830.358779
1349황진수201690.000000270.3607600.3638740.3571310.358779
1350황진수2017710.316456280.3635520.3607600.3638740.358779
1351황진수2018240.230769290.3164560.3635520.3607600.358779
\n", "

1352 rows × 9 columns

\n", "
" ], "text/plain": [ " batter_name year AB OBP age lag1_OBP lag2_OBP lag3_OBP \\\n", "0 가르시아 2018 85 0.418367 33 0.369982 0.375910 0.373119 \n", "1 강경학 2011 1 0.000000 19 0.347434 0.348603 0.344259 \n", "2 강경학 2014 0 1.000000 22 0.346682 0.337511 0.343131 \n", "3 강경학 2015 156 0.342541 23 0.353425 0.346682 0.337511 \n", "4 강경학 2016 81 0.222222 24 0.342541 0.353425 0.346682 \n", "... ... ... ... ... ... ... ... ... \n", "1347 황진수 2012 4 0.400000 23 0.353580 0.357883 0.359052 \n", "1348 황진수 2013 0 0.000000 24 0.347960 0.353580 0.357883 \n", "1349 황진수 2016 9 0.000000 27 0.360760 0.363874 0.357131 \n", "1350 황진수 2017 71 0.316456 28 0.363552 0.360760 0.363874 \n", "1351 황진수 2018 24 0.230769 29 0.316456 0.363552 0.360760 \n", "\n", " mean_OBP \n", "0 0.383495 \n", "1 0.337880 \n", "2 0.337880 \n", "3 0.337880 \n", "4 0.337880 \n", "... ... \n", "1347 0.358779 \n", "1348 0.358779 \n", "1349 0.358779 \n", "1350 0.358779 \n", "1351 0.358779 \n", "\n", "[1352 rows x 9 columns]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 생성한 함수를 이용해 결측치 처리 진행\n", "sum_hf_yr_OBP = lag_na_fill(sum_hf_yr_OBP, \"OBP\", 1, season_OBP_mean) # 1년 전 성적 대체\n", "sum_hf_yr_OBP = lag_na_fill(sum_hf_yr_OBP, \"OBP\", 2, season_OBP_mean) # 2년 전 성적 대체\n", "sum_hf_yr_OBP = lag_na_fill(sum_hf_yr_OBP, \"OBP\", 3, season_OBP_mean) # 3년 전 성적 대체\n", "sum_hf_yr_OBP" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3.6. SLG 데이터 전처리" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Diagonal Correlation HeatMap')" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 상관관계를 탐색할 변수 선택\n", "numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", "numeric_cols = list(regular_season_df.select_dtypes(include=numerics).drop(['batter_id','year','OPS','OBP'], axis =1).columns)\n", "regular_season_temp = regular_season_df[numeric_cols + ['year', 'batter_name']].copy()\n", "regular_season_temp = regular_season_temp.loc[regular_season_temp['AB']>=30]\n", "\n", "# 시간변수 생성 함수를 통한 지표별 1년 전 성적 추출\n", "for col in numeric_cols:\n", " regular_season_temp = lag_function(regular_season_temp, col, 1)\n", "\n", "numeric_cols.remove('SLG')\n", "regular_season_temp.drop(numeric_cols, axis = 1, inplace=True)\n", "\n", "# 상관관계 도출\n", "corr_matrix = regular_season_temp.corr()\n", "corr_matrix = corr_matrix.sort_values(by = 'SLG', axis = 0, ascending=False)\n", "corr_matrix = corr_matrix[corr_matrix.index]\n", "\n", "# 상관관계의 시각적 표현\n", "f, ax = plt.subplots(figsize=(12, 12))\n", "corr = regular_season_temp.select_dtypes(exclude=[\"object\",\"bool\"]).corr()\n", "\n", "# 대각 행렬을 기준으로 한쪽만 나타나게 설정해줍니다.\n", "mask = np.zeros_like(corr_matrix, dtype=np.bool)\n", "mask[np.triu_indices_from(mask)] = True\n", "\n", "cmap = sns.diverging_palette(220, 10, as_cmap=True)\n", "g = sns.heatmap(corr_matrix, cmap='RdYlGn_r', vmax=1, mask=mask, center=0, annot=True,\n", " fmt='.2f', square=True, linewidths=.5, cbar_kws={\"shrink\": .5})\n", "plt.title(\"Diagonal Correlation HeatMap\")" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABSLGage
0가르시아2018850.55294133
1강경학201110.00000019
2강경학201400.00000022
3강경학20151560.33333323
4강경학2016810.22222224
\n", "
" ], "text/plain": [ " batter_name year AB SLG age\n", "0 가르시아 2018 85 0.552941 33\n", "1 강경학 2011 1 0.000000 19\n", "2 강경학 2014 0 0.000000 22\n", "3 강경학 2015 156 0.333333 23\n", "4 강경학 2016 81 0.222222 24" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# day_by_day에서 연도별 선수의 시즌 전반기 장타율(SLG)과 관련된 성적 합 구하기\n", "sum_hf_yr_SLG = day_by_day_df.loc[day_by_day_df['date'] <= 7.18].groupby(['batter_name','year'])['AB','H','2B','3B', 'HR'].sum().reset_index()\n", "\n", "# 전반기 장타율 계산\n", "sum_hf_yr_SLG['SLG'] = \\\n", " (sum_hf_yr_SLG['H'] - sum_hf_yr_SLG[['2B', '3B', 'HR']].sum(axis=1) +\n", " sum_hf_yr_SLG['2B']*2 + sum_hf_yr_SLG['3B']*3 + sum_hf_yr_SLG['HR']*4\n", " ) / sum_hf_yr_SLG['AB']\n", "\n", "# SLG 결측치를 0으로 처리 \n", "sum_hf_yr_SLG['SLG'].fillna(0, inplace=True)\n", "\n", "# 필요한 칼럼만 불러오고 나이 계산\n", "sum_hf_yr_SLG = sum_hf_yr_SLG[['batter_name','year','AB','SLG']]\n", "sum_hf_yr_SLG = sum_hf_yr_SLG.merge(regular_season_df[['batter_name','year','age']],\n", " how='left', on=['batter_name','year'])\n", "sum_hf_yr_SLG.head()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABSLGagelag1_SLGlag2_SLGlag3_SLG
0가르시아2018850.55294133NaNNaNNaN
1강경학201110.00000019NaNNaNNaN
2강경학201400.00000022NaNNaNNaN
3강경학20151560.33333323NaNNaNNaN
4강경학2016810.222222240.333333NaNNaN
\n", "
" ], "text/plain": [ " batter_name year AB SLG age lag1_SLG lag2_SLG lag3_SLG\n", "0 가르시아 2018 85 0.552941 33 NaN NaN NaN\n", "1 강경학 2011 1 0.000000 19 NaN NaN NaN\n", "2 강경학 2014 0 0.000000 22 NaN NaN NaN\n", "3 강경학 2015 156 0.333333 23 NaN NaN NaN\n", "4 강경학 2016 81 0.222222 24 0.333333 NaN NaN" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "lag1_SLG 0.41\n", "lag2_SLG 0.54\n", "lag3_SLG 0.61\n", "dtype: float64" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 총 3년 전 성적까지 변수를 생성\n", "sum_hf_yr_SLG = lag_function(sum_hf_yr_SLG, \"SLG\", 1)\n", "sum_hf_yr_SLG = lag_function(sum_hf_yr_SLG, \"SLG\", 2)\n", "sum_hf_yr_SLG = lag_function(sum_hf_yr_SLG, \"SLG\", 3)\n", "display(sum_hf_yr_SLG.head())\n", "\n", "round(sum_hf_yr_SLG[['lag1_SLG', 'lag2_SLG', 'lag3_SLG']].isna().sum()/\\\n", " sum_hf_yr_SLG.shape[0], 2)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " \n", "C:\\Users\\HOME\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " if __name__ == '__main__':\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_nameyearABSLGagelag1_SLGlag2_SLGlag3_SLGmean_SLG
0가르시아2018850.552941330.4818550.4814980.4766270.519126
1강경학201110.000000190.3729020.3808820.3617160.332527
2강경학201400.000000220.3629310.3493440.3596160.332527
3강경학20151560.333333230.3894150.3629310.3493440.332527
4강경학2016810.222222240.3333330.3894150.3629310.332527
\n", "
" ], "text/plain": [ " batter_name year AB SLG age lag1_SLG lag2_SLG lag3_SLG \\\n", "0 가르시아 2018 85 0.552941 33 0.481855 0.481498 0.476627 \n", "1 강경학 2011 1 0.000000 19 0.372902 0.380882 0.361716 \n", "2 강경학 2014 0 0.000000 22 0.362931 0.349344 0.359616 \n", "3 강경학 2015 156 0.333333 23 0.389415 0.362931 0.349344 \n", "4 강경학 2016 81 0.222222 24 0.333333 0.389415 0.362931 \n", "\n", " mean_SLG \n", "0 0.519126 \n", "1 0.332527 \n", "2 0.332527 \n", "3 0.332527 \n", "4 0.332527 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "lag1_SLG 0.0\n", "lag2_SLG 0.0\n", "lag3_SLG 0.0\n", "dtype: float64" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 선수별 SLG 평균 데이터(player_SLG_mean)를 만듭니다\n", "player_SLG_mean = regular_season_df.loc[regular_season_df['AB'] >= 30].groupby('batter_name')['AB','H','2B','3B','HR'].sum().reset_index()\n", "player_SLG_mean['mean_SLG'] = \\\n", " (player_SLG_mean['H'] - player_SLG_mean[['2B','3B','HR']].sum(axis = 1) +\n", " player_SLG_mean['2B']*2 + player_SLG_mean['3B']*3 + player_SLG_mean['HR']*4\n", " ) / player_SLG_mean['AB']\n", "\n", "# 시즌별 SLG 평균 데이터(season_SLG_mean)를 만듭니다\n", "season_SLG_mean = regular_season_df.loc[regular_season_df['AB'] >= 30].groupby('year')['AB','H','2B','3B','HR'].sum().reset_index()\n", "season_SLG_mean['mean_SLG'] = \\\n", " (season_SLG_mean['H'] - season_SLG_mean[['2B','3B','HR']].sum(axis = 1) + \n", " season_SLG_mean['2B']*2 + season_SLG_mean['3B']*3 + season_SLG_mean['HR']*4\n", " ) / season_SLG_mean['AB']\n", "\n", "# 선수 평균의 SLG(player_OBP_mean)를 새로운 변수로 더합니다.\n", "sum_hf_yr_SLG = sum_hf_yr_SLG.merge(player_SLG_mean[['batter_name', 'mean_SLG']], how='left', on=\"batter_name\")\n", "\n", "# 선수 평균의 성적이 결측치이면 데이터에서 제거합니다.\n", "sum_hf_yr_SLG = \\\n", " sum_hf_yr_SLG.loc[~sum_hf_yr_SLG['mean_SLG'].isna()].reset_index(drop=True)\n", "\n", "# 결측치 처리\n", "sum_hf_yr_SLG = lag_na_fill(sum_hf_yr_SLG, \"SLG\", 1, season_SLG_mean) #1년전 성적 대체\n", "sum_hf_yr_SLG = lag_na_fill(sum_hf_yr_SLG, \"SLG\", 2, season_SLG_mean) #2년전 성적 대체\n", "sum_hf_yr_SLG = lag_na_fill(sum_hf_yr_SLG, \"SLG\", 3, season_SLG_mean) #3년전 성적 대체\n", "\n", "display(sum_hf_yr_SLG.head())\n", "round(sum_hf_yr_SLG[['lag1_SLG', 'lag2_SLG', 'lag3_SLG']].isna().sum()/\\\n", " sum_hf_yr_SLG.shape[0], 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.4. 모델링\n", "### 1.4.1. 데이터 분할" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(872, 9) (150, 9) (872, 9) (150, 9)\n" ] } ], "source": [ "# 30타수 이상의 데이터만 학습\n", "sum_hf_yr_OBP= sum_hf_yr_OBP.loc[sum_hf_yr_OBP['AB']>=30]\n", "sum_hf_yr_SLG = sum_hf_yr_SLG.loc[sum_hf_yr_SLG['AB']>=30] \n", "\n", "# 2018년 데이터를 test 데이터 2018년 이전은 train 데이터로 나눈다.\n", "OBP_train = sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year'] != 2018]\n", "OBP_test = sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year'] == 2018]\n", "\n", "SLG_train = sum_hf_yr_SLG.loc[sum_hf_yr_SLG['year'] != 2018]\n", "SLG_test = sum_hf_yr_SLG.loc[sum_hf_yr_SLG['year'] == 2018]\n", "print(OBP_train.shape, OBP_test.shape, SLG_train.shape, SLG_test.shape)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "def wrmse(v,w,p):\n", " # v: 실제값\n", " # w: 타수\n", " # p: 예측값\n", " return sum(np.sqrt(((v-p)**2 * w) / sum(w)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.4.2. 모델 선택" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agelag1_SLGlag2_SLGlag3_SLGmean_SLG
3230.3894150.3629310.3493440.332527
4240.3333330.3894150.3629310.332527
5250.2222220.3333330.3894150.332527
7200.4370480.4469160.4456390.466540
8210.2857140.4370480.4469160.466540
\n", "
" ], "text/plain": [ " age lag1_SLG lag2_SLG lag3_SLG mean_SLG\n", "3 23 0.389415 0.362931 0.349344 0.332527\n", "4 24 0.333333 0.389415 0.362931 0.332527\n", "5 25 0.222222 0.333333 0.389415 0.332527\n", "7 20 0.437048 0.446916 0.445639 0.466540\n", "8 21 0.285714 0.437048 0.446916 0.466540" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "SLG_train.iloc[:,-5:].head()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge, Lasso\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "# log 단위(1e+01)로 1.e-04 ~ 1.e+01 사이의 구간에 대해 parameter를 탐색한다. \n", "lasso_params = {'alpha' : np.logspace(-4, 1, 6)} \n", "ridge_params = {'alpha' : np.logspace(-4, 1, 6)} \n", "\n", "# GridSearchCV를 이용하여 dict에 Lasso, Ridege OBP 모델을 저장한다.\n", "OBP_linear_models = {\n", " 'Lasso': GridSearchCV(Lasso(), param_grid=lasso_params).fit(OBP_train.iloc[:,-5:], OBP_train['OBP']).best_estimator_,\n", " 'Ridge': GridSearchCV(Ridge(), param_grid=ridge_params).fit(OBP_train.iloc[:,-5:], OBP_train['OBP']).best_estimator_,}\n", "\n", "# GridSearchCV를 이용하여 dict에 Lasso, Ridge SLG 모델을 저장한다\n", "SLG_linear_models = {\n", " 'Lasso': GridSearchCV(Lasso(), param_grid=lasso_params).fit(SLG_train.iloc[:,-5:], SLG_train['SLG']).best_estimator_,\n", " 'Ridge': GridSearchCV(Ridge(), param_grid=ridge_params).fit(SLG_train.iloc[:,-5:], SLG_train['SLG']).best_estimator_,}" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": true }, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_2780\\1190445439.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;31m# GridSearchCV를 이용하여 dict에 OBP Randomforest 모델을 저장한다.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 18\u001b[0m SLG_RF_models = {'RF': GridSearchCV(RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1\n\u001b[1;32m---> 19\u001b[1;33m ).fit(SLG_train.iloc[:,-5:], SLG_train['SLG']).best_estimator_}\n\u001b[0m\u001b[0;32m 20\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"걸린시간 : {np.round(time.time() - start,3)}초\"\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 현재시간 – 시작시간(단위 초)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[0;32m 889\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 890\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 891\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 892\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 893\u001b[0m \u001b[1;31m# multimetric is determined here because in the case of a callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1390\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1391\u001b[0m \u001b[1;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1392\u001b[1;33m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1393\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1394\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 849\u001b[0m )\n\u001b[0;32m 850\u001b[0m for (cand_idx, parameters), (split_idx, (train, test)) in product(\n\u001b[1;32m--> 851\u001b[1;33m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcandidate_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 852\u001b[0m )\n\u001b[0;32m 853\u001b[0m )\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1054\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1055\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1056\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1057\u001b[0m \u001b[1;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1058\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 934\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'supports_timeout'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 935\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 936\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 937\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\site-packages\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[1;34m(future, timeout)\u001b[0m\n\u001b[0;32m 540\u001b[0m AsyncResults.get from multiprocessing.\"\"\"\n\u001b[0;32m 541\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 542\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 543\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mCfTimeoutError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 544\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\concurrent\\futures\\_base.py\u001b[0m in \u001b[0;36mresult\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 429\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 430\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 431\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\store_amount_prediction\\lib\\threading.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 294\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 295\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 296\u001b[1;33m \u001b[0mwaiter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 297\u001b[0m \u001b[0mgotit\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 298\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import time\n", "from sklearn.ensemble import RandomForestRegressor \n", "start = time.time() # 시작 시간 저장\n", "\n", "# 랜덤 포레스트의 parameter 범위를 정의한다.\n", "RF_params = {\n", " 'n_estimators': [50,100,150,200,300,500,1000],\n", " 'max_features': ['auto', 'sqrt'],\n", " 'max_depth' : [1,2,3,5,6,10],\n", " 'min_samples_leaf': [1, 2, 4],\n", " 'min_samples_split': [2, 3, 5, 10]}\n", "\n", "# GridSearchCV를 이용하여 dict에 OBP Randomforest 모델을 저장한다.\n", "OBP_RF_models = {'RF': GridSearchCV(RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1\n", " ).fit(OBP_train.iloc[:,-5:], OBP_train['OBP']).best_estimator_}\n", "\n", "# GridSearchCV를 이용하여 dict에 OBP Randomforest 모델을 저장한다.\n", "SLG_RF_models = {'RF': GridSearchCV(RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1\n", " ).fit(SLG_train.iloc[:,-5:], SLG_train['SLG']).best_estimator_}\n", "\n", "print(f\"걸린시간 : {np.round(time.time() - start, 3)}초\") # 현재시간 – 시작시간(단위 초)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "걸린시간 : 1115.496초\n" ] } ], "source": [ "import xgboost as xgb \n", "start = time.time() # 시작 시간 저장\n", "\n", "# xgboost parmeter space를 정의한다.\n", "XGB_params = {\n", " 'min_child_weight': [1,3, 5,10],\n", " 'gamma': [0.3,0.5, 1, 1.5, 2, 5],\n", " 'subsample': [0.6, 0.8, 1.0],\n", " 'colsample_bytree': [0.6, 0.8, 1.0],\n", " 'max_depth': [3, 4, 5,7,10]}\n", "# GridSearchCV를 통해 parameter를 탐색하게 정의한다.\n", "XGB_OBP_gridsearch = GridSearchCV(xgb.XGBRegressor(random_state=42), param_grid=XGB_params, n_jobs=-1) \n", "XGB_SLG_gridsearch = GridSearchCV(xgb.XGBRegressor(random_state=42), param_grid=XGB_params, n_jobs=-1)\n", "\n", "# 모델 학습\n", "XGB_OBP_gridsearch.fit(OBP_train.iloc[:,-5:], OBP_train['OBP'])\n", "XGB_SLG_gridsearch.fit(SLG_train.iloc[:,-5:], SLG_train['SLG'])\n", "\n", "print(f\"걸린시간 : {np.round(time.time() - start,3)}초\") # 현재시간 – 시작시간(단위 초)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 테스트 데이터셋(2018년)의 선수들의 OBP를 예측\n", "Lasso_OBP = OBP_linear_models['Lasso'].predict(OBP_test.iloc[:,-5:])\n", "Ridge_OBP = OBP_linear_models['Ridge'].predict(OBP_test.iloc[:,-5:])\n", "RF_OBP = OBP_RF_models['RF'].predict(OBP_test.iloc[:,-5:])\n", "XGB_OBP = XGB_OBP_gridsearch.predict(OBP_test.iloc[:,-5:])\n", "\n", "# test 데이터의 WRMSE 계산\n", "wrmse_score = [wrmse(OBP_test['OBP'], OBP_test['AB'], Lasso_OBP),\n", " wrmse(OBP_test['OBP'], OBP_test['AB'], Ridge_OBP),\n", " wrmse(OBP_test['OBP'], OBP_test['AB'], RF_OBP),\n", " wrmse(OBP_test['OBP'], OBP_test['AB'], XGB_OBP)]\n", "\n", "x_lab = ['Lasso', 'Ridge', 'RF', 'XGB']\n", "\n", "plt.bar(x_lab, wrmse_score)\n", "plt.title('WRMSE of OBP', fontsize=20)\n", "plt.xlabel('model', fontsize=18)\n", "plt.ylabel('', fontsize=18)\n", "plt.ylim(0,0.5)\n", "\n", "# 막대그래프 위에 값을 표시해준다.\n", "for i, v in enumerate(wrmse_score):\n", " plt.text(i-0.1, v+0.01, str(np.round(v, 3))) # x 좌표, y 좌표, 텍스트를 표현한다.\n", " \n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 테스트 데이터셋(2018년)의 선수들의 SLG를 예측\n", "Lasso_SLG = SLG_linear_models['Lasso'].predict(SLG_test.iloc[:,-5:])\n", "Ridge_SLG = SLG_linear_models['Ridge'].predict(SLG_test.iloc[:,-5:])\n", "RF_SLG = SLG_RF_models['RF'].predict(SLG_test.iloc[:,-5:])\n", "XGB_SLG = XGB_SLG_gridsearch.predict(SLG_test.iloc[:,-5:])\n", "\n", "# test데이터 WRMSE 계산\n", "wrmse_score_SLG = [wrmse(SLG_test['SLG'], SLG_test['AB'], Lasso_SLG),\n", " wrmse(SLG_test['SLG'], SLG_test['AB'], Ridge_SLG), \n", " wrmse(SLG_test['SLG'], SLG_test['AB'], RF_SLG),\n", " wrmse(SLG_test['SLG'], SLG_test['AB'], XGB_SLG)]\n", "\n", "x_lab = ['Lasso', 'Ridge', 'RF', 'XGB']\n", "\n", "plt.bar(x_lab, wrmse_score_SLG)\n", "plt.title('WRMSE of SLG', fontsize=20)\n", "plt.xlabel('model', fontsize=18)\n", "plt.ylabel('', fontsize=18)\n", "plt.ylim(0, 0.9)\n", "\n", "# 막대그래프 위에 값을 표시해준다.\n", "for i, v in enumerate(wrmse_score_SLG):\n", " plt.text(i-0.1, v + 0.01, str(np.round(v,3))) # x 좌표, y 좌표, 텍스트를 표현한다.\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.4.3. 결과 해석과 평가" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(15,6)) # 그래프의 크기 지정\n", "\n", "plt.subplot(1,2,1) \n", "plt.barh(OBP_train.iloc[:,-5:].columns, OBP_RF_models['RF'].feature_importances_) \n", "plt.title('Feature importance of RF in OBP')\n", "\n", "plt.subplot(1,2,2) \n", "plt.barh(SLG_train.iloc[:,-5:].columns,SLG_RF_models['RF'].feature_importances_)\n", "plt.title('Feature importance of RF in SLG')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alpha : 0.0001\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agelag1_OBPlag2_OBPlag3_OBPmean_OBP
coefficient0.0031950.0182490.00.00.864913
\n", "
" ], "text/plain": [ " age lag1_OBP lag2_OBP lag3_OBP mean_OBP\n", "coefficient 0.003195 0.018249 0.0 0.0 0.864913" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Alpha : 0.0001\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agelag1_SLGlag2_SLGlag3_SLGmean_SLG
coefficient0.00490.0812090.0-0.00.836453
\n", "
" ], "text/plain": [ " age lag1_SLG lag2_SLG lag3_SLG mean_SLG\n", "coefficient 0.0049 0.081209 0.0 -0.0 0.836453" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print('Alpha : ', OBP_linear_models['Lasso'].alpha) # Lasso + GridSearchCV -> alpha값 출력\n", "# Lasso model의 선형 계수 값 출력\n", "display(pd.DataFrame(OBP_linear_models['Lasso'].coef_.reshape(-1, 5), columns=OBP_train.iloc[:,-5:].columns, index=['coefficient']))\n", "\n", "print('Alpha : ', SLG_linear_models['Lasso'].alpha)\n", "display(pd.DataFrame(SLG_linear_models['Lasso'].coef_.reshape(-1, 5), columns=SLG_train.iloc[:,-5:].columns, index=['coefficient']))" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".." ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.linear_model import lars_path # LASSO의 계산 측면에서의 단점을 극복\n", "\n", "plt.figure(figsize=(15,4.8)) # 그래프 크기 지정\n", "\n", "plt.subplot(1,2,1) \n", "# OBP 모델의 alpha 값의 변화에 따른 계수의 변화를 alpha, coefs에 저장한다.\n", "alphas, _, coefs = lars_path(OBP_train.iloc[:,-5:].values, OBP_train['OBP'], method='lasso', verbose=True)\n", "xx = np.sum(np.abs(coefs.T), axis=1)# 피처별 alpha 값에 따른 선형 모델 계수의 절댓값의 합 \n", "xx /= xx[-1]# 계수의 절댓값 중 가장 큰 값으로 alpha에 따른 피처의 계수의 합을 나눈다. \n", "\n", "plt.plot(xx, coefs.T)\n", "plt.xlabel('|coef| / max|coef|')\n", "plt.ylabel('Coefficients')\n", "plt.title('OBP LASSO Path')\n", "plt.axis('tight')\n", "plt.legend(OBP_train.iloc[:,-5:].columns) \n", "\n", "\n", "plt.subplot(1,2,2)\n", "# SLG 모델에서 alpha 값의 변화에 따른 계수의 변화를 alpha, coefs에 저장한다.\n", "alphas, _, coefs = lars_path(SLG_train.iloc[:,-5:].values, SLG_train['SLG'], method='lasso', verbose=True)\n", "xx = np.sum(np.abs(coefs.T), axis=1)\n", "xx /= xx[-1]\n", "\n", "plt.plot(xx, coefs.T)\n", "plt.xlabel('|coef| / max|coef|')\n", "plt.ylabel('Coefficients')\n", "plt.title('SLG LASSO Path')\n", "plt.axis('tight')\n", "plt.legend(OBP_train.iloc[:,-5:].columns)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.5. 성능 향상을 위한 방법\n", "### 1.5.1. 앙상블" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OBP model averaging: 0.3324574652010582\n", "SLG model averaging: 0.6684541138633259\n" ] } ], "source": [ "print('OBP model averaging: ',\n", " wrmse(OBP_test['OBP'], OBP_test['AB'], (Lasso_OBP + RF_OBP) / 2))\n", "print('SLG model averaging: ',\n", " wrmse(SLG_test['SLG'], OBP_test['AB'], (Lasso_SLG + RF_SLG) / 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.5.2. df 단순화" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:18: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n" ] } ], "source": [ "# 전처리된 데이터를 다른 곳에 저장\n", "sum_hf_yr_OBP_origin = sum_hf_yr_OBP.copy()\n", "\n", "# 희생타\n", "regular_season_df['SF'] = regular_season_df[['H','BB','HBP']].sum(axis=1) / regular_season_df['OBP'] \n", " - regular_season_df[['AB','BB','HBP']].sum(axis=1)\n", "regular_season_df['SF'].fillna(0, inplace = True)\n", "regular_season_df['SF'] = regular_season_df['SF'].apply(lambda x : round(x,0))\n", "\n", "# 한 타수당 평균 희생타 \n", "regular_season_df['SF_1'] = regular_season_df['SF'] / regular_season_df['AB']\n", "regular_season_df_SF = regular_season_df[['batter_name','year','SF_1']]\n", "\n", "# 연도별 선수의 시즌 전반기 출루율과 관련된 성적 + BB, RBI 추가\n", "sum_hf_yr_OBP = day_by_day_df.loc[day_by_day_df['date'] <= 7.18].groupby(['batter_name','year'])['AB','H','BB','HBP','RBI', '2B', '3B', 'HR'].sum().reset_index()\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.merge(regular_season_df_SF, how = 'left',on=['batter_name','year'])\n", "\n", "# 한 타수당 평균 희생타 \n", "sum_hf_yr_OBP['SF'] = (sum_hf_yr_OBP['SF_1']*sum_hf_yr_OBP['AB']).apply(lambda x: round(x,0))\n", "sum_hf_yr_OBP.drop('SF_1',axis = 1, inplace = True)\n", "\n", "# 전반기 OBP(출루율 계산)\n", "sum_hf_yr_OBP['OBP'] = sum_hf_yr_OBP[['H', 'BB', 'HBP']].sum(axis = 1) / sum_hf_yr_OBP[['AB', 'BB', 'HBP','SF']].sum(axis = 1)\n", "sum_hf_yr_OBP['OBP'].fillna(0, inplace = True)\n", "\n", "# TB \n", "sum_hf_yr_OBP['TB'] = sum_hf_yr_OBP['H'] + sum_hf_yr_OBP['2B']*2 + sum_hf_yr_OBP['3B']*3 + sum_hf_yr_OBP['HR']*4\n", "sum_hf_yr_OBP = sum_hf_yr_OBP[['batter_name','year','AB','OBP', 'BB', 'TB', 'RBI']]\n", "\n", "# 나이\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.merge(regular_season_df[['batter_name','year','age']], how = 'left', on=['batter_name','year'])\n", "\n", "# 평균 OBP \n", "sum_hf_yr_OBP = sum_hf_yr_OBP.merge(player_OBP_mean[['batter_name', 'mean_OBP']], how ='left', on=\"batter_name\")\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.loc[~sum_hf_yr_OBP['mean_OBP'].isna()].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "# 각 변수에 대한 1년 전 성적 생성\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"BB\", 1)\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"TB\", 1)\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"RBI\", 1)\n", "sum_hf_yr_OBP = lag_function(sum_hf_yr_OBP, \"OBP\", 1)\n", "\n", "sum_hf_yr_OBP = sum_hf_yr_OBP.dropna() # 결측치 포함한 행 제거\n", "\n", "# 변수 리스트 지정\n", "feature_list_1 = ['age', 'lag1_OBP', 'mean_OBP']\n", "feature_list_2 = ['age', 'lag1_BB', 'lag1_TB', 'lag1_RBI','lag1_OBP', 'mean_OBP']" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "# 학습시킬 데이터 30타수 이상만 학습\n", "sum_hf_yr_OBP= sum_hf_yr_OBP.loc[sum_hf_yr_OBP['AB']>=30] \n", "\n", "# 2018년 test로 나누고 나머지는 학습\n", "OBP_train = sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year'] != 2018]\n", "OBP_test = sum_hf_yr_OBP.loc[sum_hf_yr_OBP['year'] == 2018]\n", "\n", "# grid search를 이용해 학습한다.\n", "OBP_RF_models_1 = {\n", " 'RF': GridSearchCV(\n", " RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1).fit(OBP_train.loc[:,feature_list_1], OBP_train['OBP']).best_estimator_}\n", "\n", "OBP_RF_models_2 = {\n", " 'RF': GridSearchCV(\n", " RandomForestRegressor(random_state=42), param_grid=RF_params, n_jobs=-1).fit(OBP_train.loc[:,feature_list_2], OBP_train['OBP']).best_estimator_}" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 예측\n", "RF_OBP_1 = OBP_RF_models_1['RF'].predict(OBP_test.loc[:,feature_list_1])\n", "RF_OBP_2 = OBP_RF_models_2['RF'].predict(OBP_test.loc[:,feature_list_2])\n", "\n", "# wrmse 계산\n", "wrmse_score = [wrmse(OBP_test['OBP'],OBP_test['AB'],RF_OBP_1) ,\n", " wrmse(OBP_test['OBP'],OBP_test['AB'],RF_OBP_2)]\n", "x_lab = ['simple', 'complicate']\n", "\n", "plt.bar(x_lab, wrmse_score)\n", "plt.title('WRMSE of OBP', fontsize=20)\n", "plt.xlabel('model', fontsize=18)\n", "plt.ylabel('', fontsize=18)\n", "plt.ylim(0,0.5)\n", "# 막대그래프 위에 값을 표시해준다.\n", "for i, v in enumerate(wrmse_score):\n", " plt.text(i-0.1, v + 0.01, str(np.round(v,3))) # x 좌표, y좌표, 텍스트 표시\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "# 최종 제출을 위한 원래 데이터 복구 \n", "sum_hf_yr_OBP = sum_hf_yr_OBP_origin.copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.5.3. 테스트 데이터 정제" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearyear_bornage
01강경학20191992년 08월 11일27
12강구성20191993년 06월 09일26
23강민국20191992년 01월 10일27
34강민호20191985년 08월 18일34
45강백호20191999년 07월 29일20
\n", "
" ], "text/plain": [ " batter_id batter_name year year_born age\n", "0 1 강경학 2019 1992년 08월 11일 27\n", "1 2 강구성 2019 1993년 06월 09일 26\n", "2 3 강민국 2019 1992년 01월 10일 27\n", "3 4 강민호 2019 1985년 08월 18일 34\n", "4 5 강백호 2019 1999년 07월 29일 20" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./input/submission.csv')\n", "submission['year'] = 2019 # 연도 기입\n", "\n", "batter_year_born = regular_season_df[['batter_id','batter_name','year_born']].copy() # 2019년의 Age(나이) 계산\n", "batter_year_born = batter_year_born.drop_duplicates().reset_index(drop=True) # 중복선수 제거\n", "\n", "submission = submission.merge(batter_year_born, how='left', on=['batter_id', 'batter_name'])\n", "submission['age'] = submission['year'] - submission['year_born'].apply(lambda x: int(x[:4]))\n", "submission.head()" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "submission_OBP = submission.copy()\n", "submission_SLG = submission.copy()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearyear_bornagemean_OBPlag1_OBPlag2_OBPlag3_OBP
01강경학20191992년 08월 11일270.3378800.4236110.2857140.222222
12강구성20191993년 06월 09일26NaNNaNNaNNaN
23강민국20191992년 01월 10일27NaNNaNNaNNaN
34강민호20191985년 08월 18일340.3581870.3289900.3860760.441860
45강백호20191999년 07월 29일200.3561640.355685NaNNaN
\n", "
" ], "text/plain": [ " batter_id batter_name year year_born age mean_OBP lag1_OBP \\\n", "0 1 강경학 2019 1992년 08월 11일 27 0.337880 0.423611 \n", "1 2 강구성 2019 1993년 06월 09일 26 NaN NaN \n", "2 3 강민국 2019 1992년 01월 10일 27 NaN NaN \n", "3 4 강민호 2019 1985년 08월 18일 34 0.358187 0.328990 \n", "4 5 강백호 2019 1999년 07월 29일 20 0.356164 0.355685 \n", "\n", " lag2_OBP lag3_OBP \n", "0 0.285714 0.222222 \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 0.386076 0.441860 \n", "4 NaN NaN " ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 평균 성적 기입\n", "submission_OBP = submission_OBP.merge(sum_hf_yr_OBP[['batter_name','mean_OBP']].drop_duplicates().reset_index(drop=True), how='left', on='batter_name')\n", "\n", "# 과거 성적 값 채우기\n", "for i in [1,2,3]:\n", " temp_lag_df = sum_hf_yr_OBP.loc[\n", " (sum_hf_yr_OBP['year'] == (2019 - i)) &\n", " (sum_hf_yr_OBP['AB']>=30),['batter_name','OBP']].copy()\n", " temp_lag_df.rename(columns={'OBP':'lag'+str(i)+'_OBP'}, inplace=True)\n", " submission_OBP = submission_OBP.merge(temp_lag_df, how='left', on='batter_name')\n", "\n", "submission_OBP.head()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['강구성', '강민국', '강상원', '고명성', '김응민', '김종덕', '김주찬', '김철호', '김태연',\n", " '김태진', '김형준', '나원탁', '남태혁', '박광열', '박기혁', '백민기', '샌즈', '신범수',\n", " '신성현', '양종민', '윤정우', '이동훈', '이범호', '이병휘', '이성곤', '이인행', '이종욱',\n", " '이진영', '이창진', '장승현', '장시윤', '전민재', '전병우', '정경운', '정성훈', '조홍석',\n", " '최원제', '홍창기'], dtype=object)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission_OBP['batter_name'].loc[submission_OBP['mean_OBP'].isna()].values" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "for batter_name in [\"김주찬\", \"이범호\"]:\n", " # 30타수 이상인 해당선수의 인덱스(Boolean)\n", " cond_regular = (regular_season_df['AB'] >= 30) & (regular_season_df['batter_name'] == batter_name)\n", " \n", " # 타수를 고려해 평균 OBP 계산\n", " mean_OBP = sum(regular_season_df.loc[cond_regular,'AB'] * regular_season_df.loc[cond_regular,'OBP']) / sum(regular_season_df.loc[cond_regular,'AB'])\n", " submission_OBP.loc[(submission_OBP['batter_name'] == batter_name),'mean_OBP'] = mean_OBP\n", " \n", " # regular_season_Batter으로부터 1, 2, 3년 전 성적 구하기\n", " cond_sub = submission_OBP['batter_name'] == batter_name\n", " submission_OBP.loc[cond_sub,'lag1_OBP'] = regular_season_df.loc[(cond_regular) & (regular_season_df['year']==2018),'OBP'].values\n", " submission_OBP.loc[cond_sub,'lag2_OBP'] = regular_season_df.loc[(cond_regular) & (regular_season_df['year']==2017),'OBP'].values\n", " submission_OBP.loc[cond_sub,'lag3_OBP'] = regular_season_df.loc[(cond_regular) & (regular_season_df['year']==2016),'OBP'].values" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "for i in np.where(submission_OBP['batter_name'].isin([\"고명성\",\"전민재\",\"김철호\",\"신범수\",\"이병휘\"])):\n", " #submission_OBP.loc[i,'mean_OBP'] = season_OBP_mean.loc[season_OBP_mean['year']==2018,'mean_OBP'].values\n", " submission_OBP.loc[i,'mean_OBP'] = \\\n", " season_OBP_mean.loc[season_OBP_mean['year']==2018,'mean_OBP']" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "for batter_name in [\"전병우\",\"샌즈\"]:\n", " # 30 타수 이상인 해당 선수의 index 추출\n", " cond_regular = (regular_season_df['AB']>=30) & (regular_season_df['batter_name']==batter_name) \n", "\n", "# 타수를 고려해 선수의 평균 OBP계산\n", "mean_OBP = sum(regular_season_df.loc[cond_regular,'AB'] * regular_season_df.loc[cond_regular,'OBP']) / sum(regular_season_df.loc[cond_regular,'AB'])\n", " \n", "submission_OBP.loc[(submission_OBP['batter_name'] == batter_name),'mean_OBP'] = mean_OBP\n", "cond_sub = submission_OBP['batter_name'] == batter_name\n", "\n", "# 2018년 데이터로부터 2019년의 1년 전 성적 기입\n", "submission_OBP.loc[cond_sub,'lag1_OBP'] = regular_season_df.loc[(cond_regular)&(regular_season_df['year']==2018),'OBP'].values" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "# 평균 성적이 결측치인 선수들에 대해 평균 OBP의 하위 25% 성적 기입\n", "submission_OBP.loc[submission_OBP['mean_OBP'].isna(),'mean_OBP'] = np.quantile(player_OBP_mean['mean_OBP'],0.25)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearyear_bornagemean_OBPlag1_OBPlag2_OBPlag3_OBP
01강경학20191992년 08월 11일270.3378800.4236110.2857140.222222
12강구성20191993년 06월 09일260.3041240.3299910.3302970.336224
23강민국20191992년 01월 10일270.3041240.3299910.3302970.336224
34강민호20191985년 08월 18일340.3581870.3289900.3860760.441860
45강백호20191999년 07월 29일200.3561640.3556850.3563170.362245
\n", "
" ], "text/plain": [ " batter_id batter_name year year_born age mean_OBP lag1_OBP \\\n", "0 1 강경학 2019 1992년 08월 11일 27 0.337880 0.423611 \n", "1 2 강구성 2019 1993년 06월 09일 26 0.304124 0.329991 \n", "2 3 강민국 2019 1992년 01월 10일 27 0.304124 0.329991 \n", "3 4 강민호 2019 1985년 08월 18일 34 0.358187 0.328990 \n", "4 5 강백호 2019 1999년 07월 29일 20 0.356164 0.355685 \n", "\n", " lag2_OBP lag3_OBP \n", "0 0.285714 0.222222 \n", "1 0.330297 0.336224 \n", "2 0.330297 0.336224 \n", "3 0.386076 0.441860 \n", "4 0.356317 0.362245 " ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ " # i년 전 OBP 결측치 제거\n", "for i in [1,2,3]: \n", " submission_OBP = lag_na_fill(submission_OBP, 'OBP', i, season_OBP_mean)\n", "submission_OBP.head()" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# 앞서 전처리한 데이터로 평균 SLG 값 기입\n", "submission_SLG = submission_SLG.merge(\n", " sum_hf_yr_SLG[['batter_name','mean_SLG']].drop_duplicates().reset_index(drop=True),\n", " how='left', on='batter_name')\n", "\n", "# 앞서 전처리한 데이터에서 과거 SLG 값 채우기\n", "for i in [1,2,3]:\n", " temp_lag_df = sum_hf_yr_SLG.loc[(sum_hf_yr_SLG['year'] == (2019 - i)) &\n", " (sum_hf_yr_SLG['AB']>=30),['batter_name','SLG']].copy()\n", " \n", " temp_lag_df.rename(columns={'SLG':'lag'+str(i)+'_SLG'}, inplace=True)\n", " \n", " submission_SLG = submission_SLG.merge(temp_lag_df, how='left', on='batter_name')" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['강구성', '강민국', '강상원', '고명성', '김응민', '김종덕', '김주찬', '김철호', '김태연',\n", " '김태진', '김형준', '나원탁', '남태혁', '박광열', '박기혁', '백민기', '샌즈', '신범수',\n", " '신성현', '양종민', '윤정우', '이동훈', '이범호', '이병휘', '이성곤', '이인행', '이종욱',\n", " '이진영', '이창진', '장승현', '장시윤', '전민재', '전병우', '정경운', '정성훈', '조홍석',\n", " '최원제', '홍창기'], dtype=object)" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission_SLG['batter_name'].loc[submission_SLG['mean_SLG'].isna()].values" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "for batter_name in [\"김주찬\", \"이범호\"]:\n", " # mean_SLG 계산\n", " cond_regular = (regular_season_df['AB'] >= 30) & \\\n", " (regular_season_df['batter_name'] == batter_name)\n", " \n", " # 타수를 고려해 선수의 평균 SLG 계산\n", " mean_SLG = sum(regular_season_df.loc[cond_regular,'AB'] * \\\n", " regular_season_df.loc[cond_regular,'SLG']) / \\\n", " sum(regular_season_df.loc[cond_regular,'AB'])\n", " \n", " submission_SLG.loc[(submission_SLG['batter_name'] == batter_name), 'mean_SLG'] = \\\n", " mean_SLG\n", " \n", " # regular_season_Batter으로부터 1, 2, 3년 전 성적 구하기\n", " cond_sub = submission_SLG['batter_name'] == batter_name\n", " \n", " submission_SLG.loc[cond_sub,'lag1_SLG'] = regular_season_df.loc[\n", " (cond_regular) & (regular_season_df['year'] == 2018),'SLG'].values\n", " submission_SLG.loc[cond_sub,'lag2_SLG'] = regular_season_df.loc[\n", " (cond_regular) & (regular_season_df['year'] == 2017),'SLG'].values\n", " submission_SLG.loc[cond_sub,'lag3_SLG'] = regular_season_df.loc[\n", " (cond_regular) & (regular_season_df['year'] == 2016),'SLG'].values" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "for i in np.where(submission_SLG['batter_name'].isin(\n", " [\"고명성\",\"전민재\",\"김철호\",\"신범수\",\"이병휘\"])):\n", " # 위의 해당 선수들의 평균 SLG 평균값으로 대체\n", " #submission_SLG.loc[i,'mean_SLG'] = season_SLG_mean.loc[season_SLG_mean['year']==2018,'mean_SLG'].values\n", " submission_SLG.loc[i,'mean_SLG'] = \\\n", " season_SLG_mean.loc[season_SLG_mean['year']==2018,'mean_SLG']" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "for batter_name in [\"전병우\",\"샌즈\"]:\n", " \n", " # 30타수 이상인 해당선수의 인덱스(Boolean) \n", " cond_regular = (regular_season_df['AB']>=30)&\\\n", "(regular_season_df['batter_name']==batter_name)\n", "\n", "# 타수를 고려한 평균 SLG 계산\n", "mean_SLG = sum(regular_season_df.loc[cond_regular,'AB']*\n", "regular_season_df.loc[cond_regular,'SLG']) / sum(regular_season_df.loc[cond_regular,'AB'])\n", "\n", "# 해당 선수의 평균 SLG 값 기입\n", "submission_SLG.loc[(submission_SLG['batter_name'] == batter_name),\n", "'mean_SLG'] = mean_SLG\n", "\n", "# 해당 선수의 1년 전 SLG값 기입\n", "cond_sub = submission_SLG['batter_name'] == batter_name\n", "submission_SLG.loc[cond_sub,'lag1_SLG'] = regular_season_df.loc[(cond_regular)&\n", "(regular_season_df['year']==2018),'SLG'].values" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "# 평균 성적이 결측치인 선수들에 대해 평균 SLG의 하위 25% 성적 기입\n", "submission_SLG.loc[submission_SLG['mean_SLG'].isna(),'mean_SLG'] = \\\n", " np.quantile(player_SLG_mean['mean_SLG'],0.25)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameyearyear_bornagemean_SLGlag1_SLGlag2_SLGlag3_SLG
01강경학20191992년 08월 11일270.3325270.5238100.2560980.222222
12강구성20191993년 06월 09일260.3269230.3914290.3857540.385397
23강민국20191992년 01월 10일270.3269230.3914290.3857540.385397
34강민호20191985년 08월 18일340.4665400.4872730.5487360.577689
45강백호20191999년 07월 29일200.5237190.5320510.4841520.483795
\n", "
" ], "text/plain": [ " batter_id batter_name year year_born age mean_SLG lag1_SLG \\\n", "0 1 강경학 2019 1992년 08월 11일 27 0.332527 0.523810 \n", "1 2 강구성 2019 1993년 06월 09일 26 0.326923 0.391429 \n", "2 3 강민국 2019 1992년 01월 10일 27 0.326923 0.391429 \n", "3 4 강민호 2019 1985년 08월 18일 34 0.466540 0.487273 \n", "4 5 강백호 2019 1999년 07월 29일 20 0.523719 0.532051 \n", "\n", " lag2_SLG lag3_SLG \n", "0 0.256098 0.222222 \n", "1 0.385754 0.385397 \n", "2 0.385754 0.385397 \n", "3 0.548736 0.577689 \n", "4 0.484152 0.483795 " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for i in [1,2,3]:\n", " # i년 전 SLG 성적 결측치 처리\n", " submission_SLG = lag_na_fill(submission_SLG, 'SLG', i, season_SLG_mean)\n", "submission_SLG.head()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "# Random Forests를 이용해 OBP 예측\n", "predict_OBP = OBP_RF_models['RF'].predict(submission_OBP.iloc[:,-5:]) \n", "# Lasso를 이용해 SLG 예측\n", "predict_SLG = SLG_linear_models ['Lasso'].predict(submission_SLG.iloc[:,-5:])" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " final_submission['OPS'] = predict_SLG + predict_OBP # OBP + SLG = OPS\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameOPS
01강경학0.503957
12강구성0.687933
23강민국0.696609
34강민호0.958395
45강백호0.751592
58강상원0.661807
69강승호0.505642
711강진성0.656007
812강한울0.672859
916고명성0.640507
\n", "
" ], "text/plain": [ " batter_id batter_name OPS\n", "0 1 강경학 0.503957\n", "1 2 강구성 0.687933\n", "2 3 강민국 0.696609\n", "3 4 강민호 0.958395\n", "4 5 강백호 0.751592\n", "5 8 강상원 0.661807\n", "6 9 강승호 0.505642\n", "7 11 강진성 0.656007\n", "8 12 강한울 0.672859\n", "9 16 고명성 0.640507" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_submission = submission[['batter_id','batter_name']]\n", "final_submission['OPS'] = predict_SLG + predict_OBP # OBP + SLG = OPS \n", "final_submission.head(10)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " final_submission['OPS'] = final_submission['OPS'] - 0.038\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
batter_idbatter_nameOPS
01강경학0.465957
12강구성0.649933
23강민국0.658609
34강민호0.920395
45강백호0.713592
58강상원0.623807
69강승호0.467642
711강진성0.618007
812강한울0.634859
916고명성0.602507
\n", "
" ], "text/plain": [ " batter_id batter_name OPS\n", "0 1 강경학 0.465957\n", "1 2 강구성 0.649933\n", "2 3 강민국 0.658609\n", "3 4 강민호 0.920395\n", "4 5 강백호 0.713592\n", "5 8 강상원 0.623807\n", "6 9 강승호 0.467642\n", "7 11 강진성 0.618007\n", "8 12 강한울 0.634859\n", "9 16 고명성 0.602507" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "final_submission['OPS'] = final_submission['OPS'] - 0.038\n", "display(final_submission.head(10))\n", "final_submission.to_csv('submission.csv', index=False) # 최종 제출파일 생성" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }