{ "cells": [ { "cell_type": "markdown", "id": "68cd31ff", "metadata": {}, "source": [ "# 단국대 따릉이 대여량 \n", "\n", "### train/test\n", "- train : 2018-01-01 ~ 2020-12-31(3년치) (1095, 13) IN:2020-02-29, Out:2019-09-07\n", "- test : 2021-01-01 ~ 2021-12-31(1년치) (365, 12)\n", "\n", "\n", "### 독립변수\n", "+ date: 날짜\n", "+ precipitation: 강수량(mm)\n", "+ temp_mean: 평균 기온(℃)\n", "+ temp_highest: 최고 기온(℃)\n", "+ temp_lowest: 최저 기온(℃)\n", "+ PM10: 미세먼지(㎍/㎥)\n", "+ PM2.5: 초미세먼지(㎍/㎥)\n", "+ humidity: 습도(%rh)\n", "+ sunshine_sum: 일조합\n", "+ sunshine_rate: 일조율\n", "+ wind_mean: 평균 풍속(m/s)\n", "+ wind_max: 최대 풍속(m/s)\n", "\n", "### 종속변수\n", "+ rentals : 따릉이 대여량" ] }, { "cell_type": "code", "execution_count": 104, "id": "96caa9a0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateprecipitationtemp_meantemp_highesttemp_lowestPM10PM2.5humiditysunshine_sumsunshine_ratewind_meanwind_maxrental
02018-01-01NaN-1.33.8-5.134.017.039.18.386.51.43.84950
12018-01-02NaN-1.81.8-4.336.022.042.07.982.31.84.97136
22018-01-03NaN-4.7-0.4-7.131.019.042.38.688.72.23.57156
32018-01-04NaN-4.7-0.7-8.739.024.043.06.263.91.43.57102
42018-01-05NaN-3.01.6-5.651.035.048.48.284.51.73.67705
..........................................
10902020-12-270.05.810.01.470.042.062.95.961.51.82.837103
10912020-12-281.36.711.44.266.044.072.18.083.31.43.146912
10922020-12-290.20.14.3-6.269.046.070.80.00.02.96.135747
10932020-12-30NaN-10.9-6.2-12.939.015.055.58.386.54.16.222488
10942020-12-310.0-8.9-5.0-12.928.012.053.96.062.52.44.224535
\n", "

1095 rows × 13 columns

\n", "
" ], "text/plain": [ " date precipitation temp_mean temp_highest temp_lowest PM10 \\\n", "0 2018-01-01 NaN -1.3 3.8 -5.1 34.0 \n", "1 2018-01-02 NaN -1.8 1.8 -4.3 36.0 \n", "2 2018-01-03 NaN -4.7 -0.4 -7.1 31.0 \n", "3 2018-01-04 NaN -4.7 -0.7 -8.7 39.0 \n", "4 2018-01-05 NaN -3.0 1.6 -5.6 51.0 \n", "... ... ... ... ... ... ... \n", "1090 2020-12-27 0.0 5.8 10.0 1.4 70.0 \n", "1091 2020-12-28 1.3 6.7 11.4 4.2 66.0 \n", "1092 2020-12-29 0.2 0.1 4.3 -6.2 69.0 \n", "1093 2020-12-30 NaN -10.9 -6.2 -12.9 39.0 \n", "1094 2020-12-31 0.0 -8.9 -5.0 -12.9 28.0 \n", "\n", " PM2.5 humidity sunshine_sum sunshine_rate wind_mean wind_max \\\n", "0 17.0 39.1 8.3 86.5 1.4 3.8 \n", "1 22.0 42.0 7.9 82.3 1.8 4.9 \n", "2 19.0 42.3 8.6 88.7 2.2 3.5 \n", "3 24.0 43.0 6.2 63.9 1.4 3.5 \n", "4 35.0 48.4 8.2 84.5 1.7 3.6 \n", "... ... ... ... ... ... ... \n", "1090 42.0 62.9 5.9 61.5 1.8 2.8 \n", "1091 44.0 72.1 8.0 83.3 1.4 3.1 \n", "1092 46.0 70.8 0.0 0.0 2.9 6.1 \n", "1093 15.0 55.5 8.3 86.5 4.1 6.2 \n", "1094 12.0 53.9 6.0 62.5 2.4 4.2 \n", "\n", " rental \n", "0 4950 \n", "1 7136 \n", "2 7156 \n", "3 7102 \n", "4 7705 \n", "... ... \n", "1090 37103 \n", "1091 46912 \n", "1092 35747 \n", "1093 22488 \n", "1094 24535 \n", "\n", "[1095 rows x 13 columns]" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "train = pd.read_csv('./input/dankook/train.csv')\n", "test = pd.read_csv('./input/dankook/test.csv')\n", "train" ] }, { "cell_type": "code", "execution_count": null, "id": "e7630f34", "metadata": {}, "outputs": [], "source": [ "test.info()" ] }, { "cell_type": "code", "execution_count": 105, "id": "ea356a45", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1095 entries, 0 to 1094\n", "Data columns (total 13 columns):\n", "date 1095 non-null object\n", "precipitation 417 non-null float64\n", "temp_mean 1095 non-null float64\n", "temp_highest 1095 non-null float64\n", "temp_lowest 1095 non-null float64\n", "PM10 1028 non-null float64\n", "PM2.5 1027 non-null float64\n", "humidity 1095 non-null float64\n", "sunshine_sum 1090 non-null float64\n", "sunshine_rate 1095 non-null float64\n", "wind_mean 1095 non-null float64\n", "wind_max 1095 non-null float64\n", "rental 1095 non-null int64\n", "dtypes: float64(11), int64(1), object(1)\n", "memory usage: 111.3+ KB\n" ] } ], "source": [ "train.info()" ] }, { "cell_type": "code", "execution_count": 106, "id": "74a88b13", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "date 0\n", "precipitation 678\n", "temp_mean 0\n", "temp_highest 0\n", "temp_lowest 0\n", "PM10 67\n", "PM2.5 68\n", "humidity 0\n", "sunshine_sum 5\n", "sunshine_rate 0\n", "wind_mean 0\n", "wind_max 0\n", "rental 0\n", "dtype: int64" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.isnull().sum()" ] }, { "cell_type": "markdown", "id": "81c27fc6", "metadata": {}, "source": [ "### 결측치 처리\n" ] }, { "cell_type": "code", "execution_count": 107, "id": "70557936", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateprecipitationtemp_meantemp_highesttemp_lowestPM10PM2.5humiditysunshine_sumsunshine_ratewind_meanwind_maxrental
02018-01-010.0-1.33.8-5.134.017.039.18.386.51.43.84950
12018-01-020.0-1.81.8-4.336.022.042.07.982.31.84.97136
22018-01-030.0-4.7-0.4-7.131.019.042.38.688.72.23.57156
32018-01-040.0-4.7-0.7-8.739.024.043.06.263.91.43.57102
42018-01-050.0-3.01.6-5.651.035.048.48.284.51.73.67705
\n", "
" ], "text/plain": [ " date precipitation temp_mean temp_highest temp_lowest PM10 \\\n", "0 2018-01-01 0.0 -1.3 3.8 -5.1 34.0 \n", "1 2018-01-02 0.0 -1.8 1.8 -4.3 36.0 \n", "2 2018-01-03 0.0 -4.7 -0.4 -7.1 31.0 \n", "3 2018-01-04 0.0 -4.7 -0.7 -8.7 39.0 \n", "4 2018-01-05 0.0 -3.0 1.6 -5.6 51.0 \n", "\n", " PM2.5 humidity sunshine_sum sunshine_rate wind_mean wind_max rental \n", "0 17.0 39.1 8.3 86.5 1.4 3.8 4950 \n", "1 22.0 42.0 7.9 82.3 1.8 4.9 7136 \n", "2 19.0 42.3 8.6 88.7 2.2 3.5 7156 \n", "3 24.0 43.0 6.2 63.9 1.4 3.5 7102 \n", "4 35.0 48.4 8.2 84.5 1.7 3.6 7705 " ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 결측치 처리\n", "def null(df): \n", " df['precipitation'] = df['precipitation'].fillna(0)\n", " df['PM10'] = df['PM10'].fillna(df['PM10'].mean())\n", " df['PM2.5'] = df['PM2.5'].fillna(df['PM2.5'].mean())\n", " df['sunshine_sum'] = df['sunshine_sum'].fillna(df['sunshine_sum'].mean())\n", " return df\n", "\n", "train = null(train)\n", "test = null(test)\n", "train.head()" ] }, { "cell_type": "markdown", "id": "412c16cf", "metadata": {}, "source": [ "### 날짜 변환" ] }, { "cell_type": "code", "execution_count": 108, "id": "05f9cdbc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
precipitationtemp_meantemp_highesttemp_lowestPM10PM2.5humiditysunshine_sumsunshine_ratewind_meanwind_maxyearmonthday
00.0-4.21.6-9.830.017.064.06.567.72.04.1202111
10.0-5.0-1.4-8.434.012.038.59.093.82.65.4202112
20.0-5.6-2.0-9.139.014.045.05.556.72.04.5202113
30.0-3.50.3-8.440.023.051.44.647.41.73.2202114
40.0-5.5-2.1-9.930.017.052.88.688.72.95.7202115
.............................................
3600.0-7.6-3.9-12.933.020.060.93.839.61.73.120211227
3610.0-4.1-0.9-8.551.038.073.81.717.72.23.120211228
3620.20.45.9-3.866.049.072.91.818.82.65.920211229
3630.0-3.90.2-6.830.017.048.57.376.03.36.620211230
3640.0-6.7-3.9-8.823.07.035.99.093.83.55.420211231
\n", "

365 rows × 14 columns

\n", "
" ], "text/plain": [ " precipitation temp_mean temp_highest temp_lowest PM10 PM2.5 \\\n", "0 0.0 -4.2 1.6 -9.8 30.0 17.0 \n", "1 0.0 -5.0 -1.4 -8.4 34.0 12.0 \n", "2 0.0 -5.6 -2.0 -9.1 39.0 14.0 \n", "3 0.0 -3.5 0.3 -8.4 40.0 23.0 \n", "4 0.0 -5.5 -2.1 -9.9 30.0 17.0 \n", ".. ... ... ... ... ... ... \n", "360 0.0 -7.6 -3.9 -12.9 33.0 20.0 \n", "361 0.0 -4.1 -0.9 -8.5 51.0 38.0 \n", "362 0.2 0.4 5.9 -3.8 66.0 49.0 \n", "363 0.0 -3.9 0.2 -6.8 30.0 17.0 \n", "364 0.0 -6.7 -3.9 -8.8 23.0 7.0 \n", "\n", " humidity sunshine_sum sunshine_rate wind_mean wind_max year month \\\n", "0 64.0 6.5 67.7 2.0 4.1 2021 1 \n", "1 38.5 9.0 93.8 2.6 5.4 2021 1 \n", "2 45.0 5.5 56.7 2.0 4.5 2021 1 \n", "3 51.4 4.6 47.4 1.7 3.2 2021 1 \n", "4 52.8 8.6 88.7 2.9 5.7 2021 1 \n", ".. ... ... ... ... ... ... ... \n", "360 60.9 3.8 39.6 1.7 3.1 2021 12 \n", "361 73.8 1.7 17.7 2.2 3.1 2021 12 \n", "362 72.9 1.8 18.8 2.6 5.9 2021 12 \n", "363 48.5 7.3 76.0 3.3 6.6 2021 12 \n", "364 35.9 9.0 93.8 3.5 5.4 2021 12 \n", "\n", " day \n", "0 1 \n", "1 2 \n", "2 3 \n", "3 4 \n", "4 5 \n", ".. ... \n", "360 27 \n", "361 28 \n", "362 29 \n", "363 30 \n", "364 31 \n", "\n", "[365 rows x 14 columns]" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train['date'] = pd.to_datetime(train['date'])\n", "test['date'] = pd.to_datetime(test['date'])\n", "train['year'] = train['date'].dt.year\n", "test['year'] = test['date'].dt.year\n", "train['month'] = train['date'].dt.month\n", "test['month'] = test['date'].dt.month\n", "train['day'] = train['date'].dt.day\n", "test['day'] = test['date'].dt.day\n", "train = train.drop(columns='date')\n", "test = test.drop(columns='date')\n", "test" ] }, { "cell_type": "markdown", "id": "85dfddfd", "metadata": {}, "source": [ "### val 시도" ] }, { "cell_type": "markdown", "id": "6ad40639", "metadata": { "heading_collapsed": true }, "source": [ "#### rf" ] }, { "cell_type": "code", "execution_count": 131, "id": "fe10731d", "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler \n", "from sklearn.model_selection import train_test_split, GridSearchCV \n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import RandomForestRegressor \n", "\n", "X = train.drop(['rental'],axis=1)\n", "y = train[\"rental\"]\n", "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=True)" ] }, { "cell_type": "code", "execution_count": 132, "id": "ee334c47", "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "model = RandomForestRegressor(max_features=8, min_samples_leaf=1, n_estimators=590)\n", "model.fit(X_train, y_train)\n", "pred = model.predict(X_val)" ] }, { "cell_type": "code", "execution_count": 123, "id": "0860335e", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 49 candidates, totalling 245 fits\n" ] }, { "ename": "ValueError", "evalue": "Invalid parameter C for estimator RandomForestRegressor(). Check the list of available parameters with `estimator.get_params().keys()`.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_14784\\1767163570.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mRandomForestRegressor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mgrid_search\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'neg_mean_squared_error'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mgrid_search\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbest_params_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[0;32m 889\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 890\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 891\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 892\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 893\u001b[0m \u001b[1;31m# multimetric is determined here because in the case of a callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1390\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1391\u001b[0m \u001b[1;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1392\u001b[1;33m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1393\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1394\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 849\u001b[0m )\n\u001b[0;32m 850\u001b[0m for (cand_idx, parameters), (split_idx, (train, test)) in product(\n\u001b[1;32m--> 851\u001b[1;33m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcandidate_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 852\u001b[0m )\n\u001b[0;32m 853\u001b[0m )\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[1;31m# remaining jobs.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1042\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1043\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1044\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1045\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[1;34m(self, iterator)\u001b[0m\n\u001b[0;32m 859\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 860\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 861\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 862\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 863\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m 777\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 778\u001b[0m \u001b[0mjob_idx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 779\u001b[1;33m \u001b[0mjob\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 780\u001b[0m \u001b[1;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 781\u001b[0m \u001b[1;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[1;34m(self, func, callback)\u001b[0m\n\u001b[0;32m 206\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[1;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 208\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 209\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 210\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[1;31m# arguments in memory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 572\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 573\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 574\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 261\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 262\u001b[0m return [func(*args, **kwargs)\n\u001b[1;32m--> 263\u001b[1;33m for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m 264\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 265\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__reduce__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\parallel.py\u001b[0m in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 261\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 262\u001b[0m return [func(*args, **kwargs)\n\u001b[1;32m--> 263\u001b[1;33m for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m 264\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 265\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__reduce__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\sklearn\\utils\\fixes.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 214\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 215\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mconfig_context\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 216\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunction\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 217\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 218\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)\u001b[0m\n\u001b[0;32m 666\u001b[0m \u001b[0mcloned_parameters\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mclone\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msafe\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 667\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 668\u001b[1;33m \u001b[0mestimator\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mcloned_parameters\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 670\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\anaconda3\\envs\\dankook\\lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36mset_params\u001b[1;34m(self, **params)\u001b[0m\n\u001b[0;32m 246\u001b[0m \u001b[1;34m\"Invalid parameter %s for estimator %s. \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 247\u001b[0m \u001b[1;34m\"Check the list of available parameters \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 248\u001b[1;33m \u001b[1;34m\"with `estimator.get_params().keys()`.\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 249\u001b[0m )\n\u001b[0;32m 250\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: Invalid parameter C for estimator RandomForestRegressor(). Check the list of available parameters with `estimator.get_params().keys()`." ] } ], "source": [ "# 11-2. 예시1\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "param = [{'kernel' : ['linear'],\n", "{'kernel' : ['rbf'], \n", "'gamma' : [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}]\n", "\n", "model = RandomForestRegressor()\n", "grid_search = GridSearchCV(model, param, cv=5, scoring='neg_mean_squared_error', verbose=2)\n", "grid_search.fit(X_train, y_train)\n", "\n", "grid_search.best_params_\n", "\n", "neg = grid_search.best_score_ # Best score: -0.737\n", "rmse = np.sqrt(-neg)\n", "\n", "for param_name in sorted(param.keys()):\n", " print(\"\\t%s: %r\" % (param_name, model.best_estimator_.get_params()[param_name]))" ] }, { "cell_type": "code", "execution_count": 130, "id": "55f5bb8c", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "최적 하이퍼파라미터: {'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 590}\n", "최적 하이퍼파라미터의 성능(RMSE): 0.9561\n", "Wall time: 45.2 s\n" ] } ], "source": [ "%%time\n", "import scipy.stats as stats\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "rf = RandomForestRegressor(random_state=1217)\n", "params = {'n_estimators': stats.randint(200, 1000),\n", " 'max_features': stats.randint(4, 12),\n", " 'min_samples_leaf': stats.randint(1, 5)}\n", "\n", "# RandomizedSearchCV를 이용해 최적의 하이퍼파라미터 탐색\n", "rand_cv = RandomizedSearchCV(rf, param_distributions = params, n_iter=10, \n", " cv = 3, random_state = 1217, n_jobs=-1)\n", "rand_cv.fit(X_train, y_train)\n", "\n", "# 최적의 하이퍼파라미터 값과 성능 출력\n", "print('최적 하이퍼파라미터: ', rand_cv.best_params_)\n", "print('최적 하이퍼파라미터의 성능(RMSE): {:.4f}'.format(np.sqrt(rand_cv.best_score_)))" ] }, { "cell_type": "code", "execution_count": 133, "id": "0a3c484e", "metadata": { "hidden": true }, "outputs": [], "source": [ "import numpy as np\n", "\n", "def NMAE(true, pred):\n", " score = np.mean(np.abs(true-pred) / true)\n", " return score" ] }, { "cell_type": "code", "execution_count": 134, "id": "02e05177", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "0.21381712106313697" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NMAE(y_val, pred)" ] }, { "cell_type": "markdown", "id": "cf2caebe", "metadata": {}, "source": [ "### xgb" ] }, { "cell_type": "markdown", "id": "6dbb4094", "metadata": {}, "source": [ "#### 기본 xgb" ] }, { "cell_type": "code", "execution_count": 147, "id": "45945ae9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 166 ms\n" ] }, { "data": { "text/plain": [ "0.1929053595279806" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "from xgboost import XGBRegressor\n", "\n", "xgb_model = XGBRegressor(random_state=1217)\n", "xgb_model.fit(X_train, y_train)\n", "pred1 = xgb_model.predict(X_val)\n", "NMAE(y_val, pred1)" ] }, { "cell_type": "markdown", "id": "babc5094", "metadata": {}, "source": [ "#### xgb + 랜덤서치" ] }, { "cell_type": "code", "execution_count": 145, "id": "a4a78ab2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\dankook\\lib\\site-packages\\joblib\\externals\\loky\\process_executor.py:705: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", " \"timeout or by a memory leak.\", UserWarning\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "최적 하이퍼파라미터: {'subsample': 0.9, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 0.8}\n", "최적 하이퍼파라미터의 성능(RMSE): 0.9671\n", "Wall time: 17 s\n" ] } ], "source": [ "%%time\n", "import scipy.stats as stats\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "xgb_model = XGBRegressor(random_state=1217)\n", "params = {\n", " 'n_estimators': [200, 500, 1000, 2000], \n", " 'learning_rate': [0.1, 0.05, 0.01], \n", " 'max_depth': [6, 7, 8], \n", " 'colsample_bytree': [0.8, 0.9, 1.0], \n", " 'subsample': [0.8, 0.9, 1.0],\n", "}\n", "\n", "# RandomizedSearchCV를 이용해 최적의 하이퍼파라미터 탐색\n", "rand_cv = RandomizedSearchCV(xgb_model, param_distributions = params, n_iter=10, cv = 3, random_state = 1217, n_jobs=-1)\n", "rand_cv.fit(X_train, y_train)\n", "\n", "# 최적의 하이퍼파라미터 값과 성능 출력\n", "print('최적 하이퍼파라미터: ', rand_cv.best_params_)\n", "print('최적 하이퍼파라미터의 성능(RMSE): {:.4f}'.format(np.sqrt(rand_cv.best_score_)))" ] }, { "cell_type": "code", "execution_count": 148, "id": "929508d3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 1.18 s\n" ] }, { "data": { "text/plain": [ "0.19554718042361574" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "import lightgbm as lgb\n", "\n", "xgb = XGBRegressor(subsample=0.9, n_estimators=1000, max_depth=6, learning_rate=0.01, colsample_bytree=0.8,random_state=1217)\n", "xgb.fit(X_train, y_train)\n", "pred2 = xgb.predict(X_val)\n", "NMAE(y_val, pred2)" ] }, { "cell_type": "code", "execution_count": 149, "id": "9910195f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.19190113819849675" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NMAE(y_val, (pred1+pred2)/2)" ] }, { "cell_type": "code", "execution_count": 156, "id": "04b97254", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.18276694939078872" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://dacon.io/competitions/official/235736/codeshare/2877?page=1&dtype=recent\n", "xgb_params = pd.read_csv('./input/dankook/hyperparameter_xgb.csv')\n", "xgb_reg = XGBRegressor(n_estimators = 10000, eta = xgb_params.iloc[47,1], min_child_weight = xgb_params.iloc[47,2], \n", " max_depth = xgb_params.iloc[47,3], colsample_bytree = xgb_params.iloc[47,4], \n", " subsample = xgb_params.iloc[47,5], seed=0)\n", "\n", "xgb_reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],\n", " early_stopping_rounds=300,\n", " verbose=False)\n", "\n", "pred5 = xgb_reg.predict(X_val)\n", "NMAE(y_val, pred5)" ] }, { "cell_type": "code", "execution_count": 158, "id": "cc8f207d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.18735950479526048" ] }, "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NMAE(y_val, (pred1+pred2+pred3+pred4+pred5)/5)" ] }, { "cell_type": "markdown", "id": "0e0684df", "metadata": { "heading_collapsed": true }, "source": [ "### lgbm" ] }, { "cell_type": "markdown", "id": "3fd699b0", "metadata": { "hidden": true }, "source": [ "#### 기본 lgbm" ] }, { "cell_type": "code", "execution_count": 150, "id": "f5d7ea75", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 85.8 ms\n" ] }, { "data": { "text/plain": [ "0.19887754523448536" ] }, "execution_count": 150, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "import lightgbm as lgb\n", "\n", "lgbm = lgb.LGBMRegressor(random_state=1217)\n", "lgbm.fit(X_train, y_train)\n", "pred3 = lgbm.predict(X_val)\n", "NMAE(y_val, pred3)" ] }, { "cell_type": "markdown", "id": "783b8085", "metadata": { "hidden": true }, "source": [ "#### lgbm + 랜덤서치" ] }, { "cell_type": "code", "execution_count": 142, "id": "e2945ab5", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "최적 하이퍼파라미터: {'subsample': 0.9, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 0.8}\n", "최적 하이퍼파라미터의 성능(RMSE): 0.9653\n", "Wall time: 6.76 s\n" ] } ], "source": [ "%%time\n", "import scipy.stats as stats\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "lgbm = lgb.LGBMRegressor(random_state=1217)\n", "params = {\n", " 'n_estimators': [200, 500, 1000, 2000], \n", " 'learning_rate': [0.1, 0.05, 0.01], \n", " 'max_depth': [6, 7, 8], \n", " 'colsample_bytree': [0.8, 0.9, 1.0], \n", " 'subsample': [0.8, 0.9, 1.0],\n", "}\n", "\n", "# RandomizedSearchCV를 이용해 최적의 하이퍼파라미터 탐색\n", "rand_cv = RandomizedSearchCV(lgbm, param_distributions = params, n_iter=10, cv = 3, random_state = 1217, n_jobs=-1)\n", "rand_cv.fit(X_train, y_train)\n", "\n", "# 최적의 하이퍼파라미터 값과 성능 출력\n", "print('최적 하이퍼파라미터: ', rand_cv.best_params_)\n", "print('최적 하이퍼파라미터의 성능(RMSE): {:.4f}'.format(np.sqrt(rand_cv.best_score_)))" ] }, { "cell_type": "code", "execution_count": 151, "id": "2dfc2300", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 466 ms\n" ] }, { "data": { "text/plain": [ "0.19640525966787967" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "import lightgbm as lgb\n", "\n", "lgbm = lgb.LGBMRegressor(subsample=0.9, n_estimators=1000, max_depth=6, learning_rate=0.01, colsample_bytree=0.8,random_state=1217)\n", "lgbm.fit(X_train, y_train)\n", "pred4 = lgbm.predict(X_val)\n", "NMAE(y_val, pred4)" ] }, { "cell_type": "code", "execution_count": 152, "id": "b205587b", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "0.18975235731772788" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NMAE(y_val, (pred1+pred2+pred3+pred4)/4)" ] }, { "cell_type": "markdown", "id": "c17c85fe", "metadata": { "heading_collapsed": true }, "source": [ "### 모델링" ] }, { "cell_type": "code", "execution_count": 109, "id": "a1a4550c", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
precipitationtemp_meantemp_highesttemp_lowestPM10PM2.5humiditysunshine_sumsunshine_ratewind_meanwind_maxyearmonthday
00.0-1.33.8-5.134.017.039.18.386.51.43.8201811
10.0-1.81.8-4.336.022.042.07.982.31.84.9201812
20.0-4.7-0.4-7.131.019.042.38.688.72.23.5201813
30.0-4.7-0.7-8.739.024.043.06.263.91.43.5201814
40.0-3.01.6-5.651.035.048.48.284.51.73.6201815
.............................................
10900.05.810.01.470.042.062.95.961.51.82.820201227
10911.36.711.44.266.044.072.18.083.31.43.120201228
10920.20.14.3-6.269.046.070.80.00.02.96.120201229
10930.0-10.9-6.2-12.939.015.055.58.386.54.16.220201230
10940.0-8.9-5.0-12.928.012.053.96.062.52.44.220201231
\n", "

1095 rows × 14 columns

\n", "
" ], "text/plain": [ " precipitation temp_mean temp_highest temp_lowest PM10 PM2.5 \\\n", "0 0.0 -1.3 3.8 -5.1 34.0 17.0 \n", "1 0.0 -1.8 1.8 -4.3 36.0 22.0 \n", "2 0.0 -4.7 -0.4 -7.1 31.0 19.0 \n", "3 0.0 -4.7 -0.7 -8.7 39.0 24.0 \n", "4 0.0 -3.0 1.6 -5.6 51.0 35.0 \n", "... ... ... ... ... ... ... \n", "1090 0.0 5.8 10.0 1.4 70.0 42.0 \n", "1091 1.3 6.7 11.4 4.2 66.0 44.0 \n", "1092 0.2 0.1 4.3 -6.2 69.0 46.0 \n", "1093 0.0 -10.9 -6.2 -12.9 39.0 15.0 \n", "1094 0.0 -8.9 -5.0 -12.9 28.0 12.0 \n", "\n", " humidity sunshine_sum sunshine_rate wind_mean wind_max year month \\\n", "0 39.1 8.3 86.5 1.4 3.8 2018 1 \n", "1 42.0 7.9 82.3 1.8 4.9 2018 1 \n", "2 42.3 8.6 88.7 2.2 3.5 2018 1 \n", "3 43.0 6.2 63.9 1.4 3.5 2018 1 \n", "4 48.4 8.2 84.5 1.7 3.6 2018 1 \n", "... ... ... ... ... ... ... ... \n", "1090 62.9 5.9 61.5 1.8 2.8 2020 12 \n", "1091 72.1 8.0 83.3 1.4 3.1 2020 12 \n", "1092 70.8 0.0 0.0 2.9 6.1 2020 12 \n", "1093 55.5 8.3 86.5 4.1 6.2 2020 12 \n", "1094 53.9 6.0 62.5 2.4 4.2 2020 12 \n", "\n", " day \n", "0 1 \n", "1 2 \n", "2 3 \n", "3 4 \n", "4 5 \n", "... ... \n", "1090 27 \n", "1091 28 \n", "1092 29 \n", "1093 30 \n", "1094 31 \n", "\n", "[1095 rows x 14 columns]" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train = train.drop(columns='rental')\n", "X_train" ] }, { "cell_type": "code", "execution_count": 110, "id": "ec6b8cfc", "metadata": { "hidden": true, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([ 23995.04141794, 25114.30285569, 25044.12427381, 27222.419789 ,\n", " 24516.07706993, 20830.85491642, 23514.62439376, 23742.38631221,\n", " 25015.91206993, 27688.58285177, 24385.59253468, 17293.66857662,\n", " 25520.53267066, 25295.23865876, 24571.40777911, 24408.9975808 ,\n", " 24330.53367531, 18551.36556406, 25473.67642943, 26599.33872407,\n", " 15461.76027814, 18199.378 , 30460.54666667, 39573.1818839 ,\n", " 40097.43560612, 15625.87903788, 24469.58893131, 17143.67764481,\n", " 22598.88946275, 20060.01806907, 27537.0308064 , 22472.5970724 ,\n", " 24400.67061969, 18644.46270228, 20914.3338991 , 22194.11052598,\n", " 28747.77035583, 26201.89200722, 24535.38816334, 23207.87597965,\n", " 23315.51049661, 34755.21192245, 38936.6106016 , 45059.99618277,\n", " 37932.89877778, 18013.62198752, 17820.85031247, 23442.2353181 ,\n", " 24312.33006275, 26428.06074606, 44372.33488909, 58103.8858692 ,\n", " 30156.41338866, 24954.13177276, 32451.55878474, 26804.40256593,\n", " 54977.83255556, 62574.42018788, 35516.50905301, 31464.938 ,\n", " 46203.15503175, 41141.95337932, 45360.33906882, 63516.54641367,\n", " 41923.52961821, 50756.34603866, 49103.4562782 , 44173.73079211,\n", " 52838.53243277, 52466.22517265, 46460.08500482, 50553.75860869,\n", " 49774.70260976, 44763.95710101, 51880.96027236, 60929.44727347,\n", " 60915.5473046 , 79219.51836022, 38623.98576648, 35387.94070118,\n", " 46896.6765696 , 62300.72223882, 62244.87875715, 71112.52954317,\n", " 57255.40350015, 27909.58958333, 30813.33333333, 48431.48466317,\n", " 67817.00122678, 77033.06603362, 88555.04478011, 84150.87570126,\n", " 28686.16902381, 66937.93154123, 76382.91928156, 86796.41464315,\n", " 85401.72377367, 78569.22767604, 85692.87268051, 79928.31946175,\n", " 84021.26011138, 28123.30691017, 50506.34489177, 65596.26156233,\n", " 75082.01896996, 56442.47665454, 58175.52503673, 72427.57255021,\n", " 79121.54034654, 83477.54677506, 84928.37847806, 82220.147776 ,\n", " 84469.46634693, 85356.98071915, 90478.51189965, 89675.00103018,\n", " 74743.20366264, 56509.58614954, 61119.86197078, 43324.89871068,\n", " 42673.88803922, 82260.82175564, 91401.68047238, 31369.11490552,\n", " 88083.12368976, 91976.85523871, 57347.36576623, 90210.49571021,\n", " 92520.04224477, 73861.80957728, 96083.68161079, 98794.50415417,\n", " 99884.70451602, 99805.34329914, 44263.67429679, 14551.74969251,\n", " 44295.99674816, 95524.91444335, 96654.94145436, 78218.78865259,\n", " 50471.80146786, 95922.24510959, 96851.65152669, 97435.65129437,\n", " 66481.78745077, 92978.74187309, 48380.12813492, 53922.7664596 ,\n", " 84531.68869804, 67446.38452725, 66745.98894248, 70912.22370335,\n", " 102880.08481666, 41933.60085849, 98743.71539247, 101331.85176144,\n", " 103228.89146393, 100780.95106263, 102646.22555382, 102135.54008719,\n", " 59114.30962701, 68300.79596273, 101828.88463665, 100032.40118458,\n", " 102907.83158329, 78555.3320138 , 103820.45362018, 101503.18450466,\n", " 40377.53781349, 102163.16281927, 100496.09217989, 103332.35194636,\n", " 103225.88727937, 62223.60126551, 102653.31490098, 102891.91168286,\n", " 67601.30351455, 65578.59867496, 96089.98295017, 66817.58229798,\n", " 71288.9536063 , 101188.61458895, 93777.35117785, 57048.40350496,\n", " 17207.02457347, 97160.87497121, 100448.01008316, 50698.91850061,\n", " 71251.23685756, 88279.51864172, 84324.07186338, 72358.55017871,\n", " 88735.61340717, 85218.57218411, 83849.47625647, 86759.0864303 ,\n", " 89920.70084861, 89617.17055259, 87659.22738717, 54161.43130087,\n", " 89217.82008702, 84828.10767504, 82220.70826566, 81863.73922484,\n", " 81790.16922484, 81778.13992652, 81184.78325986, 81085.84728367,\n", " 81457.4983429 , 85871.76447138, 83762.88089726, 62936.77721998,\n", " 36807.24605254, 72981.6600285 , 82950.02212236, 86024.07448147,\n", " 85581.7142695 , 85229.83597485, 94253.12417123, 51224.6948961 ,\n", " 96440.29332208, 91965.96949692, 91956.8198552 , 92444.54909586,\n", " 95475.04903252, 94794.07109471, 100882.60297193, 97732.19978376,\n", " 58735.43874278, 73045.38374308, 100881.63342041, 97286.46053473,\n", " 47527.83726054, 89963.74850945, 44114.17503144, 21627.64454545,\n", " 80905.10660213, 73025.13942645, 73181.92006422, 94231.28548648,\n", " 77036.01685713, 95139.37504511, 21159.64983754, 74151.98341978,\n", " 107068.7501419 , 109069.04050355, 110363.5845785 , 110405.71638032,\n", " 60832.27680133, 13701.20004966, 81332.35480641, 102273.74847883,\n", " 98848.9948061 , 102728.39992302, 109362.72830349, 108894.16671347,\n", " 110337.95485211, 110926.63101576, 111539.98068047, 111274.45241102,\n", " 110620.91886259, 109077.72903981, 111307.30270665, 64360.40792038,\n", " 73541.82416872, 111196.82056228, 111041.05675038, 98718.61633967,\n", " 111348.91512491, 110126.70249865, 77367.10157582, 21094.33714706,\n", " 96738.41931311, 65771.59906061, 100010.41274859, 107679.53638952,\n", " 62264.08256067, 87653.55709401, 49358.05116621, 50371.3181191 ,\n", " 52114.80848268, 78125.87690675, 35967.94680704, 82386.31379393,\n", " 97566.24547912, 109442.76039862, 102678.71450175, 93774.1351478 ,\n", " 86898.53354639, 66084.21735065, 58515.34953379, 54767.13918499,\n", " 87133.98395957, 86093.75104653, 88692.37962222, 90226.21656889,\n", " 90683.48071481, 89893.69638262, 91021.81515461, 92747.01108662,\n", " 92554.20521734, 95891.3352229 , 93282.47680688, 78457.94603626,\n", " 83597.96066271, 83909.70990683, 87709.01576228, 86033.90797393,\n", " 88638.95648335, 91766.91900643, 90975.36267539, 38393.73189286,\n", " 47478.1037535 , 57532.81105264, 64277.98746098, 63531.97762044,\n", " 65117.88924303, 86634.08102404, 83115.08265655, 68128.97328445,\n", " 61091.86174511, 84127.84639097, 81503.8222258 , 78664.00450378,\n", " 40535.14278705, 45560.65343617, 38331.92911538, 53624.00268884,\n", " 57997.83526985, 53986.23589798, 54279.11831785, 51323.61173266,\n", " 59949.73359928, 34385.98776984, 34351.48122222, 47924.54770907,\n", " 51020.48016511, 52359.31667476, 51368.84608689, 48271.7137043 ,\n", " 56560.41629295, 60034.52250915, 50842.70500406, 42805.53278397,\n", " 48919.83220788, 41338.60141987, 27420.34261111, 46206.81099434,\n", " 43135.05691059, 46604.48397677, 28944.76981685, 26806.97152381,\n", " 37440.78409216, 43905.25999771, 45768.91072093, 45969.64616837,\n", " 44592.58817437, 30330.7114245 , 24448.10049206, 27434.07834921,\n", " 28268.06674242, 27186.2237381 , 39766.06321809, 24602.55590476,\n", " 27132.66559524])" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "model = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=0)\n", "model.fit(X_train, train['rental'])\n", "pred = model.predict(test)\n", "pred" ] }, { "cell_type": "code", "execution_count": 111, "id": "7bf44867", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
daterental
02021-01-0123995.041418
12021-01-0225114.302856
22021-01-0325044.124274
32021-01-0427222.419789
42021-01-0524516.077070
.........
3602021-12-2728268.066742
3612021-12-2827186.223738
3622021-12-2939766.063218
3632021-12-3024602.555905
3642021-12-3127132.665595
\n", "

365 rows × 2 columns

\n", "
" ], "text/plain": [ " date rental\n", "0 2021-01-01 23995.041418\n", "1 2021-01-02 25114.302856\n", "2 2021-01-03 25044.124274\n", "3 2021-01-04 27222.419789\n", "4 2021-01-05 24516.077070\n", ".. ... ...\n", "360 2021-12-27 28268.066742\n", "361 2021-12-28 27186.223738\n", "362 2021-12-29 39766.063218\n", "363 2021-12-30 24602.555905\n", "364 2021-12-31 27132.665595\n", "\n", "[365 rows x 2 columns]" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./input/dankook/sample_submission.csv')\n", "submission['rental'] = pred\n", "submission.to_csv('./output/dankook/dankook6.csv', index=False)\n", "submission" ] }, { "cell_type": "code", "execution_count": 112, "id": "54f8009c", "metadata": { "hidden": true }, "outputs": [], "source": [ "import numpy as np\n", "\n", "def NMAE(true, pred):\n", " score = np.mean(np.abs(true-pred) / true)\n", " return score" ] }, { "cell_type": "code", "execution_count": 113, "id": "df94ba2b", "metadata": { "hidden": true }, "outputs": [ { "ename": "NameError", "evalue": "name 'val_y' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_14784\\3435665192.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mNMAE\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_y\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mNameError\u001b[0m: name 'val_y' is not defined" ] } ], "source": [ "NMAE(, pred)" ] }, { "cell_type": "code", "execution_count": null, "id": "68d0bd1f", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a3050632", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "081adb8d", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3d9fb256", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a472873f", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c9885c0f", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "9231d114", "metadata": { "heading_collapsed": true }, "source": [ "### Time_Series로 변환" ] }, { "cell_type": "code", "execution_count": 54, "id": "5fb83590", "metadata": { "hidden": true }, "outputs": [], "source": [ "# 데이터프레임을 Series로 변환하는 함수\n", "def time_series(df):\n", " index1 = pd.date_range(min(df['date']), '2019-09-06')\n", " index2 = pd.date_range('2019-09-08', max(df['date']))\n", " ts1 = pd.Series(df['rental'][:len(index1)].values, index=index1)\n", " ts2 = pd.Series(df['rental'][len(index1):len(train)].values, index=index2)\n", " ts = pd.concat([ts1, ts2])\n", " return ts" ] }, { "cell_type": "code", "execution_count": 55, "id": "5260ee0c", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#plt.figure(figsize=(15,6))\n", "time_series(train).plot()" ] }, { "cell_type": "markdown", "id": "36258fdb", "metadata": { "heading_collapsed": true }, "source": [ "### 정상성 확인(차분 필요여부)" ] }, { "cell_type": "code", "execution_count": 6, "id": "2f6e1951", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "p_val : 0.218787 , should_diff : True\n" ] } ], "source": [ "# pmdarima 패키지에 있는 ADFTest 클래스를 임포트\n", "from pmdarima.arima import ADFTest\n", "\n", "# ADF-Test 시행\n", "p_val, should_diff = ADFTest().should_diff(time_series(train))\n", "print('p_val : %f , should_diff : %s' %(p_val, should_diff))" ] }, { "cell_type": "markdown", "id": "2608cc66", "metadata": { "heading_collapsed": true }, "source": [ "### 파이썬에서 R 시계열 패키지 forecast를 통한 모델링" ] }, { "cell_type": "code", "execution_count": 7, "id": "e944bafd", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "rpy2.rinterface.NULL" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from rpy2.robjects.packages import importr # rpy2 내의 패키지를 불러올 importr 클래스\n", "\n", "utils = importr('utils') # utils 패키지를 임포트\n", "utils.install_packages('forecast') # r의 forecast 패키지 설치.\n", "utils.install_packages('forecastHybrid') # r의 forecastHybrid 패키지 설치\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "4a22bb15", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "rpy2.rinterface.NULL" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utils.install_packages('Zoo')" ] }, { "cell_type": "code", "execution_count": 37, "id": "a34732cc", "metadata": { "hidden": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\dankook\\lib\\site-packages\\rpy2-2.9.4-py3.7-win-amd64.egg\\rpy2\\robjects\\pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "data": { "text/plain": [ "array([[ 39933.10411361, 13637.0960067 , 66229.11222053,\n", " -283.16982525, 80149.37805248],\n", " [ 42188.28994381, 14296.49793224, 70080.08195539,\n", " -468.52487092, 84845.10475855],\n", " [ 37130.67852744, 9094.15804744, 65167.19900745,\n", " -5747.47937897, 80008.83643385],\n", " ...,\n", " [ 47758.05378832, 8751.00021173, 86765.10736491,\n", " -11898.08663444, 107414.19421108],\n", " [ 34499.05378832, -4531.8832869 , 73529.99086354,\n", " -25193.61329353, 94191.72087017],\n", " [ 36546.05378832, -2508.75217986, 75600.8597565 ,\n", " -23183.11761518, 96275.22519182]])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "import rpy2.robjects as robjects # r 함수를 파이썬에서 사용 가능하게 변환하는 모듈\n", "from rpy2.robjects import pandas2ri # 파이썬 자료형과 R 자료형의 호환을 도와주는 모듈\n", "\n", "# pandas2ri를 활성화 \n", "pandas2ri.activate()\n", "\n", "auto_arima = \"\"\"\n", " function(ts){\n", " library(forecast) # forecast 패키지 로드\n", " d_params = ndiffs(ts) # 시계열 자료의 차분 횟수 계산\n", " model = auto.arima(ts, max.p=2, d=d_params) # auto.arima 모델 생성\n", " forecasted_data = forecast(model, h=365) # 이후 3개월(h=3)을 예측\n", " out_df = data.frame(forecasted_data$mean) # 예측값을 R의 데이터프레임으로 변환\n", " colnames(out_df) = c('rental') # amount라는 열로 이름을 지정\n", " out_df\n", " }\n", "\"\"\"\n", "# r() 함수로 r 자료형을 파이썬에서 사용 가능\n", "auto_arima = robjects.r(auto_arima)\n", "ts = robjects.r('ts')# r 자료형 time series 자료형으로 만들어주는 함수\n", "c = robjects.r('c') # r 자료형 벡터를 만들어주는 함수\n", "#zoo = robjects.r('zoo')\n", "\n", "start_year = int(min(train['date'])[:4]) # 영업 시작 년도\n", "start_month = int(min(train['date'])[5:7]) # 영업 시작 월\n", " \n", "# R의 ts 함수로 r의 time series 자료형으로 변환\n", "train = ts(train['rental'], start=c(start_year, 0), frequency=365) \n", "\n", "#ensemble model\n", "forecast = auto_arima(train)\n", "a = (pandas2ri.ri2py(forecast).values) # 3개월 매출을 합산" ] }, { "cell_type": "code", "execution_count": 45, "id": "70785b19", "metadata": { "hidden": true, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 1.41 ms\n" ] }, { "data": { "text/plain": [ "array([ 39933.10411361, 42188.28994381, 37130.67852744, 34717.09634585,\n", " 26958.11822573, 13556.55226067, 33209.39453015, 40442.43062743,\n", " 41164.45484698, 35048.31274316, 31672.70097892, 38641.22868614,\n", " 40311.42062058, 40285.73049888, 40936.55026345, 41550.21856802,\n", " 36571.02800686, 29628.23138259, 40566.04711697, 42580.66392681,\n", " 44980.24486377, 46440.93080476, 31209.84346766, 27217.08801696,\n", " 34672.75531494, 28826.92386678, 44091.66150108, 49986.02682188,\n", " 50077.07046317, 47856.8361728 , 39152.36174899, 34386.67984947,\n", " 41684.81869058, 35010.80265128, 32316.65279504, 34407.38732073,\n", " 38547.0219522 , 34053.5702749 , 33107.04402666, 45964.45334901,\n", " 52571.80700423, 22707.11256296, 46384.37656623, 53238.60466551,\n", " 41846.80174369, 18807.97201958, 25787.11913825, 31783.24624903,\n", " 44731.35607297, 48037.45096104, 41789.5329445 , 30300.60377836,\n", " 37169.66497893, 48104.71785633, 27949.7635425 , 47783.80301542,\n", " 50536.8371201 , 24654.8665866 , 47707.89204571, 36441.91404242,\n", " 48651.93304762, 51457.94946815, 43279.96365551, 47861.97591341,\n", " 52907.98650427, 54389.99565478, 66256.00356084, 65031.01039169,\n", " 22617.01629356, 57256.02139278, 62105.02579852, 55565.02960508,\n", " 54580.03289396, 45226.03573556, 54452.0381907 , 59252.04031195,\n", " 74130.04214471, 36267.04372822, 65850.04509638, 75770.04627847,\n", " 76447.04729979, 75697.04818222, 82009.04894464, 85471.04960337,\n", " 59644.05017252, 72194.05066426, 76688.05108913, 80441.05145621,\n", " 81304.05177338, 86632.0520474 , 82269.05228417, 84140.05248873,\n", " 86867.05266547, 78416.05281818, 76122.05295011, 80425.05306411,\n", " 85696.0531626 , 84874.0532477 , 85393.05332122, 87032.05338475,\n", " 87648.05343963, 75346.05348705, 89615.05352802, 95809.05356342,\n", " 101800.05359401, 95981.05362043, 47667.05364327, 104840.05366299,\n", " 41386.05368004, 68140.05369476, 70757.05370749, 63348.05371848,\n", " 76776.05372798, 84894.05373618, 90430.05374327, 94611.0537494 ,\n", " 89160.05375469, 98552.05375927, 100838.05376322, 108888.05376663,\n", " 105260.05376958, 100519.05377213, 101298.05377433, 105590.05377623,\n", " 100230.05377788, 100572.0537793 , 102656.05378052, 75485.05378158,\n", " 19611.0537825 , 77943.05378329, 88053.05378397, 90211.05378457,\n", " 101194.05378508, 101160.05378552, 25793.0537859 , 76560.05378623,\n", " 101460.05378651, 48772.05378676, 67567.05378697, 100268.05378715,\n", " 105941.05378731, 101328.05378745, 113267.05378757, 92368.05378767,\n", " 104542.05378776, 80509.05378783, 113960.0537879 , 111803.05378796,\n", " 116010.05378801, 117183.05378805, 105558.05378809, 111941.05378812,\n", " 91844.05378814, 115898.05378817, 109667.05378819, 119354.05378821,\n", " 117044.05378822, 113859.05378823, 115593.05378825, 114095.05378826,\n", " 70144.05378826, 112042.05378827, 121221.05378828, 102407.05378828,\n", " 106417.05378829, 120331.05378829, 118530.0537883 , 119734.0537883 ,\n", " 122388.0537883 , 121135.0537883 , 114344.05378831, 107464.05378831,\n", " 110405.05378831, 115225.05378831, 31767.05378831, 58426.05378831,\n", " 108161.05378831, 119012.05378831, 110579.05378831, 82064.05378832,\n", " 50097.05378832, 117207.05378832, 119853.05378832, 114764.05378832,\n", " 110499.05378832, 116675.05378832, 118836.05378832, 103418.05378832,\n", " 117887.05378832, 114839.05378832, 102891.05378832, 115668.05378832,\n", " 69378.05378832, 16870.05378832, 97497.05378832, 118960.05378832,\n", " 114752.05378832, 111960.05378832, 110320.05378832, 72815.05378832,\n", " 84302.05378832, 112836.05378832, 58459.05378832, 22750.05378832,\n", " 89006.05378832, 114509.05378832, 106813.05378832, 64933.05378832,\n", " 80225.05378832, 43735.05378832, 104479.05378832, 107716.05378832,\n", " 47009.05378832, 23502.05378832, 29586.05378832, 61389.05378832,\n", " 40762.05378832, 63467.05378832, 90082.05378832, 52031.05378832,\n", " 24395.05378832, 35530.05378832, 62985.05378832, 90272.05378832,\n", " 91364.05378832, 84886.05378832, 38278.05378832, 78022.05378832,\n", " 85379.05378832, 94213.05378832, 96676.05378832, 97431.05378832,\n", " 90712.05378832, 52137.05378832, 92605.05378832, 102308.05378832,\n", " 99905.05378832, 52180.05378832, 44756.05378832, 74056.05378832,\n", " 45808.05378832, 64915.05378832, 99673.05378832, 99903.05378832,\n", " 25755.05378832, 83535.05378832, 127070.05378832, 125264.05378832,\n", " 66136.05378832, 15155.05378832, 116617.05378832, 87423.05378832,\n", " 119161.05378832, 73485.05378832, 56714.05378832, 130932.05378832,\n", " 118476.05378832, 123923.05378832, 100756.05378832, 122099.05378832,\n", " 130374.05378832, 129644.05378832, 122809.05378832, 120380.05378832,\n", " 124630.05378832, 123354.05378832, 124005.05378832, 126731.05378832,\n", " 127122.05378832, 121621.05378832, 119725.05378832, 125834.05378832,\n", " 83687.05378832, 94110.05378832, 87286.05378832, 109737.05378832,\n", " 98102.05378832, 105167.05378832, 112380.05378832, 115359.05378832,\n", " 117520.05378832, 116098.05378832, 116689.05378832, 107416.05378832,\n", " 109732.05378832, 109171.05378832, 105009.05378832, 104566.05378832,\n", " 104620.05378832, 106851.05378832, 101412.05378832, 107121.05378832,\n", " 110588.05378832, 108577.05378832, 104111.05378832, 96683.05378832,\n", " 89463.05378832, 92508.05378832, 104385.05378832, 107422.05378832,\n", " 107044.05378832, 104278.05378832, 109099.05378832, 101455.05378832,\n", " 44936.05378832, 98716.05378832, 81450.05378832, 86170.05378832,\n", " 97101.05378832, 103826.05378832, 96162.05378832, 74255.05378832,\n", " 85493.05378832, 93976.05378832, 98606.05378832, 100204.05378832,\n", " 102765.05378832, 93564.05378832, 81109.05378832, 99594.05378832,\n", " 93269.05378832, 56463.05378832, 46601.05378832, 72343.05378832,\n", " 72301.05378832, 46839.05378832, 70777.05378832, 73876.05378832,\n", " 76932.05378832, 76308.05378832, 70103.05378832, 50503.05378832,\n", " 44222.05378832, 61936.05378832, 65896.05378832, 67622.05378832,\n", " 63544.05378832, 62926.05378832, 54964.05378832, 52627.05378832,\n", " 61820.05378832, 59625.05378832, 62487.05378832, 65432.05378832,\n", " 65547.05378832, 54157.05378832, 26373.05378832, 40041.05378832,\n", " 40039.05378832, 39689.05378832, 44326.05378832, 42206.05378832,\n", " 36141.05378832, 35453.05378832, 49297.05378832, 54497.05378832,\n", " 54229.05378832, 53113.05378832, 40611.05378832, 49148.05378832,\n", " 49114.05378832, 58923.05378832, 47758.05378832, 34499.05378832,\n", " 36546.05378832])" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "a = (pandas2ri.ri2py(forecast).values)\n", "a.shape\n", "a = a.mean(axis=1)\n", "a" ] }, { "cell_type": "code", "execution_count": 46, "id": "469a1846", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
daterental
02021-01-0139933.104114
12021-01-0242188.289944
22021-01-0337130.678527
32021-01-0434717.096346
42021-01-0526958.118226
.........
3602021-12-2749114.053788
3612021-12-2858923.053788
3622021-12-2947758.053788
3632021-12-3034499.053788
3642021-12-3136546.053788
\n", "

365 rows × 2 columns

\n", "
" ], "text/plain": [ " date rental\n", "0 2021-01-01 39933.104114\n", "1 2021-01-02 42188.289944\n", "2 2021-01-03 37130.678527\n", "3 2021-01-04 34717.096346\n", "4 2021-01-05 26958.118226\n", ".. ... ...\n", "360 2021-12-27 49114.053788\n", "361 2021-12-28 58923.053788\n", "362 2021-12-29 47758.053788\n", "363 2021-12-30 34499.053788\n", "364 2021-12-31 36546.053788\n", "\n", "[365 rows x 2 columns]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./input/dankook/sample_submission.csv')\n", "submission['rental'] = a\n", "submission.to_csv('./output/dankook/dankook1.csv', index=False)\n", "submission" ] }, { "cell_type": "markdown", "id": "ce6f082f", "metadata": { "heading_collapsed": true }, "source": [ "### 시계열 모델 선택과 검증\n", "#### 자기회귀누적이동평균 모델" ] }, { "cell_type": "markdown", "id": "9a73491a", "metadata": { "hidden": true }, "source": [ "#### 지수평활법" ] }, { "cell_type": "code", "execution_count": 56, "id": "94ed587c", "metadata": { "hidden": true }, "outputs": [], "source": [ "import rpy2.robjects as robjects # r 함수를 파이썬에서 사용 가능하게 변환하는 모듈\n", "from rpy2.robjects import pandas2ri # 파이썬 자료형과 R 자료형의 호환을 도와주는 모듈\n", "\n", "# pandas2ri를 활성화 \n", "pandas2ri.activate()\n", "\n", "ets = \"\"\"\n", " function(ts){\n", " library(forecast) # forecast 패키지 로드\n", " model = ets(ts) # AIC가 낮은 지수평활 모델을 찾음 \n", " forecasted_data = forecast(model, h=365) # 이후 3개월(h=3)을 예측\n", " out_df = data.frame(forecasted_data$mean) # 예측값을 R의 데이터프레임으로 변환\n", " colnames(out_df) = c('amount') # amount라는 열로 이름을 지정\n", " out_df\n", " }\n", "\"\"\"\n", "# r() 함수로 r 자료형을 파이썬에서 사용 가능\n", "ets = robjects.r(ets)# str 형식으로 정의된 ets\n", "ts = robjects.r('ts')# r 자료형 time series 자료형으로 만들어주는 함수\n", "c = robjects.r('c') # r 자료형 벡터를 만들어주는 함수\n", "\n", " \n", "# R의 ts 함수로 r의 time series 자료형으로 변환\n", "ts_train2 = ts(train['rental'], start=c(2018, 0), frequency=365) \n", "\n", "#ensemble model\n", "forecast = auto_arima(ts_train2)\n", "b = (pandas2ri.ri2py(forecast).values) # 3개월 매출을 합산" ] }, { "cell_type": "code", "execution_count": 57, "id": "0a33a60f", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
daterental
02021-01-0139933.104114
12021-01-0242188.289944
22021-01-0337130.678527
32021-01-0434717.096346
42021-01-0526958.118226
.........
3602021-12-2749114.053788
3612021-12-2858923.053788
3622021-12-2947758.053788
3632021-12-3034499.053788
3642021-12-3136546.053788
\n", "

365 rows × 2 columns

\n", "
" ], "text/plain": [ " date rental\n", "0 2021-01-01 39933.104114\n", "1 2021-01-02 42188.289944\n", "2 2021-01-03 37130.678527\n", "3 2021-01-04 34717.096346\n", "4 2021-01-05 26958.118226\n", ".. ... ...\n", "360 2021-12-27 49114.053788\n", "361 2021-12-28 58923.053788\n", "362 2021-12-29 47758.053788\n", "363 2021-12-30 34499.053788\n", "364 2021-12-31 36546.053788\n", "\n", "[365 rows x 2 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./input/dankook/sample_submission.csv')\n", "submission['rental'] = b\n", "submission.to_csv('./output/dankook/dankook2.csv', index=False)\n", "submission" ] }, { "cell_type": "markdown", "id": "3b3367f3", "metadata": { "hidden": true }, "source": [ "#### STL 분해를 적용한 지수평활법" ] }, { "cell_type": "code", "execution_count": 58, "id": "bf943d15", "metadata": { "hidden": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HOME\\anaconda3\\envs\\dankook\\lib\\site-packages\\ipykernel_launcher.py:6: FutureWarning: the 'freq' keyword is deprecated, use 'period' instead\n", " \n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from statsmodels.tsa.seasonal import seasonal_decompose\n", "import matplotlib.pyplot as plt\n", "\n", "t_train = time_series(train)\n", "# STL 분해\n", "stl = seasonal_decompose(t_train.values, freq=12)\n", "stl.plot()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 59, "id": "ca96a122", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 39min 30s\n" ] } ], "source": [ "%%time\n", "\n", "import rpy2.robjects as robjects # r 함수를 파이썬에서 사용 가능하게 변환하는 모듈\n", "from rpy2.robjects import pandas2ri # 파이썬 자료형과 R 자료형의 호환을 도와주는 모듈\n", "\n", "# pandas2ri를 활성화 \n", "pandas2ri.activate()\n", "stlm = \"\"\"\n", " function(ts){\n", " library(forecast) # forecast 패키지 로드\n", " model = stlm(ts, s.window=\"periodic\", method='ets') # STL 분해 후 지수평활법을 통한 예측 \n", " forecasted_data = forecast(model, h=365) # 이후 3개월(h=3)을 예측\n", " out_df = data.frame(forecasted_data$mean) # 예측값을 R의 데이터프레임으로 변환\n", " colnames(out_df) = c('amount') # amount라는 열로 이름을 지정\n", " out_df\n", " }\n", "\"\"\"\n", "ets = \"\"\"\n", " function(ts){\n", " library(forecast) # forecast 패키지 로드\n", " model = ets(ts) # AIC가 낮은 지수평활 모델을 찾음 \n", " forecasted_data = forecast(model, h=365) # 이후 3개월(h=3)을 예측\n", " out_df = data.frame(forecasted_data$mean) # 예측값을 R의 데이터프레임으로 변환\n", " colnames(out_df) = c('amount') # amount라는 열로 이름을 지정\n", " out_df\n", " }\n", "\"\"\"\n", "# r() 함수로 r을 파이썬에서 사용 가능\n", "stlm = robjects.r(stlm)# str 형식으로 정의된 stlm\n", "ets = robjects.r(ets)# str 형식으로 정의된 ets\n", "ts = robjects.r('ts')# r 자료형 time series 자료형으로 만들어주는 함수\n", "c = robjects.r('c') # r 자료형 벡터를 만들어주는 함수\n", "\n", "# R의 ts 함수로 r의 time series 자료형으로 변환\n", "ts_train3 = ts(train['rental'], start=c(2018, 0), frequency=365) \n", "\n", "#ensemble model\n", "forecast = auto_arima(ts_train3)\n", "c = (pandas2ri.ri2py(forecast).values) # 3개월 매출을 합산" ] }, { "cell_type": "code", "execution_count": 60, "id": "fd2a5dcd", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
daterental
02021-01-0139933.104114
12021-01-0242188.289944
22021-01-0337130.678527
32021-01-0434717.096346
42021-01-0526958.118226
.........
3602021-12-2749114.053788
3612021-12-2858923.053788
3622021-12-2947758.053788
3632021-12-3034499.053788
3642021-12-3136546.053788
\n", "

365 rows × 2 columns

\n", "
" ], "text/plain": [ " date rental\n", "0 2021-01-01 39933.104114\n", "1 2021-01-02 42188.289944\n", "2 2021-01-03 37130.678527\n", "3 2021-01-04 34717.096346\n", "4 2021-01-05 26958.118226\n", ".. ... ...\n", "360 2021-12-27 49114.053788\n", "361 2021-12-28 58923.053788\n", "362 2021-12-29 47758.053788\n", "363 2021-12-30 34499.053788\n", "364 2021-12-31 36546.053788\n", "\n", "[365 rows x 2 columns]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./input/dankook/sample_submission.csv')\n", "submission['rental'] = c\n", "submission.to_csv('./output/dankook/dankook3.csv', index=False)\n", "submission" ] }, { "cell_type": "code", "execution_count": 61, "id": "c7569c54", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "submission.plot()" ] }, { "cell_type": "markdown", "id": "d8948aa7", "metadata": { "heading_collapsed": true }, "source": [ "## 성능 향상을 위한 방법\n", "### 상점 매출액의 로그 정규화" ] }, { "cell_type": "code", "execution_count": 51, "id": "a5ed6e1a", "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "log-regularization mae: 2401.9664819482714\n", "mae: 25197.053143496625\n" ] } ], "source": [ "import rpy2.robjects as robjects # r 함수를 파이썬에서 사용 가능하게 변환하는 모듈\n", "from rpy2.robjects import pandas2ri # 파이썬 자료형과 R 자료형의 호환을 도와주는 모듈\n", "import numpy as np\n", "\n", "# pandas2ri를 활성화 \n", "pandas2ri.activate()\n", "\n", "auto_arima = \"\"\"\n", " function(ts){\n", " library(forecast) # forecast 패키지 로드\n", " d_params = ndiffs(ts) # 시계열 자료의 차분 횟수 계산\n", " model = auto.arima(ts, max.p=2, d=d_params) # auto.arima 모델 생성\n", " forecasted_data = forecast(model, h=365) # 이후 3개월(h=3)을 예측\n", " out_df = data.frame(forecasted_data$mean) # 예측값을 R의 데이터프레임으로 변환\n", " colnames(out_df) = c('amount') # amount라는 열로 이름을 지정\n", " out_df\n", " }\n", "\"\"\"\n", "\n", "# r() 함수로 r 자료형을 파이썬에서 사용 가능\n", "auto_arima = robjects.r(auto_arima)\n", "ts = robjects.r('ts')# r 자료형 time series 자료형으로 만들어주는 함수\n", "c = robjects.r('c') # r 자료형 벡터를 만들어주는 함수\n", "log = robjects.r('log')# 로그 변환 함수\n", "exp = robjects.r('exp')# 로그 역변환 함수\n", "\n", "# 0번 상점 추출\n", "store_0 = resampling_data[resampling_data['store_id']==0]\n", "start_year = int(min(store_0['year_month'])[:4]) # 영업 시작 년도\n", "start_month = int(min(store_0['year_month'])[5:]) # 영업 시작 월\n", "\n", "# train, test 분리\n", "train = store_0[store_0.index <= len(store_0)-4]\n", "test = store_0[store_0.index > len(store_0)-4]\n", "\n", "# R의 ts 함수로 r의 time series 자료형으로 변환\n", "train_log = ts(log(train['amount']), start=c(start_year, start_month), frequency=12) # log 정규화 \n", "train = ts(train['amount'], start=c(start_year, start_month), frequency=12) # log 정규화를 하지 않음\n", "\n", "# model arima\n", "forecast_log = auto_arima(train_log)\n", "forecast = auto_arima(train)\n", "\n", "# pred\n", "pred_log = np.sum(pandas2ri.ri2py(exp(forecast_log)).values) #로그 역변환 후 3개월 합산\n", "pred = np.sum(pandas2ri.ri2py(forecast).values) #3개월 매출을 합산\n", "\n", "# test(2018-12~2019-02)\n", "test = np.sum(test['amount'])\n", "\n", "# mae\n", "print('log-regularization mae: ', abs(test-pred_log))\n", "print('mae:', abs(test-pred))" ] }, { "cell_type": "code", "execution_count": 52, "id": "a1fa5967", "metadata": { "hidden": true }, "outputs": [], "source": [ "# 매출 변동 계수를 구하는 함수\n", "def coefficient_variation(df, i):\n", " cv_data = df.groupby(['store_id']).amount.std()/df.groupby(['store_id']).amount.mean()\n", " cv = cv_data[i]\n", " return cv" ] }, { "cell_type": "code", "execution_count": 53, "id": "f030b1f5", "metadata": { "hidden": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████| 1967/1967 [48:06<00:00, 1.47s/it]\n" ] } ], "source": [ "import rpy2.robjects as robjects # r 함수를 파이썬에서 사용 가능하게 변환하는 모듈\n", "from rpy2.robjects import pandas2ri # 파이썬 자료형과 R 자료형의 호환을 도와주는 모듈\n", "import numpy as np\n", "\n", "# pandas2ri를 활성화 \n", "pandas2ri.activate()\n", "\n", "ets = \"\"\"\n", " function(ts){\n", " library(forecast) # forecast 패키지 로드\n", " model = ets(ts) # AIC가 낮은 지수평활 모델을 찾음 \n", " forecasted_data = forecast(model, h=3) # 이후 3개월(h=3)을 예측\n", " out_df = data.frame(forecasted_data$mean) # 예측값을 R의 데이터프레임으로 변환\n", " colnames(out_df) = c('amount') # amount라는 열로 이름을 지정\n", " out_df\n", " }\n", "\"\"\"\n", "\n", "# r() 함수로 r 자료형을 파이썬에서 사용 가능\n", "ets = robjects.r(ets)\n", "ts = robjects.r('ts') # r 자료형 time series 자료형으로 만들어주는 함수\n", "c = robjects.r('c') # r 자료형 벡터를 만들어주는 함수\n", "log = robjects.r('log') # 로그 변환 함수\n", "exp = robjects.r('exp')# 로그 역변환 함수\n", "\n", "final_pred = []\n", "\n", "for i in tqdm(resampling_data.store_id.unique()):\n", " store = resampling_data[resampling_data['store_id']==i]\n", " start_year = int(min(store['year_month'])[:4]) # 영업 시작 년도\n", " start_month = int(min(store['year_month'])[5:]) # 영업 시작 월\n", " \n", " cv = coefficient_variation(resampling_data, i)\n", " # 매출액 변동 계수가 0.3 미만인 경우만 log를 씌움\n", " if cv < 0.3:\n", " train_log = ts(log(store['amount']), start=c(start_year,start_month), frequency=12) \n", " # ets model\n", " forecast_log = ets(train_log)\n", " final_pred.append(np.sum(pandas2ri.ri2py(exp(forecast_log)).values))\n", " # 매출액 변동 계수가 0.3 이상인 경우\n", " else:\n", " train = ts(store['amount'], start=c(start_year,start_month), frequency=12)\n", " # 지수평활법\n", " forecast = ets(train)\n", " final_pred.append(np.sum(pandas2ri.ri2py(forecast).values)) " ] }, { "cell_type": "code", "execution_count": 54, "id": "dc640ec8", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
store_idamount
001.963401e+06
112.483335e+05
221.260318e+06
342.660373e+06
457.091162e+05
.........
196221322.122653e+06
196321333.488475e+05
196421344.249576e+05
196521351.603989e+06
196621366.504026e+06
\n", "

1967 rows × 2 columns

\n", "
" ], "text/plain": [ " store_id amount\n", "0 0 1.963401e+06\n", "1 1 2.483335e+05\n", "2 2 1.260318e+06\n", "3 4 2.660373e+06\n", "4 5 7.091162e+05\n", "... ... ...\n", "1962 2132 2.122653e+06\n", "1963 2133 3.488475e+05\n", "1964 2134 4.249576e+05\n", "1965 2135 1.603989e+06\n", "1966 2136 6.504026e+06\n", "\n", "[1967 rows x 2 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./submission.csv')\n", "submission['amount'] = final_pred\n", "submission.to_csv('submission4.csv', index=False)\n", "submission" ] }, { "cell_type": "markdown", "id": "447b24e2", "metadata": { "hidden": true }, "source": [ "### 4.5.2.\t파이썬에서 R 시계열 패키지 forecastHybrid를 통한 앙상블" ] }, { "cell_type": "code", "execution_count": 55, "id": "d621b190", "metadata": { "hidden": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████| 1967/1967 [38:40<00:00, 1.18s/it]\n" ] } ], "source": [ "import rpy2.robjects as robjects # r 함수를 파이썬에서 사용 가능하게 변환하는 모듈\n", "from rpy2.robjects import pandas2ri # 파이썬 자료형과 R 자료형의 호환을 도와주는 모듈\n", "import numpy as np\n", "\n", "# pandas2ri를 활성화 \n", "pandas2ri.activate()\n", "\n", "hybridModel = \"\"\"\n", " function(ts){\n", " library(forecast)\n", " library(forecastHybrid)\n", " d_params=ndiffs(ts)\n", " hb_mdl<-hybridModel(ts, models=\"aes\", # auto_arima, ets, stlm\n", " a.arg=list(max.p=2, d=d_params), # auto_arima parameter\n", " weight=\"equal\") # 가중치를 동일하게 줌(평균)\n", " forecasted_data<-forecast(hb_mdl, h=365) # 이후 3개월(h=3)을 예측\n", " outdf<-data.frame(forecasted_data$mean)\n", " colnames(outdf)<-c('amount')\n", " outdf\n", " }\n", "\"\"\" \n", "\n", "# r() 함수로 r 자료형을 파이썬에서 사용 가능\n", "hybridModel = robjects.r(hybridModel)\n", "ts = robjects.r('ts') # r 자료형 time series 자료형으로 만들어주는 함수\n", "c = robjects.r('c') # r 자료형 벡터를 만들어주는 함수\n", "log = robjects.r('log') # 로그 변환 함수\n", "exp = robjects.r('exp')# 로그 역변환 함수\n", "\n", "final_pred = []\n", "\n", "for i in tqdm(resampling_data.store_id.unique()):\n", " store = resampling_data[resampling_data['store_id']==i]\n", " start_year = int(min(store['year_month'])[:4]) # 영업 시작 년도\n", " start_month = int(min(store['year_month'])[5:]) # 영업 시작 월\n", " \n", " cv = coefficient_variation(resampling_data, i)\n", " # 매출액 변동 계수가 0.3 미만인 경우만 log를 씌움\n", " if cv < 0.3:\n", " train_log = ts(log(store['amount']), start=c(start_year,start_month), frequency=12) \n", " # 앙상블 예측\n", " forecast_log = hybridModel(train_log)\n", " final_pred.append(np.sum(pandas2ri.ri2py(exp(forecast_log)).values)) \n", " # 매출액 변동 계수가 0.3 이상인 경우\n", " else:\n", " train = ts(store['amount'], start=c(start_year,start_month), frequency=12)\n", " # 앙상블 예측\n", " forecast = hybridModel(train)\n", " final_pred.append(np.sum(pandas2ri.ri2py(forecast).values))\n" ] }, { "cell_type": "code", "execution_count": 57, "id": "dd7cad3b", "metadata": { "hidden": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
store_idamount
001.973673e+06
112.700469e+05
221.279552e+06
342.695279e+06
458.240620e+05
.........
196221322.115933e+06
196321336.311052e+05
196421343.021292e+05
196521351.626442e+06
196621366.504046e+06
\n", "

1967 rows × 2 columns

\n", "
" ], "text/plain": [ " store_id amount\n", "0 0 1.973673e+06\n", "1 1 2.700469e+05\n", "2 2 1.279552e+06\n", "3 4 2.695279e+06\n", "4 5 8.240620e+05\n", "... ... ...\n", "1962 2132 2.115933e+06\n", "1963 2133 6.311052e+05\n", "1964 2134 3.021292e+05\n", "1965 2135 1.626442e+06\n", "1966 2136 6.504046e+06\n", "\n", "[1967 rows x 2 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission = pd.read_csv('./submission.csv')\n", "submission['amount'] = final_pred\n", "submission.to_csv('submission5.csv', index=False)\n", "submission" ] }, { "cell_type": "code", "execution_count": null, "id": "2a052333", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1d3edfe3", "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1d653b73", "metadata": { "hidden": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "[dankook]", "language": "python", "name": "dankook" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }