{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0   1       1       0       0       0       0       0       0       0       0   \n",
       "1   2       0       0       0       0       0       0       0       1       0   \n",
       "2   3       0       0       0       0       0       0       0       1       0   \n",
       "3   4       1       0       0       1       6       1       5       0       0   \n",
       "4   5       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "   ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \\\n",
       "0  ...        1        0        0        0        0        0        0   \n",
       "1  ...        0        0        0        0        0        0        0   \n",
       "2  ...        0        0        0        0        0        0        0   \n",
       "3  ...        0        1        2        0        0        0        0   \n",
       "4  ...        1        0        0        0        0        1        0   \n",
       "\n",
       "   feat_92  feat_93   target  \n",
       "0        0        0  Class_1  \n",
       "1        0        0  Class_1  \n",
       "2        0        0  Class_1  \n",
       "3        0        0  Class_1  \n",
       "4        0        0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('./Data/otto_train.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop(['id'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping_dict = {'Class_1' : 1,\n",
    "               'Class_2' : 2,\n",
    "               'Class_3' : 3,\n",
    "               'Class_4' : 4,\n",
    "               'Class_5' : 5,\n",
    "               'Class_6' : 6,\n",
    "               'Class_7' : 7,\n",
    "               'Class_8' : 8,\n",
    "               'Class_9' : 9}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "after_mapping_target = data['target'].apply(lambda x : mapping_dict[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_columns = list(data.columns.difference(['target']))\n",
    "X = data[feature_columns]\n",
    "y = after_mapping_target\n",
    "\n",
    "train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6771170006464124\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "clf = AdaBoostClassifier(n_estimators=100, random_state=1)\n",
    "clf.fit(train_x, train_y)\n",
    "pred1 = clf.predict(test_x)\n",
    "print(accuracy_score(test_y, pred1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6085972850678733\n"
     ]
    }
   ],
   "source": [
    "tree_model = DecisionTreeClassifier(max_depth=5)\n",
    "clf1 = AdaBoostClassifier(base_estimator=tree_model, n_estimators=100, random_state=0)\n",
    "clf1.fit(train_x, train_y)\n",
    "pred2 = clf1.predict(test_x)\n",
    "print(accuracy_score(test_y, pred2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7427278603749192\n"
     ]
    }
   ],
   "source": [
    "tree_model = DecisionTreeClassifier(max_depth=20)\n",
    "clf1 = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)\n",
    "clf1.fit(train_x, train_y)\n",
    "pred2 = clf1.predict(test_x)\n",
    "print(accuracy_score(test_y, pred2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[20:55:09] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1593723618214/work/src/learner.cc:480: \n",
      "Parameters: { n_estimators } might not be used.\n",
      "\n",
      "  This may not be accurate due to some parameters are only used in language bindings but\n",
      "  passed down to XGBoost core.  Or some parameters are not used but slip through this\n",
      "  verification. Please open an issue if you find above cases.\n",
      "\n",
      "\n",
      "Accuracy : 76.67 %\n",
      "Time : 10.01 seconds\n"
     ]
    }
   ],
   "source": [
    "import xgboost as xgb\n",
    "import time\n",
    "\n",
    "start = time.time()\n",
    "xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y)\n",
    "xgb_dtest = xgb.DMatrix(data=test_x)\n",
    "xgb_param = {'max_depth' : 10,\n",
    "            'learning_rate' : 0.01,\n",
    "            'n_estimators' : 100,\n",
    "            'num_class' : len(set(train_y)) + 1}\n",
    "\n",
    "xgb_model = xgb.train(params = xgb_param, dtrain=xgb_dtrain)\n",
    "xgb_model_predict = xgb_model.predict(xgb_dtest)\n",
    "\n",
    "print('Accuracy : %.2f' %(accuracy_score(test_y, xgb_model_predict) * 100), '%')\n",
    "print('Time : %.2f' %(time.time() - start), 'seconds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb_model_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021106 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 3110\n",
      "[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93\n",
      "[LightGBM] [Info] Start training from score -34.538776\n",
      "[LightGBM] [Info] Start training from score -3.476745\n",
      "[LightGBM] [Info] Start training from score -1.341381\n",
      "[LightGBM] [Info] Start training from score -2.039019\n",
      "[LightGBM] [Info] Start training from score -3.135151\n",
      "[LightGBM] [Info] Start training from score -3.125444\n",
      "[LightGBM] [Info] Start training from score -1.481556\n",
      "[LightGBM] [Info] Start training from score -3.074772\n",
      "[LightGBM] [Info] Start training from score -1.986562\n",
      "[LightGBM] [Info] Start training from score -2.533374\n",
      "Accuracy: 76.28 %\n",
      "Time: 8.22 seconds\n"
     ]
    }
   ],
   "source": [
    "import lightgbm as lgb\n",
    "start = time.time() # 시작 시간 지정\n",
    "lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환\n",
    "lgb_param = {'max_depth': 10, # 트리 깊이\n",
    "            'learning_rate': 0.01, # Step Size\n",
    "            'n_estimators': 100, # Number of trees, 트리 생성 개수\n",
    "            'objective': 'multiclass', # 목적 함수\n",
    "            'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.\n",
    "lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행\n",
    "lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측\n",
    "print(\"Accuracy: %.2f\" % (accuracy_score(test_y, lgb_model_predict) * 100), \"%\") # 정확도 % 계산\n",
    "print(\"Time: %.2f\" % (time.time() - start), \"seconds\") # 코드 실행 시간 계산"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,\n",
       "        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],\n",
       "       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,\n",
       "        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],\n",
       "       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,\n",
       "        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],\n",
       "       ...,\n",
       "       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,\n",
       "        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],\n",
       "       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,\n",
       "        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],\n",
       "       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,\n",
       "        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lgb_model.predict(test_x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:\tlearn: 0.5907034\ttotal: 390ms\tremaining: 38.6s\n",
      "1:\tlearn: 0.6356107\ttotal: 755ms\tremaining: 37s\n",
      "2:\tlearn: 0.6411256\ttotal: 1.12s\tremaining: 36.3s\n",
      "3:\tlearn: 0.6480344\ttotal: 1.49s\tremaining: 35.7s\n",
      "4:\tlearn: 0.6508222\ttotal: 1.86s\tremaining: 35.4s\n",
      "5:\tlearn: 0.6499939\ttotal: 2.24s\tremaining: 35.1s\n",
      "6:\tlearn: 0.6507818\ttotal: 2.61s\tremaining: 34.7s\n",
      "7:\tlearn: 0.6548422\ttotal: 2.98s\tremaining: 34.3s\n",
      "8:\tlearn: 0.6559533\ttotal: 3.35s\tremaining: 33.9s\n",
      "9:\tlearn: 0.6560947\ttotal: 3.71s\tremaining: 33.4s\n",
      "10:\tlearn: 0.6568421\ttotal: 4.07s\tremaining: 32.9s\n",
      "11:\tlearn: 0.6588219\ttotal: 4.45s\tremaining: 32.6s\n",
      "12:\tlearn: 0.6592259\ttotal: 4.83s\tremaining: 32.3s\n",
      "13:\tlearn: 0.6611248\ttotal: 5.23s\tremaining: 32.1s\n",
      "14:\tlearn: 0.6625591\ttotal: 5.6s\tremaining: 31.7s\n",
      "15:\tlearn: 0.6631853\ttotal: 5.96s\tremaining: 31.3s\n",
      "16:\tlearn: 0.6639328\ttotal: 6.32s\tremaining: 30.9s\n",
      "17:\tlearn: 0.6668821\ttotal: 6.71s\tremaining: 30.5s\n",
      "18:\tlearn: 0.6669630\ttotal: 7.09s\tremaining: 30.2s\n",
      "19:\tlearn: 0.6675286\ttotal: 7.46s\tremaining: 29.8s\n",
      "20:\tlearn: 0.6673266\ttotal: 7.83s\tremaining: 29.4s\n",
      "21:\tlearn: 0.6677104\ttotal: 8.2s\tremaining: 29.1s\n",
      "22:\tlearn: 0.6682558\ttotal: 8.57s\tremaining: 28.7s\n",
      "23:\tlearn: 0.6683972\ttotal: 8.93s\tremaining: 28.3s\n",
      "24:\tlearn: 0.6686599\ttotal: 9.3s\tremaining: 27.9s\n",
      "25:\tlearn: 0.6681952\ttotal: 9.67s\tremaining: 27.5s\n",
      "26:\tlearn: 0.6684982\ttotal: 10s\tremaining: 27.1s\n",
      "27:\tlearn: 0.6692053\ttotal: 10.4s\tremaining: 26.8s\n",
      "28:\tlearn: 0.6696699\ttotal: 10.8s\tremaining: 26.5s\n",
      "29:\tlearn: 0.6699325\ttotal: 11.2s\tremaining: 26.2s\n",
      "30:\tlearn: 0.6705992\ttotal: 11.6s\tremaining: 25.8s\n",
      "31:\tlearn: 0.6709426\ttotal: 12s\tremaining: 25.4s\n",
      "32:\tlearn: 0.6708012\ttotal: 12.3s\tremaining: 25.1s\n",
      "33:\tlearn: 0.6709426\ttotal: 12.7s\tremaining: 24.7s\n",
      "34:\tlearn: 0.6707002\ttotal: 13.1s\tremaining: 24.3s\n",
      "35:\tlearn: 0.6715082\ttotal: 13.5s\tremaining: 24s\n",
      "36:\tlearn: 0.6705992\ttotal: 13.9s\tremaining: 23.6s\n",
      "37:\tlearn: 0.6725991\ttotal: 14.3s\tremaining: 23.3s\n",
      "38:\tlearn: 0.6729829\ttotal: 14.6s\tremaining: 22.9s\n",
      "39:\tlearn: 0.6725991\ttotal: 15s\tremaining: 22.5s\n",
      "40:\tlearn: 0.6734273\ttotal: 15.4s\tremaining: 22.2s\n",
      "41:\tlearn: 0.6738314\ttotal: 15.8s\tremaining: 21.8s\n",
      "42:\tlearn: 0.6741546\ttotal: 16.2s\tremaining: 21.5s\n",
      "43:\tlearn: 0.6739728\ttotal: 16.6s\tremaining: 21.1s\n",
      "44:\tlearn: 0.6741950\ttotal: 17s\tremaining: 20.7s\n",
      "45:\tlearn: 0.6750636\ttotal: 17.4s\tremaining: 20.4s\n",
      "46:\tlearn: 0.6758919\ttotal: 17.8s\tremaining: 20s\n",
      "47:\tlearn: 0.6757707\ttotal: 18.1s\tremaining: 19.6s\n",
      "48:\tlearn: 0.6762151\ttotal: 18.5s\tremaining: 19.3s\n",
      "49:\tlearn: 0.6774474\ttotal: 18.9s\tremaining: 18.9s\n",
      "50:\tlearn: 0.6777100\ttotal: 19.3s\tremaining: 18.6s\n",
      "51:\tlearn: 0.6786594\ttotal: 19.7s\tremaining: 18.2s\n",
      "52:\tlearn: 0.6789827\ttotal: 20.1s\tremaining: 17.8s\n",
      "53:\tlearn: 0.6804372\ttotal: 20.6s\tremaining: 17.5s\n",
      "54:\tlearn: 0.6804372\ttotal: 20.9s\tremaining: 17.1s\n",
      "55:\tlearn: 0.6809220\ttotal: 21.3s\tremaining: 16.8s\n",
      "56:\tlearn: 0.6812250\ttotal: 21.7s\tremaining: 16.4s\n",
      "57:\tlearn: 0.6813058\ttotal: 22.2s\tremaining: 16s\n",
      "58:\tlearn: 0.6811846\ttotal: 22.6s\tremaining: 15.7s\n",
      "59:\tlearn: 0.6813260\ttotal: 23s\tremaining: 15.3s\n",
      "60:\tlearn: 0.6816694\ttotal: 23.4s\tremaining: 14.9s\n",
      "61:\tlearn: 0.6823159\ttotal: 23.8s\tremaining: 14.6s\n",
      "62:\tlearn: 0.6832653\ttotal: 24.2s\tremaining: 14.2s\n",
      "63:\tlearn: 0.6840734\ttotal: 24.6s\tremaining: 13.8s\n",
      "64:\tlearn: 0.6840734\ttotal: 25s\tremaining: 13.5s\n",
      "65:\tlearn: 0.6846592\ttotal: 25.4s\tremaining: 13.1s\n",
      "66:\tlearn: 0.6843360\ttotal: 25.8s\tremaining: 12.7s\n",
      "67:\tlearn: 0.6846390\ttotal: 26.2s\tremaining: 12.3s\n",
      "68:\tlearn: 0.6854269\ttotal: 26.7s\tremaining: 12s\n",
      "69:\tlearn: 0.6858309\ttotal: 27.1s\tremaining: 11.6s\n",
      "70:\tlearn: 0.6858309\ttotal: 27.5s\tremaining: 11.2s\n",
      "71:\tlearn: 0.6865783\ttotal: 28s\tremaining: 10.9s\n",
      "72:\tlearn: 0.6864167\ttotal: 28.4s\tremaining: 10.5s\n",
      "73:\tlearn: 0.6868611\ttotal: 28.8s\tremaining: 10.1s\n",
      "74:\tlearn: 0.6869217\ttotal: 29.3s\tremaining: 9.75s\n",
      "75:\tlearn: 0.6870429\ttotal: 29.6s\tremaining: 9.36s\n",
      "76:\tlearn: 0.6875278\ttotal: 30s\tremaining: 8.97s\n",
      "77:\tlearn: 0.6881136\ttotal: 30.5s\tremaining: 8.59s\n",
      "78:\tlearn: 0.6883762\ttotal: 30.9s\tremaining: 8.2s\n",
      "79:\tlearn: 0.6888207\ttotal: 31.3s\tremaining: 7.82s\n",
      "80:\tlearn: 0.6892449\ttotal: 31.7s\tremaining: 7.43s\n",
      "81:\tlearn: 0.6898509\ttotal: 32.1s\tremaining: 7.05s\n",
      "82:\tlearn: 0.6897095\ttotal: 32.5s\tremaining: 6.67s\n",
      "83:\tlearn: 0.6902549\ttotal: 33s\tremaining: 6.28s\n",
      "84:\tlearn: 0.6909822\ttotal: 33.4s\tremaining: 5.9s\n",
      "85:\tlearn: 0.6910832\ttotal: 33.8s\tremaining: 5.51s\n",
      "86:\tlearn: 0.6914468\ttotal: 34.2s\tremaining: 5.12s\n",
      "87:\tlearn: 0.6916084\ttotal: 34.7s\tremaining: 4.73s\n",
      "88:\tlearn: 0.6919922\ttotal: 35.1s\tremaining: 4.34s\n",
      "89:\tlearn: 0.6925579\ttotal: 35.5s\tremaining: 3.94s\n",
      "90:\tlearn: 0.6928407\ttotal: 35.9s\tremaining: 3.55s\n",
      "91:\tlearn: 0.6930427\ttotal: 36.4s\tremaining: 3.16s\n",
      "92:\tlearn: 0.6935073\ttotal: 36.8s\tremaining: 2.77s\n",
      "93:\tlearn: 0.6940932\ttotal: 37.2s\tremaining: 2.37s\n",
      "94:\tlearn: 0.6944972\ttotal: 37.6s\tremaining: 1.98s\n",
      "95:\tlearn: 0.6948810\ttotal: 38s\tremaining: 1.58s\n",
      "96:\tlearn: 0.6951840\ttotal: 38.4s\tremaining: 1.19s\n",
      "97:\tlearn: 0.6954264\ttotal: 38.8s\tremaining: 793ms\n",
      "98:\tlearn: 0.6955881\ttotal: 39.2s\tremaining: 396ms\n",
      "99:\tlearn: 0.6956285\ttotal: 39.6s\tremaining: 0us\n",
      "Accuracy: 69.64 %\n",
      "Time: 39.81 seconds\n"
     ]
    }
   ],
   "source": [
    "import catboost as cb\n",
    "start = time.time() # 시작 시간 지정\n",
    "cb_dtrain = cb.Pool(data = train_x, label = train_y) # 학습 데이터를 Catboost 모델에 맞게 변환\n",
    "cb_param = {'max_depth': 10, # 트리 깊이\n",
    "            'learning_rate': 0.01, # Step Size\n",
    "            'n_estimators': 100, # Number of trees, 트리 생성 개수\n",
    "            'eval_metric': 'Accuracy', # 평가 척도\n",
    "            'loss_function': 'MultiClass'} # 손실 함수, 목적 함수\n",
    "cb_model = cb.train(pool = cb_dtrain, params = cb_param) # 학습 진행\n",
    "cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1\n",
    "print(\"Accuracy: %.2f\" % (accuracy_score(test_y, cb_model_predict) * 100), \"%\") # 정확도 % 계산\n",
    "print(\"Time: %.2f\" % (time.time() - start), \"seconds\") # 코드 실행 시간 계산"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,\n",
       "        -0.02059177, -0.2130643 ],\n",
       "       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,\n",
       "         0.2719157 ,  0.25089315],\n",
       "       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,\n",
       "        -0.24018767, -0.32984969],\n",
       "       ...,\n",
       "       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,\n",
       "         0.12789417,  1.51166757],\n",
       "       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,\n",
       "        -0.49799871, -0.38136323],\n",
       "       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,\n",
       "         1.05515787, -0.20799899]])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cb_model.predict(test_x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>condition</th>\n",
       "      <th>grade</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>zipcode</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7129300520</td>\n",
       "      <td>20141013T000000</td>\n",
       "      <td>221900.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1955</td>\n",
       "      <td>0</td>\n",
       "      <td>98178</td>\n",
       "      <td>47.5112</td>\n",
       "      <td>-122.257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6414100192</td>\n",
       "      <td>20141209T000000</td>\n",
       "      <td>538000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.25</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1951</td>\n",
       "      <td>1991</td>\n",
       "      <td>98125</td>\n",
       "      <td>47.7210</td>\n",
       "      <td>-122.319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5631500400</td>\n",
       "      <td>20150225T000000</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1933</td>\n",
       "      <td>0</td>\n",
       "      <td>98028</td>\n",
       "      <td>47.7379</td>\n",
       "      <td>-122.233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2487200875</td>\n",
       "      <td>20141209T000000</td>\n",
       "      <td>604000.0</td>\n",
       "      <td>4</td>\n",
       "      <td>3.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>1965</td>\n",
       "      <td>0</td>\n",
       "      <td>98136</td>\n",
       "      <td>47.5208</td>\n",
       "      <td>-122.393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1954400510</td>\n",
       "      <td>20150218T000000</td>\n",
       "      <td>510000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "      <td>1987</td>\n",
       "      <td>0</td>\n",
       "      <td>98074</td>\n",
       "      <td>47.6168</td>\n",
       "      <td>-122.045</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           id             date     price  bedrooms  bathrooms  floors  \\\n",
       "0  7129300520  20141013T000000  221900.0         3       1.00     1.0   \n",
       "1  6414100192  20141209T000000  538000.0         3       2.25     2.0   \n",
       "2  5631500400  20150225T000000  180000.0         2       1.00     1.0   \n",
       "3  2487200875  20141209T000000  604000.0         4       3.00     1.0   \n",
       "4  1954400510  20150218T000000  510000.0         3       2.00     1.0   \n",
       "\n",
       "   waterfront  condition  grade  yr_built  yr_renovated  zipcode      lat  \\\n",
       "0           0          3      7      1955             0    98178  47.5112   \n",
       "1           0          3      7      1951          1991    98125  47.7210   \n",
       "2           0          3      6      1933             0    98028  47.7379   \n",
       "3           0          5      7      1965             0    98136  47.5208   \n",
       "4           0          3      8      1987             0    98074  47.6168   \n",
       "\n",
       "      long  \n",
       "0 -122.257  \n",
       "1 -122.319  \n",
       "2 -122.233  \n",
       "3 -122.393  \n",
       "4 -122.045  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 데이터 불러오기\n",
    "data = pd.read_csv(\"./data/kc_house_data.csv\") \n",
    "data.head() # 데이터 확인"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # id, date, zipcode, lat, long  제거"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15129, 8) (6484, 8) (15129,) (6484,)\n"
     ]
    }
   ],
   "source": [
    "feature_columns = list(data.columns.difference(['price'])) # Price를 제외한 모든 행\n",
    "X = data[feature_columns]\n",
    "y = data['price']\n",
    "train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # 학습데이터와 평가데이터의 비율을 7:3\n",
    "print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/minseok/opt/anaconda3/lib/python3.8/site-packages/lightgbm/engine.py:151: UserWarning: Found `n_estimators` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001018 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 237\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 537729.263666\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n"
     ]
    }
   ],
   "source": [
    "import lightgbm as lgb\n",
    "start = time.time() # 시작 시간 지정\n",
    "lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환\n",
    "lgb_param = {'max_depth': 10, # 트리 깊이\n",
    "            'learning_rate': 0.01, # Step Size\n",
    "            'n_estimators': 500, # Number of trees, 트리 생성 개수\n",
    "            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.\n",
    "lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "210904.17249451784"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from math import sqrt\n",
    "\n",
    "sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9543\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000886 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 229\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 538982.435984\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/minseok/opt/anaconda3/lib/python3.8/site-packages/lightgbm/engine.py:151: UserWarning: Found `n_estimators` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9596\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000733 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 235\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 537744.524754\n",
      "9530\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000739 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 236\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 539889.654571\n",
      "9565\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000966 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 229\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 535155.529843\n",
      "9560\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000796 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 233\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 535273.912420\n",
      "9491\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000685 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 231\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 537581.127966\n",
      "9542\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001022 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 235\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 539255.693106\n",
      "9535\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000735 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 235\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 536829.666138\n",
      "9551\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000727 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 234\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 540163.022606\n",
      "9539\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000740 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 234\n",
      "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n",
      "[LightGBM] [Info] Start training from score 534035.500297\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "bagging_predict_result = [] # 빈 리스트 생성\n",
    "for _ in range(10):\n",
    "    data_index = [data_index for data_index in range(train_x.shape[0])] # 학습 데이터의 인덱스를 리스트로 변환\n",
    "    random_data_index = np.random.choice(data_index, train_x.shape[0]) # 데이터의 1/10 크기만큼 랜덤 샘플링, // 는 소수점을 무시하기 위함\n",
    "    print(len(set(random_data_index)))\n",
    "    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # 학습 데이터를 LightGBM 모델에 맞게 변환\n",
    "    lgb_param = {'max_depth': 14, # 트리 깊이\n",
    "            'learning_rate': 0.01, # Step Size\n",
    "            'n_estimators': 500, # Number of trees, 트리 생성 개수\n",
    "            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.\n",
    "    lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행\n",
    "    predict1 = lgb_model.predict(test_x) # 테스트 데이터 예측\n",
    "    bagging_predict_result.append(predict1) # 반복문이 실행되기 전 빈 리스트에 결과 값 저장"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([501112.91616712, 676014.78617208, 968790.26447095, ...,\n",
       "        337320.99244146, 932608.91026449, 463099.00220264]),\n",
       " array([522860.14214257, 658589.440505  , 900829.44610988, ...,\n",
       "        343306.42246959, 818990.10366725, 461393.16910036]),\n",
       " array([525472.78175825, 658555.76576486, 948198.33071618, ...,\n",
       "        346584.48020485, 894716.05405641, 468379.68198623]),\n",
       " array([512302.18528008, 622401.23750153, 888062.40223491, ...,\n",
       "        338982.58114998, 831565.77278752, 450965.10476254]),\n",
       " array([502058.92315601, 662044.74513498, 932452.48613691, ...,\n",
       "        347367.42201304, 937286.20211666, 458730.39103716]),\n",
       " array([508134.49741987, 574676.69633686, 937651.53259611, ...,\n",
       "        342633.98851126, 895013.8530175 , 472839.70904548]),\n",
       " array([514024.84727545, 639033.55079825, 962081.56382748, ...,\n",
       "        334140.57592373, 977231.25212313, 456209.00250602]),\n",
       " array([493958.54727075, 621934.169044  , 974356.48628427, ...,\n",
       "        332475.83750523, 937747.27835049, 464330.59216067]),\n",
       " array([516574.02784263, 651575.32097768, 946510.81537713, ...,\n",
       "        339958.66410864, 938510.58408088, 454251.75749311]),\n",
       " array([486084.0251391 , 646353.70202063, 986406.40778855, ...,\n",
       "        346966.47909864, 976376.07245224, 455730.69620078])]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bagging_predict_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산\n",
    "bagging_predict = [] # 빈 리스트 생성\n",
    "for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복\n",
    "    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)\n",
    "    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복\n",
    "        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장\n",
    "    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 211129.04999030154\n"
     ]
    }
   ],
   "source": [
    "# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가\n",
    "\n",
    "print(\"RMSE: {}\".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}