{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfeat_1feat_2feat_3feat_4feat_5feat_6feat_7feat_8feat_9...feat_85feat_86feat_87feat_88feat_89feat_90feat_91feat_92feat_93target
01100000000...100000000Class_1
12000000010...000000000Class_1
23000000010...000000000Class_1
34100161500...012000000Class_1
45000000000...100001000Class_1
\n", "

5 rows × 95 columns

\n", "
" ], "text/plain": [ " id feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 \\\n", "0 1 1 0 0 0 0 0 0 0 0 \n", "1 2 0 0 0 0 0 0 0 1 0 \n", "2 3 0 0 0 0 0 0 0 1 0 \n", "3 4 1 0 0 1 6 1 5 0 0 \n", "4 5 0 0 0 0 0 0 0 0 0 \n", "\n", " ... feat_85 feat_86 feat_87 feat_88 feat_89 feat_90 feat_91 \\\n", "0 ... 1 0 0 0 0 0 0 \n", "1 ... 0 0 0 0 0 0 0 \n", "2 ... 0 0 0 0 0 0 0 \n", "3 ... 0 1 2 0 0 0 0 \n", "4 ... 1 0 0 0 0 1 0 \n", "\n", " feat_92 feat_93 target \n", "0 0 0 Class_1 \n", "1 0 0 Class_1 \n", "2 0 0 Class_1 \n", "3 0 0 Class_1 \n", "4 0 0 Class_1 \n", "\n", "[5 rows x 95 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('./Data/otto_train.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data = data.drop(['id'], axis=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "mapping_dict = {'Class_1' : 1,\n", " 'Class_2' : 2,\n", " 'Class_3' : 3,\n", " 'Class_4' : 4,\n", " 'Class_5' : 5,\n", " 'Class_6' : 6,\n", " 'Class_7' : 7,\n", " 'Class_8' : 8,\n", " 'Class_9' : 9}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "after_mapping_target = data['target'].apply(lambda x : mapping_dict[x])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "feature_columns = list(data.columns.difference(['target']))\n", "X = data[feature_columns]\n", "y = after_mapping_target\n", "\n", "train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6771170006464124\n" ] } ], "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.metrics import accuracy_score\n", "\n", "clf = AdaBoostClassifier(n_estimators=100, random_state=1)\n", "clf.fit(train_x, train_y)\n", "pred1 = clf.predict(test_x)\n", "print(accuracy_score(test_y, pred1))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6085972850678733\n" ] } ], "source": [ "tree_model = DecisionTreeClassifier(max_depth=5)\n", "clf1 = AdaBoostClassifier(base_estimator=tree_model, n_estimators=100, random_state=0)\n", "clf1.fit(train_x, train_y)\n", "pred2 = clf1.predict(test_x)\n", "print(accuracy_score(test_y, pred2))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7427278603749192\n" ] } ], "source": [ "tree_model = DecisionTreeClassifier(max_depth=20)\n", "clf1 = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)\n", "clf1.fit(train_x, train_y)\n", "pred2 = clf1.predict(test_x)\n", "print(accuracy_score(test_y, pred2))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[20:55:09] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1593723618214/work/src/learner.cc:480: \n", "Parameters: { n_estimators } might not be used.\n", "\n", " This may not be accurate due to some parameters are only used in language bindings but\n", " passed down to XGBoost core. Or some parameters are not used but slip through this\n", " verification. Please open an issue if you find above cases.\n", "\n", "\n", "Accuracy : 76.67 %\n", "Time : 10.01 seconds\n" ] } ], "source": [ "import xgboost as xgb\n", "import time\n", "\n", "start = time.time()\n", "xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y)\n", "xgb_dtest = xgb.DMatrix(data=test_x)\n", "xgb_param = {'max_depth' : 10,\n", " 'learning_rate' : 0.01,\n", " 'n_estimators' : 100,\n", " 'num_class' : len(set(train_y)) + 1}\n", "\n", "xgb_model = xgb.train(params = xgb_param, dtrain=xgb_dtrain)\n", "xgb_model_predict = xgb_model.predict(xgb_dtest)\n", "\n", "print('Accuracy : %.2f' %(accuracy_score(test_y, xgb_model_predict) * 100), '%')\n", "print('Time : %.2f' %(time.time() - start), 'seconds')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xgb_model_predict" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021106 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 3110\n", "[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93\n", "[LightGBM] [Info] Start training from score -34.538776\n", "[LightGBM] [Info] Start training from score -3.476745\n", "[LightGBM] [Info] Start training from score -1.341381\n", "[LightGBM] [Info] Start training from score -2.039019\n", "[LightGBM] [Info] Start training from score -3.135151\n", "[LightGBM] [Info] Start training from score -3.125444\n", "[LightGBM] [Info] Start training from score -1.481556\n", "[LightGBM] [Info] Start training from score -3.074772\n", "[LightGBM] [Info] Start training from score -1.986562\n", "[LightGBM] [Info] Start training from score -2.533374\n", "Accuracy: 76.28 %\n", "Time: 8.22 seconds\n" ] } ], "source": [ "import lightgbm as lgb\n", "start = time.time() # 시작 시간 지정\n", "lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환\n", "lgb_param = {'max_depth': 10, # 트리 깊이\n", " 'learning_rate': 0.01, # Step Size\n", " 'n_estimators': 100, # Number of trees, 트리 생성 개수\n", " 'objective': 'multiclass', # 목적 함수\n", " 'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.\n", "lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행\n", "lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측\n", "print(\"Accuracy: %.2f\" % (accuracy_score(test_y, lgb_model_predict) * 100), \"%\") # 정확도 % 계산\n", "print(\"Time: %.2f\" % (time.time() - start), \"seconds\") # 코드 실행 시간 계산" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,\n", " 3.24234521e-02, 5.82126692e-02, 3.67722414e-02],\n", " [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,\n", " 3.25081119e-01, 9.38028846e-02, 6.50463131e-02],\n", " [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,\n", " 1.42318289e-02, 3.40230275e-02, 2.14919364e-02],\n", " ...,\n", " [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,\n", " 5.46934960e-02, 7.24513712e-02, 5.74635996e-01],\n", " [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,\n", " 2.45870954e-02, 5.65410617e-02, 3.62344513e-02],\n", " [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,\n", " 5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lgb_model.predict(test_x)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0:\tlearn: 0.5907034\ttotal: 390ms\tremaining: 38.6s\n", "1:\tlearn: 0.6356107\ttotal: 755ms\tremaining: 37s\n", "2:\tlearn: 0.6411256\ttotal: 1.12s\tremaining: 36.3s\n", "3:\tlearn: 0.6480344\ttotal: 1.49s\tremaining: 35.7s\n", "4:\tlearn: 0.6508222\ttotal: 1.86s\tremaining: 35.4s\n", "5:\tlearn: 0.6499939\ttotal: 2.24s\tremaining: 35.1s\n", "6:\tlearn: 0.6507818\ttotal: 2.61s\tremaining: 34.7s\n", "7:\tlearn: 0.6548422\ttotal: 2.98s\tremaining: 34.3s\n", "8:\tlearn: 0.6559533\ttotal: 3.35s\tremaining: 33.9s\n", "9:\tlearn: 0.6560947\ttotal: 3.71s\tremaining: 33.4s\n", "10:\tlearn: 0.6568421\ttotal: 4.07s\tremaining: 32.9s\n", "11:\tlearn: 0.6588219\ttotal: 4.45s\tremaining: 32.6s\n", "12:\tlearn: 0.6592259\ttotal: 4.83s\tremaining: 32.3s\n", "13:\tlearn: 0.6611248\ttotal: 5.23s\tremaining: 32.1s\n", "14:\tlearn: 0.6625591\ttotal: 5.6s\tremaining: 31.7s\n", "15:\tlearn: 0.6631853\ttotal: 5.96s\tremaining: 31.3s\n", "16:\tlearn: 0.6639328\ttotal: 6.32s\tremaining: 30.9s\n", "17:\tlearn: 0.6668821\ttotal: 6.71s\tremaining: 30.5s\n", "18:\tlearn: 0.6669630\ttotal: 7.09s\tremaining: 30.2s\n", "19:\tlearn: 0.6675286\ttotal: 7.46s\tremaining: 29.8s\n", "20:\tlearn: 0.6673266\ttotal: 7.83s\tremaining: 29.4s\n", "21:\tlearn: 0.6677104\ttotal: 8.2s\tremaining: 29.1s\n", "22:\tlearn: 0.6682558\ttotal: 8.57s\tremaining: 28.7s\n", "23:\tlearn: 0.6683972\ttotal: 8.93s\tremaining: 28.3s\n", "24:\tlearn: 0.6686599\ttotal: 9.3s\tremaining: 27.9s\n", "25:\tlearn: 0.6681952\ttotal: 9.67s\tremaining: 27.5s\n", "26:\tlearn: 0.6684982\ttotal: 10s\tremaining: 27.1s\n", "27:\tlearn: 0.6692053\ttotal: 10.4s\tremaining: 26.8s\n", "28:\tlearn: 0.6696699\ttotal: 10.8s\tremaining: 26.5s\n", "29:\tlearn: 0.6699325\ttotal: 11.2s\tremaining: 26.2s\n", "30:\tlearn: 0.6705992\ttotal: 11.6s\tremaining: 25.8s\n", "31:\tlearn: 0.6709426\ttotal: 12s\tremaining: 25.4s\n", "32:\tlearn: 0.6708012\ttotal: 12.3s\tremaining: 25.1s\n", "33:\tlearn: 0.6709426\ttotal: 12.7s\tremaining: 24.7s\n", "34:\tlearn: 0.6707002\ttotal: 13.1s\tremaining: 24.3s\n", "35:\tlearn: 0.6715082\ttotal: 13.5s\tremaining: 24s\n", "36:\tlearn: 0.6705992\ttotal: 13.9s\tremaining: 23.6s\n", "37:\tlearn: 0.6725991\ttotal: 14.3s\tremaining: 23.3s\n", "38:\tlearn: 0.6729829\ttotal: 14.6s\tremaining: 22.9s\n", "39:\tlearn: 0.6725991\ttotal: 15s\tremaining: 22.5s\n", "40:\tlearn: 0.6734273\ttotal: 15.4s\tremaining: 22.2s\n", "41:\tlearn: 0.6738314\ttotal: 15.8s\tremaining: 21.8s\n", "42:\tlearn: 0.6741546\ttotal: 16.2s\tremaining: 21.5s\n", "43:\tlearn: 0.6739728\ttotal: 16.6s\tremaining: 21.1s\n", "44:\tlearn: 0.6741950\ttotal: 17s\tremaining: 20.7s\n", "45:\tlearn: 0.6750636\ttotal: 17.4s\tremaining: 20.4s\n", "46:\tlearn: 0.6758919\ttotal: 17.8s\tremaining: 20s\n", "47:\tlearn: 0.6757707\ttotal: 18.1s\tremaining: 19.6s\n", "48:\tlearn: 0.6762151\ttotal: 18.5s\tremaining: 19.3s\n", "49:\tlearn: 0.6774474\ttotal: 18.9s\tremaining: 18.9s\n", "50:\tlearn: 0.6777100\ttotal: 19.3s\tremaining: 18.6s\n", "51:\tlearn: 0.6786594\ttotal: 19.7s\tremaining: 18.2s\n", "52:\tlearn: 0.6789827\ttotal: 20.1s\tremaining: 17.8s\n", "53:\tlearn: 0.6804372\ttotal: 20.6s\tremaining: 17.5s\n", "54:\tlearn: 0.6804372\ttotal: 20.9s\tremaining: 17.1s\n", "55:\tlearn: 0.6809220\ttotal: 21.3s\tremaining: 16.8s\n", "56:\tlearn: 0.6812250\ttotal: 21.7s\tremaining: 16.4s\n", "57:\tlearn: 0.6813058\ttotal: 22.2s\tremaining: 16s\n", "58:\tlearn: 0.6811846\ttotal: 22.6s\tremaining: 15.7s\n", "59:\tlearn: 0.6813260\ttotal: 23s\tremaining: 15.3s\n", "60:\tlearn: 0.6816694\ttotal: 23.4s\tremaining: 14.9s\n", "61:\tlearn: 0.6823159\ttotal: 23.8s\tremaining: 14.6s\n", "62:\tlearn: 0.6832653\ttotal: 24.2s\tremaining: 14.2s\n", "63:\tlearn: 0.6840734\ttotal: 24.6s\tremaining: 13.8s\n", "64:\tlearn: 0.6840734\ttotal: 25s\tremaining: 13.5s\n", "65:\tlearn: 0.6846592\ttotal: 25.4s\tremaining: 13.1s\n", "66:\tlearn: 0.6843360\ttotal: 25.8s\tremaining: 12.7s\n", "67:\tlearn: 0.6846390\ttotal: 26.2s\tremaining: 12.3s\n", "68:\tlearn: 0.6854269\ttotal: 26.7s\tremaining: 12s\n", "69:\tlearn: 0.6858309\ttotal: 27.1s\tremaining: 11.6s\n", "70:\tlearn: 0.6858309\ttotal: 27.5s\tremaining: 11.2s\n", "71:\tlearn: 0.6865783\ttotal: 28s\tremaining: 10.9s\n", "72:\tlearn: 0.6864167\ttotal: 28.4s\tremaining: 10.5s\n", "73:\tlearn: 0.6868611\ttotal: 28.8s\tremaining: 10.1s\n", "74:\tlearn: 0.6869217\ttotal: 29.3s\tremaining: 9.75s\n", "75:\tlearn: 0.6870429\ttotal: 29.6s\tremaining: 9.36s\n", "76:\tlearn: 0.6875278\ttotal: 30s\tremaining: 8.97s\n", "77:\tlearn: 0.6881136\ttotal: 30.5s\tremaining: 8.59s\n", "78:\tlearn: 0.6883762\ttotal: 30.9s\tremaining: 8.2s\n", "79:\tlearn: 0.6888207\ttotal: 31.3s\tremaining: 7.82s\n", "80:\tlearn: 0.6892449\ttotal: 31.7s\tremaining: 7.43s\n", "81:\tlearn: 0.6898509\ttotal: 32.1s\tremaining: 7.05s\n", "82:\tlearn: 0.6897095\ttotal: 32.5s\tremaining: 6.67s\n", "83:\tlearn: 0.6902549\ttotal: 33s\tremaining: 6.28s\n", "84:\tlearn: 0.6909822\ttotal: 33.4s\tremaining: 5.9s\n", "85:\tlearn: 0.6910832\ttotal: 33.8s\tremaining: 5.51s\n", "86:\tlearn: 0.6914468\ttotal: 34.2s\tremaining: 5.12s\n", "87:\tlearn: 0.6916084\ttotal: 34.7s\tremaining: 4.73s\n", "88:\tlearn: 0.6919922\ttotal: 35.1s\tremaining: 4.34s\n", "89:\tlearn: 0.6925579\ttotal: 35.5s\tremaining: 3.94s\n", "90:\tlearn: 0.6928407\ttotal: 35.9s\tremaining: 3.55s\n", "91:\tlearn: 0.6930427\ttotal: 36.4s\tremaining: 3.16s\n", "92:\tlearn: 0.6935073\ttotal: 36.8s\tremaining: 2.77s\n", "93:\tlearn: 0.6940932\ttotal: 37.2s\tremaining: 2.37s\n", "94:\tlearn: 0.6944972\ttotal: 37.6s\tremaining: 1.98s\n", "95:\tlearn: 0.6948810\ttotal: 38s\tremaining: 1.58s\n", "96:\tlearn: 0.6951840\ttotal: 38.4s\tremaining: 1.19s\n", "97:\tlearn: 0.6954264\ttotal: 38.8s\tremaining: 793ms\n", "98:\tlearn: 0.6955881\ttotal: 39.2s\tremaining: 396ms\n", "99:\tlearn: 0.6956285\ttotal: 39.6s\tremaining: 0us\n", "Accuracy: 69.64 %\n", "Time: 39.81 seconds\n" ] } ], "source": [ "import catboost as cb\n", "start = time.time() # 시작 시간 지정\n", "cb_dtrain = cb.Pool(data = train_x, label = train_y) # 학습 데이터를 Catboost 모델에 맞게 변환\n", "cb_param = {'max_depth': 10, # 트리 깊이\n", " 'learning_rate': 0.01, # Step Size\n", " 'n_estimators': 100, # Number of trees, 트리 생성 개수\n", " 'eval_metric': 'Accuracy', # 평가 척도\n", " 'loss_function': 'MultiClass'} # 손실 함수, 목적 함수\n", "cb_model = cb.train(pool = cb_dtrain, params = cb_param) # 학습 진행\n", "cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1\n", "print(\"Accuracy: %.2f\" % (accuracy_score(test_y, cb_model_predict) * 100), \"%\") # 정확도 % 계산\n", "print(\"Time: %.2f\" % (time.time() - start), \"seconds\") # 코드 실행 시간 계산" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.35426047, 1.22109587, 0.44230101, ..., -0.1698448 ,\n", " -0.02059177, -0.2130643 ],\n", " [-0.07235138, 0.42535181, 0.20060428, ..., 0.21863604,\n", " 0.2719157 , 0.25089315],\n", " [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,\n", " -0.24018767, -0.32984969],\n", " ...,\n", " [ 0.05304325, 0.02500267, -0.14752573, ..., -0.20741963,\n", " 0.12789417, 1.51166757],\n", " [-0.55093666, 1.7691278 , 0.99746884, ..., -0.3420542 ,\n", " -0.49799871, -0.38136323],\n", " [-0.3033724 , 0.09352675, -0.11808658, ..., 0.65825036,\n", " 1.05515787, -0.20799899]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cb_model.predict(test_x)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddatepricebedroomsbathroomsfloorswaterfrontconditiongradeyr_builtyr_renovatedzipcodelatlong
0712930052020141013T000000221900.031.001.0037195509817847.5112-122.257
1641410019220141209T000000538000.032.252.0037195119919812547.7210-122.319
2563150040020150225T000000180000.021.001.0036193309802847.7379-122.233
3248720087520141209T000000604000.043.001.0057196509813647.5208-122.393
4195440051020150218T000000510000.032.001.0038198709807447.6168-122.045
\n", "
" ], "text/plain": [ " id date price bedrooms bathrooms floors \\\n", "0 7129300520 20141013T000000 221900.0 3 1.00 1.0 \n", "1 6414100192 20141209T000000 538000.0 3 2.25 2.0 \n", "2 5631500400 20150225T000000 180000.0 2 1.00 1.0 \n", "3 2487200875 20141209T000000 604000.0 4 3.00 1.0 \n", "4 1954400510 20150218T000000 510000.0 3 2.00 1.0 \n", "\n", " waterfront condition grade yr_built yr_renovated zipcode lat \\\n", "0 0 3 7 1955 0 98178 47.5112 \n", "1 0 3 7 1951 1991 98125 47.7210 \n", "2 0 3 6 1933 0 98028 47.7379 \n", "3 0 5 7 1965 0 98136 47.5208 \n", "4 0 3 8 1987 0 98074 47.6168 \n", "\n", " long \n", "0 -122.257 \n", "1 -122.319 \n", "2 -122.233 \n", "3 -122.393 \n", "4 -122.045 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 데이터 불러오기\n", "data = pd.read_csv(\"./data/kc_house_data.csv\") \n", "data.head() # 데이터 확인" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # id, date, zipcode, lat, long 제거" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(15129, 8) (6484, 8) (15129,) (6484,)\n" ] } ], "source": [ "feature_columns = list(data.columns.difference(['price'])) # Price를 제외한 모든 행\n", "X = data[feature_columns]\n", "y = data['price']\n", "train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # 학습데이터와 평가데이터의 비율을 7:3\n", "print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/minseok/opt/anaconda3/lib/python3.8/site-packages/lightgbm/engine.py:151: UserWarning: Found `n_estimators` in params. Will use it instead of argument\n", " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001018 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 237\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 537729.263666\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n" ] } ], "source": [ "import lightgbm as lgb\n", "start = time.time() # 시작 시간 지정\n", "lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환\n", "lgb_param = {'max_depth': 10, # 트리 깊이\n", " 'learning_rate': 0.01, # Step Size\n", " 'n_estimators': 500, # Number of trees, 트리 생성 개수\n", " 'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.\n", "lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "210904.17249451784" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import mean_squared_error, r2_score\n", "from math import sqrt\n", "\n", "sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9543\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000886 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 229\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 538982.435984\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/minseok/opt/anaconda3/lib/python3.8/site-packages/lightgbm/engine.py:151: UserWarning: Found `n_estimators` in params. Will use it instead of argument\n", " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "9596\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000733 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 235\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 537744.524754\n", "9530\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000739 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 236\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 539889.654571\n", "9565\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000966 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 229\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 535155.529843\n", "9560\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000796 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 233\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 535273.912420\n", "9491\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000685 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 231\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 537581.127966\n", "9542\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001022 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 235\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 539255.693106\n", "9535\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000735 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 235\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 536829.666138\n", "9551\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000727 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 234\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 540163.022606\n", "9539\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n", "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000740 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 234\n", "[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8\n", "[LightGBM] [Info] Start training from score 534035.500297\n" ] } ], "source": [ "import random\n", "bagging_predict_result = [] # 빈 리스트 생성\n", "for _ in range(10):\n", " data_index = [data_index for data_index in range(train_x.shape[0])] # 학습 데이터의 인덱스를 리스트로 변환\n", " random_data_index = np.random.choice(data_index, train_x.shape[0]) # 데이터의 1/10 크기만큼 랜덤 샘플링, // 는 소수점을 무시하기 위함\n", " print(len(set(random_data_index)))\n", " lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # 학습 데이터를 LightGBM 모델에 맞게 변환\n", " lgb_param = {'max_depth': 14, # 트리 깊이\n", " 'learning_rate': 0.01, # Step Size\n", " 'n_estimators': 500, # Number of trees, 트리 생성 개수\n", " 'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.\n", " lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행\n", " predict1 = lgb_model.predict(test_x) # 테스트 데이터 예측\n", " bagging_predict_result.append(predict1) # 반복문이 실행되기 전 빈 리스트에 결과 값 저장" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[array([501112.91616712, 676014.78617208, 968790.26447095, ...,\n", " 337320.99244146, 932608.91026449, 463099.00220264]),\n", " array([522860.14214257, 658589.440505 , 900829.44610988, ...,\n", " 343306.42246959, 818990.10366725, 461393.16910036]),\n", " array([525472.78175825, 658555.76576486, 948198.33071618, ...,\n", " 346584.48020485, 894716.05405641, 468379.68198623]),\n", " array([512302.18528008, 622401.23750153, 888062.40223491, ...,\n", " 338982.58114998, 831565.77278752, 450965.10476254]),\n", " array([502058.92315601, 662044.74513498, 932452.48613691, ...,\n", " 347367.42201304, 937286.20211666, 458730.39103716]),\n", " array([508134.49741987, 574676.69633686, 937651.53259611, ...,\n", " 342633.98851126, 895013.8530175 , 472839.70904548]),\n", " array([514024.84727545, 639033.55079825, 962081.56382748, ...,\n", " 334140.57592373, 977231.25212313, 456209.00250602]),\n", " array([493958.54727075, 621934.169044 , 974356.48628427, ...,\n", " 332475.83750523, 937747.27835049, 464330.59216067]),\n", " array([516574.02784263, 651575.32097768, 946510.81537713, ...,\n", " 339958.66410864, 938510.58408088, 454251.75749311]),\n", " array([486084.0251391 , 646353.70202063, 986406.40778855, ...,\n", " 346966.47909864, 976376.07245224, 455730.69620078])]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bagging_predict_result" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산\n", "bagging_predict = [] # 빈 리스트 생성\n", "for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복\n", " temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)\n", " for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복\n", " temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장\n", " bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE: 211129.04999030154\n" ] } ], "source": [ "# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가\n", "\n", "print(\"RMSE: {}\".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 4 }