{ "cells": [ { "cell_type": "markdown", "id": "d95fae49", "metadata": {}, "source": [ "Catboost 조건\n", "\n", "문자형 변수(train 변수 중 하나)와 수치형 변수(y)\n", "ex) color와 grade\n", "\n", "Catboost 계산식\n", "(TargetSum + Prior) / (FeatureCount + 1)\n", "ex) color의 red 위치는 grade의 1, 3, 1 위치이고 -> TargetSum=(1+3+1)=5\n", "red의 개수는 3개 => FeatureCount=3\n", "Prior(=y의 평균) = (25)/10 = 2.5\n", "\n", "따라서 red는 (5+2.5)/(3+1) = 1.875로 변환된다.\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "0782f9a8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colorinterestsheight
0redsketching68
1bluepainting64
2blueinstruments87
3greensketching45
4redpainting54
5redvideo games64
6blackpainting67
7blackinstruments98
8bluesketching90
9greensketching87
\n", "
" ], "text/plain": [ " color interests height\n", "0 red sketching 68\n", "1 blue painting 64\n", "2 blue instruments 87\n", "3 green sketching 45\n", "4 red painting 54\n", "5 red video games 64\n", "6 black painting 67\n", "7 black instruments 98\n", "8 blue sketching 90\n", "9 green sketching 87" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import libraries\n", "import category_encoders as ce\n", "import pandas as pd\n", "\n", "# Make dataset\n", "train = pd.DataFrame({\n", " 'color': [\"red\", \"blue\", \"blue\", \"green\", \"red\", \"red\", \"black\", \"black\", \"blue\", \"green\"],\n", " 'interests': [\"sketching\", \"painting\", \"instruments\", \"sketching\", \"painting\", \"video games\", \n", " \"painting\", \"instruments\", \"sketching\", \"sketching\"],\n", " 'height': [68, 64, 87, 45, 54, 64, 67, 98, 90, 87],\n", " 'grade': [1, 2, 3, 2, 3, 1, 4, 4, 2, 3], \n", " })\n", "\n", "# Define train and target\n", "target = train[['grade']]\n", "train = train.drop('grade', axis = 1)\n", "train" ] }, { "cell_type": "code", "execution_count": 2, "id": "94260dc7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colorinterestsheight
01.8752.10000068
12.3752.87500064
22.3753.16666787
32.5002.10000045
41.8752.87500054
51.8752.50000064
63.5002.87500067
73.5003.16666798
82.3752.10000090
92.5002.10000087
\n", "
" ], "text/plain": [ " color interests height\n", "0 1.875 2.100000 68\n", "1 2.375 2.875000 64\n", "2 2.375 3.166667 87\n", "3 2.500 2.100000 45\n", "4 1.875 2.875000 54\n", "5 1.875 2.500000 64\n", "6 3.500 2.875000 67\n", "7 3.500 3.166667 98\n", "8 2.375 2.100000 90\n", "9 2.500 2.100000 87" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Define catboost encoder\n", "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n", "\n", "# Fit encoder and transform the features\n", "cbe_encoder.fit(train, target)\n", "train_cbe = cbe_encoder.transform(train)\n", "train_cbe" ] }, { "cell_type": "code", "execution_count": null, "id": "17ac413e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }