{
"cells": [
{
"cell_type": "markdown",
"id": "d95fae49",
"metadata": {},
"source": [
"Catboost 조건\n",
"\n",
"문자형 변수(train 변수 중 하나)와 수치형 변수(y)\n",
"ex) color와 grade\n",
"\n",
"Catboost 계산식\n",
"(TargetSum + Prior) / (FeatureCount + 1)\n",
"ex) color의 red 위치는 grade의 1, 3, 1 위치이고 -> TargetSum=(1+3+1)=5\n",
"red의 개수는 3개 => FeatureCount=3\n",
"Prior(=y의 평균) = (25)/10 = 2.5\n",
"\n",
"따라서 red는 (5+2.5)/(3+1) = 1.875로 변환된다.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "0782f9a8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" interests | \n",
" height | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" red | \n",
" sketching | \n",
" 68 | \n",
"
\n",
" \n",
" 1 | \n",
" blue | \n",
" painting | \n",
" 64 | \n",
"
\n",
" \n",
" 2 | \n",
" blue | \n",
" instruments | \n",
" 87 | \n",
"
\n",
" \n",
" 3 | \n",
" green | \n",
" sketching | \n",
" 45 | \n",
"
\n",
" \n",
" 4 | \n",
" red | \n",
" painting | \n",
" 54 | \n",
"
\n",
" \n",
" 5 | \n",
" red | \n",
" video games | \n",
" 64 | \n",
"
\n",
" \n",
" 6 | \n",
" black | \n",
" painting | \n",
" 67 | \n",
"
\n",
" \n",
" 7 | \n",
" black | \n",
" instruments | \n",
" 98 | \n",
"
\n",
" \n",
" 8 | \n",
" blue | \n",
" sketching | \n",
" 90 | \n",
"
\n",
" \n",
" 9 | \n",
" green | \n",
" sketching | \n",
" 87 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" color interests height\n",
"0 red sketching 68\n",
"1 blue painting 64\n",
"2 blue instruments 87\n",
"3 green sketching 45\n",
"4 red painting 54\n",
"5 red video games 64\n",
"6 black painting 67\n",
"7 black instruments 98\n",
"8 blue sketching 90\n",
"9 green sketching 87"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import libraries\n",
"import category_encoders as ce\n",
"import pandas as pd\n",
"\n",
"# Make dataset\n",
"train = pd.DataFrame({\n",
" 'color': [\"red\", \"blue\", \"blue\", \"green\", \"red\", \"red\", \"black\", \"black\", \"blue\", \"green\"],\n",
" 'interests': [\"sketching\", \"painting\", \"instruments\", \"sketching\", \"painting\", \"video games\", \n",
" \"painting\", \"instruments\", \"sketching\", \"sketching\"],\n",
" 'height': [68, 64, 87, 45, 54, 64, 67, 98, 90, 87],\n",
" 'grade': [1, 2, 3, 2, 3, 1, 4, 4, 2, 3], \n",
" })\n",
"\n",
"# Define train and target\n",
"target = train[['grade']]\n",
"train = train.drop('grade', axis = 1)\n",
"train"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "94260dc7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" interests | \n",
" height | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.875 | \n",
" 2.100000 | \n",
" 68 | \n",
"
\n",
" \n",
" 1 | \n",
" 2.375 | \n",
" 2.875000 | \n",
" 64 | \n",
"
\n",
" \n",
" 2 | \n",
" 2.375 | \n",
" 3.166667 | \n",
" 87 | \n",
"
\n",
" \n",
" 3 | \n",
" 2.500 | \n",
" 2.100000 | \n",
" 45 | \n",
"
\n",
" \n",
" 4 | \n",
" 1.875 | \n",
" 2.875000 | \n",
" 54 | \n",
"
\n",
" \n",
" 5 | \n",
" 1.875 | \n",
" 2.500000 | \n",
" 64 | \n",
"
\n",
" \n",
" 6 | \n",
" 3.500 | \n",
" 2.875000 | \n",
" 67 | \n",
"
\n",
" \n",
" 7 | \n",
" 3.500 | \n",
" 3.166667 | \n",
" 98 | \n",
"
\n",
" \n",
" 8 | \n",
" 2.375 | \n",
" 2.100000 | \n",
" 90 | \n",
"
\n",
" \n",
" 9 | \n",
" 2.500 | \n",
" 2.100000 | \n",
" 87 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" color interests height\n",
"0 1.875 2.100000 68\n",
"1 2.375 2.875000 64\n",
"2 2.375 3.166667 87\n",
"3 2.500 2.100000 45\n",
"4 1.875 2.875000 54\n",
"5 1.875 2.500000 64\n",
"6 3.500 2.875000 67\n",
"7 3.500 3.166667 98\n",
"8 2.375 2.100000 90\n",
"9 2.500 2.100000 87"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define catboost encoder\n",
"cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
"\n",
"# Fit encoder and transform the features\n",
"cbe_encoder.fit(train, target)\n",
"train_cbe = cbe_encoder.transform(train)\n",
"train_cbe"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17ac413e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}