{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 데이터 로딩, 확인\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# feature 확인\n",
    "feature_name_df = pd.read_csv(\n",
    "    \"../data/human_activity/features.txt\"\n",
    "    , header = None\n",
    "    , sep = '\\s+' # white space => 공백, \\, \\t\n",
    "    , names = ['column_index', 'column_name']\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column_index</th>\n",
       "      <th>column_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>tBodyAcc-mean()-X</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>tBodyAcc-mean()-Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>tBodyAcc-mean()-Z</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>tBodyAcc-std()-X</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>tBodyAcc-std()-Y</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   column_index        column_name\n",
       "0             1  tBodyAcc-mean()-X\n",
       "1             2  tBodyAcc-mean()-Y\n",
       "2             3  tBodyAcc-mean()-Z\n",
       "3             4   tBodyAcc-std()-X\n",
       "4             5   tBodyAcc-std()-Y"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_name_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 컬럼명을 10개 추출\n",
    "type(feature_name_df.iloc[:,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    tBodyAcc-mean()-X\n",
       "1    tBodyAcc-mean()-Y\n",
       "2    tBodyAcc-mean()-Z\n",
       "3     tBodyAcc-std()-X\n",
       "4     tBodyAcc-std()-Y\n",
       "5     tBodyAcc-std()-Z\n",
       "6     tBodyAcc-mad()-X\n",
       "7     tBodyAcc-mad()-Y\n",
       "8     tBodyAcc-mad()-Z\n",
       "9     tBodyAcc-max()-X\n",
       "Name: column_name, dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_name_df.iloc[:,1][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',\n",
       "       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',\n",
       "       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',\n",
       "       'tBodyAcc-max()-X'], dtype=object)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_name_df.iloc[:,1].values[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column_index    42\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# 중복 여부 확인\n",
    "feature_dup_df = feature_name_df.groupby('column_name').count()\n",
    "print(feature_dup_df[feature_dup_df['column_index']>1].count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column_index</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>column_name</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>fBodyAcc-bandsEnergy()-1,16</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fBodyAcc-bandsEnergy()-1,24</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fBodyAcc-bandsEnergy()-1,8</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fBodyAcc-bandsEnergy()-17,24</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fBodyAcc-bandsEnergy()-17,32</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              column_index\n",
       "column_name                               \n",
       "fBodyAcc-bandsEnergy()-1,16              3\n",
       "fBodyAcc-bandsEnergy()-1,24              3\n",
       "fBodyAcc-bandsEnergy()-1,8               3\n",
       "fBodyAcc-bandsEnergy()-17,24             3\n",
       "fBodyAcc-bandsEnergy()-17,32             3"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_dup_df[feature_dup_df['column_index']>1].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_new_feature_name_df(old_feature_name_df):\n",
    "    feature_dup_df = pd.DataFrame(\n",
    "          data = old_feature_name_df.groupby('column_name').cumcount()\n",
    "        , columns = ['dup_cnt']\n",
    "        )\n",
    "    feature_dup_df = feature_dup_df.reset_index()\n",
    "    new_feature_name_df = pd.merge(\n",
    "          old_feature_name_df.reset_index()\n",
    "        , feature_dup_df\n",
    "        , how = 'outer'\n",
    "        )\n",
    "    new_feature_name_df['column_name'] = \\\n",
    "        new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis =1)\n",
    "    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)\n",
    "    return new_feature_name_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.13 ('ml-dev')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "8500c1516a682a4e8776c4457fab4ff1f1d739e261725eabc29b69f9c9445475"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}