{ "cells": [ { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# 데이터 로딩, 확인\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# feature 확인\n", "feature_name_df = pd.read_csv(\n", " \"../data/human_activity/features.txt\"\n", " , header = None\n", " , sep = '\\s+' # white space => 공백, \\, \\t\n", " , names = ['column_index', 'column_name']\n", " )" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
column_indexcolumn_name
01tBodyAcc-mean()-X
12tBodyAcc-mean()-Y
23tBodyAcc-mean()-Z
34tBodyAcc-std()-X
45tBodyAcc-std()-Y
\n", "
" ], "text/plain": [ " column_index column_name\n", "0 1 tBodyAcc-mean()-X\n", "1 2 tBodyAcc-mean()-Y\n", "2 3 tBodyAcc-mean()-Z\n", "3 4 tBodyAcc-std()-X\n", "4 5 tBodyAcc-std()-Y" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_name_df.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 컬럼명을 10개 추출\n", "type(feature_name_df.iloc[:,1])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 tBodyAcc-mean()-X\n", "1 tBodyAcc-mean()-Y\n", "2 tBodyAcc-mean()-Z\n", "3 tBodyAcc-std()-X\n", "4 tBodyAcc-std()-Y\n", "5 tBodyAcc-std()-Z\n", "6 tBodyAcc-mad()-X\n", "7 tBodyAcc-mad()-Y\n", "8 tBodyAcc-mad()-Z\n", "9 tBodyAcc-max()-X\n", "Name: column_name, dtype: object" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_name_df.iloc[:,1][:10]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',\n", " 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',\n", " 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',\n", " 'tBodyAcc-max()-X'], dtype=object)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_name_df.iloc[:,1].values[:10]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "column_index 42\n", "dtype: int64\n" ] } ], "source": [ "# 중복 여부 확인\n", "feature_dup_df = feature_name_df.groupby('column_name').count()\n", "print(feature_dup_df[feature_dup_df['column_index']>1].count())" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
column_index
column_name
fBodyAcc-bandsEnergy()-1,163
fBodyAcc-bandsEnergy()-1,243
fBodyAcc-bandsEnergy()-1,83
fBodyAcc-bandsEnergy()-17,243
fBodyAcc-bandsEnergy()-17,323
\n", "
" ], "text/plain": [ " column_index\n", "column_name \n", "fBodyAcc-bandsEnergy()-1,16 3\n", "fBodyAcc-bandsEnergy()-1,24 3\n", "fBodyAcc-bandsEnergy()-1,8 3\n", "fBodyAcc-bandsEnergy()-17,24 3\n", "fBodyAcc-bandsEnergy()-17,32 3" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_dup_df[feature_dup_df['column_index']>1].head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "def get_new_feature_name_df(old_feature_name_df):\n", " feature_dup_df = pd.DataFrame(\n", " data = old_feature_name_df.groupby('column_name').cumcount()\n", " , columns = ['dup_cnt']\n", " )\n", " feature_dup_df = feature_dup_df.reset_index()\n", " new_feature_name_df = pd.merge(\n", " old_feature_name_df.reset_index()\n", " , feature_dup_df\n", " , how = 'outer'\n", " )\n", " new_feature_name_df['column_name'] = \\\n", " new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis =1)\n", " new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)\n", " return new_feature_name_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.13 ('ml-dev')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "8500c1516a682a4e8776c4457fab4ff1f1d739e261725eabc29b69f9c9445475" } } }, "nbformat": 4, "nbformat_minor": 2 }