{
"cells": [
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# 데이터 로딩, 확인\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# feature 확인\n",
"feature_name_df = pd.read_csv(\n",
" \"../data/human_activity/features.txt\"\n",
" , header = None\n",
" , sep = '\\s+' # white space => 공백, \\, \\t\n",
" , names = ['column_index', 'column_name']\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" column_index | \n",
" column_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" tBodyAcc-mean()-X | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" tBodyAcc-mean()-Y | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" tBodyAcc-mean()-Z | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" tBodyAcc-std()-X | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" tBodyAcc-std()-Y | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" column_index column_name\n",
"0 1 tBodyAcc-mean()-X\n",
"1 2 tBodyAcc-mean()-Y\n",
"2 3 tBodyAcc-mean()-Z\n",
"3 4 tBodyAcc-std()-X\n",
"4 5 tBodyAcc-std()-Y"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_name_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 컬럼명을 10개 추출\n",
"type(feature_name_df.iloc[:,1])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 tBodyAcc-mean()-X\n",
"1 tBodyAcc-mean()-Y\n",
"2 tBodyAcc-mean()-Z\n",
"3 tBodyAcc-std()-X\n",
"4 tBodyAcc-std()-Y\n",
"5 tBodyAcc-std()-Z\n",
"6 tBodyAcc-mad()-X\n",
"7 tBodyAcc-mad()-Y\n",
"8 tBodyAcc-mad()-Z\n",
"9 tBodyAcc-max()-X\n",
"Name: column_name, dtype: object"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_name_df.iloc[:,1][:10]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',\n",
" 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',\n",
" 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',\n",
" 'tBodyAcc-max()-X'], dtype=object)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_name_df.iloc[:,1].values[:10]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"column_index 42\n",
"dtype: int64\n"
]
}
],
"source": [
"# 중복 여부 확인\n",
"feature_dup_df = feature_name_df.groupby('column_name').count()\n",
"print(feature_dup_df[feature_dup_df['column_index']>1].count())"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" column_index | \n",
"
\n",
" \n",
" column_name | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" fBodyAcc-bandsEnergy()-1,16 | \n",
" 3 | \n",
"
\n",
" \n",
" fBodyAcc-bandsEnergy()-1,24 | \n",
" 3 | \n",
"
\n",
" \n",
" fBodyAcc-bandsEnergy()-1,8 | \n",
" 3 | \n",
"
\n",
" \n",
" fBodyAcc-bandsEnergy()-17,24 | \n",
" 3 | \n",
"
\n",
" \n",
" fBodyAcc-bandsEnergy()-17,32 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" column_index\n",
"column_name \n",
"fBodyAcc-bandsEnergy()-1,16 3\n",
"fBodyAcc-bandsEnergy()-1,24 3\n",
"fBodyAcc-bandsEnergy()-1,8 3\n",
"fBodyAcc-bandsEnergy()-17,24 3\n",
"fBodyAcc-bandsEnergy()-17,32 3"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_dup_df[feature_dup_df['column_index']>1].head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"def get_new_feature_name_df(old_feature_name_df):\n",
" feature_dup_df = pd.DataFrame(\n",
" data = old_feature_name_df.groupby('column_name').cumcount()\n",
" , columns = ['dup_cnt']\n",
" )\n",
" feature_dup_df = feature_dup_df.reset_index()\n",
" new_feature_name_df = pd.merge(\n",
" old_feature_name_df.reset_index()\n",
" , feature_dup_df\n",
" , how = 'outer'\n",
" )\n",
" new_feature_name_df['column_name'] = \\\n",
" new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis =1)\n",
" new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)\n",
" return new_feature_name_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('ml-dev')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8500c1516a682a4e8776c4457fab4ff1f1d739e261725eabc29b69f9c9445475"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}