# 라이브러리 import
import numpy as np
import pandas as pd
import missingno as msno
import datetime as dt
from matplotlib import pyplot as plt
import seaborn as sns


#csv 파일 불러오기
df = pd.read_csv("train.csv")


#상위 5개의 행만 불러오기
df.head()


# df의 컬럼 파악
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')


#df의 shape을 확인
df.shape
# 10886 rows, 12 columns

(10886, 12)


# 컬럼별 결측값 개수 구하기
df.isnull().sum()
# 결측값이 없음을 파악할 수 있음.

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64


#결측치의 분포를 시각화하기.
msno.matrix(df)
#missingno 모듈을 사용해서 데이터에 결측치가 있는지 확인해보고 이를 시각화해봄. -> 결측치 없음.

<Axes: >


df.shape
#결측치가 없으므로, 전처리 후 변화 X

(10886, 12)


#df의 기본 정보 출력하기
df.info()
#1개의 object형이 존재 -> 추후에 수치형으로 타입 변환을 해주어야 함.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


#기술통계 확인하기
df.describe()


# 정규성 확인

#왜도와 첨도의 수치를 보고 데이터의 치우침 정도 파악해보기.
df_stats = df.describe().T

skew_results = []
kurtosis_results = []
null_results = []
median_results = []

for idx, val in enumerate(df_stats.index):

  median_results.append(df[val].median())
  skew_results.append(df[val].skew())
  kurtosis_results.append(df[val].kurtosis())
  null_results.append(df[val].isnull().sum())

df_stats['median'] = median_results
df_stats['missing'] = null_results
df_stats['skewness'] = skew_results
df_stats['kurtosis'] = kurtosis_results

df_stats
#수치상으로는 변수 count의 왜도와 첨도에 큰 문제가 없게 출력됨.
#왜도(skewness):1.242066
#첨도(kurtosis):1.300093

# skewness(왜도)가 2, kurtosis(첨도)가 7보다 작기에 정규분포에서 크게 벗어나지 않고 정규성을 보임


#to_datetime을 활용하여 datetime 컬럼을 날짜로 인식하도록 변환.
df['datetime'] = pd.to_datetime(df['datetime'])


df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
#df['minute'] = df['datetime'].dt.minute
#df['second'] = df['datetime'].dt.second
df['dayofweek'] = df['datetime'].dt.dayofweek
# dayofweek 는 요일을 가져오는 말
#월(0) 화(1) 수(2) 목(3) 금(4) 토(5) 일(6)


df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'dayofweek'],
      dtype='object')


df['hour']
# 시간도 알맞게 변경되었음

0         0
1         1
2         2
3         3
4         4
         ..
10881    19
10882    20
10883    21
10884    22
10885    23
Name: hour, Length: 10886, dtype: int64


df['dayofweek']
# 요일도 알맞게 변경되었음

0        5
1        5
2        5
3        5
4        5
        ..
10881    2
10882    2
10883    2
10884    2
10885    2
Name: dayofweek, Length: 10886, dtype: int64


df.info()
#datatime 칼럼을 날짜로 인식하도록 변환하였기 때문에, 범주형 타입을 수치형으로 바꿀 수 있었음.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  int64         
 13  month       10886 non-null  int64         
 14  day         10886 non-null  int64         
 15  hour        10886 non-null  int64         
 16  dayofweek   10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(13)
memory usage: 1.4 MB


df.hist(column=['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'year', 'month', 'day', 'hour', 'dayofweek'], figsize=(15, 12), bins=16)
plt.tight_layout()  # 그래프간 간격 조정
plt.rc('font', size=14)
plt.show()


sns.countplot(data=df,x='season')
plt.show()

print(df['season'].value_counts())
sns.barplot(data=df,x='season',y='count')
plt.show()
# 'season' 데이터는 모두 고르게 존재하는 편이며
# x='season',y='count'로 분석했을 경우 가을이 가장 대여량이 많음을 알 수 있음

4    2734
2    2733
3    2733
1    2686
Name: season, dtype: int64


sns.countplot(data=df,x='holiday')
plt.show()

print(df['holiday'].value_counts())
sns.barplot(data=df,x='holiday',y='count')
plt.show()
# 'holiday' 칼럼은 공휴일이 아닌 데이터가 약 34배 많았으며
# x='holiday',y='count'로 분석했을 경우 대여량이 비슷한 것을 알 수 있음

0    10575
1      311
Name: holiday, dtype: int64


sns.countplot(data=df,x='year')
plt.show()
print(df['year'].value_counts())

sns.barplot(data=df,x='year',y='count')
plt.show()
# year 칼럼은 둘다 고르게 분포했으며
# x='year',y='count'로 분석했을 경우, 2011년 보다 2012년에 자전거 대여량 약 상승한 것을 알 수 있음.

2012    5464
2011    5422
Name: year, dtype: int64


sns.countplot(data=df,x='weather')
plt.show()
print(df['weather'].value_counts())

sns.barplot(data=df,x='weather',y='count')
plt.show()
# weather 칼럼은 맑음(=1) 데이터가 가장 많으며
# x='weather',y='count'로 분석했을 경우에도 맑음(=1)일때, 가장 많은 대여량임을 알 수 있음.

1    7192
2    2834
3     859
4       1
Name: weather, dtype: int64


sns.histplot(data=df,x='temp',kde=True)
plt.show()

# temp 칼럼은 정규분포에 가깝게 분포했으며
# x=temp, y=count로 분석했을 경우, 14도 일때 가장 대여량이 많음을 알 수 있음


sns.histplot(data=df,x='atemp',kde=True)
plt.show()

# x=atemp, y=count로 분석했을 경우, 31도 일때 가장 대여량이 많음을 알 수 있음


plt.rc('font', size=7)
plt.figure(figsize=(15, 6))
sns.barplot(data = df, x = 'humidity', y = 'count')

# x=humidity, y=count로 분석했을 경우, 습도가 적을수록 대여량이 많은 것을 알 수 있음

<Axes: xlabel='humidity', ylabel='count'>


sns.histplot(data=df,x='windspeed',kde=True)
plt.show()

# x=windspeed, y=count로 분석했을 경우, 풍속이 0일때 가장 대여량이 많은 것을 알 수 있


sns.countplot(data=df,x='year')
plt.show()
print(df['year'].value_counts())

sns.barplot(data=df,x='year',y='count')
plt.show()

# year 칼럼은 둘다 고르게 분포했으며
# x='year',y='count'로 분석했을 경우, 2011년 보다 2012년에 자전거 대여량 약 상승한 것을 알 수 있음.

2012    5464
2011    5422
Name: year, dtype: int64


sns.countplot(data=df,x='month')
plt.show()
print(df['month'].value_counts())

sns.barplot(data=df,x='month',y='count')
plt.show()

# month 칼럼의 데이터는 모두 고르게 분포하는 편이었으며
# x='month',y='count'로 분석했을 경우, 6월달에 가장 높은 대여량을 가지는 걸 알 수 있음.

5     912
6     912
7     912
8     912
12    912
10    911
11    911
4     909
9     909
2     901
3     901
1     884
Name: month, dtype: int64


sns.countplot(data=df,x='hour')
plt.show()
print(df['hour'].value_counts())

sns.barplot(data=df,x='hour',y='count')
plt.show()

# hour 칼럼 모두 데이터가 고른편이었으며

# x='hour',y='count'로 분석했을 경우,
# 오전 중엔 8시에 가장 많은 대여량을 보였고, 오후엔 17시에 가장 많은 대여량을 보였음.
# 이 시간대 모두 출퇴근 시간인 것을 알 수 있음.

12    456
13    456
22    456
21    456
20    456
19    456
18    456
17    456
16    456
15    456
14    456
23    456
11    455
10    455
9     455
8     455
7     455
6     455
0     455
1     454
5     452
2     448
4     442
3     433
Name: hour, dtype: int64


df.corr(method = 'pearson')

/var/folders/zq/q1l7ks592572ylj1c8lqbvmm0000gn/T/ipykernel_33668/3294333281.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr(method = 'pearson')


df_C = df.corr()

# 그림 사이즈 지정
fig, ax = plt.subplots( figsize=(14,10) )

# 삼각형 마스크
mask = np.zeros_like(df_C, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# 히트맵
sns.heatmap(df_C,
            annot = True,
            annot_kws={"size": 10},
            mask=mask,
            linewidths=.5,
            cbar_kws={"shrink": .5},
            vmin = -1,vmax = 1)
plt.show()

/var/folders/zq/q1l7ks592572ylj1c8lqbvmm0000gn/T/ipykernel_33668/3300754389.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df_C = df.corr()

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count
count	10886.000000	10886.000000	10886.000000	10886.000000	10886.00000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000
mean	2.506614	0.028569	0.680875	1.418427	20.23086	23.655084	61.886460	12.799395	36.021955	155.552177	191.574132
std	1.116174	0.166599	0.466159	0.633839	7.79159	8.474601	19.245033	8.164537	49.960477	151.039033	181.144454
min	1.000000	0.000000	0.000000	1.000000	0.82000	0.760000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	2.000000	0.000000	0.000000	1.000000	13.94000	16.665000	47.000000	7.001500	4.000000	36.000000	42.000000
50%	3.000000	0.000000	1.000000	1.000000	20.50000	24.240000	62.000000	12.998000	17.000000	118.000000	145.000000
75%	4.000000	0.000000	1.000000	2.000000	26.24000	31.060000	77.000000	16.997900	49.000000	222.000000	284.000000
max	4.000000	1.000000	1.000000	4.000000	41.00000	45.455000	100.000000	56.996900	367.000000	886.000000	977.000000

	count	mean	std	min	25%	50%	75%	max	median	skewness	kurtosis
season	10886.0	2.506614	1.116174	1.00	2.0000	3.000	4.0000	4.0000	3.000	-0.007076	-1.355661
holiday	10886.0	0.028569	0.166599	0.00	0.0000	0.000	0.0000	1.0000	0.000	5.660517	30.046975
workingday	10886.0	0.680875	0.466159	0.00	0.0000	1.000	1.0000	1.0000	1.000	-0.776163	-1.397828
weather	10886.0	1.418427	0.633839	1.00	1.0000	1.000	2.0000	4.0000	1.000	1.243484	0.395533
temp	10886.0	20.230860	7.791590	0.82	13.9400	20.500	26.2400	41.0000	20.500	0.003691	-0.914530
atemp	10886.0	23.655084	8.474601	0.76	16.6650	24.240	31.0600	45.4550	24.240	-0.102560	-0.850076
humidity	10886.0	61.886460	19.245033	0.00	47.0000	62.000	77.0000	100.0000	62.000	-0.086335	-0.759818
windspeed	10886.0	12.799395	8.164537	0.00	7.0015	12.998	16.9979	56.9969	12.998	0.588767	0.630133
casual	10886.0	36.021955	49.960477	0.00	4.0000	17.000	49.0000	367.0000	17.000	2.495748	7.551629
registered	10886.0	155.552177	151.039033	0.00	36.0000	118.000	222.0000	886.0000	118.000	1.524805	2.626081
count	10886.0	191.574132	181.144454	1.00	42.0000	145.000	284.0000	977.0000	145.000	1.242066	1.300093

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count	year	month	day	hour	dayofweek
season	1.000000	0.029368	-0.008126	0.008879	0.258689	0.264744	0.190610	-0.147121	0.096758	0.164011	0.163439	-0.004797	0.971524	0.001729	-0.006546	-0.010553
holiday	0.029368	1.000000	-0.250491	-0.007074	0.000295	-0.005215	0.001929	0.008409	0.043799	-0.020956	-0.005393	0.012021	0.001731	-0.015877	-0.000354	-0.191832
workingday	-0.008126	-0.250491	1.000000	0.033772	0.029966	0.024660	-0.010880	0.013373	-0.319111	0.119460	0.011594	-0.002482	-0.003394	0.009829	0.002780	-0.704267
weather	0.008879	-0.007074	0.033772	1.000000	-0.055035	-0.055376	0.406244	0.007261	-0.135918	-0.109340	-0.128655	-0.012548	0.012144	-0.007890	-0.022740	-0.047692
temp	0.258689	0.000295	0.029966	-0.055035	1.000000	0.984948	-0.064949	-0.017852	0.467097	0.318571	0.394454	0.061226	0.257589	0.015551	0.145430	-0.038466
atemp	0.264744	-0.005215	0.024660	-0.055376	0.984948	1.000000	-0.043536	-0.057473	0.462067	0.314635	0.389784	0.058540	0.264173	0.011866	0.140343	-0.040235
humidity	0.190610	0.001929	-0.010880	0.406244	-0.064949	-0.043536	1.000000	-0.318607	-0.348187	-0.265458	-0.317371	-0.078606	0.204537	-0.011335	-0.278011	-0.026507
windspeed	-0.147121	0.008409	0.013373	0.007261	-0.017852	-0.057473	-0.318607	1.000000	0.092276	0.091052	0.101369	-0.015221	-0.150192	0.036157	0.146631	-0.024804
casual	0.096758	0.043799	-0.319111	-0.135918	0.467097	0.462067	-0.348187	0.092276	1.000000	0.497250	0.690414	0.145241	0.092722	0.014109	0.302045	0.246959
registered	0.164011	-0.020956	0.119460	-0.109340	0.318571	0.314635	-0.265458	0.091052	0.497250	1.000000	0.970948	0.264265	0.169451	0.019111	0.380540	-0.084427
count	0.163439	-0.005393	0.011594	-0.128655	0.394454	0.389784	-0.317371	0.101369	0.690414	0.970948	1.000000	0.260403	0.166862	0.019826	0.400601	-0.002283
year	-0.004797	0.012021	-0.002482	-0.012548	0.061226	0.058540	-0.078606	-0.015221	0.145241	0.264265	0.260403	1.000000	-0.004932	0.001800	-0.004234	-0.003785
month	0.971524	0.001731	-0.003394	0.012144	0.257589	0.264173	0.204537	-0.150192	0.092722	0.169451	0.166862	-0.004932	1.000000	0.001974	-0.006818	-0.002266
day	0.001729	-0.015877	0.009829	-0.007890	0.015551	0.011866	-0.011335	0.036157	0.014109	0.019111	0.019826	0.001800	0.001974	1.000000	0.001132	-0.011070
hour	-0.006546	-0.000354	0.002780	-0.022740	0.145430	0.140343	-0.278011	0.146631	0.302045	0.380540	0.400601	-0.004234	-0.006818	0.001132	1.000000	-0.002925
dayofweek	-0.010553	-0.191832	-0.704267	-0.047692	-0.038466	-0.040235	-0.026507	-0.024804	0.246959	-0.084427	-0.002283	-0.003785	-0.002266	-0.011070	-0.002925	1.000000

1. Problem Define¶

2. Libraries Setting¶

3. Data Collection¶

4.Data Preprocessing¶

5.EDA¶

5-1 히스토그램¶

5-2 독립변수와 종속변수 탐색¶

5-2-1 Season (계절)¶

5-2-2 Holiday (공휴일)¶

5-2-2 Workingday (주중)¶

5-2-3 Weather (날씨)¶

5-2-4 Temp (온도)¶

5-2-5 atemp (체감 온도)¶

5-2-6 Humidity (습도)¶

5-2-7 windspeed (풍속)¶

5-2-8 Year (년도)¶

5-2-9 Month (달)¶

5-2-9 Hour (시간)¶

5-3 Heatmap¶

해석¶

최종 인사이트¶

향후 분석 방안¶