# 라이브러리 import

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder

from matplotlib import pyplot as plt
import seaborn as sns
import missingno as msno

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


#1. csv 파일을 불러오세요. (변수명은 df로 해주세요.)
df = pd.read_csv('train.csv')


#2. df 데이터를 상위 5개의 행만 보이게 출력하세요.
df.head()


# df의 컬럼 파악
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


#4. df의 shape을 확인하세요.
# 891 rows, 12 columns
df.shape

(891, 12)


# 보기 편하게 index를 1부터로 수정
df.reset_index(drop=True)
df.reset_index()
df.index += 1
df.head()


#5. missingno 모듈을 사용해서 데이터에 결측치가 있는지 확인하세요.(5점)
# missingno: 결측데이터들을 파악하는데 직관적인 도움을 주는 패키지
# 하얀색 부분이 널 값을 나타냄
# Age, Cabin, Embarked에 결측치가 있음을 시각적으로 확인할 수 있음
# 특히, Cabin의 경우 결측치가 다른 변수보다 많음
msno.matrix(df, figsize=(16,8))

<AxesSubplot:>


# 컬럼별 결측값 개수 구하고 싶을 때 (간단히 숫자로 결측치 확인하고 싶을 때)
# Age 컬럼에는 177개의 결측값이, Cabin 컬럼에는 687개의 결측값이, Embarked 컬럼에는 2개의 결측값이 있네?
# 그럼 이 결측값을 어떻게 처리하는게 좋을지 생각해봐야겠다!
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


#6. 위 결측치를 보고, 어떻게 전처리 하고 싶은지 작성하세요. (5점)
#7. 하고자 하는 결측치 전처리를 코드로 작성해서 실행해주세요. (5점)


# Cabin은 null 값이 많으니 컬럼 자체를 제외하자
df = df.drop(['Cabin'], axis=1)


# Embarked는 가장 많은 S로 널 값을 채워주자
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64


df['Embarked'] = df['Embarked'].fillna('S')


# Age에서 널 값이 있는 행만 제거하자
df = df.dropna()


df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


df.shape
# 전처리 진행 후 (891, 12) -> (714, 11)

(714, 11)


#8. df의 기본 정보를 출력해주세요.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 1 to 891
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Name         714 non-null    object 
 4   Sex          714 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        714 non-null    int64  
 7   Parch        714 non-null    int64  
 8   Ticket       714 non-null    object 
 9   Fare         714 non-null    float64
 10  Embarked     714 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 66.9+ KB


df.describe()


# 기술통계 값은 describe() 함수를 사용해 확인할 수 있음
# 중간값, 결측치, 왜도, 첨도 값은 describe()로 알 수 없기 때문에 이 정보들을 알고 싶다면 아래 코드를 돌려보자

df_stats = df.describe().T

skew_results = []
kurtosis_results = []
null_results = []
median_results = []

for idx, val in enumerate(df_stats.index):

  median_results.append(df[val].median())
  skew_results.append(df[val].skew())
  kurtosis_results.append(df[val].kurtosis())
  null_results.append(df[val].isnull().sum())

df_stats['median'] = median_results
df_stats['missing'] = null_results
df_stats['skewness'] = skew_results
df_stats['kurtosis'] = kurtosis_results

df_stats


#10. Pclass, Sex, Sibsp, Parch값과 생존 여부를 groupby() 함수를 통해 각각 출력해주세요. (각 5점)
#11. 출력 값을 보고, 어떤 경우에 생존 확률이 높은지 서술해주세요. (10점)

# 좌석 등급(Pclass)이 높은 1등석이 생존 확률이 가장 높음
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False)


# 성별(Sex)은 여성일 때가 생존 확률이 높음
df[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().sort_values(by='Survived', ascending=False)


# 형제 자매/배우자 수는 1명일 때가 생존 확률이 가장 높음
df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=True).mean().sort_values(by='Survived', ascending=False)


# 부모와 자식 수는 3명일 때가 생존 확률이 가장 높음
df[['Parch', 'Survived']].groupby(['Parch'], as_index=True).mean().sort_values(by='Survived', ascending=False)


# 종속 변수 (y), Survived 데이터가 균형을 이루고 있는지 확인
# 종속 변수의 특정 클래스의 수가 다른 클래스의 수보다 많을 때, 클래스 불균형 데이터를 사용해 분류 모델을 학습하면 분류 성능이 저하되는 문제가 발생함
# -> Imbalanced problem이라고 함
# 여기에서는 데이터가 어느 정도 균형을 이루고 있음
# 균형을 맞추기 위한 Resampling method (Over sampling, Under sampling, Hybrid resampling)
df['Survived'].value_counts()

0    424
1    290
Name: Survived, dtype: int64


total = df['Survived'].value_counts()[0] + df['Survived'].value_counts()[1]

print("Survived = 0은", round(df['Survived'].value_counts()[0] / total*100, 2), '퍼센트')
print("Survived = 1은", round(df['Survived'].value_counts()[1] / total*100, 2), '퍼센트')

Survived = 0은 59.38 퍼센트
Survived = 1은 40.62 퍼센트


#12. df의 기본 정보를 출력해주세요.
# 데이터 타입은 int형, object형, float형이 섞여 있음
# 모델을 돌릴 때, object형은 들어가면 안되기 때문에 수치형으로 타입 변환을 해주어야 함
# 범주형 타입: Name, Sex, Ticket, Embarked
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 1 to 891
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Name         714 non-null    object 
 4   Sex          714 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        714 non-null    int64  
 7   Parch        714 non-null    int64  
 8   Ticket       714 non-null    object 
 9   Fare         714 non-null    float64
 10  Embarked     714 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 66.9+ KB


#13. 범주형 데이터들이 섞여 있습니다. 후에, 모델을 돌릴 때, 범주형 데이터가 있으면 안될 것 같은데, 어떤 전처리를 하고 싶은지 작성해주세요. (10점)
#14. 13번에서 적은 과정을 코드로 실행해주세요. (5점)


# Name 전처리
# ,(콤마)와 .(점)을 기준으로 해서 중간의 호칭을 구분 (Miss, Mr, Mrs 등)
df['Name'].value_counts()

Braund, Mr. Owen Harris                            1
Kimball, Mr. Edwin Nelson Jr                       1
Chapman, Mr. John Henry                            1
Van Impe, Mr. Jean Baptiste                        1
Johnson, Mr. Alfred                                1
                                                  ..
Allison, Miss. Helen Loraine                       1
Baxter, Mrs. James (Helene DeLaudeniere Chaput)    1
Johnson, Mr. William Cahoone Jr                    1
Allison, Master. Hudson Trevor                     1
Dooley, Mr. Patrick                                1
Name: Name, Length: 714, dtype: int64


arr_title = list()
for s in df['Name'].values:
    title = s.split(",")[1].split(".")[0].replace(" ", "")

    arr_title.append(title)

df['Title'] = arr_title
df.tail()


# 호칭 개수 확인
df['Title'].value_counts()

Mr             398
Miss           146
Mrs            108
Master          36
Rev              6
Dr               6
Mlle             2
Major            2
Col              2
theCountess      1
Capt             1
Ms               1
Sir              1
Lady             1
Mme              1
Don              1
Jonkheer         1
Name: Title, dtype: int64


# Mr, Miss, Mrs, Master를 제외하고 나머지는 Others로 분류
df['Title'] = df['Title'].replace(['Rev','Dr','Col','Major','Sir','Don','Lady','theCountess'
                                   ,'Jonkheer','Dona','Capt'],'Others')

# 또한 프랑스어 및 영어 Ms는 다음과 같이 처리
# Mlle -> Miss  / Mme -> Mrs / Ms -> Miss

df['Title'] = df['Title'].replace('Mlle','Miss')
df['Title'] = df['Title'].replace('Ms','Miss')
df['Title'] = df['Title'].replace('Mme','Mrs')
df['Title'].value_counts()

Mr        398
Miss      149
Mrs       109
Master     36
Others     22
Name: Title, dtype: int64


# Sex를 전처리
# femal = 0, male = 1로 레이블 인코딩
# 사이킷런의 LabelEncoder를 이용하면 쉽게 변환할 수 있음
df['Sex'].value_counts()

male      453
female    261
Name: Sex, dtype: int64


enc = LabelEncoder()
enc.fit(df['Sex'])
df['Sex'] = enc.transform(df['Sex'])
df['Sex']

1      1
2      0
3      0
4      0
5      1
      ..
886    0
887    1
888    0
890    1
891    1
Name: Sex, Length: 714, dtype: int32


# 숫자로 인코딩된 값
enc.classes_

array(['female', 'male'], dtype=object)


# 해당 값에 대한 숫자
enc.inverse_transform([0,1])

array(['female', 'male'], dtype=object)


# Embarked(정박 항구)는 S, C, Q로 분류되어 있음
# C = 0, Q = 1, S = 2로 레이블 인코딩
df['Embarked'].value_counts()

S    556
C    130
Q     28
Name: Embarked, dtype: int64


enc = LabelEncoder()
df['Embarked'] = enc.fit_transform(df['Embarked'])
df.head()


# Pclass, Sex, Tile, Embarked 범주형 변수들 dummy 처리
# 보통 원-핫 인코딩이라고 불리며 전에 했던 레이블 인코딩과는 다른 방식으로 0과 1로만 이루어진 열을 생성

# Pclass, Dummy 처리
ohe_pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
ohe_pclass.head()


df = pd.concat([df, ohe_pclass], axis=1)
df.head()


# Sex, Embarked도 같은 방식으로 원-핫 인코딩을 한 다음 df 데이터 프레임에 합친 후 결과
# Sex, Dummy 처리
ohe_sex = pd.get_dummies(df['Sex'], prefix='Sex')
df = pd.concat([df, ohe_sex], axis=1)

# Title, Dummy 처리
ohe_title = pd.get_dummies(df['Title'], prefix='Title')
df = pd.concat([df, ohe_title], axis=1)

# Embarked, Dummy 처리
ohe_embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, ohe_embarked], axis=1)

df.head()


#15. 데이터 간 상관관계를 히트맵을 통해 분석 (총 15점)
# 조건1. 소수점 둘째 자리까지만 표시 (2점)
# 조건2. 모양은 삼각형으로 나올 것 (3점)
# 조건3. positive, negative 값이 0.2 이상인 상관관계만 추출
# 조건4. 위 조건을 만족하는 positive 상관관계가 0.2 이상인 히트맵과 negative 상관관계가 -0.2 이하인 히트앱을 각각 시각화할 것 (각 5점)


# 시각화를 통해 데이터의 특징을 살펴보자
# 데이터 간 상관관계를 히트맵을 통해 분석
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)

plt.figure(figsize=(20,16))
sns.heatmap(df.corr(), annot=True, cmap=cmap, linewidths=.5, fmt='.2f', annot_kws={"size":10})
plt.show()


# 반대쪽 삼각형은 안 보이게 설정
# fmt = 실제 값 표시
# .2f = 소수점 둘째 자리

df_corr = df.corr()
mask = np.zeros_like(df_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

df_corr_positie = df_corr[df_corr>=0.2]
df_corr_negative = df_corr[(df_corr <= -0.2) & (df_corr <= 1.0)| (df_corr == 1.0)]


# positive 상관관계 0.2 이상
plt.figure(figsize=(20, 16))

sns.heatmap(df_corr_positie, annot=True, mask=mask, cmap=cmap, linewidths=.6, fmt='.2f', annot_kws={'size':10})
plt.show()

# 생존에 가장 큰 영향을 끼치는 독립 변수는 Sex_female(여성), Tile_Miss, Title_Mrs, Pclass_1, Fare 순으로 나타남
# 즉, 성별이 Survived아 positive 관점에서 가장 큰 상관관계가 있음


# Negative 상관관계 -0.2 이하
plt.figure(figsize=(20, 16))

sns.heatmap(df_corr_negative, annot=True, mask=mask, cmap=cmap, linewidths=.6, fmt='.2f', annot_kws={'size':10})
plt.show()

# Sex, Sex_male, Tile_Mr, Pclass, Pclass_3 순로 negative 상관관계 확인
# 즉, 남성이 대다수 사망했다는 것과 좌석 등급이 낮을수록 사망 확률이 높다고 예측 가능


#16. 위 시각화 결과를 통해 알 수 있는 정보는 무엇인가요? (5점)


#17. 위 결과를 통해 종속변수와 독립변수의 관계를 시각화 해보고 싶은게 있다면 시각화해 보고 인사이트를 도출하시오. (10점)


# 성별이 생존 여부에 큰 영향을 끼치고 있구나!
df_corr.sort_values(by='Survived', ascending=False)[['Survived']]


# 여성의 생존 확률이 남성보다 월등히 높음
sns.barplot(x='Sex', y='Survived', data=df)

<AxesSubplot:xlabel='Sex', ylabel='Survived'>


# 여성의 생존 확률이 남성보다 월등히 높음
# 여성: Mrs, Miss, 남성: Mr
sns.barplot(x='Title', y='Survived', data=df)

<AxesSubplot:xlabel='Title', ylabel='Survived'>


# 좌석 등급이 높을수록 생존 확률이 높음
# 1등성은 주로 중앙에 분포하고, 갑판 가까이에 있어 탈출이 용이
sns.barplot(x='Pclass', y='Survived', data=df)

<AxesSubplot:xlabel='Pclass', ylabel='Survived'>


# 호칭과 성별을 함께 봐도 여성의 생존률이 더 높음
sns.barplot(x='Sex',y='Survived', hue='Title', data=df)

<AxesSubplot:xlabel='Sex', ylabel='Survived'>


# 좌석 등급과 성별을 함께 볼 경우, 좌석 등급이 1,2인 경우 생존 확률이 높았으며 특히 1등석 남성의 경우 2,3등석보다 생존 확률이 2배 이상 높음
sns.barplot(x='Pclass',y='Survived', hue='Sex', data=df)

<AxesSubplot:xlabel='Pclass', ylabel='Survived'>


# Sex 별로 생존자 확인
pd.crosstab(df.Sex, df.Survived, margins=
            True).style.background_gradient(cmap='Blues')


# Pclass 별로 생존자 확인
pd.crosstab(df.Pclass, df.Survived, margins=
            True).style.background_gradient(cmap='Blues')


# Title 별로 생존자 확인
pd.crosstab(df.Title, df.Survived, margins=
            True).style.background_gradient(cmap='Blues')


#18. 왜 이 모델에 로지스틱 회귀를 사용하면 좋을까?
# 종속변수를 봐보자!
# 종속변수는 (생존 한다, 안 한다.) 이진 분류이다.
# 로지스틱 회귀 분석은 종속 변수를 0과 1 사이로 산출하게 하는 로지스틱 함수를 사용하는 분류 방법론이다.

# 독립변수로는 Pclass, Sex, Age, Sibsp, Parch, Fare
# 종속변수는 Survival
# 이렇게 변수를 설정해서 로지스틱 회귀 모델을 사용하면 생존 여부를 예측할 수 있을 것이다.

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
1	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
2	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
3	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
4	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
5	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	714.000000	714.000000	714.000000	714.000000	714.000000	714.000000	714.000000
mean	448.582633	0.406162	2.236695	29.699118	0.512605	0.431373	34.694514
std	259.119524	0.491460	0.838250	14.526497	0.929783	0.853289	52.918930
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	222.250000	0.000000	1.000000	20.125000	0.000000	0.000000	8.050000
50%	445.000000	0.000000	2.000000	28.000000	0.000000	0.000000	15.741700
75%	677.750000	1.000000	3.000000	38.000000	1.000000	1.000000	33.375000
max	891.000000	1.000000	3.000000	80.000000	5.000000	6.000000	512.329200

	count	mean	std	min	25%	50%	75%	max	median	skewness	kurtosis
PassengerId	714.0	448.582633	259.119524	1.00	222.250	445.0000	677.750	891.0000	445.0000	-0.000609	-1.224109
Survived	714.0	0.406162	0.491460	0.00	0.000	0.0000	1.000	1.0000	0.0000	0.382945	-1.858567
Pclass	714.0	2.236695	0.838250	1.00	1.000	2.0000	3.000	3.0000	2.0000	-0.468543	-1.419558
Age	714.0	29.699118	14.526497	0.42	20.125	28.0000	38.000	80.0000	28.0000	0.389108	0.178274
SibSp	714.0	0.512605	0.929783	0.00	0.000	0.0000	1.000	5.0000	0.0000	2.519577	7.044951
Parch	714.0	0.431373	0.853289	0.00	0.000	0.0000	1.000	6.0000	0.0000	2.618914	8.853126
Fare	714.0	34.694514	52.918930	0.00	8.050	15.7417	33.375	512.3292	15.7417	4.653630	30.924249

	Survived
Pclass
1	0.655914
2	0.479769
3	0.239437

발표자: 최보현¶

EDA란?¶

데이터 수집 및 정제 과정¶

EDA 왜 할까?¶

EDA의 단계?¶

1. Problem Define¶

2. Libraries Setting¶

3. Data Collection¶

4.Data Preprocessing¶

5.EDA¶

수고하셨습니다 :)¶

	Survived
Parch
3	0.600000
2	0.573529
1	0.554545
0	0.357006
5	0.200000
4	0.000000
6	0.000000

	PassengerId	Survived	Pclass	Name	Sex	Age	Parch	Ticket	Fare	Embarked	Title
886	886	0	3	Rice, Mrs. William (Margaret Norton)	female	39.0	5	382652	29.125	Q	Mrs
887	887	0	2	Montvila, Rev. Juozas	male	27.0	0	211536	13.000	S	Rev
888	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	112053	30.000	S	Miss
890	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	111369	30.000	C	Mr
891	891	0	3	Dooley, Mr. Patrick	male	32.0	0	370376	7.750	Q	Mr

	Survived
Survived	1.000000
Sex_0	0.538826
Title_Miss	0.333216
Title_Mrs	0.330869
Pclass_1	0.301831
Fare	0.268189
Embarked_0	0.193607
Parch	0.093317
Pclass_2	0.084753
Title_Master	0.083128
PassengerId	0.029340
Title_Others	-0.015439
SibSp	-0.017358
Embarked_1	-0.049549
Age	-0.077221
Embarked_2	-0.156815
Embarked	-0.179775
Pclass_3	-0.337587
Pclass	-0.359653
Sex_1	-0.538826
Sex	-0.538826
Title_Mr	-0.543456

Survived	0	1	All
Title
Master	15	21	36
Miss	41	108	149
Mr	331	67	398
Mrs	23	86	109
Others	14	8	22
All	424	290	714

	Survived
SibSp
1	0.530055
2	0.440000
0	0.371550
3	0.333333
4	0.166667
5	0.000000