In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("../data_set/2.데이터 클린징/loan_train_quiz_cleaning.csv")
df.head()
Out[2]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 0 1 0 0 0 0 5849 0.0 128.0 360.0 1.0 2 1
1 1 1 1 1 0 0 4583 1508.0 128.0 360.0 1.0 0 0
2 2 1 1 0 0 1 3000 0.0 66.0 360.0 1.0 2 1
3 3 1 1 0 1 0 2583 2358.0 120.0 360.0 1.0 2 1
4 4 1 0 0 0 0 6000 0.0 141.0 360.0 1.0 2 1

변수¶

  • label : 정답, 종속변수
  • feature : 속성, 독립변수
In [3]:
df.columns
Out[3]:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
In [25]:
features = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']
label = 'Loan_Status'
# Loan_Status : 대출이 가능한지 불가능한지 여부

# features 는 결과에 요인을 주는 요소
# label 은 알고자 하는 결과 값
X, y = df[features], df[label]
In [5]:
y
Out[5]:
0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64
In [6]:
X
Out[6]:
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
0 1 0 0 0 0 5849 0.0 128.0 360.0 1.0 2
1 1 1 1 0 0 4583 1508.0 128.0 360.0 1.0 0
2 1 1 0 0 1 3000 0.0 66.0 360.0 1.0 2
3 1 1 0 1 0 2583 2358.0 120.0 360.0 1.0 2
4 1 0 0 0 0 6000 0.0 141.0 360.0 1.0 2
... ... ... ... ... ... ... ... ... ... ... ...
609 0 0 0 0 0 2900 0.0 71.0 360.0 1.0 0
610 1 1 3 0 0 4106 0.0 40.0 180.0 1.0 0
611 1 1 1 0 0 8072 240.0 253.0 360.0 1.0 2
612 1 1 2 0 0 7583 0.0 187.0 360.0 1.0 2
613 0 0 0 0 1 4583 0.0 133.0 360.0 0.0 1

614 rows × 11 columns

In [8]:
# 전체 데이터 갯수와 컬럼 갯수 확인
df.shape
Out[8]:
(614, 13)
In [9]:
from sklearn.model_selection import train_test_split
In [12]:
# 학습용 데이터로 80% 를 할당
# 검증용 데이터로 20% 를 할당
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [11]:
X_train.shape, X_test.shape, len(y_train), len(y_test)
Out[11]:
((491, 11), (123, 11), 491, 123)
In [13]:
# 데이터를 학습 시킨다
# X(요소)일 경우 y(결과) 가 나온다 를 학습 시킨다
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
Out[13]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [18]:
# 테스트용으로 빼놓은 20% 의 요소를 집어넣어
# 예측값을 가져와 pred 변수에 저장 후 출력
pred = dtc.predict(X_test)
pred
Out[18]:
array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)
In [19]:
y_test
Out[19]:
75     0
278    1
512    1
217    1
154    1
      ..
573    0
503    0
120    1
352    1
230    1
Name: Loan_Status, Length: 123, dtype: int64
In [20]:
# score 함수를 사용하여
# 테스트 용 변수와 결과 값을 넣었을 때
# 정답률을 도출할 수 있음
dtc.score(X_test, y_test)
Out[20]:
0.7479674796747967
In [22]:
from sklearn.metrics import accuracy_score

# accuracy_score 를 사용해서 정답률을 확인할 수도 있다
accuracy_score(pred, y_test)
Out[22]:
0.7479674796747967
In [32]:
# 새로운 Dataset 을 가져옴
# 결과 값(대출 가능 여부)이 없는 데이터
# 이 데이터는 실제 학습된 머신에 데이터를 집어넣어 결과 값을 사용할 경우를
# 가정하고 가져온 데이터이다
# 고객의 정보를 기준으로 대출이 나올지 안나올지 확인해보기
df_test = pd.read_csv("../data_set/2.데이터 클린징/loan_test_cleaning.csv")
df_test.head(3)
Out[32]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
0 0 1 1 0 0 0 5720 0 110.0 360.0 1.0 2
1 1 1 1 1 0 0 3076 1500 126.0 360.0 1.0 2
2 2 1 1 2 0 0 5000 1800 208.0 360.0 1.0 2
In [28]:
# Loan_ID 컬럼을 제외한 값이 X_test 에 저장된다
features = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']
X_test = df_test[features]
X_test
Out[28]:
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
0 1 1 0 0 0 5720 0 110.0 360.0 1.0 2
1 1 1 1 0 0 3076 1500 126.0 360.0 1.0 2
2 1 1 2 0 0 5000 1800 208.0 360.0 1.0 2
3 1 1 2 0 0 2340 2546 100.0 360.0 1.0 2
4 1 0 0 1 0 3276 0 78.0 360.0 1.0 2
... ... ... ... ... ... ... ... ... ... ... ...
362 1 1 3 1 1 4009 1777 113.0 360.0 1.0 2
363 1 1 0 0 0 4158 709 115.0 360.0 1.0 2
364 1 0 0 0 0 3250 1993 126.0 360.0 1.0 1
365 1 1 0 0 0 5000 2393 158.0 360.0 1.0 0
366 1 0 0 0 1 9200 0 98.0 180.0 1.0 0

367 rows × 11 columns

In [30]:
# 예측 정답을 가져와서 처리한다
y_test = dtc.predict(X_test)
y_test
Out[30]:
array([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)
In [33]:
# 결과 값을 다시 Dataset 에 넣었음
# 다수의 고객이 발생했다고 가정하면
# 결과 값을 한번에 도출하여 처리하기 때문에
# 해당 상황을 가정하고 test 진행하였음
df_test['Loan_Status'] = y_test
df_test
Out[33]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 0 1 1 0 0 0 5720 0 110.0 360.0 1.0 2 0
1 1 1 1 1 0 0 3076 1500 126.0 360.0 1.0 2 1
2 2 1 1 2 0 0 5000 1800 208.0 360.0 1.0 2 1
3 3 1 1 2 0 0 2340 2546 100.0 360.0 1.0 2 1
4 4 1 0 0 1 0 3276 0 78.0 360.0 1.0 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
362 362 1 1 3 1 1 4009 1777 113.0 360.0 1.0 2 1
363 363 1 1 0 0 0 4158 709 115.0 360.0 1.0 2 1
364 364 1 0 0 0 0 3250 1993 126.0 360.0 1.0 1 1
365 365 1 1 0 0 0 5000 2393 158.0 360.0 1.0 0 0
366 366 1 0 0 0 1 9200 0 98.0 180.0 1.0 0 1

367 rows × 13 columns

In [34]:
# 현재 유저의 상황을 user 라는 변수에 담아
# 예측 값을 출력하여 고객에게 전달
user = [[1,1,0,0,0,5720,0,110.0,360.0,1.0,2]]
dtc.predict(user)
C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(
Out[34]:
array([0], dtype=int64)
In [35]:
if dtc.predict(user)[0] == 1:
    print("대출이 가능")
else:
    print("은행에 문의하세요")
은행에 문의하세요
C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(