분류 문제: 통신사 고객이탈여부, 신용카드 발급여부, 신용대출 취급여부등에 사용
문제 풀이 순서
if(!require(caret)) {install.packages("caret");library(caret)} #Classification and Regression Training
# Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien
if(!require(e1071)) {install.packages("e1071");library(e1071)}
#Fast Nearest Neighbour Search (Wraps ANN Library) Using L2 Metric
if(!require(RANN)) {install.packages("RANN");library(RANN)}
#Project MOSAIC Statistics and Mathematics Teaching Utilities
if(!require(mosaic)) {install.packages("mosaic") ; library(mosaic)}
if(!require(tidyverse)) {install.packages("tidyverse") ; library(tidyverse)}
#Graphical User Interface for Data Science in R
if(!require(rattle)) { install.packages('rattle', dependencies = TRUE) ; library(rattle)}
if(!require(rpart)) { install.packages('rpart'); library(rpart) }
#Harrell Miscellaneous
library(Hmisc)
## 'data.frame': 20000 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ COLLEGE : int 0 1 1 0 1 0 0 1 0 0 ...
## $ INCOME : int 31953 36147 27273 120070 29215 133728 42052 84744 38171 105824 ...
## $ OVERAGE : int 0 0 230 38 208 64 224 0 0 174 ...
## $ LEFTOVER : int 6 13 0 33 85 48 0 20 7 18 ...
## $ HOUSE : int 313378 800586 305049 788235 224784 632969 697949 688098 274218 153560 ...
## $ HANDSET_PRICE : int 161 244 201 780 241 626 191 357 190 687 ...
## $ OVER_15MINS_CALLS_PER_MONTH: int 0 0 16 3 21 3 10 0 0 25 ...
## $ AVERAGE_CALL_DURATION : int 4 6 15 2 1 2 5 5 5 4 ...
## $ REPORTED_SATISFACTION : Factor w/ 5 levels "avg","sat","unsat",..: 3 3 3 3 5 3 5 5 4 4 ...
## $ REPORTED_USAGE_LEVEL : Factor w/ 5 levels "avg","high","little",..: 3 3 5 4 3 2 3 3 3 3 ...
## $ CONSIDERING_CHANGE_OF_PLAN : Factor w/ 5 levels "actively_looking_into_it",..: 4 2 5 2 3 4 1 2 1 3 ...
## $ CHURN : Factor w/ 2 levels "LEAVE","STAY": 2 2 2 1 2 2 2 2 2 1 ...
#1. 변수의 Type 검증: factor, Number 분리
# dictinct 값을 확인하여 10개 미만이면 Factor로 변경
# 시험에서는 아래와 같이 함수를 사용하지 않음
# 몇개의 변수만 확인 후 시간 관계상 Skip 했으나, 시험장에서 꼭 사전 준비했으면 하는 부분 이었음
perf_col <- matrix(nrow = ncol(mobile), ncol = 4) # 컬럼 요약 정보
colnames(perf_col) <- c("col_name","NA_CNT","Distinct", "class")
data_info <- Hmisc::describe(mobile) # 컬럼별 주요 속성
colX <- sapply(mobile, class) #컬럼 Type
i = 1
for (a1 in data_info) {
# 컬럼명, missing, # distinct
perf_col[i,] = c(a1[['descript']], a1[['counts']][[2]], a1[['counts']][[3]], colX[i])
i = i + 1
}
# 컬럼 속성: Distinct가 10이하이면 factor로 변환 여부 검증
# 컬럼# = 레코드#는 분석에 의미 없음
perf_col
## col_name NA_CNT Distinct class
## [1,] "X" "0" "20000" "integer"
## [2,] "COLLEGE" "0" "2" "factor"
## [3,] "INCOME" "0" "18541" "integer"
## [4,] "OVERAGE" "0" "284" "integer"
## [5,] "LEFTOVER" "0" "86" "integer"
## [6,] "HOUSE" "0" "19703" "integer"
## [7,] "HANDSET_PRICE" "0" "770" "integer"
## [8,] "OVER_15MINS_CALLS_PER_MONTH" "0" "25" "integer"
## [9,] "AVERAGE_CALL_DURATION" "0" "13" "integer"
## [10,] "REPORTED_SATISFACTION" "0" "5" "factor"
## [11,] "REPORTED_USAGE_LEVEL" "0" "5" "factor"
## [12,] "CONSIDERING_CHANGE_OF_PLAN" "0" "5" "factor"
## [13,] "CHURN" "0" "2" "factor"
# 1) Target 변수의 분포를 먼저 살펴봅시다.
table(mobile$CHURN); prop.table(table(mobile$CHURN)) # class imbalance 인지 확인 (분포에 특이 사항 없음)
##
## LEAVE STAY
## 9852 10148
##
## LEAVE STAY
## 0.493 0.507
범주형 변수 vs. 종속 변수 분포 확인
### 2): 범주형 변수 확인.
## 평균 신용도
CHURN.Rate <- table(mobile$CHURN)[2] / (sum(table(mobile$CHURN)))
#범주형 변수 리스트
colNames.factor <- names(mobile[, sapply(mobile, is.factor)])
colNames.factor <- colNames.factor[colNames.factor != "CHURN"] # 종속변수는 제외
## 모자이크 플롯은 X,Y 반대 반향으로 그리고
## 평균 신용도와 비교해 보자
for (colname in colNames.factor) {
x1 <- paste0(colname , ' ~ CHURN')
mosaicplot(as.formula(x1), data = mobile, color = TRUE, main = colname)
abline(a= CHURN.Rate, b=0, col = 'red')
}
숫자 변수의 분포 확인 - 분포가 일반적이지 아니면 꼭 의견을 기술
## [1] "X" "INCOME"
## [3] "OVERAGE" "LEFTOVER"
## [5] "HOUSE" "HANDSET_PRICE"
## [7] "OVER_15MINS_CALLS_PER_MONTH" "AVERAGE_CALL_DURATION"
# 데이터 해석 필요
# 미사용, 70근방, 200근방 ==> 비지니스 적인 요소가 있음 (2금액 근방에 이벤트..)
qplot(OVERAGE, data = mobile, geom = 'histogram', bins = 50)
# 4) 종속변수와의 관계
# 연속 vs 범주
#==> INCOME이 10,000 기준으로 어떻게 해석해야 할까?(그래프가 교차하는 지점)
qplot(INCOME, data = mobile, geom = 'density', colour = CHURN)
## X INCOME OVERAGE LEFTOVER HOUSE
## X 1.00000 0.003686 -0.006050 0.00607 0.01135
## INCOME 0.00369 1.000000 0.000458 0.00652 -0.01096
## OVERAGE -0.00605 0.000458 1.000000 -0.00312 0.00241
## LEFTOVER 0.00607 0.006515 -0.003123 1.00000 0.00653
## HOUSE 0.01135 -0.010964 0.002412 0.00653 1.00000
## HANDSET_PRICE -0.00784 0.727200 0.000324 0.00400 -0.00776
## OVER_15MINS_CALLS_PER_MONTH 0.00125 0.002136 0.770557 -0.01041 0.00741
## AVERAGE_CALL_DURATION -0.00583 -0.007219 0.000653 -0.66029 -0.00936
## HANDSET_PRICE OVER_15MINS_CALLS_PER_MONTH
## X -0.007838 0.00125
## INCOME 0.727200 0.00214
## OVERAGE 0.000324 0.77056
## LEFTOVER 0.004004 -0.01041
## HOUSE -0.007756 0.00741
## HANDSET_PRICE 1.000000 0.00268
## OVER_15MINS_CALLS_PER_MONTH 0.002680 1.00000
## AVERAGE_CALL_DURATION -0.005190 0.00777
## AVERAGE_CALL_DURATION
## X -0.005830
## INCOME -0.007219
## OVERAGE 0.000653
## LEFTOVER -0.660285
## HOUSE -0.009359
## HANDSET_PRICE -0.005190
## OVER_15MINS_CALLS_PER_MONTH 0.007769
## AVERAGE_CALL_DURATION 1.000000
#tr_idx <- sample(nrow(mobile), size=0.7 * nrow(mobile))
# createDataPartition: Data Splitting functions
tr_idx <- caret::createDataPartition(mobile$CHURN, p=0.7, list = FALSE)
trainData <- mobile[tr_idx,]
testData <- mobile[-tr_idx,]
# Train vs. Test의 비율이 일정함 0.493 0.507
prop.table(table(mobile$CHURN)) #분리전 비율
##
## LEAVE STAY
## 0.493 0.507
##
## LEAVE STAY
## 0.493 0.507
##
## LEAVE STAY
## 0.493 0.507
## [1] "X" "COLLEGE"
## [3] "INCOME" "OVERAGE"
## [5] "LEFTOVER" "HOUSE"
## [7] "HANDSET_PRICE" "OVER_15MINS_CALLS_PER_MONTH"
## [9] "AVERAGE_CALL_DURATION" "REPORTED_SATISFACTION"
## [11] "REPORTED_USAGE_LEVEL" "CONSIDERING_CHANGE_OF_PLAN"
## [13] "CHURN"
#preproc <- preProcess(trainData[,-12], method= 'range') # target은 제외
#변수의 위치가 변경되면 영향을 받지 않기 위해
#TODO 이부분을 train/test 분리하기전에 한 경우와 분리후 하는 부분에 대한 자료 확인 추가 하자
preproc <- preProcess(select(trainData, -('CHURN')), method= 'range') # target은 제외
preproc
## Created from 14001 samples and 12 variables
##
## Pre-processing:
## - ignored (4)
## - re-scaling to [0, 1] (8)
# 역시 함수 적용은 함수 적용은 predict
# 만든 모델을 trainData과 testData에 적용합니다.
trainData_sc <- predict(preproc, newdata = trainData)
testData_sc <- predict(preproc, newdata = testData)
summary(trainData_sc)
## X COLLEGE INCOME OVERAGE LEFTOVER
## Min. :0.000 0:6980 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.250 1:7021 1st Qu.:0.158 1st Qu.:0.006 1st Qu.:0.000
## Median :0.498 Median :0.394 Median :0.181 Median :0.169
## Mean :0.499 Mean :0.430 Mean :0.260 Mean :0.269
## 3rd Qu.:0.747 3rd Qu.:0.686 3rd Qu.:0.534 3rd Qu.:0.461
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000
## HOUSE HANDSET_PRICE OVER_15MINS_CALLS_PER_MONTH
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.134 1st Qu.:0.116 1st Qu.:0.034
## Median :0.358 Median :0.254 Median :0.138
## Mean :0.405 Mean :0.337 Mean :0.275
## 3rd Qu.:0.653 3rd Qu.:0.525 3rd Qu.:0.517
## Max. :1.000 Max. :1.000 Max. :1.000
## AVERAGE_CALL_DURATION REPORTED_SATISFACTION REPORTED_USAGE_LEVEL
## Min. :0.000 avg :1403 avg : 689
## 1st Qu.:0.071 sat : 719 high :1396
## Median :0.286 unsat :2792 little :5518
## Mean :0.356 very_sat :3523 very_high :3575
## 3rd Qu.:0.643 very_unsat:5564 very_little:2823
## Max. :1.000
## CONSIDERING_CHANGE_OF_PLAN CHURN
## actively_looking_into_it:3495 LEAVE:6897
## considering :5524 STAY :7104
## never_thought :1375
## no :2849
## perhaps : 758
##
fitControl <- trainControl(method = "cv", number = 3)
# logistic regression : glm
model_glm <- train(CHURN ~ . , data = trainData_sc
, method = 'glm'
, trControl = fitControl
, tuneLength = 3)
result_glm <- predict(model_glm, newdata = testData_sc)
cm_glm <- confusionMatrix(result_glm, testData_sc$CHURN, positive = "LEAVE")
# knn : knn
model_knn <- train(CHURN ~ . , data = trainData_sc
, method = 'knn'
, trControl = fitControl
, tuneLength = 3)
result_knn <- predict(model_knn, newdata = testData_sc)
cm_knn <- confusionMatrix(result_knn, testData_sc$CHURN, positive = "LEAVE")
# decision tree : rpart2
model_rpart <- train(CHURN ~ . , data = trainData_sc
, method = 'rpart2'
, trControl = fitControl
, tuneLength = 3)
result_rpart <- predict(model_rpart, newdata = testData_sc)
cm_rpart <- confusionMatrix(result_rpart, testData_sc$CHURN, positive = "LEAVE")
# randomForest : rf
model_rf <- train(CHURN ~ . , data = trainData_sc
, method = 'rf'
, trControl = fitControl
, tuneLength = 3
)
result_rf <- predict(model_rf, newdata = testData_sc)
cm_rf <- confusionMatrix(result_rf, testData_sc$CHURN, positive = "LEAVE")
# XGBoost : xgbTree
model_xgbT <- train(CHURN ~ . , data = trainData_sc
, method = 'xgbTree'
, trControl = fitControl
, tuneLength = 3)
result_xgbT <- predict(model_xgbT, newdata = testData_sc)
cm_xgbT <- confusionMatrix(result_xgbT, testData_sc$CHURN, positive = "LEAVE")
cm_xgbT
## Confusion Matrix and Statistics
##
## Reference
## Prediction LEAVE STAY
## LEAVE 2110 971
## STAY 845 2073
##
## Accuracy : 0.697
## 95% CI : (0.685, 0.709)
## No Information Rate : 0.507
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.395
##
## Mcnemar's Test P-Value : 0.00335
##
## Sensitivity : 0.714
## Specificity : 0.681
## Pos Pred Value : 0.685
## Neg Pred Value : 0.710
## Prevalence : 0.493
## Detection Rate : 0.352
## Detection Prevalence : 0.514
## Balanced Accuracy : 0.698
##
## 'Positive' Class : LEAVE
##
#? caret::varImp==>Calculation of variable importance for regression and classification models
# Linear Models --> the absolute value of the t-statistic
# logistic regression : glm
vi_glm <- varImp(model_glm, scale = T)
plot(vi_glm, top = 20)
library(dplyr)
## x2[order(x2$Overall, decreasing = T), ] ==> RowName이 사라진다
varImp_map <- matrix(0, nrow = 5, ncol = 1)
colnames(varImp_map) <- c("중요변수5개")
rownames(varImp_map) <- c("GLM", "KNN", "rpart", "Random Forest", "XGbt")
#상위 5개 중요 변수 추출
varImp_eval <- function(varImp) {
temp_df <- dplyr::arrange(varImp, desc(Overall))
return (c(paste0(rownames(temp_df)[1:5], collapse = " --> ")))
}
varImp_map[1,] <- varImp_eval(vi_glm[[1]])
temp <- vi_knn[[1]]
temp <- select(temp, "LEAVE")
colnames(temp) <- c("Overall")
varImp_map[2,] <- varImp_eval(temp)
varImp_map[3,] <- varImp_eval(vi_rpart[[1]])
varImp_map[4,] <- varImp_eval(vi_rf[[1]])
varImp_map[5,] <- varImp_eval(vi_xgbT[[1]])
## 각 모델별 성능 측정 요약
varImp_map
## 중요변수5개
## GLM "HOUSE --> OVERAGE --> LEFTOVER --> INCOME --> AVERAGE_CALL_DURATION"
## KNN "OVERAGE --> OVER_15MINS_CALLS_PER_MONTH --> HOUSE --> INCOME --> HANDSET_PRICE"
## rpart "OVERAGE --> OVER_15MINS_CALLS_PER_MONTH --> HOUSE --> INCOME --> HANDSET_PRICE"
## Random Forest "HOUSE --> INCOME --> OVERAGE --> HANDSET_PRICE --> X"
## XGbt "HOUSE --> OVERAGE --> INCOME --> LEFTOVER --> OVER_15MINS_CALLS_PER_MONTH"
#1> 각 모델링 방법으로 구축된 내용의 비교
perf_map <- matrix(0, nrow = 5, ncol = 6)
colnames(perf_map) <- c("Accuracy", "Kappa", "Sensitivity", "Specificity", "Balanced Accuracy", "F1")
rownames(perf_map) <- c("GLM", "KNN", "rpart", "Random Forest", "XGbt")
#성능 내역 세부 내역
perf_eval <- function(cm) {
Accuracy = cm$overall[1]
Kappa = cm$overall[2]
Sensitivity = cm$byClass[1]
Specificity = cm$byClass[2]
BalancedAccuracy = cm$byClass[11]
# F1-Measure
F1 <- 2*Sensitivity*Specificity/(Sensitivity+Specificity)
return (c(Accuracy, Kappa, Sensitivity, Specificity, BalancedAccuracy, F1))
}
perf_map[1,] <- perf_eval(cm_glm)
perf_map[2,] <- perf_eval(cm_knn)
perf_map[3,] <- perf_eval(cm_rpart)
perf_map[4,] <- perf_eval(cm_rf)
perf_map[5,] <- perf_eval(cm_xgbT)
## 각 모델별 성능 측정 요약
perf_map
## Accuracy Kappa Sensitivity Specificity Balanced Accuracy F1
## GLM 0.635 0.270 0.618 0.651 0.635 0.634
## KNN 0.609 0.217 0.569 0.648 0.608 0.606
## rpart 0.689 0.380 0.824 0.557 0.691 0.665
## Random Forest 0.696 0.394 0.737 0.657 0.697 0.695
## XGbt 0.697 0.395 0.714 0.681 0.698 0.697
예측오류에 대한 비지니스 비용을 가정하고, 각 모델의 이익을 평가
비즈니스 가치[가정]
# 위 비즈니스 가치를 matrix로 만들어 봅시다.
bv <- matrix(c( 30, 0, 30 ,50 ) , nrow = 2)
# 모델의 성적표(confistion matrix)를 비율로 변환하여 저장
## 비율로 바꾸려면, prop.table
prop.table(cm_glm$table)
## Reference
## Prediction LEAVE STAY
## LEAVE 0.305 0.177
## STAY 0.188 0.331
## Reference
## Prediction LEAVE STAY
## LEAVE 0.280 0.179
## STAY 0.213 0.329
## Reference
## Prediction LEAVE STAY
## LEAVE 0.4061 0.2249
## STAY 0.0865 0.2825
## Reference
## Prediction LEAVE STAY
## LEAVE 0.363 0.174
## STAY 0.129 0.333
## Reference
## Prediction LEAVE STAY
## LEAVE 0.352 0.162
## STAY 0.141 0.346
## [1] 31
## [1] 30.2
## [1] 33.1
## [1] 32.8
## [1] 32.7