241014¶
Machine Learning¶
In [4]:
# 0. 데이터 분리
# 1. 모듈 불러오기
# 2. 선언하기
# 3. 학습하기
# 4. 예측하기
# 5. 평가하기
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'
1. 오존 데이터 (회귀)¶
In [7]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/airquality_simple.csv'
data = pd.read_csv(path)
In [8]:
data.head()
Out[8]:
Ozone | Solar.R | Wind | Temp | Month | Day | |
---|---|---|---|---|---|---|
0 | 41 | 190.0 | 7.4 | 67 | 5 | 1 |
1 | 36 | 118.0 | 8.0 | 72 | 5 | 2 |
2 | 12 | 149.0 | 12.6 | 74 | 5 | 3 |
3 | 18 | 313.0 | 11.5 | 62 | 5 | 4 |
4 | 19 | NaN | 14.3 | 56 | 5 | 5 |
In [9]:
data.corr().style.background_gradient()
Out[9]:
Ozone | Solar.R | Wind | Temp | Month | Day | |
---|---|---|---|---|---|---|
Ozone | 1.000000 | 0.280068 | -0.605478 | 0.683372 | 0.174197 | 0.004419 |
Solar.R | 0.280068 | 1.000000 | -0.056792 | 0.275840 | -0.075301 | -0.150275 |
Wind | -0.605478 | -0.056792 | 1.000000 | -0.457988 | -0.178293 | 0.027181 |
Temp | 0.683372 | 0.275840 | -0.457988 | 1.000000 | 0.420947 | -0.130593 |
Month | 0.174197 | -0.075301 | -0.178293 | 0.420947 | 1.000000 | -0.007962 |
Day | 0.004419 | -0.150275 | 0.027181 | -0.130593 | -0.007962 | 1.000000 |
In [10]:
sns.heatmap(data.corr(numeric_only = True),
annot = True,
fmt = '.2f')
plt.show()
In [11]:
data.isnull().sum()
Out[11]:
Ozone 0 Solar.R 7 Wind 0 Temp 0 Month 0 Day 0 dtype: int64
In [12]:
# 결측치 제거
data['Solar.R'].ffill(inplace = True)
In [13]:
# 불필요한 데이터 제거
data.drop(columns = ['Month', 'Day'], inplace = True)
In [14]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'Ozone'
x = data.drop(columns = target)
y = data[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
In [15]:
# 1. 모듈 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
In [16]:
# 2. 선언하기
model = LinearRegression()
In [17]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[17]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [18]:
# 4. 예측하기
y_pred = model.predict(x_test)
In [19]:
# 5. 평가히기
print(mean_absolute_error(y_test, y_pred))
print()
print('Predict:', y_pred[:10])
print('Actual:', y_test.values[:10])
13.976843190385708 Predict: [13.84003067 5.82919112 81.93563027 58.41267418 50.86150737 31.52971121 66.8083547 -8.56411529 50.2136544 39.13346172] Actual: [24 18 97 47 34 22 66 18 69 27]
In [20]:
# 5-1. 시각화
plt.plot(y_pred, label = 'Predict')
plt.plot(y_test.values, label = 'Actual')
plt.legend(loc = 'lower center')
plt.show()
In [ ]:
2. 보스턴 집값 데이터 (회귀)¶
In [22]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/boston.csv'
data = pd.read_csv(path)
In [23]:
data.head()
Out[23]:
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
In [24]:
data.corr(numeric_only = True).style.background_gradient()
Out[24]:
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
crim | 1.000000 | -0.200469 | 0.406583 | -0.055892 | 0.420972 | -0.219247 | 0.352734 | -0.379670 | 0.625505 | 0.582764 | 0.289946 | -0.385064 | 0.455621 | -0.388305 |
zn | -0.200469 | 1.000000 | -0.533828 | -0.042697 | -0.516604 | 0.311991 | -0.569537 | 0.664408 | -0.311948 | -0.314563 | -0.391679 | 0.175520 | -0.412995 | 0.360445 |
indus | 0.406583 | -0.533828 | 1.000000 | 0.062938 | 0.763651 | -0.391676 | 0.644779 | -0.708027 | 0.595129 | 0.720760 | 0.383248 | -0.356977 | 0.603800 | -0.483725 |
chas | -0.055892 | -0.042697 | 0.062938 | 1.000000 | 0.091203 | 0.091251 | 0.086518 | -0.099176 | -0.007368 | -0.035587 | -0.121515 | 0.048788 | -0.053929 | 0.175260 |
nox | 0.420972 | -0.516604 | 0.763651 | 0.091203 | 1.000000 | -0.302188 | 0.731470 | -0.769230 | 0.611441 | 0.668023 | 0.188933 | -0.380051 | 0.590879 | -0.427321 |
rm | -0.219247 | 0.311991 | -0.391676 | 0.091251 | -0.302188 | 1.000000 | -0.240265 | 0.205246 | -0.209847 | -0.292048 | -0.355501 | 0.128069 | -0.613808 | 0.695360 |
age | 0.352734 | -0.569537 | 0.644779 | 0.086518 | 0.731470 | -0.240265 | 1.000000 | -0.747881 | 0.456022 | 0.506456 | 0.261515 | -0.273534 | 0.602339 | -0.376955 |
dis | -0.379670 | 0.664408 | -0.708027 | -0.099176 | -0.769230 | 0.205246 | -0.747881 | 1.000000 | -0.494588 | -0.534432 | -0.232471 | 0.291512 | -0.496996 | 0.249929 |
rad | 0.625505 | -0.311948 | 0.595129 | -0.007368 | 0.611441 | -0.209847 | 0.456022 | -0.494588 | 1.000000 | 0.910228 | 0.464741 | -0.444413 | 0.488676 | -0.381626 |
tax | 0.582764 | -0.314563 | 0.720760 | -0.035587 | 0.668023 | -0.292048 | 0.506456 | -0.534432 | 0.910228 | 1.000000 | 0.460853 | -0.441808 | 0.543993 | -0.468536 |
ptratio | 0.289946 | -0.391679 | 0.383248 | -0.121515 | 0.188933 | -0.355501 | 0.261515 | -0.232471 | 0.464741 | 0.460853 | 1.000000 | -0.177383 | 0.374044 | -0.507787 |
black | -0.385064 | 0.175520 | -0.356977 | 0.048788 | -0.380051 | 0.128069 | -0.273534 | 0.291512 | -0.444413 | -0.441808 | -0.177383 | 1.000000 | -0.366087 | 0.333461 |
lstat | 0.455621 | -0.412995 | 0.603800 | -0.053929 | 0.590879 | -0.613808 | 0.602339 | -0.496996 | 0.488676 | 0.543993 | 0.374044 | -0.366087 | 1.000000 | -0.737663 |
medv | -0.388305 | 0.360445 | -0.483725 | 0.175260 | -0.427321 | 0.695360 | -0.376955 | 0.249929 | -0.381626 | -0.468536 | -0.507787 | 0.333461 | -0.737663 | 1.000000 |
In [25]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'medv'
x = data.drop(columns = target)
y = data[[target]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state =1)
In [26]:
# 1. 모듈 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
In [27]:
# 2. 선언하기
model = LinearRegression()
In [28]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[28]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [29]:
# 4. 예측하기
y_pred = model.predict(x_test)
In [30]:
# 5. 평가하기
mean_absolute_error(y_test, y_pred)
Out[30]:
3.3446655035987605
In [31]:
# 5-1. 시각화
plt.plot(y_pred, label = 'Predict')
plt.plot(y_test.values, label = 'Actual')
plt.axhline(y_train.values.mean(), label = 'AVG', color = 'red')
plt.legend(loc = 'upper right')
plt.show()
In [ ]:
3. 대학원 입학 여부 데이터 (분류)¶
In [33]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/admission_simple.csv'
data = pd.read_csv(path)
In [34]:
data.corr(numeric_only = True).style.background_gradient()
Out[34]:
GRE | TOEFL | RANK | SOP | LOR | GPA | RESEARCH | ADMIT | |
---|---|---|---|---|---|---|---|---|
GRE | 1.000000 | 0.827200 | 0.635376 | 0.613498 | 0.524679 | 0.825878 | 0.563398 | 0.701671 |
TOEFL | 0.827200 | 1.000000 | 0.649799 | 0.644410 | 0.541563 | 0.810574 | 0.467012 | 0.680503 |
RANK | 0.635376 | 0.649799 | 1.000000 | 0.728024 | 0.608651 | 0.705254 | 0.427047 | 0.618367 |
SOP | 0.613498 | 0.644410 | 0.728024 | 1.000000 | 0.663707 | 0.712154 | 0.408116 | 0.606876 |
LOR | 0.524679 | 0.541563 | 0.608651 | 0.663707 | 1.000000 | 0.637469 | 0.372526 | 0.536527 |
GPA | 0.825878 | 0.810574 | 0.705254 | 0.712154 | 0.637469 | 1.000000 | 0.501311 | 0.752196 |
RESEARCH | 0.563398 | 0.467012 | 0.427047 | 0.408116 | 0.372526 | 0.501311 | 1.000000 | 0.503104 |
ADMIT | 0.701671 | 0.680503 | 0.618367 | 0.606876 | 0.536527 | 0.752196 | 0.503104 | 1.000000 |
In [35]:
sns.heatmap(data.corr(numeric_only = True),
annot = True,
fmt = '.2f')
plt.show()
In [36]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'ADMIT'
x = data.drop(columns = target)
y = data[[target]]
x_train, x_test, y_train, y_test = train_test_split(x, y,
shuffle = True, # 시계열 데이터에서는 셔플 금지
stratify = y, # y의 비율에 따라 분할
train_size = 0.7, # 전체 중 얼마나 train, test
random_state = 1) # 같은 데이터 갖게
In [37]:
# 1. 모듈 불러오기
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
In [38]:
# 2. 선언하기
model = KNeighborsClassifier()
In [39]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[39]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [40]:
# 4. 예측하기
pred_y = model.predict(x_test)
In [41]:
# 5. 평가하기
accuracy_score(pred_y, y_test)
Out[41]:
0.8466666666666667
In [42]:
print('Predict:', pred_y[:10])
# y_test, y_train 을 df로 저장했을 때 방법1
nlist = list()
for i in y_test.values[:10]:
nlist.extend(i)
nlist = np.array(nlist)
# 방법2
# nlist = nlist.flatten()
print('Actual:', nlist)
Predict: [1 1 0 1 0 1 0 1 1 1] Actual: [1 0 0 1 0 1 1 1 1 1]
In [ ]:
4. 붓꽃 분류 (분류)¶
In [44]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/iris.csv'
data = pd.read_csv(path)
In [45]:
data.corr(numeric_only = True).style.background_gradient()
Out[45]:
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
---|---|---|---|---|
Sepal.Length | 1.000000 | -0.117570 | 0.871754 | 0.817941 |
Sepal.Width | -0.117570 | 1.000000 | -0.428440 | -0.366126 |
Petal.Length | 0.871754 | -0.428440 | 1.000000 | 0.962865 |
Petal.Width | 0.817941 | -0.366126 | 0.962865 | 1.000000 |
In [46]:
data
Out[46]:
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [47]:
sns.heatmap(data.corr(numeric_only = True),
annot = True,
fmt = '.2f',
vmin = -1,
vmax = 1)
plt.show()
In [48]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'Species'
x = data.drop(columns = target)
y = data[[target]] # df 저장
x_train, x_test, y_train, y_test = train_test_split(x, y,
stratify = y, # y기준 분할
test_size = 0.3,
random_state = 1)
In [49]:
# 1. 모듈 불러오기
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
In [50]:
# 2. 선언하기
model = DecisionTreeClassifier()
In [51]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[51]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [52]:
# 4. 예측하기
y_pred = model.predict(x_test)
In [53]:
# 5. 평가하기
accuracy_score(y_pred, y_test)
Out[53]:
0.9777777777777777
In [54]:
print(y_pred[:10])
print(y_test.values[:10].flatten()) # y_test는 df형식(series 형식 아님)
['virginica' 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor' 'virginica' 'setosa'] ['virginica' 'setosa' 'setosa' 'virginica' 'versicolor' 'versicolor' 'virginica' 'versicolor' 'virginica' 'setosa']
In [ ]: