241014¶

Machine Learning¶

In [4]:
# 0. 데이터 분리
# 1. 모듈 불러오기
# 2. 선언하기
# 3. 학습하기
# 4. 예측하기
# 5. 평가하기
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

1. 오존 데이터 (회귀)¶

In [7]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/airquality_simple.csv'
data = pd.read_csv(path)
In [8]:
data.head()
Out[8]:
Ozone Solar.R Wind Temp Month Day
0 41 190.0 7.4 67 5 1
1 36 118.0 8.0 72 5 2
2 12 149.0 12.6 74 5 3
3 18 313.0 11.5 62 5 4
4 19 NaN 14.3 56 5 5
In [9]:
data.corr().style.background_gradient()
Out[9]:
  Ozone Solar.R Wind Temp Month Day
Ozone 1.000000 0.280068 -0.605478 0.683372 0.174197 0.004419
Solar.R 0.280068 1.000000 -0.056792 0.275840 -0.075301 -0.150275
Wind -0.605478 -0.056792 1.000000 -0.457988 -0.178293 0.027181
Temp 0.683372 0.275840 -0.457988 1.000000 0.420947 -0.130593
Month 0.174197 -0.075301 -0.178293 0.420947 1.000000 -0.007962
Day 0.004419 -0.150275 0.027181 -0.130593 -0.007962 1.000000
In [10]:
sns.heatmap(data.corr(numeric_only = True),
           annot = True,
           fmt = '.2f')
plt.show()
No description has been provided for this image
In [11]:
data.isnull().sum()
Out[11]:
Ozone      0
Solar.R    7
Wind       0
Temp       0
Month      0
Day        0
dtype: int64
In [12]:
# 결측치 제거
data['Solar.R'].ffill(inplace = True)
In [13]:
# 불필요한 데이터 제거
data.drop(columns = ['Month', 'Day'], inplace = True)
In [14]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split

target = 'Ozone'

x = data.drop(columns = target)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
In [15]:
# 1. 모듈 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
In [16]:
# 2. 선언하기
model = LinearRegression()
In [17]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[17]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [18]:
# 4. 예측하기
y_pred = model.predict(x_test)
In [19]:
# 5. 평가히기
print(mean_absolute_error(y_test, y_pred))
print()

print('Predict:', y_pred[:10])
print('Actual:', y_test.values[:10])
13.976843190385708

Predict: [13.84003067  5.82919112 81.93563027 58.41267418 50.86150737 31.52971121
 66.8083547  -8.56411529 50.2136544  39.13346172]
Actual: [24 18 97 47 34 22 66 18 69 27]
In [20]:
# 5-1. 시각화
plt.plot(y_pred, label = 'Predict')
plt.plot(y_test.values, label = 'Actual')
plt.legend(loc = 'lower center')
plt.show()
No description has been provided for this image
In [ ]:
 

2. 보스턴 집값 데이터 (회귀)¶

In [22]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/boston.csv'
data = pd.read_csv(path)
In [23]:
data.head()
Out[23]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
In [24]:
data.corr(numeric_only = True).style.background_gradient()
Out[24]:
  crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
crim 1.000000 -0.200469 0.406583 -0.055892 0.420972 -0.219247 0.352734 -0.379670 0.625505 0.582764 0.289946 -0.385064 0.455621 -0.388305
zn -0.200469 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 0.175520 -0.412995 0.360445
indus 0.406583 -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 -0.356977 0.603800 -0.483725
chas -0.055892 -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 0.048788 -0.053929 0.175260
nox 0.420972 -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.590879 -0.427321
rm -0.219247 0.311991 -0.391676 0.091251 -0.302188 1.000000 -0.240265 0.205246 -0.209847 -0.292048 -0.355501 0.128069 -0.613808 0.695360
age 0.352734 -0.569537 0.644779 0.086518 0.731470 -0.240265 1.000000 -0.747881 0.456022 0.506456 0.261515 -0.273534 0.602339 -0.376955
dis -0.379670 0.664408 -0.708027 -0.099176 -0.769230 0.205246 -0.747881 1.000000 -0.494588 -0.534432 -0.232471 0.291512 -0.496996 0.249929
rad 0.625505 -0.311948 0.595129 -0.007368 0.611441 -0.209847 0.456022 -0.494588 1.000000 0.910228 0.464741 -0.444413 0.488676 -0.381626
tax 0.582764 -0.314563 0.720760 -0.035587 0.668023 -0.292048 0.506456 -0.534432 0.910228 1.000000 0.460853 -0.441808 0.543993 -0.468536
ptratio 0.289946 -0.391679 0.383248 -0.121515 0.188933 -0.355501 0.261515 -0.232471 0.464741 0.460853 1.000000 -0.177383 0.374044 -0.507787
black -0.385064 0.175520 -0.356977 0.048788 -0.380051 0.128069 -0.273534 0.291512 -0.444413 -0.441808 -0.177383 1.000000 -0.366087 0.333461
lstat 0.455621 -0.412995 0.603800 -0.053929 0.590879 -0.613808 0.602339 -0.496996 0.488676 0.543993 0.374044 -0.366087 1.000000 -0.737663
medv -0.388305 0.360445 -0.483725 0.175260 -0.427321 0.695360 -0.376955 0.249929 -0.381626 -0.468536 -0.507787 0.333461 -0.737663 1.000000
In [25]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'medv'

x = data.drop(columns = target)
y = data[[target]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state =1)
In [26]:
# 1. 모듈 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
In [27]:
# 2. 선언하기
model = LinearRegression()
In [28]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[28]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [29]:
# 4. 예측하기
y_pred = model.predict(x_test)
In [30]:
# 5. 평가하기
mean_absolute_error(y_test, y_pred)
Out[30]:
3.3446655035987605
In [31]:
# 5-1. 시각화
plt.plot(y_pred, label = 'Predict')
plt.plot(y_test.values, label = 'Actual')
plt.axhline(y_train.values.mean(), label = 'AVG', color = 'red')
plt.legend(loc = 'upper right')

plt.show()
No description has been provided for this image
In [ ]:
 

3. 대학원 입학 여부 데이터 (분류)¶

In [33]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/admission_simple.csv'
data = pd.read_csv(path)
In [34]:
data.corr(numeric_only = True).style.background_gradient()
Out[34]:
  GRE TOEFL RANK SOP LOR GPA RESEARCH ADMIT
GRE 1.000000 0.827200 0.635376 0.613498 0.524679 0.825878 0.563398 0.701671
TOEFL 0.827200 1.000000 0.649799 0.644410 0.541563 0.810574 0.467012 0.680503
RANK 0.635376 0.649799 1.000000 0.728024 0.608651 0.705254 0.427047 0.618367
SOP 0.613498 0.644410 0.728024 1.000000 0.663707 0.712154 0.408116 0.606876
LOR 0.524679 0.541563 0.608651 0.663707 1.000000 0.637469 0.372526 0.536527
GPA 0.825878 0.810574 0.705254 0.712154 0.637469 1.000000 0.501311 0.752196
RESEARCH 0.563398 0.467012 0.427047 0.408116 0.372526 0.501311 1.000000 0.503104
ADMIT 0.701671 0.680503 0.618367 0.606876 0.536527 0.752196 0.503104 1.000000
In [35]:
sns.heatmap(data.corr(numeric_only = True),
           annot = True,
           fmt = '.2f')
plt.show()
No description has been provided for this image
In [36]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split

target = 'ADMIT'

x = data.drop(columns = target)
y = data[[target]]

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    shuffle = True,    # 시계열 데이터에서는 셔플 금지
                                                    stratify = y,    # y의 비율에 따라 분할
                                                    train_size = 0.7,    # 전체 중 얼마나 train, test
                                                    random_state = 1)    # 같은 데이터 갖게
In [37]:
# 1. 모듈 불러오기
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
In [38]:
# 2. 선언하기
model = KNeighborsClassifier()
In [39]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[39]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [40]:
# 4. 예측하기
pred_y = model.predict(x_test)
In [41]:
# 5. 평가하기
accuracy_score(pred_y, y_test)
Out[41]:
0.8466666666666667
In [42]:
print('Predict:', pred_y[:10])

# y_test, y_train 을 df로 저장했을 때 방법1
nlist = list()
for i in y_test.values[:10]:
    nlist.extend(i)
nlist = np.array(nlist)


# 방법2
# nlist = nlist.flatten()

print('Actual:', nlist)
Predict: [1 1 0 1 0 1 0 1 1 1]
Actual: [1 0 0 1 0 1 1 1 1 1]
In [ ]:
 

4. 붓꽃 분류 (분류)¶

In [44]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/iris.csv'
data = pd.read_csv(path)
In [45]:
data.corr(numeric_only = True).style.background_gradient()
Out[45]:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 1.000000 -0.117570 0.871754 0.817941
Sepal.Width -0.117570 1.000000 -0.428440 -0.366126
Petal.Length 0.871754 -0.428440 1.000000 0.962865
Petal.Width 0.817941 -0.366126 0.962865 1.000000
In [46]:
data
Out[46]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

150 rows × 5 columns

In [47]:
sns.heatmap(data.corr(numeric_only = True),
           annot = True,
           fmt = '.2f',
           vmin = -1,
           vmax = 1)
plt.show()
No description has been provided for this image
In [48]:
# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'Species'

x = data.drop(columns = target)
y = data[[target]]    # df 저장

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify = y,    # y기준 분할
                                                    test_size = 0.3,
                                                     random_state = 1)
In [49]:
# 1. 모듈 불러오기
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
In [50]:
# 2. 선언하기
model = DecisionTreeClassifier()
In [51]:
# 3. 학습하기
model.fit(x_train, y_train)
Out[51]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [52]:
# 4. 예측하기
y_pred = model.predict(x_test)
In [53]:
# 5. 평가하기
accuracy_score(y_pred, y_test)
Out[53]:
0.9777777777777777
In [54]:
print(y_pred[:10])
print(y_test.values[:10].flatten())    # y_test는 df형식(series 형식 아님)
['virginica' 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor'
 'virginica' 'versicolor' 'virginica' 'setosa']
['virginica' 'setosa' 'setosa' 'virginica' 'versicolor' 'versicolor'
 'virginica' 'versicolor' 'virginica' 'setosa']
In [ ]: