# 0. 데이터 분리
# 1. 모듈 불러오기
# 2. 선언하기
# 3. 학습하기
# 4. 예측하기
# 5. 평가하기

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/airquality_simple.csv'
data = pd.read_csv(path)

data.head()

data.corr().style.background_gradient()

sns.heatmap(data.corr(numeric_only = True),
           annot = True,
           fmt = '.2f')
plt.show()

data.isnull().sum()

Ozone      0
Solar.R    7
Wind       0
Temp       0
Month      0
Day        0
dtype: int64

# 결측치 제거
data['Solar.R'].ffill(inplace = True)

# 불필요한 데이터 제거
data.drop(columns = ['Month', 'Day'], inplace = True)

# 0. 데이터 분리
from sklearn.model_selection import train_test_split

target = 'Ozone'

x = data.drop(columns = target)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

# 1. 모듈 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# 2. 선언하기
model = LinearRegression()

# 3. 학습하기
model.fit(x_train, y_train)

LinearRegression()

LinearRegression()

# 4. 예측하기
y_pred = model.predict(x_test)

# 5. 평가히기
print(mean_absolute_error(y_test, y_pred))
print()

print('Predict:', y_pred[:10])
print('Actual:', y_test.values[:10])

13.976843190385708

Predict: [13.84003067  5.82919112 81.93563027 58.41267418 50.86150737 31.52971121
 66.8083547  -8.56411529 50.2136544  39.13346172]
Actual: [24 18 97 47 34 22 66 18 69 27]

# 5-1. 시각화
plt.plot(y_pred, label = 'Predict')
plt.plot(y_test.values, label = 'Actual')
plt.legend(loc = 'lower center')
plt.show()

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/boston.csv'
data = pd.read_csv(path)

data.head()

data.corr(numeric_only = True).style.background_gradient()

# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'medv'

x = data.drop(columns = target)
y = data[[target]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state =1)

# 1. 모듈 불러오기
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# 2. 선언하기
model = LinearRegression()

# 3. 학습하기
model.fit(x_train, y_train)

LinearRegression()

LinearRegression()

# 4. 예측하기
y_pred = model.predict(x_test)

# 5. 평가하기
mean_absolute_error(y_test, y_pred)

3.3446655035987605

# 5-1. 시각화
plt.plot(y_pred, label = 'Predict')
plt.plot(y_test.values, label = 'Actual')
plt.axhline(y_train.values.mean(), label = 'AVG', color = 'red')
plt.legend(loc = 'upper right')

plt.show()

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/admission_simple.csv'
data = pd.read_csv(path)

data.corr(numeric_only = True).style.background_gradient()

sns.heatmap(data.corr(numeric_only = True),
           annot = True,
           fmt = '.2f')
plt.show()

# 0. 데이터 분리
from sklearn.model_selection import train_test_split

target = 'ADMIT'

x = data.drop(columns = target)
y = data[[target]]

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    shuffle = True,    # 시계열 데이터에서는 셔플 금지
                                                    stratify = y,    # y의 비율에 따라 분할
                                                    train_size = 0.7,    # 전체 중 얼마나 train, test
                                                    random_state = 1)    # 같은 데이터 갖게

# 1. 모듈 불러오기
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 2. 선언하기
model = KNeighborsClassifier()

# 3. 학습하기
model.fit(x_train, y_train)

KNeighborsClassifier()

KNeighborsClassifier()

# 4. 예측하기
pred_y = model.predict(x_test)

# 5. 평가하기
accuracy_score(pred_y, y_test)

0.8466666666666667

print('Predict:', pred_y[:10])

# y_test, y_train 을 df로 저장했을 때 방법1
nlist = list()
for i in y_test.values[:10]:
    nlist.extend(i)
nlist = np.array(nlist)


# 방법2
# nlist = nlist.flatten()

print('Actual:', nlist)

Predict: [1 1 0 1 0 1 0 1 1 1]
Actual: [1 0 0 1 0 1 1 1 1 1]

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/iris.csv'
data = pd.read_csv(path)

data.corr(numeric_only = True).style.background_gradient()

data

sns.heatmap(data.corr(numeric_only = True),
           annot = True,
           fmt = '.2f',
           vmin = -1,
           vmax = 1)
plt.show()

# 0. 데이터 분리
from sklearn.model_selection import train_test_split
target = 'Species'

x = data.drop(columns = target)
y = data[[target]]    # df 저장

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify = y,    # y기준 분할
                                                    test_size = 0.3,
                                                     random_state = 1)

# 1. 모듈 불러오기
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 2. 선언하기
model = DecisionTreeClassifier()

# 3. 학습하기
model.fit(x_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

# 4. 예측하기
y_pred = model.predict(x_test)

# 5. 평가하기
accuracy_score(y_pred, y_test)

0.9777777777777777

print(y_pred[:10])
print(y_test.values[:10].flatten())    # y_test는 df형식(series 형식 아님)

['virginica' 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor'
 'virginica' 'versicolor' 'virginica' 'setosa']
['virginica' 'setosa' 'setosa' 'virginica' 'versicolor' 'versicolor'
 'virginica' 'versicolor' 'virginica' 'setosa']

	Ozone	Solar.R	Wind	Temp	Month	Day
Ozone	1.000000	0.280068	-0.605478	0.683372	0.174197	0.004419
Solar.R	0.280068	1.000000	-0.056792	0.275840	-0.075301	-0.150275
Wind	-0.605478	-0.056792	1.000000	-0.457988	-0.178293	0.027181
Temp	0.683372	0.275840	-0.457988	1.000000	0.420947	-0.130593
Month	0.174197	-0.075301	-0.178293	0.420947	1.000000	-0.007962
Day	0.004419	-0.150275	0.027181	-0.130593	-0.007962	1.000000

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
crim	1.000000	-0.200469	0.406583	-0.055892	0.420972	-0.219247	0.352734	-0.379670	0.625505	0.582764	0.289946	-0.385064	0.455621	-0.388305
zn	-0.200469	1.000000	-0.533828	-0.042697	-0.516604	0.311991	-0.569537	0.664408	-0.311948	-0.314563	-0.391679	0.175520	-0.412995	0.360445
indus	0.406583	-0.533828	1.000000	0.062938	0.763651	-0.391676	0.644779	-0.708027	0.595129	0.720760	0.383248	-0.356977	0.603800	-0.483725
chas	-0.055892	-0.042697	0.062938	1.000000	0.091203	0.091251	0.086518	-0.099176	-0.007368	-0.035587	-0.121515	0.048788	-0.053929	0.175260
nox	0.420972	-0.516604	0.763651	0.091203	1.000000	-0.302188	0.731470	-0.769230	0.611441	0.668023	0.188933	-0.380051	0.590879	-0.427321
rm	-0.219247	0.311991	-0.391676	0.091251	-0.302188	1.000000	-0.240265	0.205246	-0.209847	-0.292048	-0.355501	0.128069	-0.613808	0.695360
age	0.352734	-0.569537	0.644779	0.086518	0.731470	-0.240265	1.000000	-0.747881	0.456022	0.506456	0.261515	-0.273534	0.602339	-0.376955
dis	-0.379670	0.664408	-0.708027	-0.099176	-0.769230	0.205246	-0.747881	1.000000	-0.494588	-0.534432	-0.232471	0.291512	-0.496996	0.249929
rad	0.625505	-0.311948	0.595129	-0.007368	0.611441	-0.209847	0.456022	-0.494588	1.000000	0.910228	0.464741	-0.444413	0.488676	-0.381626
tax	0.582764	-0.314563	0.720760	-0.035587	0.668023	-0.292048	0.506456	-0.534432	0.910228	1.000000	0.460853	-0.441808	0.543993	-0.468536
ptratio	0.289946	-0.391679	0.383248	-0.121515	0.188933	-0.355501	0.261515	-0.232471	0.464741	0.460853	1.000000	-0.177383	0.374044	-0.507787
black	-0.385064	0.175520	-0.356977	0.048788	-0.380051	0.128069	-0.273534	0.291512	-0.444413	-0.441808	-0.177383	1.000000	-0.366087	0.333461
lstat	0.455621	-0.412995	0.603800	-0.053929	0.590879	-0.613808	0.602339	-0.496996	0.488676	0.543993	0.374044	-0.366087	1.000000	-0.737663
medv	-0.388305	0.360445	-0.483725	0.175260	-0.427321	0.695360	-0.376955	0.249929	-0.381626	-0.468536	-0.507787	0.333461	-0.737663	1.000000

	GRE	TOEFL	RANK	SOP	LOR	GPA	RESEARCH	ADMIT
GRE	1.000000	0.827200	0.635376	0.613498	0.524679	0.825878	0.563398	0.701671
TOEFL	0.827200	1.000000	0.649799	0.644410	0.541563	0.810574	0.467012	0.680503
RANK	0.635376	0.649799	1.000000	0.728024	0.608651	0.705254	0.427047	0.618367
SOP	0.613498	0.644410	0.728024	1.000000	0.663707	0.712154	0.408116	0.606876
LOR	0.524679	0.541563	0.608651	0.663707	1.000000	0.637469	0.372526	0.536527
GPA	0.825878	0.810574	0.705254	0.712154	0.637469	1.000000	0.501311	0.752196
RESEARCH	0.563398	0.467012	0.427047	0.408116	0.372526	0.501311	1.000000	0.503104
ADMIT	0.701671	0.680503	0.618367	0.606876	0.536527	0.752196	0.503104	1.000000

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Sepal.Length	1.000000	-0.117570	0.871754	0.817941
Sepal.Width	-0.117570	1.000000	-0.428440	-0.366126
Petal.Length	0.871754	-0.428440	1.000000	0.962865
Petal.Width	0.817941	-0.366126	0.962865	1.000000

241014¶

Machine Learning¶

1. 오존 데이터 (회귀)¶

2. 보스턴 집값 데이터 (회귀)¶

3. 대학원 입학 여부 데이터 (분류)¶

4. 붓꽃 분류 (분류)¶

	Ozone	Solar.R	Wind	Temp	Month	Day
0	41	190.0	7.4	67	5	1
1	36	118.0	8.0	72	5	2
2	12	149.0	12.6	74	5	3
3	18	313.0	11.5	62	5	4
4	19	NaN	14.3	56	5	5

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica