seaborn을 사용한 데이터 분포 시각화¶

Matplotlib을 기반으로 다양한 색상 테마와 통계용 차트 등의 기능을 추가한 시각화 패키지
기본적인 시각화 기능은 Matplotlib 패키지에 의존하며 통계 기능은 Statsmodels 패키지에 의존
Matplotib 과 함께 사용
https://seaborn.pydata.org/index.html
https://datascienceschool.net/view-notebook/4c2d5ff1caab4b21a708cc662137bc65/

# pip install seaborn

Collecting seaborn
  Downloading seaborn-0.10.1-py3-none-any.whl (215 kB)
Requirement already satisfied: numpy>=1.13.3 in c:\users\205\.conda\envs\r_study\lib\site-packages (from seaborn) (1.19.0)
Requirement already satisfied: matplotlib>=2.1.2 in c:\users\205\.conda\envs\r_study\lib\site-packages (from seaborn) (3.3.0)
Collecting scipy>=1.0.1
  Downloading scipy-1.5.1-cp38-cp38-win_amd64.whl (31.4 MB)
Requirement already satisfied: pandas>=0.22.0 in c:\users\205\.conda\envs\r_study\lib\site-packages (from seaborn) (1.0.5)
Requirement already satisfied: pillow>=6.2.0 in c:\users\205\.conda\envs\r_study\lib\site-packages (from matplotlib>=2.1.2->seaborn) (7.2.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\205\.conda\envs\r_study\lib\site-packages (from matplotlib>=2.1.2->seaborn) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\205\.conda\envs\r_study\lib\site-packages (from matplotlib>=2.1.2->seaborn) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\205\.conda\envs\r_study\lib\site-packages (from matplotlib>=2.1.2->seaborn) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\205\.conda\envs\r_study\lib\site-packages (from matplotlib>=2.1.2->seaborn) (2.4.7)
Requirement already satisfied: pytz>=2017.2 in c:\users\205\.conda\envs\r_study\lib\site-packages (from pandas>=0.22.0->seaborn) (2020.1)
Requirement already satisfied: six in c:\users\205\.conda\envs\r_study\lib\site-packages (from cycler>=0.10->matplotlib>=2.1.2->seaborn) (1.15.0)
Installing collected packages: scipy, seaborn
Successfully installed scipy-1.5.1 seaborn-0.10.1
Note: you may need to restart the kernel to use updated packages.

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import numpy as np

x = np.linspace(0, 14, 100) # 0~14 사이에 등간격으로 100개의 데이터가 들어가도록!
y1 = np.sin(x) # sin 그래프
y2 = 2*np.sin(x + 0.5) # y1 보다 진폭 2배, x위치는 0.5 이동
y3 = 3*np.sin(x + 1.0)
y4 = 4*np.sin(x + 1.5)

plt.figure(figsize=(10,6))
plt.plot(x,y1, x,y2, x,y3, x,y4)
plt.show()

# 뒷 배경 바꿀 수 있음 (white / whitegrid 등)
sns.set_style('dark') 

plt.figure(figsize=(10,6))
plt.plot(x,y1, x,y2, x,y3, x,y4)

# 필요없는 axis border 제거
sns.despine()

plt.show()

sns.set_style('whitegrid') 

plt.figure(figsize=(10,6))
plt.plot(x,y1, x,y2, x,y3, x,y4)

sns.despine(offset=10) # offset x축, y축과의 거리 띄우기 옵션

plt.show()

예제) 레스토랑 tip 통계¶

tips : 요일별 점심, 저녁, 흡연 여부 식사 금액과 팁을 정리한 데이터

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set_style("whitegrid")
%matplotlib inline

# tips : 요일별  점심, 저녁, 흡연 여부 식사 금액과 팁을 정리한 데이터
tips = sns.load_dataset('tips')
tips.head(5)

데이터 이상치 분석¶

sns.set_style("whitegrid")

plt.figure(figsize=(8,6))

# total_bill 컬럼의 이상치 확인
sns.boxplot(x=tips["total_bill"])

plt.show()

plt.figure(figsize=(8,6))
sns.boxplot(x="day", y="total_bill", data=tips)
plt.show()

# 주말에 총 식사 금액이 높은 것을 알 수 있다

plt.figure(figsize=(8,6))

# hue : 그릅 지어줌
sns.boxplot(x="day", y="total_bill", hue="smoker", data=tips, palette="Set3")

plt.show()

plt.figure(figsize=(8,6))

# swarmplot은 stripplot과 비슷하지만 데이터를 나타내는 점이 겹치지 않도록 옆으로 이동
sns.swarmplot(x="day", y="total_bill", data=tips, color=".5")
plt.show()

plt.figure(figsize=(8,6))
sns.boxplot(x="day", y="total_bill", data=tips)
sns.swarmplot(x='day', y='total_bill', data=tips, color=".25")
plt.show()

데이터 회귀 분석¶

https://stricky.tistory.com/124

# 회귀분석하기
sns.set_style('darkgrid')
sns.lmplot(x='total_bill', y='tip', data=tips, size=7)
plt.show()

C:\Users\205\.conda\envs\r_study\lib\site-packages\seaborn\regression.py:573: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

# hue 그룹별로 묶는 옵션
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips, size=7)
plt.show()

C:\Users\205\.conda\envs\r_study\lib\site-packages\seaborn\regression.py:573: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

# palette : 준비된 색상 사용
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips, palette="Set1", size=7)
plt.show()

C:\Users\205\.conda\envs\r_study\lib\site-packages\seaborn\regression.py:573: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

heatmap 그리기¶

https://rfriend.tistory.com/419
seaborn은 데이터가 2차원 피벗 테이블 형태의 DataFrame으로 집계가 되어 있어야 함 -> sns.heatmap() 함수로 매우 간단하게 히트맵을 그려줌

# 렌덤 데이터 뽑기
uniform_data = np.random.rand(10, 12)
uniform_data

array([[0.76521694, 0.8509849 , 0.81710593, 0.09330002, 0.39397471,
        0.47531479, 0.51715782, 0.58600019, 0.4039119 , 0.73746222,
        0.51223502, 0.21077428],
       [0.63214211, 0.51223726, 0.34767351, 0.07271042, 0.29931021,
        0.35465967, 0.84116496, 0.19761945, 0.55309399, 0.10456428,
        0.98481457, 0.98329438],
       [0.24504171, 0.48692868, 0.84778432, 0.24167111, 0.6908898 ,
        0.44471907, 0.43938796, 0.06797726, 0.44640081, 0.98473936,
        0.95377681, 0.84856707],
       [0.7044088 , 0.42607806, 0.0631966 , 0.86203469, 0.31118263,
        0.56147755, 0.44264132, 0.48731698, 0.81740754, 0.67588092,
        0.3249606 , 0.21319456],
       [0.87071639, 0.58983575, 0.3235015 , 0.36999525, 0.57332803,
        0.24369277, 0.9088101 , 0.13586679, 0.4690908 , 0.24945623,
        0.23434014, 0.52714381],
       [0.05931859, 0.64437612, 0.43216454, 0.79086532, 0.47078407,
        0.76850849, 0.85315332, 0.42076962, 0.11365723, 0.37509032,
        0.10270419, 0.27443537],
       [0.0080614 , 0.81633395, 0.51780829, 0.3954894 , 0.73282102,
        0.85383831, 0.24264645, 0.29747614, 0.01648424, 0.14058904,
        0.77662967, 0.85203049],
       [0.47455547, 0.55123389, 0.01211627, 0.41937937, 0.3805657 ,
        0.92490766, 0.64522336, 0.16551995, 0.88199035, 0.82814488,
        0.4914034 , 0.37729463],
       [0.45652452, 0.14607903, 0.7785055 , 0.63989596, 0.23531233,
        0.99776393, 0.71315095, 0.35081031, 0.65955451, 0.26087877,
        0.41570753, 0.47123292],
       [0.85603251, 0.73578901, 0.80848295, 0.24206133, 0.13425923,
        0.82133267, 0.65645067, 0.4848226 , 0.85330143, 0.34841877,
        0.68391862, 0.08778288]])

sns.heatmap(uniform_data)
plt.show()

sns.heatmap(uniform_data, vmin=0, vmax=1) # vmin, vmax 색상 막대 범위
plt.show()

heatmap 활용해서 데이터 분석¶

# 연도별 월병 항공기 승객수 flights 데이터 로드
flights = sns.load_dataset('flights')
flights.head(5)

# pivot 기능으로 간편하게 월별, 연도별로 구분할 수 있다. 
flights = flights.pivot("month", "year", "passengers")
flights.head(5)

plt.figure(figsize=(10,8))
sns.heatmap(flights)
plt.show()

plt.figure(figsize=(10,8))

# annot=True argument :각 셀에 숫자 입력해(annotate each cell with numeric value)
# fmt='d' : 정수 형태(integer format)로 숫자 입력
sns.heatmap(flights, annot=True, fmt='d')

plt.show()

pairplot¶

데이터에 들어 있는 각 컬럼(열)들의 모든 상관 관계를 출력
3차원 이상의 데이터라면 pairplot 함수를 사용해 분포도를 그림
pairplot은 그리도(grid) 형태로 각 집합의 조합에 대해 히스토그램과 분포도를 그림 ## iris 데이터 분석
iris 머신러닝에서 기본적으로 사용한 아이리스 꽃 관련 데이터
꽃잎, 꽃받침의 너비화 폭을 가지고 종(species)을 구분 하기 위한 데이터

# 데이터 로드
sns.set(style='ticks') # ticks : grid 제거

iris = sns.load_dataset('iris')
iris.head(10)

# pairplot()
sns.pairplot(iris)
plt.show()

# hue 옵션 : 그룹별로 묶기
sns.pairplot(iris, hue='species')
plt.show()

# 컬럼별로 관계 분석할 수 있도록 그래프 그리기
sns.pairplot(iris, vars=['sepal_width', 'sepal_length'])
plt.show()

sns.pairplot(iris, x_vars=["sepal_width", "sepal_length"], 
             y_vars=["petal_width", "petal_length"])
plt.show()

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	year	month	passengers
0	1949	January	112
1	1949	February	118
2	1949	March	132
3	1949	April	129
4	1949	May	121

year	1949	1950	1951	1952	1953	1954	1955	1956	1957	1958	1959	1960
month
January	112	115	145	171	196	204	242	284	315	340	360	417
February	118	126	150	180	196	188	233	277	301	318	342	391
March	132	141	178	193	236	235	267	317	356	362	406	419
April	129	135	163	181	235	227	269	313	348	348	396	461
May	121	125	172	183	229	234	270	318	355	363	420	472

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
5	5.4	3.9	1.7	0.4	setosa
6	4.6	3.4	1.4	0.3	setosa
7	5.0	3.4	1.5	0.2	setosa
8	4.4	2.9	1.4	0.2	setosa
9	4.9	3.1	1.5	0.1	setosa