{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1.Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import library\n", "import numpy as np \n", "import pandas as pd\n", "import json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.1)영화 정보 데이터" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "movie = pd.read_csv('the-movies-dataset/movies_metadata.csv', encoding = \"UTF-8\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "\n", " homepage id imdb_id original_language \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n", "1 NaN 8844 tt0113497 en \n", "2 NaN 15602 tt0113228 en \n", "3 NaN 31357 tt0114885 en \n", "4 NaN 11862 tt0113041 en \n", "\n", " original_title \\\n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men \n", "3 Waiting to Exhale \n", "4 Father of the Bride Part II \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.2)유저 평점 데이터" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "ratings = pd.read_csv('the-movies-dataset/ratings_small.csv', encoding = \"UTF-8\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2.EDA(탐색적 데이터 분석)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.1)movie" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1.1)변수 선택" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 45466 entries, 0 to 45465\n", "Data columns (total 24 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 adult 45466 non-null object \n", " 1 belongs_to_collection 4494 non-null object \n", " 2 budget 45466 non-null object \n", " 3 genres 45466 non-null object \n", " 4 homepage 7782 non-null object \n", " 5 id 45466 non-null object \n", " 6 imdb_id 45449 non-null object \n", " 7 original_language 45455 non-null object \n", " 8 original_title 45466 non-null object \n", " 9 overview 44512 non-null object \n", " 10 popularity 45461 non-null object \n", " 11 poster_path 45080 non-null object \n", " 12 production_companies 45463 non-null object \n", " 13 production_countries 45463 non-null object \n", " 14 release_date 45379 non-null object \n", " 15 revenue 45460 non-null float64\n", " 16 runtime 45203 non-null float64\n", " 17 spoken_languages 45460 non-null object \n", " 18 status 45379 non-null object \n", " 19 tagline 20412 non-null object \n", " 20 title 45460 non-null object \n", " 21 video 45460 non-null object \n", " 22 vote_average 45460 non-null float64\n", " 23 vote_count 45460 non-null float64\n", "dtypes: float64(4), object(20)\n", "memory usage: 8.3+ MB\n" ] } ], "source": [ "movie.info()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 32269 entries, 0 to 45465\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movie_id 32269 non-null object\n", " 1 imdb_id 32256 non-null object\n", " 2 original_title 32269 non-null object\n", " 3 original_language 32269 non-null object\n", " 4 genres 32269 non-null object\n", "dtypes: object(5)\n", "memory usage: 1.5+ MB\n" ] } ], "source": [ "# 사용할 변수 선택\n", "movie = movie[['id', 'imdb_id', 'original_title', 'original_language', 'genres']]\n", "movie = movie.rename(columns={'id':'movie_id'})\n", "movie = movie[movie['original_language'] == 'en']\n", "movie.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1.2)데이터 사전[movie]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "| 변수 | 설명 | \n", "|----------|:-------------:|\n", "| movie_id | 영화 id |\n", "| imdb_id | imdb에 등록된 영화 id |\n", "| original_title | 영화 이름 |\n", "| original_language | 영화 언어 |\n", "| genres | 영화 장르 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1.3)탐색" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 32256 entries, 0 to 45465\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movie_id 32256 non-null object\n", " 1 imdb_id 32256 non-null object\n", " 2 original_title 32256 non-null object\n", " 3 original_language 32256 non-null object\n", " 4 genres 32256 non-null object\n", "dtypes: object(5)\n", "memory usage: 1.5+ MB\n" ] } ], "source": [ "# null제거 -> 평균으로 처리할 수 없는 데이터이므로\n", "movie = movie.dropna(axis=0)\n", "movie.info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
count3636363636
unique181818117
top152795tt0454792Confessions of a Dangerous Minden[{'id': 99, 'name': 'Documentary'}]
freq222364
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language \\\n", "count 36 36 36 36 \n", "unique 18 18 18 1 \n", "top 152795 tt0454792 Confessions of a Dangerous Mind en \n", "freq 2 2 2 36 \n", "\n", " genres \n", "count 36 \n", "unique 17 \n", "top [{'id': 99, 'name': 'Documentary'}] \n", "freq 4 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 중복 movie_id 확인\n", "dup_movie = movie[movie.duplicated(subset=['movie_id'], keep=False)]\n", "dup_movie.describe()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
count3636363636
unique181818117
top152795tt0454792Confessions of a Dangerous Minden[{'id': 99, 'name': 'Documentary'}]
freq222364
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language \\\n", "count 36 36 36 36 \n", "unique 18 18 18 1 \n", "top 152795 tt0454792 Confessions of a Dangerous Mind en \n", "freq 2 2 2 36 \n", "\n", " genres \n", "count 36 \n", "unique 17 \n", "top [{'id': 99, 'name': 'Documentary'}] \n", "freq 4 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#중복 imdb_id 확인\n", "dup_movie2 = movie[movie.duplicated(subset=['imdb_id'], keep=False)]\n", "dup_movie2.describe()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
count31943194319431943194
unique3176317614201861
top119916tt0127834Hamleten[{'id': 18, 'name': 'Drama'}]
freq2283194303
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language \\\n", "count 3194 3194 3194 3194 \n", "unique 3176 3176 1420 1 \n", "top 119916 tt0127834 Hamlet en \n", "freq 2 2 8 3194 \n", "\n", " genres \n", "count 3194 \n", "unique 861 \n", "top [{'id': 18, 'name': 'Drama'}] \n", "freq 303 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#중복 title 확인\n", "dup_movie3 = movie[movie.duplicated(subset=['original_title'], keep=False)]\n", "dup_movie3.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "#중복 id, title제거\n", "column_list = ['movie_id','imdb_id','original_title']\n", "for i in column_list:\n", " movie = movie.drop_duplicates(subset=i, keep=\"first\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 30482 entries, 0 to 45465\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movie_id 30482 non-null object\n", " 1 imdb_id 30482 non-null object\n", " 2 original_title 30482 non-null object\n", " 3 original_language 30482 non-null object\n", " 4 genres 30482 non-null object\n", "dtypes: object(5)\n", "memory usage: 1.4+ MB\n" ] } ], "source": [ "movie.info()\n", "# 30,482 종류의 영화" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
count00000
unique00000
topNaNNaNNaNNaNNaN
freqNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language genres\n", "count 0 0 0 0 0\n", "unique 0 0 0 0 0\n", "top NaN NaN NaN NaN NaN\n", "freq NaN NaN NaN NaN NaN" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#중복 movie_id 재확인\n", "dup_movie = movie[movie.duplicated(subset=['movie_id'], keep=False)]\n", "dup_movie.describe()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
count00000
unique00000
topNaNNaNNaNNaNNaN
freqNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language genres\n", "count 0 0 0 0 0\n", "unique 0 0 0 0 0\n", "top NaN NaN NaN NaN NaN\n", "freq NaN NaN NaN NaN NaN" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#중복 imdb_id 재확인\n", "dup_movie2 = movie[movie.duplicated(subset=['imdb_id'], keep=False)]\n", "dup_movie2.describe()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
count00000
unique00000
topNaNNaNNaNNaNNaN
freqNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language genres\n", "count 0 0 0 0 0\n", "unique 0 0 0 0 0\n", "top NaN NaN NaN NaN NaN\n", "freq NaN NaN NaN NaN NaN" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#중복 title 확인\n", "dup_movie3 = movie[movie.duplicated(subset=['original_title'], keep=False)]\n", "dup_movie3.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.2)ratings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2.1)변수 선택" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 100004 entries, 0 to 100003\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 100004 non-null int64 \n", " 1 movieId 100004 non-null int64 \n", " 2 rating 100004 non-null float64\n", " 3 timestamp 100004 non-null int64 \n", "dtypes: float64(1), int64(3)\n", "memory usage: 3.1 MB\n" ] } ], "source": [ "ratings.info()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 100004 entries, 0 to 100003\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 100004 non-null int64 \n", " 1 movie_id 100004 non-null int64 \n", " 2 rating 100004 non-null float64\n", "dtypes: float64(1), int64(2)\n", "memory usage: 2.3 MB\n" ] } ], "source": [ "# 사용할 변수 선택\n", "ratings = ratings[['userId', 'movieId', 'rating']]\n", "ratings = ratings.rename(columns={'movieId':'movie_id'})\n", "ratings.info()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovie_idrating
01312.5
1110293.0
2110613.0
3111292.0
4111724.0
\n", "
" ], "text/plain": [ " userId movie_id rating\n", "0 1 31 2.5\n", "1 1 1029 3.0\n", "2 1 1061 3.0\n", "3 1 1129 2.0\n", "4 1 1172 4.0" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2.2)데이터 사전[ratings]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "| 변수 | 설명 | \n", "|----------|:-------------:|\n", "| userId | 유저 id |\n", "| movie_id | 영화 id |\n", "| rating | 영화 평점 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2.3)탐색" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovie_idrating
count0.00.00.0
meanNaNNaNNaN
stdNaNNaNNaN
minNaNNaNNaN
25%NaNNaNNaN
50%NaNNaNNaN
75%NaNNaNNaN
maxNaNNaNNaN
\n", "
" ], "text/plain": [ " userId movie_id rating\n", "count 0.0 0.0 0.0\n", "mean NaN NaN NaN\n", "std NaN NaN NaN\n", "min NaN NaN NaN\n", "25% NaN NaN NaN\n", "50% NaN NaN NaN\n", "75% NaN NaN NaN\n", "max NaN NaN NaN" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 한 유저가 같은 영화를 여러번 평가했는지 중복 확인\n", "dup_rating = ratings[ratings.duplicated(subset=['userId','movie_id'], keep=False)]\n", "dup_rating.describe()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 671.000000\n", "mean 149.037258\n", "std 231.226948\n", "min 20.000000\n", "25% 37.000000\n", "50% 71.000000\n", "75% 161.000000\n", "max 2391.000000\n", "Name: movie_id, dtype: float64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 몇 명의 유저가 몇개의 영화를 평가했는지 확인\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "user_count = ratings.groupby('userId').count()\n", "user_count = user_count['movie_id']\n", "user_count.describe()\n", "\n", "# <1> 671명의 유저가 10만개의 영화 평가 -> 매니악층의 유저 평가를 모아놓은 데이터셋으로 추정\n", "\n", "# <2> 평균 : 149, 편차 : 231 -> 1인당 평균 149개의 영화 평가" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'\\n<참고문헌>\\n- https://pandas.pydata.org/pandas-docs/version/0.13/visualization.html\\n- http://jonathansoma.com/lede/data-studio/matplotlib/changing-the-background-of-a-pandas-matplotlib-graph/\\n'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 671명의 시청 수 그래프\n", "plt.subplots(facecolor='white')\n", "plt.plot(user_count)\n", "plt.title(\"Number of movies watched by user\")\n", "plt.xlabel(\"User ID\")\n", "plt.ylabel(\"Watched Movie\")\n", "plt.show()\n", "\n", "'''\n", "<참고문헌>\n", "- https://pandas.pydata.org/pandas-docs/version/0.13/visualization.html\n", "- http://jonathansoma.com/lede/data-studio/matplotlib/changing-the-background-of-a-pandas-matplotlib-graph/\n", "'''" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'\\n<참고문헌>\\n- https://pandas.pydata.org/pandas-docs/version/0.13/visualization.html\\n- http://jonathansoma.com/lede/data-studio/matplotlib/changing-the-background-of-a-pandas-matplotlib-graph/\\n'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 시청 분포도\n", "sns.set_style('white')\n", "hist = sns.distplot(user_count,hist=True,kde=False,bins=10,color='royalblue',\n", " hist_kws={'edgecolor': 'gray'},kde_kws={'linewidth': 3})\n", "hist.set_title('Histogram of UserMovieCount')\n", "hist.set_xlabel('Watched Movie')\n", "hist.set_ylabel('Density')\n", "plt.show()\n", "'''\n", "<참고문헌>\n", "- https://pandas.pydata.org/pandas-docs/version/0.13/visualization.html\n", "- http://jonathansoma.com/lede/data-studio/matplotlib/changing-the-background-of-a-pandas-matplotlib-graph/\n", "'''" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "userId\n", "1 20\n", "2 76\n", "3 51\n", "4 204\n", "5 100\n", "Name: movie_id, dtype: int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_count.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "userId\n", "15 1700\n", "30 1011\n", "73 1610\n", "311 1019\n", "380 1063\n", "452 1340\n", "468 1291\n", "547 2391\n", "564 1868\n", "624 1735\n", "Name: movie_id, dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 시청 수가 1000개가 넘는 유저 탐색 \n", "maniac = user_count[user_count>=1000]\n", "maniac # 따로 이상치라고 판단x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3.Feature Engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.1)movie['movie_id']&ratings['movie_id']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1.1) 데이터 타입 변경" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# movieId : object to int64\n", "movie.movie_id = pd.to_numeric(movie.movie_id, errors='coerce') #ValueError무시\n", "ratings.movie_id = pd.to_numeric(ratings.movie_id, errors='coerce')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.2)movie['genres']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.2.1)json decoding" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '...\n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...\n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...\n", "4 [{'id': 35, 'name': 'Comedy'}]\n", "Name: genres, dtype: object\n" ] }, { "data": { "text/plain": [ "\"\\nmovie의 genre 객체들은 str인 상태. 각 객체를 str->dict형태로 변환 후, name value만 list에 담아야함\\nex) [{'id': 27, 'name': 'Horror'}, ..] -> [[Horror], ..]\\n\"" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(movie['genres'].head())\n", "'''\n", "movie의 genre 객체들은 str인 상태. 각 객체를 str->dict형태로 변환 후, name value만 list에 담아야함\n", "ex) [{'id': 27, 'name': 'Horror'}, ..] -> [[Horror], ..]\n", "'''" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n- http://pythonstudy.xyz/python/article/205-JSON-%EB%8D%B0%EC%9D%B4%ED%83%80\\n- http://tcpschool.com/json/json_datatype_string\\n- https://galid1.tistory.com/405\\n'" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# genres의 각 str행 객체를 json str -> dict로 디코딩 후 name key의 value만 genre_list에 담는 함수\n", "def parse_genres(each_genre): # str -> json, str -> dict\n", " genres = json.loads(each_genre.replace('\\'', '\"')) # 각 행의 '를 \"로 변환 (json decoding)\n", "# Json은 double quotes(\"\")이 기본 \n", " genres_list = [] \n", " for g in genres: #genres = [ {'id': 27, 'name...}, {'id': 9648, 'name...} ]\n", " genres_list.append(g['name']) #genres의 요소를 하나씩 꺼내서 name value만 genres_list에 담기\n", " return genres_list\n", "\n", "movie['genres'] = movie['genres'].apply(parse_genres) #genres열에 함수 적용 json to list\n", "\n", "'''\n", "- http://pythonstudy.xyz/python/article/205-JSON-%EB%8D%B0%EC%9D%B4%ED%83%80\n", "- http://tcpschool.com/json/json_datatype_string\n", "- https://galid1.tistory.com/405\n", "'''" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 [Animation, Comedy, Family]\n", "1 [Adventure, Fantasy, Family]\n", "2 [Romance, Comedy]\n", "3 [Comedy, Drama, Romance]\n", "4 [Comedy]\n", " ... \n", "45457 [Mystery, Horror]\n", "45458 [Horror]\n", "45459 [Science Fiction]\n", "45464 []\n", "45465 []\n", "Name: genres, Length: 30482, dtype: object\n" ] } ], "source": [ "print(movie['genres'])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
0862tt0114709Toy Storyen[Animation, Comedy, Family]
18844tt0113497Jumanjien[Adventure, Fantasy, Family]
215602tt0113228Grumpier Old Menen[Romance, Comedy]
331357tt0114885Waiting to Exhaleen[Comedy, Drama, Romance]
411862tt0113041Father of the Bride Part IIen[Comedy]
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language \\\n", "0 862 tt0114709 Toy Story en \n", "1 8844 tt0113497 Jumanji en \n", "2 15602 tt0113228 Grumpier Old Men en \n", "3 31357 tt0114885 Waiting to Exhale en \n", "4 11862 tt0113041 Father of the Bride Part II en \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.1)movie&ratings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1.1)병합" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 30482 entries, 0 to 45465\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movie_id 30482 non-null int64 \n", " 1 imdb_id 30482 non-null object\n", " 2 original_title 30482 non-null object\n", " 3 original_language 30482 non-null object\n", " 4 genres 30482 non-null object\n", "dtypes: int64(1), object(4)\n", "memory usage: 1.4+ MB\n" ] } ], "source": [ "movie.info()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 100004 entries, 0 to 100003\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 100004 non-null int64 \n", " 1 movie_id 100004 non-null int64 \n", " 2 rating 100004 non-null float64\n", "dtypes: float64(1), int64(2)\n", "memory usage: 2.3 MB\n" ] } ], "source": [ "ratings.info()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idimdb_idoriginal_titleoriginal_languagegenres
0862tt0114709Toy Storyen[Animation, Comedy, Family]
18844tt0113497Jumanjien[Adventure, Fantasy, Family]
215602tt0113228Grumpier Old Menen[Romance, Comedy]
331357tt0114885Waiting to Exhaleen[Comedy, Drama, Romance]
411862tt0113041Father of the Bride Part IIen[Comedy]
\n", "
" ], "text/plain": [ " movie_id imdb_id original_title original_language \\\n", "0 862 tt0114709 Toy Story en \n", "1 8844 tt0113497 Jumanji en \n", "2 15602 tt0113228 Grumpier Old Men en \n", "3 31357 tt0114885 Waiting to Exhale en \n", "4 11862 tt0113041 Father of the Bride Part II en \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovie_idrating
01312.5
1110293.0
2110613.0
3111292.0
4111724.0
\n", "
" ], "text/plain": [ " userId movie_id rating\n", "0 1 31 2.5\n", "1 1 1029 3.0\n", "2 1 1061 3.0\n", "3 1 1129 2.0\n", "4 1 1172 4.0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovie_idratingimdb_idoriginal_titleoriginal_languagegenres
0113712.5tt0084602Rocky IIIen[Drama]
1413714.0tt0084602Rocky IIIen[Drama]
2713713.0tt0084602Rocky IIIen[Drama]
31913714.0tt0084602Rocky IIIen[Drama]
42113713.0tt0084602Rocky IIIen[Drama]
\n", "
" ], "text/plain": [ " userId movie_id rating imdb_id original_title original_language \\\n", "0 1 1371 2.5 tt0084602 Rocky III en \n", "1 4 1371 4.0 tt0084602 Rocky III en \n", "2 7 1371 3.0 tt0084602 Rocky III en \n", "3 19 1371 4.0 tt0084602 Rocky III en \n", "4 21 1371 3.0 tt0084602 Rocky III en \n", "\n", " genres \n", "0 [Drama] \n", "1 [Drama] \n", "2 [Drama] \n", "3 [Drama] \n", "4 [Drama] " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#movie + ratings inner join : movie_id기준으로 두 테이블 조인\n", "data = pd.merge(ratings, movie, on='movie_id', how='inner')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 31854 entries, 0 to 31853\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 31854 non-null int64 \n", " 1 movie_id 31854 non-null int64 \n", " 2 rating 31854 non-null float64\n", " 3 imdb_id 31854 non-null object \n", " 4 original_title 31854 non-null object \n", " 5 original_language 31854 non-null object \n", " 6 genres 31854 non-null object \n", "dtypes: float64(1), int64(2), object(4)\n", "memory usage: 1.9+ MB\n" ] }, { "data": { "text/plain": [ "'\\nratings = 10004개\\nmovie = 30482개\\njoin = 31854개?\\n'" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.info()\n", "'''\n", "ratings = 10004개\n", "movie = 30482개\n", "join = 31854개?\n", "'''" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Terminator 3: Rise of the Machines 324\n", "The Million Dollar Hotel 311\n", "The 39 Steps 291\n", "Once Were Warriors 244\n", "Men in Black II 224\n", " ... \n", "Repeaters 1\n", "Reservation Road 1\n", "Blood River 1\n", "Beowulf & Grendel 1\n", "Fire Birds 1\n", "Name: original_title, Length: 1973, dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['original_title'].value_counts()\n", "#조인 테이블에 영화종류는 1973개" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "296 324\n", "318 311\n", "260 291\n", "527 244\n", "608 224\n", " ... \n", "90603 1\n", "8675 1\n", "2486 1\n", "2438 1\n", "2049 1\n", "Name: movie_id, Length: 1973, dtype: int64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['movie_id'].value_counts()\n", "#영화 테이블에 영화종류는 30482개" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "조인 테이블 -> 671명의 유저들이 30482개 중 1973개 영화만 평가한 31854rows 테이블" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1.2)피벗 테이블" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 671 entries, 1 to 671\n", "Columns: 1973 entries, !Women Art Revolution to Мой сводный брат Франкенштейн\n", "dtypes: float64(1973)\n", "memory usage: 10.1 MB\n" ] } ], "source": [ "matrix = data.pivot_table(index='userId', columns='original_title', values='rating')\n", "matrix.info() # 671명의 유저 row" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
original_title!Women Art Revolution'Gator Bait'Twas the Night Before Christmas10 Items or Less10 Things I Hate About You10,000 BC11'09''01 - September 1112 + 112 Angry Men1408...Young and InnocentZaatZabriskie PointZapped Again!ZardozZodiaceXistenZxXx¡Three Amigos!Мой сводный брат Франкенштейн
userId
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...3.5NaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...3.5NaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 1973 columns

\n", "
" ], "text/plain": [ "original_title !Women Art Revolution 'Gator Bait \\\n", "userId \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "5 NaN NaN \n", "\n", "original_title 'Twas the Night Before Christmas 10 Items or Less \\\n", "userId \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "5 NaN NaN \n", "\n", "original_title 10 Things I Hate About You 10,000 BC \\\n", "userId \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "5 NaN NaN \n", "\n", "original_title 11'09''01 - September 11 12 + 1 12 Angry Men 1408 ... \\\n", "userId ... \n", "1 NaN NaN NaN NaN ... \n", "2 NaN NaN NaN NaN ... \n", "3 NaN NaN NaN NaN ... \n", "4 NaN NaN NaN NaN ... \n", "5 NaN NaN NaN NaN ... \n", "\n", "original_title Young and Innocent Zaat Zabriskie Point Zapped Again! \\\n", "userId \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 3.5 NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "5 3.5 NaN NaN NaN \n", "\n", "original_title Zardoz Zodiac eXistenZ xXx ¡Three Amigos! \\\n", "userId \n", "1 NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN \n", "5 NaN NaN NaN NaN NaN \n", "\n", "original_title Мой сводный брат Франкенштейн \n", "userId \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "\n", "[5 rows x 1973 columns]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matrix.head()\n", "# row : 671명의 유저\n", "# column : 1973개의 영화" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4.Recommendation[1st]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.1)Pearson correlation 함수" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "image/jpeg": "\n", "text/plain": [ "" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import Image\n", "Image(\"image/pearson.jpg\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# 장르 가중치(비슷한 장르)와 유저 간 영화의 평점에 따른 피어슨 상관계수를 계산하는 함수\n", "GENRE_WEIGHT = 0.1\n", "\n", "def pearsonR(s1, s2):\n", " s1_c = s1 - s1.mean()\n", " s2_c = s2 - s2.mean()\n", " return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))\n", "\n", "def recommend(input_movie, matrix, n, similar_genre=True):\n", " input_genres = movie[movie['original_title'] == input_movie]['genres'].iloc(0)[0]\n", "\n", " result = []\n", " for title in matrix.columns:\n", " if title == input_movie:\n", " continue\n", "\n", " # rating comparison\n", " cor = pearsonR(matrix[input_movie], matrix[title])\n", " \n", " # genre comparison\n", " if similar_genre and len(input_genres) > 0:\n", " temp_genres = movie[movie['original_title'] == title]['genres'].iloc(0)[0]\n", "\n", " same_count = np.sum(np.isin(input_genres, temp_genres))\n", " cor += (GENRE_WEIGHT * same_count)\n", " \n", " if np.isnan(cor):\n", " continue\n", " else:\n", " result.append((title, '{:.2f}'.format(cor), temp_genres))\n", " \n", " result.sort(key=lambda r: r[1], reverse=True)\n", "\n", " return result[:n]\n", "#https://namu.wiki/w/%EC%83%81%EA%B4%80%20%EA%B3%84%EC%88%98?from=%ED%94%BC%EC%96%B4%EC%8A%A8%20%EC%83%81%EA%B4%80%20%EA%B3%84%EC%88%98#s-2\n", "#https://wikidocs.net/46459" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2)첫 번째 추천" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.2.1)list" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:7: RuntimeWarning: invalid value encountered in double_scalars\n", " import sys\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecorrelationgenres
0Forbidden Planet0.53[Adventure, Science Fiction, Action]
1Wild Wild West0.48[Action, Adventure, Comedy, Science Fiction, W...
2Armageddon0.47[Action, Thriller, Science Fiction, Adventure]
3Lara Croft: Tomb Raider0.47[Adventure, Fantasy, Action, Thriller]
4Aliens vs Predator: Requiem0.46[Fantasy, Action, Science Fiction, Thriller, H...
5The Lord of the Rings: The Two Towers0.46[Adventure, Fantasy, Action]
6Pitch Black0.43[Thriller, Science Fiction, Action]
7Return of the Jedi0.43[Adventure, Action, Science Fiction]
8Star Trek VI: The Undiscovered Country0.43[Science Fiction, Action, Adventure, Thriller]
9Indiana Jones and the Temple of Doom0.40[Adventure, Action]
\n", "
" ], "text/plain": [ " title correlation \\\n", "0 Forbidden Planet 0.53 \n", "1 Wild Wild West 0.48 \n", "2 Armageddon 0.47 \n", "3 Lara Croft: Tomb Raider 0.47 \n", "4 Aliens vs Predator: Requiem 0.46 \n", "5 The Lord of the Rings: The Two Towers 0.46 \n", "6 Pitch Black 0.43 \n", "7 Return of the Jedi 0.43 \n", "8 Star Trek VI: The Undiscovered Country 0.43 \n", "9 Indiana Jones and the Temple of Doom 0.40 \n", "\n", " genres \n", "0 [Adventure, Science Fiction, Action] \n", "1 [Action, Adventure, Comedy, Science Fiction, W... \n", "2 [Action, Thriller, Science Fiction, Adventure] \n", "3 [Adventure, Fantasy, Action, Thriller] \n", "4 [Fantasy, Action, Science Fiction, Thriller, H... \n", "5 [Adventure, Fantasy, Action] \n", "6 [Thriller, Science Fiction, Action] \n", "7 [Adventure, Action, Science Fiction] \n", "8 [Science Fiction, Action, Adventure, Thriller] \n", "9 [Adventure, Action] " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recommend_result = recommend('Star Wars', matrix, 10, similar_genre=True) #영화 매개변수\n", "recommend_list = pd.DataFrame(recommend_result, columns = ['title', 'correlation', 'genres'])\n", "recommend_list #correlation이 비슷한 상황이 잦아서 분별성이 떨어짐" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.2.2)narrowing\n", " - imdb_id와 조인해서 추천 영화들의 실제 평균 평점 탐색" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Forbidden Planet\n", "1 Wild Wild West\n", "2 Armageddon\n", "3 Lara Croft: Tomb Raider\n", "4 Aliens vs Predator: Requiem\n", "5 The Lord of the Rings: The Two Towers\n", "6 Pitch Black\n", "7 Return of the Jedi\n", "8 Star Trek VI: The Undiscovered Country\n", "9 Indiana Jones and the Temple of Doom\n", "Name: title, dtype: object" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim_movie = recommend_list['title']\n", "sim_movie\n", "#movie랑 조인해서 imdbID 수집" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 30482 entries, 0 to 45465\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movie_id 30482 non-null int64 \n", " 1 imdb_id 30482 non-null object\n", " 2 title 30482 non-null object\n", " 3 original_language 30482 non-null object\n", " 4 genres 30482 non-null object\n", "dtypes: int64(1), object(4)\n", "memory usage: 1.4+ MB\n" ] } ], "source": [ "#title 이름 변경\n", "movie = movie.rename(columns={'original_title':'title'})\n", "movie.info()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
title
0Forbidden Planet
1Wild Wild West
2Armageddon
3Lara Croft: Tomb Raider
4Aliens vs Predator: Requiem
5The Lord of the Rings: The Two Towers
6Pitch Black
7Return of the Jedi
8Star Trek VI: The Undiscovered Country
9Indiana Jones and the Temple of Doom
\n", "
" ], "text/plain": [ " title\n", "0 Forbidden Planet\n", "1 Wild Wild West\n", "2 Armageddon\n", "3 Lara Croft: Tomb Raider\n", "4 Aliens vs Predator: Requiem\n", "5 The Lord of the Rings: The Two Towers\n", "6 Pitch Black\n", "7 Return of the Jedi\n", "8 Star Trek VI: The Undiscovered Country\n", "9 Indiana Jones and the Temple of Doom" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_rc = pd.DataFrame(sim_movie)\n", "first_rc" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlemovie_idimdb_idoriginal_languagegenres
0Forbidden Planet830tt0049223en[Adventure, Science Fiction, Action]
1Wild Wild West8487tt0120891en[Action, Adventure, Comedy, Science Fiction, W...
2Armageddon95tt0120591en[Action, Thriller, Science Fiction, Adventure]
3Lara Croft: Tomb Raider1995tt0146316en[Adventure, Fantasy, Action, Thriller]
4Aliens vs Predator: Requiem440tt0758730en[Fantasy, Action, Science Fiction, Thriller, H...
5The Lord of the Rings: The Two Towers121tt0167261en[Adventure, Fantasy, Action]
6Pitch Black2787tt0134847en[Thriller, Science Fiction, Action]
7Return of the Jedi1892tt0086190en[Adventure, Action, Science Fiction]
8Star Trek VI: The Undiscovered Country174tt0102975en[Science Fiction, Action, Adventure, Thriller]
9Indiana Jones and the Temple of Doom87tt0087469en[Adventure, Action]
\n", "
" ], "text/plain": [ " title movie_id imdb_id \\\n", "0 Forbidden Planet 830 tt0049223 \n", "1 Wild Wild West 8487 tt0120891 \n", "2 Armageddon 95 tt0120591 \n", "3 Lara Croft: Tomb Raider 1995 tt0146316 \n", "4 Aliens vs Predator: Requiem 440 tt0758730 \n", "5 The Lord of the Rings: The Two Towers 121 tt0167261 \n", "6 Pitch Black 2787 tt0134847 \n", "7 Return of the Jedi 1892 tt0086190 \n", "8 Star Trek VI: The Undiscovered Country 174 tt0102975 \n", "9 Indiana Jones and the Temple of Doom 87 tt0087469 \n", "\n", " original_language genres \n", "0 en [Adventure, Science Fiction, Action] \n", "1 en [Action, Adventure, Comedy, Science Fiction, W... \n", "2 en [Action, Thriller, Science Fiction, Adventure] \n", "3 en [Adventure, Fantasy, Action, Thriller] \n", "4 en [Fantasy, Action, Science Fiction, Thriller, H... \n", "5 en [Adventure, Fantasy, Action] \n", "6 en [Thriller, Science Fiction, Action] \n", "7 en [Adventure, Action, Science Fiction] \n", "8 en [Science Fiction, Action, Adventure, Thriller] \n", "9 en [Adventure, Action] " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#movie, first_rc 조인 on title\n", "first_df = first_rc.merge(movie, left_on='title',right_on='title')\n", "first_df" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "movie_id\n", "1 3.872470\n", "2 3.401869\n", "3 3.161017\n", "4 2.384615\n", "5 3.267857\n", " ... \n", "161944 5.000000\n", "162376 4.500000\n", "162542 5.000000\n", "162672 3.000000\n", "163949 5.000000\n", "Name: rating, Length: 9066, dtype: float64" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# movie_id별 평점 평균\n", "avg_rating = ratings.groupby('movie_id').mean()\n", "avg_rating = avg_rating['rating']\n", "avg_rating" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.2.3)1차 추천\n", " - 상관함수 기반 첫 추천 리스트 10개" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleimdb_idrating
0Forbidden Planettt00492233.134615
1Wild Wild Westtt01208912.750000
2Armageddontt01205913.177419
3Lara Croft: Tomb Raidertt01463162.375000
4Aliens vs Predator: Requiemtt07587303.561111
5The Lord of the Rings: The Two Towerstt01672613.800000
6Pitch Blacktt01348473.250000
7Return of the Jeditt00861903.260870
8Star Trek VI: The Undiscovered Countrytt01029752.250000
9Indiana Jones and the Temple of Doomtt00874691.666667
\n", "
" ], "text/plain": [ " title imdb_id rating\n", "0 Forbidden Planet tt0049223 3.134615\n", "1 Wild Wild West tt0120891 2.750000\n", "2 Armageddon tt0120591 3.177419\n", "3 Lara Croft: Tomb Raider tt0146316 2.375000\n", "4 Aliens vs Predator: Requiem tt0758730 3.561111\n", "5 The Lord of the Rings: The Two Towers tt0167261 3.800000\n", "6 Pitch Black tt0134847 3.250000\n", "7 Return of the Jedi tt0086190 3.260870\n", "8 Star Trek VI: The Undiscovered Country tt0102975 2.250000\n", "9 Indiana Jones and the Temple of Doom tt0087469 1.666667" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_rating = first_df.merge(avg_rating, left_on='movie_id',right_on='movie_id')\n", "first_data = first_rating[['title','imdb_id','rating']]\n", "first_data" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "#1차 추천 받은 영화의 title, imdb_id, rating\n", "first_data = first_rating[['title','imdb_id','rating']]\n", "first_data = first_data.sort_values(by='rating', ascending=False)[:5]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleimdb_idrating
5The Lord of the Rings: The Two Towerstt01672613.800000
4Aliens vs Predator: Requiemtt07587303.561111
7Return of the Jeditt00861903.260870
6Pitch Blacktt01348473.250000
2Armageddontt01205913.177419
\n", "
" ], "text/plain": [ " title imdb_id rating\n", "5 The Lord of the Rings: The Two Towers tt0167261 3.800000\n", "4 Aliens vs Predator: Requiem tt0758730 3.561111\n", "7 Return of the Jedi tt0086190 3.260870\n", "6 Pitch Black tt0134847 3.250000\n", "2 Armageddon tt0120591 3.177419" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_data # 5개 중 0,5,2가 내취향이야 -> 이 세 영화에 전부 고평점을 내린 나와 비슷한 유저 탐색" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.2.3)첫 번째 추천 함수의 문제점\n", " - 추천받은 10개의 영화는 모든 유저들의 평가로 계산된 결과 \n", " \n", " -> 다양한 취향의 유저들이 평가 \n", " \n", " -> 신뢰성 부족(하지만 다양한 취미의 유저들도 비슷한 평가를 내렸다는 점에서 신빈성 존재)\n", " \n", " -> 비슷한 취향의 유저 탐색 필요(크롤링)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.Searching similar user" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1)크롤링 [user_name,user_rating]" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "from urllib.request import urlopen\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['tt0167261', 'tt0758730', 'tt0086190', 'tt0134847', 'tt0120591']\n" ] } ], "source": [ "# 첫 추천 영화들의 imdb_id 리스트\n", "imdb_list = first_data['imdb_id'].tolist()\n", "print(imdb_list)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "user_name = [] #시리즈에 담을 리스트 선언\n", "user_rating = []\n", "#list를 사용하는 이유 -> dict는 NaN값을 생략 -> 순서 혼란\n", "for i in imdb_list: #10번 수행\n", " html = urlopen(\"https://www.imdb.com/title/\" + i + \"/reviews?ref_=tt_ql_3\") #접근할 html변수\n", " bs = BeautifulSoup(html, \"html.parser\") \n", " tag = bs.select(\"#main div.review-container\") #태그를 담을 변수\n", " i_name = []\n", " i_rating = []\n", " for j in range(len(tag)): #24번 수행(한 페이지에 24개 리뷰 표시)\n", " j += 1\n", " #name\n", " name_seq = \"#main div.lister-list div:nth-child(\" + str(j) + \") div.review-container span.display-name-link a\"\n", " name_tag = bs.select(str(name_seq))\n", " name_list = [] \n", " for k in name_tag:\n", " name_list.append(k.text) #24개의 유저 이름 담기\n", " i_name.append(name_list) # 첫 번째 영화에 대한 유저 이름 리스트를 1/10에 담기\n", " #rating\n", " rating_seq = \"#main div.lister-list div:nth-child(\" + str(j) + \") div.review-container div.ipl-ratings-bar span:nth-child(2)\"\n", " rating_tag = bs.select(str(rating_seq))\n", " rating_list = [] \n", " for k in rating_tag:\n", " rating_list.append(k.text) #24개의 유저 레이팅 담기\n", " i_rating.append(rating_list) # 첫 번째 영화에 대한 유저 레이팅 리스트를 1/10에 담기\n", "\n", " user_name.append(i_name)\n", " user_rating.append(i_rating)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[[['Loving_Silence'],\n", " ['rc_whittle'],\n", " ['MR_Heraclius'],\n", " ['Mithrindir'],\n", " ['docmonster'],\n", " ['justinrsko'],\n", " ['joe_unander'],\n", " ['calcat91355'],\n", " ['jedsalazar'],\n", " ['JohnLennon1985'],\n", " ['Boba_Fett1138'],\n", " ['tfrizzell'],\n", " ['otisfirefly2001'],\n", " ['classicsoncall'],\n", " ['Barky44'],\n", " ['grendelkhan'],\n", " ['jasonmg99'],\n", " ['dcastor'],\n", " ['bopdog'],\n", " ['dreamlanzerl'],\n", " ['sundog1'],\n", " ['John_Mclaren'],\n", " ['fearfulofspiders'],\n", " ['Theo Robertson'],\n", " ['FilmCreature']],\n", " [['call_me_grudge'],\n", " ['nadjadiamond'],\n", " ['LordBlacklist'],\n", " ['emerald_dark'],\n", " ['frich71-1'],\n", " ['luke-346'],\n", " ['mik30'],\n", " ['ignorepeter'],\n", " ['matches81'],\n", " ['Joe_ollie'],\n", " ['mrforestranger'],\n", " ['chechrissie'],\n", " ['alex-1250'],\n", " ['SUNLION777'],\n", " ['tibetanpunk'],\n", " ['littlejimmy835'],\n", " ['Caustic Pulp'],\n", " ['pmaglinger'],\n", " ['ma-cortes'],\n", " ['agentgates'],\n", " ['rijoenpial-1'],\n", " ['alex_dbs'],\n", " ['twilight-zone-1'],\n", " ['Joshy-3'],\n", " ['mail-1030']],\n", " [['UniqueParticle'],\n", " ['avenatticlint'],\n", " ['ivo-cobra8'],\n", " ['P97'],\n", " ['evanston_dad'],\n", " ['Aaron_Kyle'],\n", " ['ladysolo'],\n", " ['waynegavin1'],\n", " ['Gen S2rt'],\n", " ['ismailkardelen'],\n", " ['exterminator_99'],\n", " ['ccthemovieman-1'],\n", " ['TxMike'],\n", " ['JTurner82'],\n", " ['deepthinker566'],\n", " ['mjw2305'],\n", " ['AlsExGal'],\n", " ['Med-Jasta'],\n", " ['hothfreeze'],\n", " ['simonp-43115'],\n", " ['Lady_Targaryen'],\n", " ['DKosty123'],\n", " ['Sargebri'],\n", " ['luke-a-mcgowan'],\n", " ['Lancer-7']],\n", " [['ParanoidAndroidMarvin'],\n", " ['NightfallRaven'],\n", " ['charles000'],\n", " ['trehling'],\n", " ['atalanta-3'],\n", " ['HAZEL-5'],\n", " ['MovieAddict2016'],\n", " ['mstomaso'],\n", " ['RaziaK'],\n", " ['stephie-12'],\n", " ['lopcar1993'],\n", " ['KristinaElora'],\n", " ['bjmarsha'],\n", " ['hostagtj'],\n", " ['PlanecrazyIkarus'],\n", " ['claudio_carvalho'],\n", " ['LivingDog'],\n", " ['charchuk'],\n", " ['jamesrupert2014'],\n", " ['allegra-sloman'],\n", " ['fractalmama'],\n", " ['sol-'],\n", " ['sddavis63'],\n", " ['malmroes'],\n", " ['efrench']],\n", " [['madmaxmedia'],\n", " ['markokhoward-854-530441'],\n", " ['michael-stenlund'],\n", " ['gavin6942'],\n", " ['0w0'],\n", " ['XJoey'],\n", " ['mjw2305'],\n", " ['NoArrow'],\n", " ['bah_cacatule'],\n", " ['buzznzipp1995'],\n", " ['ccthemovieman-1'],\n", " ['ercfunk-445-950046'],\n", " ['polkadotlegwarmers'],\n", " ['olikomerc-513-868262'],\n", " ['LanieG'],\n", " ['aero_nut'],\n", " ['fermat1313'],\n", " ['bluejamie'],\n", " ['syncopation'],\n", " ['Flinx-2'],\n", " ['Dave-448'],\n", " ['AngelLiloLopez'],\n", " ['Lumpenprole'],\n", " ['MAYESY-44'],\n", " ['Pistol219']]]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_name # [[24명], [24명], [24명], [24명], [24명]] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.2)정규화" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.2.1) user_name" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0[Loving_Silence][call_me_grudge][UniqueParticle][ParanoidAndroidMarvin][madmaxmedia]
1[rc_whittle][nadjadiamond][avenatticlint][NightfallRaven][markokhoward-854-530441]
2[MR_Heraclius][LordBlacklist][ivo-cobra8][charles000][michael-stenlund]
3[Mithrindir][emerald_dark][P97][trehling][gavin6942]
4[docmonster][frich71-1][evanston_dad][atalanta-3][0w0]
5[justinrsko][luke-346][Aaron_Kyle][HAZEL-5][XJoey]
6[joe_unander][mik30][ladysolo][MovieAddict2016][mjw2305]
7[calcat91355][ignorepeter][waynegavin1][mstomaso][NoArrow]
8[jedsalazar][matches81][Gen S2rt][RaziaK][bah_cacatule]
9[JohnLennon1985][Joe_ollie][ismailkardelen][stephie-12][buzznzipp1995]
10[Boba_Fett1138][mrforestranger][exterminator_99][lopcar1993][ccthemovieman-1]
11[tfrizzell][chechrissie][ccthemovieman-1][KristinaElora][ercfunk-445-950046]
12[otisfirefly2001][alex-1250][TxMike][bjmarsha][polkadotlegwarmers]
13[classicsoncall][SUNLION777][JTurner82][hostagtj][olikomerc-513-868262]
14[Barky44][tibetanpunk][deepthinker566][PlanecrazyIkarus][LanieG]
15[grendelkhan][littlejimmy835][mjw2305][claudio_carvalho][aero_nut]
16[jasonmg99][Caustic Pulp][AlsExGal][LivingDog][fermat1313]
17[dcastor][pmaglinger][Med-Jasta][charchuk][bluejamie]
18[bopdog][ma-cortes][hothfreeze][jamesrupert2014][syncopation]
19[dreamlanzerl][agentgates][simonp-43115][allegra-sloman][Flinx-2]
20[sundog1][rijoenpial-1][Lady_Targaryen][fractalmama][Dave-448]
21[John_Mclaren][alex_dbs][DKosty123][sol-][AngelLiloLopez]
22[fearfulofspiders][twilight-zone-1][Sargebri][sddavis63][Lumpenprole]
23[Theo Robertson][Joshy-3][luke-a-mcgowan][malmroes][MAYESY-44]
24[FilmCreature][mail-1030][Lancer-7][efrench][Pistol219]
\n", "
" ], "text/plain": [ " 0 1 2 \\\n", "0 [Loving_Silence] [call_me_grudge] [UniqueParticle] \n", "1 [rc_whittle] [nadjadiamond] [avenatticlint] \n", "2 [MR_Heraclius] [LordBlacklist] [ivo-cobra8] \n", "3 [Mithrindir] [emerald_dark] [P97] \n", "4 [docmonster] [frich71-1] [evanston_dad] \n", "5 [justinrsko] [luke-346] [Aaron_Kyle] \n", "6 [joe_unander] [mik30] [ladysolo] \n", "7 [calcat91355] [ignorepeter] [waynegavin1] \n", "8 [jedsalazar] [matches81] [Gen S2rt] \n", "9 [JohnLennon1985] [Joe_ollie] [ismailkardelen] \n", "10 [Boba_Fett1138] [mrforestranger] [exterminator_99] \n", "11 [tfrizzell] [chechrissie] [ccthemovieman-1] \n", "12 [otisfirefly2001] [alex-1250] [TxMike] \n", "13 [classicsoncall] [SUNLION777] [JTurner82] \n", "14 [Barky44] [tibetanpunk] [deepthinker566] \n", "15 [grendelkhan] [littlejimmy835] [mjw2305] \n", "16 [jasonmg99] [Caustic Pulp] [AlsExGal] \n", "17 [dcastor] [pmaglinger] [Med-Jasta] \n", "18 [bopdog] [ma-cortes] [hothfreeze] \n", "19 [dreamlanzerl] [agentgates] [simonp-43115] \n", "20 [sundog1] [rijoenpial-1] [Lady_Targaryen] \n", "21 [John_Mclaren] [alex_dbs] [DKosty123] \n", "22 [fearfulofspiders] [twilight-zone-1] [Sargebri] \n", "23 [Theo Robertson] [Joshy-3] [luke-a-mcgowan] \n", "24 [FilmCreature] [mail-1030] [Lancer-7] \n", "\n", " 3 4 \n", "0 [ParanoidAndroidMarvin] [madmaxmedia] \n", "1 [NightfallRaven] [markokhoward-854-530441] \n", "2 [charles000] [michael-stenlund] \n", "3 [trehling] [gavin6942] \n", "4 [atalanta-3] [0w0] \n", "5 [HAZEL-5] [XJoey] \n", "6 [MovieAddict2016] [mjw2305] \n", "7 [mstomaso] [NoArrow] \n", "8 [RaziaK] [bah_cacatule] \n", "9 [stephie-12] [buzznzipp1995] \n", "10 [lopcar1993] [ccthemovieman-1] \n", "11 [KristinaElora] [ercfunk-445-950046] \n", "12 [bjmarsha] [polkadotlegwarmers] \n", "13 [hostagtj] [olikomerc-513-868262] \n", "14 [PlanecrazyIkarus] [LanieG] \n", "15 [claudio_carvalho] [aero_nut] \n", "16 [LivingDog] [fermat1313] \n", "17 [charchuk] [bluejamie] \n", "18 [jamesrupert2014] [syncopation] \n", "19 [allegra-sloman] [Flinx-2] \n", "20 [fractalmama] [Dave-448] \n", "21 [sol-] [AngelLiloLopez] \n", "22 [sddavis63] [Lumpenprole] \n", "23 [malmroes] [MAYESY-44] \n", "24 [efrench] [Pistol219] " ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "name_to_df = pd.DataFrame.from_records(user_name,columns=None)\n", "name_to_df = name_to_df.T\n", "name_to_df # 24x5 데이터프레임" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0Loving_Silencecall_me_grudgeUniqueParticleParanoidAndroidMarvinmadmaxmedia
1rc_whittlenadjadiamondavenatticlintNightfallRavenmarkokhoward-854-530441
2MR_HeracliusLordBlacklistivo-cobra8charles000michael-stenlund
3Mithrindiremerald_darkP97trehlinggavin6942
4docmonsterfrich71-1evanston_dadatalanta-30w0
5justinrskoluke-346Aaron_KyleHAZEL-5XJoey
6joe_unandermik30ladysoloMovieAddict2016mjw2305
7calcat91355ignorepeterwaynegavin1mstomasoNoArrow
8jedsalazarmatches81Gen S2rtRaziaKbah_cacatule
9JohnLennon1985Joe_ollieismailkardelenstephie-12buzznzipp1995
10Boba_Fett1138mrforestrangerexterminator_99lopcar1993ccthemovieman-1
11tfrizzellchechrissieccthemovieman-1KristinaEloraercfunk-445-950046
12otisfirefly2001alex-1250TxMikebjmarshapolkadotlegwarmers
13classicsoncallSUNLION777JTurner82hostagtjolikomerc-513-868262
14Barky44tibetanpunkdeepthinker566PlanecrazyIkarusLanieG
15grendelkhanlittlejimmy835mjw2305claudio_carvalhoaero_nut
16jasonmg99Caustic PulpAlsExGalLivingDogfermat1313
17dcastorpmaglingerMed-Jastacharchukbluejamie
18bopdogma-corteshothfreezejamesrupert2014syncopation
19dreamlanzerlagentgatessimonp-43115allegra-slomanFlinx-2
20sundog1rijoenpial-1Lady_TargaryenfractalmamaDave-448
21John_Mclarenalex_dbsDKosty123sol-AngelLiloLopez
22fearfulofspiderstwilight-zone-1Sargebrisddavis63Lumpenprole
23Theo RobertsonJoshy-3luke-a-mcgowanmalmroesMAYESY-44
24FilmCreaturemail-1030Lancer-7efrenchPistol219
\n", "
" ], "text/plain": [ " 0 1 2 3 \\\n", "0 Loving_Silence call_me_grudge UniqueParticle ParanoidAndroidMarvin \n", "1 rc_whittle nadjadiamond avenatticlint NightfallRaven \n", "2 MR_Heraclius LordBlacklist ivo-cobra8 charles000 \n", "3 Mithrindir emerald_dark P97 trehling \n", "4 docmonster frich71-1 evanston_dad atalanta-3 \n", "5 justinrsko luke-346 Aaron_Kyle HAZEL-5 \n", "6 joe_unander mik30 ladysolo MovieAddict2016 \n", "7 calcat91355 ignorepeter waynegavin1 mstomaso \n", "8 jedsalazar matches81 Gen S2rt RaziaK \n", "9 JohnLennon1985 Joe_ollie ismailkardelen stephie-12 \n", "10 Boba_Fett1138 mrforestranger exterminator_99 lopcar1993 \n", "11 tfrizzell chechrissie ccthemovieman-1 KristinaElora \n", "12 otisfirefly2001 alex-1250 TxMike bjmarsha \n", "13 classicsoncall SUNLION777 JTurner82 hostagtj \n", "14 Barky44 tibetanpunk deepthinker566 PlanecrazyIkarus \n", "15 grendelkhan littlejimmy835 mjw2305 claudio_carvalho \n", "16 jasonmg99 Caustic Pulp AlsExGal LivingDog \n", "17 dcastor pmaglinger Med-Jasta charchuk \n", "18 bopdog ma-cortes hothfreeze jamesrupert2014 \n", "19 dreamlanzerl agentgates simonp-43115 allegra-sloman \n", "20 sundog1 rijoenpial-1 Lady_Targaryen fractalmama \n", "21 John_Mclaren alex_dbs DKosty123 sol- \n", "22 fearfulofspiders twilight-zone-1 Sargebri sddavis63 \n", "23 Theo Robertson Joshy-3 luke-a-mcgowan malmroes \n", "24 FilmCreature mail-1030 Lancer-7 efrench \n", "\n", " 4 \n", "0 madmaxmedia \n", "1 markokhoward-854-530441 \n", "2 michael-stenlund \n", "3 gavin6942 \n", "4 0w0 \n", "5 XJoey \n", "6 mjw2305 \n", "7 NoArrow \n", "8 bah_cacatule \n", "9 buzznzipp1995 \n", "10 ccthemovieman-1 \n", "11 ercfunk-445-950046 \n", "12 polkadotlegwarmers \n", "13 olikomerc-513-868262 \n", "14 LanieG \n", "15 aero_nut \n", "16 fermat1313 \n", "17 bluejamie \n", "18 syncopation \n", "19 Flinx-2 \n", "20 Dave-448 \n", "21 AngelLiloLopez \n", "22 Lumpenprole \n", "23 MAYESY-44 \n", "24 Pistol219 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#괄호 제거\n", "for i in range(len(name_to_df.columns)):\n", " name_to_df[i] = name_to_df[i].str[0]\n", "name_to_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.2.2)user_rating" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0[10][2][10][10][8]
1[10][4][10][8][7]
2[10][1][8][][10]
3[10][1][9][8][7]
4[][1][10][10][7]
5[10][2][9][10][8]
6[10][1][9][8][8]
7[9][5][10][8][]
8[10][2][10][10][10]
9[10][4][10][10][8]
10[10][1][7][7][]
11[][1][9][10][8]
12[][1][][8][7]
13[10][1][10][10][9]
14[9][1][10][][]
15[][2][10][8][10]
16[10][9][7][10][]
17[9][1][10][8][2]
18[10][5][10][7][4]
19[10][1][10][7][1]
20[][1][10][10][10]
21[10][1][9][][10]
22[10][3][][7][3]
23[9][2][5][7][10]
24[10][1][10][10][10]
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 [10] [2] [10] [10] [8]\n", "1 [10] [4] [10] [8] [7]\n", "2 [10] [1] [8] [] [10]\n", "3 [10] [1] [9] [8] [7]\n", "4 [] [1] [10] [10] [7]\n", "5 [10] [2] [9] [10] [8]\n", "6 [10] [1] [9] [8] [8]\n", "7 [9] [5] [10] [8] []\n", "8 [10] [2] [10] [10] [10]\n", "9 [10] [4] [10] [10] [8]\n", "10 [10] [1] [7] [7] []\n", "11 [] [1] [9] [10] [8]\n", "12 [] [1] [] [8] [7]\n", "13 [10] [1] [10] [10] [9]\n", "14 [9] [1] [10] [] []\n", "15 [] [2] [10] [8] [10]\n", "16 [10] [9] [7] [10] []\n", "17 [9] [1] [10] [8] [2]\n", "18 [10] [5] [10] [7] [4]\n", "19 [10] [1] [10] [7] [1]\n", "20 [] [1] [10] [10] [10]\n", "21 [10] [1] [9] [] [10]\n", "22 [10] [3] [] [7] [3]\n", "23 [9] [2] [5] [7] [10]\n", "24 [10] [1] [10] [10] [10]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating_to_df = pd.DataFrame.from_records(user_rating,columns=None)\n", "rating_to_df = rating_to_df.T\n", "rating_to_df" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
010210108
11041087
21018NaN10
3101987
4NaN110107
51029108
6101988
795108NaN
8102101010
910410108
1010177NaN
11NaN19108
12NaN1NaN87
1310110109
149110NaNNaN
15NaN210810
16109710NaN
17911082
181051074
191011071
20NaN1101010
211019NaN10
22103NaN73
23925710
24101101010
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 10 2 10 10 8\n", "1 10 4 10 8 7\n", "2 10 1 8 NaN 10\n", "3 10 1 9 8 7\n", "4 NaN 1 10 10 7\n", "5 10 2 9 10 8\n", "6 10 1 9 8 8\n", "7 9 5 10 8 NaN\n", "8 10 2 10 10 10\n", "9 10 4 10 10 8\n", "10 10 1 7 7 NaN\n", "11 NaN 1 9 10 8\n", "12 NaN 1 NaN 8 7\n", "13 10 1 10 10 9\n", "14 9 1 10 NaN NaN\n", "15 NaN 2 10 8 10\n", "16 10 9 7 10 NaN\n", "17 9 1 10 8 2\n", "18 10 5 10 7 4\n", "19 10 1 10 7 1\n", "20 NaN 1 10 10 10\n", "21 10 1 9 NaN 10\n", "22 10 3 NaN 7 3\n", "23 9 2 5 7 10\n", "24 10 1 10 10 10" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for i in range(len(rating_to_df.columns)):\n", " rating_to_df[i] = rating_to_df[i].str[0]\n", "rating_to_df\n", "#{ [], [] , []...}형식은 NaN값이 당겨짐 -> [[], [], []...]를 사용한 이유" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.3)병합" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "# merge 함수\n", "def merge(i):\n", " df_dict = {\"user\" : name_to_df[i],\n", " \"rating\" : rating_to_df[i]}\n", " return pd.DataFrame(df_dict)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userrating
0Loving_Silence10
1rc_whittle10
2MR_Heraclius10
3Mithrindir10
4docmonsterNaN
5justinrsko10
6joe_unander10
7calcat913559
8jedsalazar10
9JohnLennon198510
10Boba_Fett113810
11tfrizzellNaN
12otisfirefly2001NaN
13classicsoncall10
14Barky449
15grendelkhanNaN
16jasonmg9910
17dcastor9
18bopdog10
19dreamlanzerl10
20sundog1NaN
21John_Mclaren10
22fearfulofspiders10
23Theo Robertson9
24FilmCreature10
\n", "
" ], "text/plain": [ " user rating\n", "0 Loving_Silence 10\n", "1 rc_whittle 10\n", "2 MR_Heraclius 10\n", "3 Mithrindir 10\n", "4 docmonster NaN\n", "5 justinrsko 10\n", "6 joe_unander 10\n", "7 calcat91355 9\n", "8 jedsalazar 10\n", "9 JohnLennon1985 10\n", "10 Boba_Fett1138 10\n", "11 tfrizzell NaN\n", "12 otisfirefly2001 NaN\n", "13 classicsoncall 10\n", "14 Barky44 9\n", "15 grendelkhan NaN\n", "16 jasonmg99 10\n", "17 dcastor 9\n", "18 bopdog 10\n", "19 dreamlanzerl 10\n", "20 sundog1 NaN\n", "21 John_Mclaren 10\n", "22 fearfulofspiders 10\n", "23 Theo Robertson 9\n", "24 FilmCreature 10" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merges에 5개의 df 만들어 담기 -> df(username,rating) x 5\n", "merges = {}\n", "for i in range(5):\n", " merges[i] = pd.DataFrame(merge(i))\n", "\n", "merges[0] # 첫 번째 추천 영화에 대한 유저,평점 확인" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.4)매니악 유저 탐색 with concat" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userrating
0Loving_Silence10
1rc_whittle10
2MR_Heraclius10
3Mithrindir10
4docmonsterNaN
.........
20Dave-44810
21AngelLiloLopez10
22Lumpenprole3
23MAYESY-4410
24Pistol21910
\n", "

125 rows × 2 columns

\n", "
" ], "text/plain": [ " user rating\n", "0 Loving_Silence 10\n", "1 rc_whittle 10\n", "2 MR_Heraclius 10\n", "3 Mithrindir 10\n", "4 docmonster NaN\n", ".. ... ...\n", "20 Dave-448 10\n", "21 AngelLiloLopez 10\n", "22 Lumpenprole 3\n", "23 MAYESY-44 10\n", "24 Pistol219 10\n", "\n", "[125 rows x 2 columns]" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_row = pd.concat([merges[0],merges[1],merges[2],merges[3],merges[4]],axis=0)\n", "merge_row" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "mjw2305 2\n", "ccthemovieman-1 2\n", "chechrissie 1\n", "tibetanpunk 1\n", "John_Mclaren 1\n", " ..\n", "twilight-zone-1 1\n", "otisfirefly2001 1\n", "Lancer-7 1\n", "ivo-cobra8 1\n", "jedsalazar 1\n", "Name: user, Length: 123, dtype: int64" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_row['user'].value_counts() \n", "# 5개의 영화 중 2개의 영화를 시청한 mjw2305, ccthemovieman-1유저에 대한 세부 탐색 필요" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userratinguserratinguserratinguserratinguserrating
0Loving_Silence10call_me_grudge2UniqueParticle10ParanoidAndroidMarvin10madmaxmedia8
1rc_whittle10nadjadiamond4avenatticlint10NightfallRaven8markokhoward-854-5304417
2MR_Heraclius10LordBlacklist1ivo-cobra88charles000NaNmichael-stenlund10
3Mithrindir10emerald_dark1P979trehling8gavin69427
4docmonsterNaNfrich71-11evanston_dad10atalanta-3100w07
5justinrsko10luke-3462Aaron_Kyle9HAZEL-510XJoey8
6joe_unander10mik301ladysolo9MovieAddict20168mjw23058
7calcat913559ignorepeter5waynegavin110mstomaso8NoArrowNaN
8jedsalazar10matches812Gen S2rt10RaziaK10bah_cacatule10
9JohnLennon198510Joe_ollie4ismailkardelen10stephie-1210buzznzipp19958
10Boba_Fett113810mrforestranger1exterminator_997lopcar19937ccthemovieman-1NaN
11tfrizzellNaNchechrissie1ccthemovieman-19KristinaElora10ercfunk-445-9500468
12otisfirefly2001NaNalex-12501TxMikeNaNbjmarsha8polkadotlegwarmers7
13classicsoncall10SUNLION7771JTurner8210hostagtj10olikomerc-513-8682629
14Barky449tibetanpunk1deepthinker56610PlanecrazyIkarusNaNLanieGNaN
15grendelkhanNaNlittlejimmy8352mjw230510claudio_carvalho8aero_nut10
16jasonmg9910Caustic Pulp9AlsExGal7LivingDog10fermat1313NaN
17dcastor9pmaglinger1Med-Jasta10charchuk8bluejamie2
18bopdog10ma-cortes5hothfreeze10jamesrupert20147syncopation4
19dreamlanzerl10agentgates1simonp-4311510allegra-sloman7Flinx-21
20sundog1NaNrijoenpial-11Lady_Targaryen10fractalmama10Dave-44810
21John_Mclaren10alex_dbs1DKosty1239sol-NaNAngelLiloLopez10
22fearfulofspiders10twilight-zone-13SargebriNaNsddavis637Lumpenprole3
23Theo Robertson9Joshy-32luke-a-mcgowan5malmroes7MAYESY-4410
24FilmCreature10mail-10301Lancer-710efrench10Pistol21910
\n", "
" ], "text/plain": [ " user rating user rating user rating \\\n", "0 Loving_Silence 10 call_me_grudge 2 UniqueParticle 10 \n", "1 rc_whittle 10 nadjadiamond 4 avenatticlint 10 \n", "2 MR_Heraclius 10 LordBlacklist 1 ivo-cobra8 8 \n", "3 Mithrindir 10 emerald_dark 1 P97 9 \n", "4 docmonster NaN frich71-1 1 evanston_dad 10 \n", "5 justinrsko 10 luke-346 2 Aaron_Kyle 9 \n", "6 joe_unander 10 mik30 1 ladysolo 9 \n", "7 calcat91355 9 ignorepeter 5 waynegavin1 10 \n", "8 jedsalazar 10 matches81 2 Gen S2rt 10 \n", "9 JohnLennon1985 10 Joe_ollie 4 ismailkardelen 10 \n", "10 Boba_Fett1138 10 mrforestranger 1 exterminator_99 7 \n", "11 tfrizzell NaN chechrissie 1 ccthemovieman-1 9 \n", "12 otisfirefly2001 NaN alex-1250 1 TxMike NaN \n", "13 classicsoncall 10 SUNLION777 1 JTurner82 10 \n", "14 Barky44 9 tibetanpunk 1 deepthinker566 10 \n", "15 grendelkhan NaN littlejimmy835 2 mjw2305 10 \n", "16 jasonmg99 10 Caustic Pulp 9 AlsExGal 7 \n", "17 dcastor 9 pmaglinger 1 Med-Jasta 10 \n", "18 bopdog 10 ma-cortes 5 hothfreeze 10 \n", "19 dreamlanzerl 10 agentgates 1 simonp-43115 10 \n", "20 sundog1 NaN rijoenpial-1 1 Lady_Targaryen 10 \n", "21 John_Mclaren 10 alex_dbs 1 DKosty123 9 \n", "22 fearfulofspiders 10 twilight-zone-1 3 Sargebri NaN \n", "23 Theo Robertson 9 Joshy-3 2 luke-a-mcgowan 5 \n", "24 FilmCreature 10 mail-1030 1 Lancer-7 10 \n", "\n", " user rating user rating \n", "0 ParanoidAndroidMarvin 10 madmaxmedia 8 \n", "1 NightfallRaven 8 markokhoward-854-530441 7 \n", "2 charles000 NaN michael-stenlund 10 \n", "3 trehling 8 gavin6942 7 \n", "4 atalanta-3 10 0w0 7 \n", "5 HAZEL-5 10 XJoey 8 \n", "6 MovieAddict2016 8 mjw2305 8 \n", "7 mstomaso 8 NoArrow NaN \n", "8 RaziaK 10 bah_cacatule 10 \n", "9 stephie-12 10 buzznzipp1995 8 \n", "10 lopcar1993 7 ccthemovieman-1 NaN \n", "11 KristinaElora 10 ercfunk-445-950046 8 \n", "12 bjmarsha 8 polkadotlegwarmers 7 \n", "13 hostagtj 10 olikomerc-513-868262 9 \n", "14 PlanecrazyIkarus NaN LanieG NaN \n", "15 claudio_carvalho 8 aero_nut 10 \n", "16 LivingDog 10 fermat1313 NaN \n", "17 charchuk 8 bluejamie 2 \n", "18 jamesrupert2014 7 syncopation 4 \n", "19 allegra-sloman 7 Flinx-2 1 \n", "20 fractalmama 10 Dave-448 10 \n", "21 sol- NaN AngelLiloLopez 10 \n", "22 sddavis63 7 Lumpenprole 3 \n", "23 malmroes 7 MAYESY-44 10 \n", "24 efrench 10 Pistol219 10 " ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_col = pd.concat([merges[0],merges[1],merges[2],merges[3],merges[4]],axis=1)\n", "merge_col" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userratinguserratinguserratinguserratinguserrating
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
7NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
8NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
9NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10NaNNaNNaNNaNNaNNaNNaNNaNccthemovieman-1NaN
11NaNNaNNaNNaNccthemovieman-1NaNNaNNaNNaNNaN
12NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
13NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
14NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
15NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
17NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
18NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
19NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
21NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
23NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
24NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " user rating user rating user rating user rating \\\n", "0 NaN NaN NaN NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN NaN NaN \n", "5 NaN NaN NaN NaN NaN NaN NaN NaN \n", "6 NaN NaN NaN NaN NaN NaN NaN NaN \n", "7 NaN NaN NaN NaN NaN NaN NaN NaN \n", "8 NaN NaN NaN NaN NaN NaN NaN NaN \n", "9 NaN NaN NaN NaN NaN NaN NaN NaN \n", "10 NaN NaN NaN NaN NaN NaN NaN NaN \n", "11 NaN NaN NaN NaN ccthemovieman-1 NaN NaN NaN \n", "12 NaN NaN NaN NaN NaN NaN NaN NaN \n", "13 NaN NaN NaN NaN NaN NaN NaN NaN \n", "14 NaN NaN NaN NaN NaN NaN NaN NaN \n", "15 NaN NaN NaN NaN NaN NaN NaN NaN \n", "16 NaN NaN NaN NaN NaN NaN NaN NaN \n", "17 NaN NaN NaN NaN NaN NaN NaN NaN \n", "18 NaN NaN NaN NaN NaN NaN NaN NaN \n", "19 NaN NaN NaN NaN NaN NaN NaN NaN \n", "20 NaN NaN NaN NaN NaN NaN NaN NaN \n", "21 NaN NaN NaN NaN NaN NaN NaN NaN \n", "22 NaN NaN NaN NaN NaN NaN NaN NaN \n", "23 NaN NaN NaN NaN NaN NaN NaN NaN \n", "24 NaN NaN NaN NaN NaN NaN NaN NaN \n", "\n", " user rating \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "5 NaN NaN \n", "6 NaN NaN \n", "7 NaN NaN \n", "8 NaN NaN \n", "9 NaN NaN \n", "10 ccthemovieman-1 NaN \n", "11 NaN NaN \n", "12 NaN NaN \n", "13 NaN NaN \n", "14 NaN NaN \n", "15 NaN NaN \n", "16 NaN NaN \n", "17 NaN NaN \n", "18 NaN NaN \n", "19 NaN NaN \n", "20 NaN NaN \n", "21 NaN NaN \n", "22 NaN NaN \n", "23 NaN NaN \n", "24 NaN NaN " ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_col[merge_col == 'ccthemovieman-1'] \n", "#ccthemovieman-1유저는 3,5번 째 영화를 평가" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userratinguserratinguserratinguserratinguserrating
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6NaNNaNNaNNaNNaNNaNNaNNaNmjw2305NaN
7NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
8NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
9NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
11NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
13NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
14NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
15NaNNaNNaNNaNmjw2305NaNNaNNaNNaNNaN
16NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
17NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
18NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
19NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
21NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
23NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
24NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " user rating user rating user rating user rating user rating\n", "0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "5 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "6 NaN NaN NaN NaN NaN NaN NaN NaN mjw2305 NaN\n", "7 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "8 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "9 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "13 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "15 NaN NaN NaN NaN mjw2305 NaN NaN NaN NaN NaN\n", "16 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "18 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "20 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "21 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "22 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "23 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_col[merge_col == 'mjw2305'] \n", "#mjw2305유저도 3,5번 째 영화를 평가" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userrating_xrating_y
0ccthemovieman-19NaN
1mjw2305108
\n", "
" ], "text/plain": [ " user rating_x rating_y\n", "0 ccthemovieman-1 9 NaN\n", "1 mjw2305 10 8" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#조인하여 해당 유저들이 몇 점을 부여했는지 관찰\n", "mergedStuff = pd.merge(merges[2], merges[4], on=['user'], how='inner')\n", "mergedStuff.head()\n", "#ccthemovieman-1유저는 5번째 영화에는 평점을 주지 않음 -> 부정적인 평가라고 판단\n", "#두 영화 모두에 고평가를 내린 mjw2305가 해당 분야의 매니악 유저일 것으로 유력\n", "#mjw2305유저 세부 탐색 필요" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6.Recommendation[2st]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.1)유저 리뷰 분석 함수 \n", "<참고> https://wikidocs.net/24586" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorboard\\compat\\tensorflow_stub\\dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorboard\\compat\\tensorflow_stub\\dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorboard\\compat\\tensorflow_stub\\dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorboard\\compat\\tensorflow_stub\\dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorboard\\compat\\tensorflow_stub\\dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "C:\\Anaconda\\lib\\site-packages\\tensorboard\\compat\\tensorflow_stub\\dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" ] } ], "source": [ "from tensorflow.keras.datasets import imdb\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1.1)데이터셋" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "(X_train, y_train), (X_test, y_test) = imdb.load_data()\n", "# (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 10000) #등장 빈도 순위 1~10,000제한" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((25000,), (25000,))" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape, y_train.shape #25000개의 훈련 데이터" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((25000,), (25000,))" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test.shape, y_test.shape #25000개의 테스트 데이터" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "훈련용 리뷰 개수 : 25000\n", "테스트용 리뷰 개수 : 25000\n", "카테고리 : 2\n" ] } ], "source": [ "print('훈련용 리뷰 개수 : {}'.format(len(X_train)))\n", "print('테스트용 리뷰 개수 : {}'.format(len(X_test)))\n", "num_classes = max(y_train) + 1\n", "print('카테고리 : {}'.format(num_classes))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1.2)상위 빈도수 단어 확인" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "빈도수 상위 1번 단어 : liked\n" ] } ], "source": [ "word_to_index = imdb.get_word_index()\n", "index_to_word={}\n", "for key, value in word_to_index.items():\n", " index_to_word[value] = key\n", " \n", "print('빈도수 상위 1번 단어 : {}'.format(index_to_word[420]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1.3)상위 빈도수 단어 확인" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but when from one bit then have two of script their with her nobody most that with wasn't to with armed acting watch an for with heartfelt film want an\n" ] } ], "source": [ "#token화 되기 전 훈련데이터[0]\n", "print(' '.join([index_to_word[X] for X in X_train[0]]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1.4)모델링" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.datasets import imdb\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, LSTM, Embedding\n", "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", "from tensorflow.keras.models import load_model" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "max_len = 300 #문장 길이 맞추기 -> 학습이 부족하면 0이 0.57등으로 편향될 확률 존재\n", "#정한 길이를 초과하면 초과분을 삭제하고, 부족하면 0으로 보충\n", "X_train = pad_sequences(X_train, maxlen=max_len)\n", "X_test = pad_sequences(X_test, maxlen=max_len)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Logging before flag parsing goes to stderr.\n", "W0626 19:16:06.041751 11044 deprecation.py:506] From C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\keras\\initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Call initializer instance with the dtype argument instead of passing it to the constructor\n", "W0626 19:16:06.063730 11044 deprecation.py:506] From C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Call initializer instance with the dtype argument instead of passing it to the constructor\n" ] } ], "source": [ "model = Sequential()\n", "model.add(Embedding(100000, 120)) #단어 집합 크기, 임베딩 후 벡터크기\n", "model.add(LSTM(120))\n", "model.add(Dense(1, activation='sigmoid')) #긍정/부정 이진 분류이므로 활성화 함수로 시그모이드 함수 채택" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)\n", "mc = ModelCheckpoint('sentiment_model', monitor='val_acc', mode='max', verbose=1, save_best_only=True)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "W0626 19:16:06.405204 11044 deprecation.py:323] From C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Train on 25000 samples, validate on 25000 samples\n", "24960/25000 [============================>.] - ETA: 0s - loss: 0.4267 - acc: 0.7988\n", "Epoch 00001: val_acc improved from -inf to 0.85832, saving model to sentiment_model\n", "25000/25000 [==============================] - 261s 10ms/sample - loss: 0.4264 - acc: 0.7991 - val_loss: 0.3335 - val_acc: 0.8583\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])\n", "model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=64, callbacks=[es, mc])" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "W0626 19:20:28.609368 11044 deprecation.py:506] From C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Call initializer instance with the dtype argument instead of passing it to the constructor\n", "W0626 19:20:28.611304 11044 deprecation.py:506] From C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Call initializer instance with the dtype argument instead of passing it to the constructor\n", "W0626 19:20:28.611551 11044 deprecation.py:506] From C:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Call initializer instance with the dtype argument instead of passing it to the constructor\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "25000/25000 [==============================] - 57s 2ms/sample - loss: 0.3335 - acc: 0.8583\n", "\n", " 테스트 정확도: 0.8583\n" ] } ], "source": [ "loaded_model = load_model('sentiment_model')\n", "print(\"\\n 테스트 정확도: %.4f\" % (loaded_model.evaluate(X_test, y_test)[1]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.2)유저 리뷰 크롤링\n", "<참고> https://wikidocs.net/24586" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "maniac_url = 'https://www.imdb.com/user/ur3174947/?ref_=tt_urv'\n", "page = urlopen(maniac_url)\n", "bs = BeautifulSoup(page, \"html.parser\")\n", "#main > div.widgets > div.widget.article.reviews > div > div:nth-child(2) > h3 > a\n", "tag = bs.select(\"#main > div.widgets > div.widget.article.reviews > div\")" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Next'" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#영화 이름\n", "movie_tags = bs.select(\"h3 > a\")\n", "movie_names = [i.text for i in movie_tags]\n", "movie_names[24]" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Cris Johnson (Nicolas Cage) earns a living in a Las Vegas magic act, where the magic is no illusion, he really can see up to two minutes into the future, which has drawn the attention of FBI agent Callie Ferris (Julianne Moore) and she wants him to help foil a plot to detonate a nuclear bomb in the heart of Los Angles. Johnson and his unsuspecting girl (Jessica Biel), who is destined to be in his future, become caught up in race against time that only he can stop.Despite its flaws, Next becomes a very compelling action thriller, that sees a really good concept play out with style. I found it was great fun to watch, and i particularly enjoyed the way the directors utilised the gift in Cages character; its was pretty original and very cool; so much so that it glossed over any flaws very well.8/10 great fun action'" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#영화 리뷰\n", "review_tags = bs.find('div', class_='reviews')\n", "review_tags = bs.findAll('p')\n", "reviews = [i.text for i in review_tags]\n", "reviews = reviews[:25]\n", "reviews[24]" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moviereview
0Toy Story 4This forth, and hopefully, final outing for Wo...
1Toy Story 3I have always loved these characters and once ...
2Toy StoryWow, is this movie really 25 years old now!I r...
3I Am LegendI watched this movie years ago, and very delib...
4Star Wars: Episode III - Revenge of the SithRevenge of the Sith is now a classic piece of ...
\n", "
" ], "text/plain": [ " movie \\\n", "0 Toy Story 4 \n", "1 Toy Story 3 \n", "2 Toy Story \n", "3 I Am Legend \n", "4 Star Wars: Episode III - Revenge of the Sith \n", "\n", " review \n", "0 This forth, and hopefully, final outing for Wo... \n", "1 I have always loved these characters and once ... \n", "2 Wow, is this movie really 25 years old now!I r... \n", "3 I watched this movie years ago, and very delib... \n", "4 Revenge of the Sith is now a classic piece of ... " ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#df[name,reivew]\n", "name_review_data = {\"movie\" : movie_names,\n", " \"review\" : reviews}\n", "name_review = pd.DataFrame(name_review_data)\n", "#훈련 데이터 사전에 등록되지 않은 단어 제거\n", "name_review['review'] = name_review['review'].str.replace('vengenance','')\n", "name_review['review'] = name_review['review'].str.replace('enought','')\n", "name_review['review'] = name_review['review'].str.replace('boobfest','')\n", "name_review['review'] = name_review['review'].str.replace('lol9','')\n", "name_review['review'] = name_review['review'].str.replace('annimation','')\n", "name_review['review'] = name_review['review'].str.replace('me4','')\n", "name_review['review'] = name_review['review'].str.replace('dispells','')\n", "name_review['review'] = name_review['review'].str.replace('Salvavation','')\n", "name_review['review'] = name_review['review'].str.replace('adifferent','')\n", "name_review['review'] = name_review['review'].str.replace('Upload','')\n", "name_review['review'] = name_review['review'].str.replace('seemlessly','')\n", "name_review['review'] = name_review['review'].str.replace('far','')\n", "name_review['review'] = name_review['review'].str.replace('attrocity','')\n", "name_review['review'] = name_review['review'].str.replace('testiment','')\n", "name_review['review'] = name_review['review'].str.replace('daren','')\n", "name_review['review'] = name_review['review'].str.replace('Hypercube','')\n", "name_review['review'] = name_review['review'].str.replace('lockdown','')\n", "name_review['review'] = name_review['review'].str.replace('Skipp','')\n", "name_review['review'] = name_review['review'].str.replace('Kotzwinkle','')\n", "name_review['review'] = name_review['review'].str.replace('Callie','')\n", "name_review['review'] = name_review['review'].str.replace('foreword','')\n", "name_review['review'] = name_review['review'].str.replace('Nia','')\n", "name_review['review'] = name_review['review'].str.replace('Vardalos','')\n", "name_review['review'] = name_review['review'].str.replace('Kindgom','')\n", "name_review['review'] = name_review['review'].str.replace('Matterson','')\n", "name_review.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.3)감정 분석" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "101.8" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#리뷰의 평균 단어 개수\n", "length = [len(i.split()) for i in name_review['review']]\n", "length = np.array(length)\n", "length.mean() #maxlen에 참고" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.3.1) 리뷰 정규화" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"This forth, and hopefully, final outing for Woody and the gang dissapointed me greatly I am sad to say!My issue is that the trilogy as it stood was genius and had run full circle to a truely brilliant conclusion!Toy Story 4 adds nothing new or fresh, it lacks the heart of the other films where it truely matters and it felt like the first milking of the franchise!Don't get me wrong, I laughed in parts, some of the scenes were pretty good and it was simply brilliantly animated.But, for me, it didn't need to be made and left me feeling extremely MEH!!!!\"" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_review[\"review\"][0]" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(25,)" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#정규화\n", "indexs = []\n", "for i in name_review[\"review\"]:\n", " i = re.sub(\"[^\\w]\", \" \",i).split()\n", " i = [x.lower() for x in i]\n", " i = [word_to_index[x] for x in i]\n", " i = np.array(i) # i(list) -> array\n", " indexs.append(i)\n", "indexs = np.array(indexs) # indexs(list) -> array\n", "\n", "indexs.shape" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 11, 2586, 2, 2360, 474, 5672, 15, 2289, 2,\n", " 1, 1363, 33534, 69, 3055, 10, 241, 616, 5,\n", " 132, 58, 1831, 6, 12, 1, 2352, 14, 9,\n", " 3402, 13, 1259, 2, 66, 518, 365, 4243, 5,\n", " 3, 21519, 527, 1171, 2885, 62, 467, 1605, 161,\n", " 159, 39, 1473, 9, 1500, 1, 480, 4, 1,\n", " 82, 105, 118, 9, 21519, 2291, 2, 9, 418,\n", " 37, 1, 83, 19283, 4, 1, 3132, 1558, 827,\n", " 76, 69, 352, 10, 1495, 8, 528, 46, 4,\n", " 1, 136, 68, 181, 49, 2, 9, 13, 328,\n", " 2102, 1122, 18, 15, 69, 9, 15496, 827, 356,\n", " 5, 27, 90, 2, 314, 69, 544, 573, 26147])" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indexs[0]" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(25, 101)" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index_maxlen = pad_sequences(indexs, maxlen=101) # 평균으로 문장 길이 균일화\n", "index_maxlen.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.3.2) 예측" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[0.5069408 ],\n", " [0.5050207 ],\n", " [0.4961085 ],\n", " [0.5054615 ],\n", " [0.48894867],\n", " [0.34563628],\n", " [0.4690467 ],\n", " [0.40974882],\n", " [0.4961085 ],\n", " [0.5954045 ],\n", " [0.48020768],\n", " [0.48169833],\n", " [0.4707434 ],\n", " [0.7006737 ],\n", " [0.5062654 ],\n", " [0.31814855],\n", " [0.25153908],\n", " [0.53862554],\n", " [0.5046823 ],\n", " [0.5234926 ],\n", " [0.5578947 ],\n", " [0.5151209 ],\n", " [0.56781495],\n", " [0.5954045 ],\n", " [0.61896557],\n", " [0.4697098 ],\n", " [0.5425703 ],\n", " [0.47174564],\n", " [0.49604207],\n", " [0.54297024],\n", " [0.4961085 ],\n", " [0.5101451 ],\n", " [0.63096577],\n", " [0.37623864],\n", " [0.4973281 ],\n", " [0.53862554],\n", " [0.5029195 ],\n", " [0.453186 ],\n", " [0.29633033],\n", " [0.33290666],\n", " [0.5080796 ],\n", " [0.49131298],\n", " [0.5183239 ],\n", " [0.41750246],\n", " [0.38687968],\n", " [0.49087515],\n", " [0.4848033 ],\n", " [0.598925 ],\n", " [0.5425703 ],\n", " [0.45481098],\n", " [0.5954045 ],\n", " [0.75629056],\n", " [0.5370675 ],\n", " [0.5954045 ],\n", " [0.63496625],\n", " [0.45359102],\n", " [0.72590345],\n", " [0.5425703 ],\n", " [0.453186 ],\n", " [0.34848672],\n", " [0.4961085 ],\n", " [0.5425703 ],\n", " [0.48397982],\n", " [0.49980178],\n", " [0.5954045 ],\n", " [0.49329454],\n", " [0.5695524 ],\n", " [0.5370675 ],\n", " [0.5954045 ],\n", " [0.5464714 ],\n", " [0.23836029],\n", " [0.49221566],\n", " [0.40408167],\n", " [0.4707434 ],\n", " [0.4383876 ],\n", " [0.5062654 ],\n", " [0.5345975 ],\n", " [0.4959167 ],\n", " [0.5711426 ],\n", " [0.5151 ],\n", " [0.5370675 ],\n", " [0.5954045 ],\n", " [0.5151059 ],\n", " [0.50105476],\n", " [0.39824963],\n", " [0.46728298],\n", " [0.4961085 ],\n", " [0.5425703 ],\n", " [0.49604207],\n", " [0.49729204],\n", " [0.55748713],\n", " [0.5788277 ],\n", " [0.4973612 ],\n", " [0.4690467 ],\n", " [0.4707434 ],\n", " [0.5425703 ],\n", " [0.51483226],\n", " [0.49221566],\n", " [0.7074171 ],\n", " [0.53862554],\n", " [0.45910323],\n", " [0.48900685],\n", " [0.4961085 ],\n", " [0.53254175],\n", " [0.4707434 ],\n", " [0.5491598 ],\n", " [0.5648199 ],\n", " [0.4067978 ]], dtype=float32)" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#예측\n", "predicts = [loaded_model.predict(i) for i in indexs]\n", "predicts[0]" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5051343" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#점수\n", "scores = [i.mean() for i in predicts]\n", "scores[0]" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "#데이터 프레임에 삽입\n", "name_review['sentiment_score'] = scores" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "#평가 함수 \n", "def evaluate(review):\n", " test_score = review.mean()\n", " if(test_score > 0.5):\n", " return 1 #긍정\n", " else:\n", " return 0 #부정" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "evaluation = [evaluate(i) for i in predicts]" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "#add evaluate to df\n", "name_review['evaluation'] = evaluation" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "#유저가 리뷰에 평가한 점수 ex) it was nice movie 8/10\n", "real_score = [None,None,None,8,10,10,None,None,7,6,None,10,4,None,None,8,None,None,10,8,4,4,5,7,8]\n", "name_review['user_score'] = real_score" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moviereviewsentiment_scoreevaluationuser_score
0Toy Story 4This forth, and hopefully, final outing for Wo...0.5051341NaN
1Toy Story 3I have always loved these characters and once ...0.5119471NaN
2Toy StoryWow, is this movie really 25 years old now!I r...0.5111381NaN
3I Am LegendI watched this movie years ago, and very delib...0.50231218.0
4Star Wars: Episode III - Revenge of the SithRevenge of the Sith is now a classic piece of ...0.506502110.0
5True RomanceWith a Stellar cast, slick direction, terrific...0.513968110.0
6Best of the Best 3: No Turning BackThis outing only has Phillip Rhee from the ori...0.5068171NaN
7Best of the Best III really enjoyed the first film in the series ...0.5057631NaN
8TeethMy expectations were very low, but i was intri...0.50524417.0
9Piranha 3DExpectations were low, alcohol to blood level ...0.53289516.0
10Repo MenDidn't have a clue this movie would be as good...0.5011791NaN
11District 910 minutes in and my wife and I were like, \"ar...0.506789110.0
12The Addams FamilyI don't know if it was the transition to or w...0.49714604.0
13Terminator: Dark FateSeriously, I have enjoyed the whole franchise!...0.5019721NaN
14Terminator GenisysThe burning question is did this really need t...0.5015181NaN
15Terminator SalvationTerminator takes us to the inevitable and let...0.49354808.0
16UploadI've not quite finished season 1, but I am alr...0.5005361NaN
17The MistI have only watched the first 5 episodes, but ...0.4920610NaN
18Eden LakeFirstly, Eden Lake is not for everyone! It hit...0.505571110.0
19OnwardI new nothing about this film and went in to i...0.50189518.0
20Cube ZeroSurprisingly I did actually quite enjoy this m...0.50304514.0
21A Nightmare on Elm Street: The Dream ChildThis time Stephen Hopkins (Director) and (John...0.49827804.0
22A Nightmare on Elm Street 4: The Dream MasterAfter huge success with the first outing, Wes ...0.49852305.0
23Night at the MuseumWhen Larry Daley (Ben Stiller) is hired as the...0.50663717.0
24NextCris Johnson (Nicolas Cage) earns a living in ...0.50898618.0
\n", "
" ], "text/plain": [ " movie \\\n", "0 Toy Story 4 \n", "1 Toy Story 3 \n", "2 Toy Story \n", "3 I Am Legend \n", "4 Star Wars: Episode III - Revenge of the Sith \n", "5 True Romance \n", "6 Best of the Best 3: No Turning Back \n", "7 Best of the Best II \n", "8 Teeth \n", "9 Piranha 3D \n", "10 Repo Men \n", "11 District 9 \n", "12 The Addams Family \n", "13 Terminator: Dark Fate \n", "14 Terminator Genisys \n", "15 Terminator Salvation \n", "16 Upload \n", "17 The Mist \n", "18 Eden Lake \n", "19 Onward \n", "20 Cube Zero \n", "21 A Nightmare on Elm Street: The Dream Child \n", "22 A Nightmare on Elm Street 4: The Dream Master \n", "23 Night at the Museum \n", "24 Next \n", "\n", " review sentiment_score \\\n", "0 This forth, and hopefully, final outing for Wo... 0.505134 \n", "1 I have always loved these characters and once ... 0.511947 \n", "2 Wow, is this movie really 25 years old now!I r... 0.511138 \n", "3 I watched this movie years ago, and very delib... 0.502312 \n", "4 Revenge of the Sith is now a classic piece of ... 0.506502 \n", "5 With a Stellar cast, slick direction, terrific... 0.513968 \n", "6 This outing only has Phillip Rhee from the ori... 0.506817 \n", "7 I really enjoyed the first film in the series ... 0.505763 \n", "8 My expectations were very low, but i was intri... 0.505244 \n", "9 Expectations were low, alcohol to blood level ... 0.532895 \n", "10 Didn't have a clue this movie would be as good... 0.501179 \n", "11 10 minutes in and my wife and I were like, \"ar... 0.506789 \n", "12 I don't know if it was the transition to or w... 0.497146 \n", "13 Seriously, I have enjoyed the whole franchise!... 0.501972 \n", "14 The burning question is did this really need t... 0.501518 \n", "15 Terminator takes us to the inevitable and let... 0.493548 \n", "16 I've not quite finished season 1, but I am alr... 0.500536 \n", "17 I have only watched the first 5 episodes, but ... 0.492061 \n", "18 Firstly, Eden Lake is not for everyone! It hit... 0.505571 \n", "19 I new nothing about this film and went in to i... 0.501895 \n", "20 Surprisingly I did actually quite enjoy this m... 0.503045 \n", "21 This time Stephen Hopkins (Director) and (John... 0.498278 \n", "22 After huge success with the first outing, Wes ... 0.498523 \n", "23 When Larry Daley (Ben Stiller) is hired as the... 0.506637 \n", "24 Cris Johnson (Nicolas Cage) earns a living in ... 0.508986 \n", "\n", " evaluation user_score \n", "0 1 NaN \n", "1 1 NaN \n", "2 1 NaN \n", "3 1 8.0 \n", "4 1 10.0 \n", "5 1 10.0 \n", "6 1 NaN \n", "7 1 NaN \n", "8 1 7.0 \n", "9 1 6.0 \n", "10 1 NaN \n", "11 1 10.0 \n", "12 0 4.0 \n", "13 1 NaN \n", "14 1 NaN \n", "15 0 8.0 \n", "16 1 NaN \n", "17 0 NaN \n", "18 1 10.0 \n", "19 1 8.0 \n", "20 1 4.0 \n", "21 0 4.0 \n", "22 0 5.0 \n", "23 1 7.0 \n", "24 1 8.0 " ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_review" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moviereviewsentiment_scoreevaluationuser_score
9Piranha 3DExpectations were low, alcohol to blood level ...0.53289516.0
5True RomanceWith a Stellar cast, slick direction, terrific...0.513968110.0
1Toy Story 3I have always loved these characters and once ...0.5119471NaN
2Toy StoryWow, is this movie really 25 years old now!I r...0.5111381NaN
24NextCris Johnson (Nicolas Cage) earns a living in ...0.50898618.0
6Best of the Best 3: No Turning BackThis outing only has Phillip Rhee from the ori...0.5068171NaN
11District 910 minutes in and my wife and I were like, \"ar...0.506789110.0
23Night at the MuseumWhen Larry Daley (Ben Stiller) is hired as the...0.50663717.0
4Star Wars: Episode III - Revenge of the SithRevenge of the Sith is now a classic piece of ...0.506502110.0
7Best of the Best III really enjoyed the first film in the series ...0.5057631NaN
18Eden LakeFirstly, Eden Lake is not for everyone! It hit...0.505571110.0
8TeethMy expectations were very low, but i was intri...0.50524417.0
0Toy Story 4This forth, and hopefully, final outing for Wo...0.5051341NaN
20Cube ZeroSurprisingly I did actually quite enjoy this m...0.50304514.0
3I Am LegendI watched this movie years ago, and very delib...0.50231218.0
13Terminator: Dark FateSeriously, I have enjoyed the whole franchise!...0.5019721NaN
19OnwardI new nothing about this film and went in to i...0.50189518.0
14Terminator GenisysThe burning question is did this really need t...0.5015181NaN
10Repo MenDidn't have a clue this movie would be as good...0.5011791NaN
16UploadI've not quite finished season 1, but I am alr...0.5005361NaN
22A Nightmare on Elm Street 4: The Dream MasterAfter huge success with the first outing, Wes ...0.49852305.0
21A Nightmare on Elm Street: The Dream ChildThis time Stephen Hopkins (Director) and (John...0.49827804.0
12The Addams FamilyI don't know if it was the transition to or w...0.49714604.0
15Terminator SalvationTerminator takes us to the inevitable and let...0.49354808.0
17The MistI have only watched the first 5 episodes, but ...0.4920610NaN
\n", "
" ], "text/plain": [ " movie \\\n", "9 Piranha 3D \n", "5 True Romance \n", "1 Toy Story 3 \n", "2 Toy Story \n", "24 Next \n", "6 Best of the Best 3: No Turning Back \n", "11 District 9 \n", "23 Night at the Museum \n", "4 Star Wars: Episode III - Revenge of the Sith \n", "7 Best of the Best II \n", "18 Eden Lake \n", "8 Teeth \n", "0 Toy Story 4 \n", "20 Cube Zero \n", "3 I Am Legend \n", "13 Terminator: Dark Fate \n", "19 Onward \n", "14 Terminator Genisys \n", "10 Repo Men \n", "16 Upload \n", "22 A Nightmare on Elm Street 4: The Dream Master \n", "21 A Nightmare on Elm Street: The Dream Child \n", "12 The Addams Family \n", "15 Terminator Salvation \n", "17 The Mist \n", "\n", " review sentiment_score \\\n", "9 Expectations were low, alcohol to blood level ... 0.532895 \n", "5 With a Stellar cast, slick direction, terrific... 0.513968 \n", "1 I have always loved these characters and once ... 0.511947 \n", "2 Wow, is this movie really 25 years old now!I r... 0.511138 \n", "24 Cris Johnson (Nicolas Cage) earns a living in ... 0.508986 \n", "6 This outing only has Phillip Rhee from the ori... 0.506817 \n", "11 10 minutes in and my wife and I were like, \"ar... 0.506789 \n", "23 When Larry Daley (Ben Stiller) is hired as the... 0.506637 \n", "4 Revenge of the Sith is now a classic piece of ... 0.506502 \n", "7 I really enjoyed the first film in the series ... 0.505763 \n", "18 Firstly, Eden Lake is not for everyone! It hit... 0.505571 \n", "8 My expectations were very low, but i was intri... 0.505244 \n", "0 This forth, and hopefully, final outing for Wo... 0.505134 \n", "20 Surprisingly I did actually quite enjoy this m... 0.503045 \n", "3 I watched this movie years ago, and very delib... 0.502312 \n", "13 Seriously, I have enjoyed the whole franchise!... 0.501972 \n", "19 I new nothing about this film and went in to i... 0.501895 \n", "14 The burning question is did this really need t... 0.501518 \n", "10 Didn't have a clue this movie would be as good... 0.501179 \n", "16 I've not quite finished season 1, but I am alr... 0.500536 \n", "22 After huge success with the first outing, Wes ... 0.498523 \n", "21 This time Stephen Hopkins (Director) and (John... 0.498278 \n", "12 I don't know if it was the transition to or w... 0.497146 \n", "15 Terminator takes us to the inevitable and let... 0.493548 \n", "17 I have only watched the first 5 episodes, but ... 0.492061 \n", "\n", " evaluation user_score \n", "9 1 6.0 \n", "5 1 10.0 \n", "1 1 NaN \n", "2 1 NaN \n", "24 1 8.0 \n", "6 1 NaN \n", "11 1 10.0 \n", "23 1 7.0 \n", "4 1 10.0 \n", "7 1 NaN \n", "18 1 10.0 \n", "8 1 7.0 \n", "0 1 NaN \n", "20 1 4.0 \n", "3 1 8.0 \n", "13 1 NaN \n", "19 1 8.0 \n", "14 1 NaN \n", "10 1 NaN \n", "16 1 NaN \n", "22 0 5.0 \n", "21 0 4.0 \n", "12 0 4.0 \n", "15 0 8.0 \n", "17 0 NaN " ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#감정 분석 기준 정렬\n", "maniac_favorite_sentiment = name_review.sort_values(by='sentiment_score', ascending=False)\n", "maniac_favorite_sentiment" ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moviereviewsentiment_scoreevaluationuser_score
4Star Wars: Episode III - Revenge of the SithRevenge of the Sith is now a classic piece of ...0.506502110.0
5True RomanceWith a Stellar cast, slick direction, terrific...0.513968110.0
11District 910 minutes in and my wife and I were like, \"ar...0.506789110.0
18Eden LakeFirstly, Eden Lake is not for everyone! It hit...0.505571110.0
3I Am LegendI watched this movie years ago, and very delib...0.50231218.0
15Terminator SalvationTerminator takes us to the inevitable and let...0.49354808.0
19OnwardI new nothing about this film and went in to i...0.50189518.0
24NextCris Johnson (Nicolas Cage) earns a living in ...0.50898618.0
8TeethMy expectations were very low, but i was intri...0.50524417.0
23Night at the MuseumWhen Larry Daley (Ben Stiller) is hired as the...0.50663717.0
9Piranha 3DExpectations were low, alcohol to blood level ...0.53289516.0
22A Nightmare on Elm Street 4: The Dream MasterAfter huge success with the first outing, Wes ...0.49852305.0
12The Addams FamilyI don't know if it was the transition to or w...0.49714604.0
20Cube ZeroSurprisingly I did actually quite enjoy this m...0.50304514.0
21A Nightmare on Elm Street: The Dream ChildThis time Stephen Hopkins (Director) and (John...0.49827804.0
0Toy Story 4This forth, and hopefully, final outing for Wo...0.5051341NaN
1Toy Story 3I have always loved these characters and once ...0.5119471NaN
2Toy StoryWow, is this movie really 25 years old now!I r...0.5111381NaN
6Best of the Best 3: No Turning BackThis outing only has Phillip Rhee from the ori...0.5068171NaN
7Best of the Best III really enjoyed the first film in the series ...0.5057631NaN
10Repo MenDidn't have a clue this movie would be as good...0.5011791NaN
13Terminator: Dark FateSeriously, I have enjoyed the whole franchise!...0.5019721NaN
14Terminator GenisysThe burning question is did this really need t...0.5015181NaN
16UploadI've not quite finished season 1, but I am alr...0.5005361NaN
17The MistI have only watched the first 5 episodes, but ...0.4920610NaN
\n", "
" ], "text/plain": [ " movie \\\n", "4 Star Wars: Episode III - Revenge of the Sith \n", "5 True Romance \n", "11 District 9 \n", "18 Eden Lake \n", "3 I Am Legend \n", "15 Terminator Salvation \n", "19 Onward \n", "24 Next \n", "8 Teeth \n", "23 Night at the Museum \n", "9 Piranha 3D \n", "22 A Nightmare on Elm Street 4: The Dream Master \n", "12 The Addams Family \n", "20 Cube Zero \n", "21 A Nightmare on Elm Street: The Dream Child \n", "0 Toy Story 4 \n", "1 Toy Story 3 \n", "2 Toy Story \n", "6 Best of the Best 3: No Turning Back \n", "7 Best of the Best II \n", "10 Repo Men \n", "13 Terminator: Dark Fate \n", "14 Terminator Genisys \n", "16 Upload \n", "17 The Mist \n", "\n", " review sentiment_score \\\n", "4 Revenge of the Sith is now a classic piece of ... 0.506502 \n", "5 With a Stellar cast, slick direction, terrific... 0.513968 \n", "11 10 minutes in and my wife and I were like, \"ar... 0.506789 \n", "18 Firstly, Eden Lake is not for everyone! It hit... 0.505571 \n", "3 I watched this movie years ago, and very delib... 0.502312 \n", "15 Terminator takes us to the inevitable and let... 0.493548 \n", "19 I new nothing about this film and went in to i... 0.501895 \n", "24 Cris Johnson (Nicolas Cage) earns a living in ... 0.508986 \n", "8 My expectations were very low, but i was intri... 0.505244 \n", "23 When Larry Daley (Ben Stiller) is hired as the... 0.506637 \n", "9 Expectations were low, alcohol to blood level ... 0.532895 \n", "22 After huge success with the first outing, Wes ... 0.498523 \n", "12 I don't know if it was the transition to or w... 0.497146 \n", "20 Surprisingly I did actually quite enjoy this m... 0.503045 \n", "21 This time Stephen Hopkins (Director) and (John... 0.498278 \n", "0 This forth, and hopefully, final outing for Wo... 0.505134 \n", "1 I have always loved these characters and once ... 0.511947 \n", "2 Wow, is this movie really 25 years old now!I r... 0.511138 \n", "6 This outing only has Phillip Rhee from the ori... 0.506817 \n", "7 I really enjoyed the first film in the series ... 0.505763 \n", "10 Didn't have a clue this movie would be as good... 0.501179 \n", "13 Seriously, I have enjoyed the whole franchise!... 0.501972 \n", "14 The burning question is did this really need t... 0.501518 \n", "16 I've not quite finished season 1, but I am alr... 0.500536 \n", "17 I have only watched the first 5 episodes, but ... 0.492061 \n", "\n", " evaluation user_score \n", "4 1 10.0 \n", "5 1 10.0 \n", "11 1 10.0 \n", "18 1 10.0 \n", "3 1 8.0 \n", "15 0 8.0 \n", "19 1 8.0 \n", "24 1 8.0 \n", "8 1 7.0 \n", "23 1 7.0 \n", "9 1 6.0 \n", "22 0 5.0 \n", "12 0 4.0 \n", "20 1 4.0 \n", "21 0 4.0 \n", "0 1 NaN \n", "1 1 NaN \n", "2 1 NaN \n", "6 1 NaN \n", "7 1 NaN \n", "10 1 NaN \n", "13 1 NaN \n", "14 1 NaN \n", "16 1 NaN \n", "17 0 NaN " ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#실제 유저 평점 기준 정렬\n", "maniac_favorite_score = name_review.sort_values(by='user_score', ascending=False)\n", "maniac_favorite_score" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieevaluation
4Star Wars: Episode III - Revenge of the Sith1
5True Romance1
11District 91
18Eden Lake1
3I Am Legend1
15Terminator Salvation0
19Onward1
24Next1
8Teeth1
23Night at the Museum1
9Piranha 3D1
22A Nightmare on Elm Street 4: The Dream Master0
12The Addams Family0
20Cube Zero1
21A Nightmare on Elm Street: The Dream Child0
0Toy Story 41
1Toy Story 31
2Toy Story1
6Best of the Best 3: No Turning Back1
7Best of the Best II1
10Repo Men1
13Terminator: Dark Fate1
14Terminator Genisys1
16Upload1
17The Mist0
\n", "
" ], "text/plain": [ " movie evaluation\n", "4 Star Wars: Episode III - Revenge of the Sith 1\n", "5 True Romance 1\n", "11 District 9 1\n", "18 Eden Lake 1\n", "3 I Am Legend 1\n", "15 Terminator Salvation 0\n", "19 Onward 1\n", "24 Next 1\n", "8 Teeth 1\n", "23 Night at the Museum 1\n", "9 Piranha 3D 1\n", "22 A Nightmare on Elm Street 4: The Dream Master 0\n", "12 The Addams Family 0\n", "20 Cube Zero 1\n", "21 A Nightmare on Elm Street: The Dream Child 0\n", "0 Toy Story 4 1\n", "1 Toy Story 3 1\n", "2 Toy Story 1\n", "6 Best of the Best 3: No Turning Back 1\n", "7 Best of the Best II 1\n", "10 Repo Men 1\n", "13 Terminator: Dark Fate 1\n", "14 Terminator Genisys 1\n", "16 Upload 1\n", "17 The Mist 0" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "maniac_movie = maniac_favorite_score[['movie','evaluation']]\n", "maniac_movie = pd.DataFrame(maniac_movie)\n", "maniac_movie" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
moviegenres
0Toy Story[Animation, Comedy, Family]
1Jumanji[Adventure, Fantasy, Family]
2Grumpier Old Men[Romance, Comedy]
3Waiting to Exhale[Comedy, Drama, Romance]
4Father of the Bride Part II[Comedy]
.........
45457Shadow of the Blair Witch[Mystery, Horror]
45458The Burkittsville 7[Horror]
45459Caged Heat 3000[Science Fiction]
45464Satana likuyushchiy[]
45465Queerama[]
\n", "

30482 rows × 2 columns

\n", "
" ], "text/plain": [ " movie genres\n", "0 Toy Story [Animation, Comedy, Family]\n", "1 Jumanji [Adventure, Fantasy, Family]\n", "2 Grumpier Old Men [Romance, Comedy]\n", "3 Waiting to Exhale [Comedy, Drama, Romance]\n", "4 Father of the Bride Part II [Comedy]\n", "... ... ...\n", "45457 Shadow of the Blair Witch [Mystery, Horror]\n", "45458 The Burkittsville 7 [Horror]\n", "45459 Caged Heat 3000 [Science Fiction]\n", "45464 Satana likuyushchiy []\n", "45465 Queerama []\n", "\n", "[30482 rows x 2 columns]" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_genres = movie[['title','genres']]\n", "movie_genres = movie_genres.rename(columns={'title':'movie'})\n", "movie_genres" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieevaluationgenres
0Star Wars: Episode III - Revenge of the Sith1[Science Fiction, Adventure, Action]
1True Romance1[Action, Thriller, Crime, Romance]
2District 91[Science Fiction]
3Eden Lake1[Horror, Thriller]
4I Am Legend1[Drama, Horror, Action, Thriller, Science Fict...
5Terminator Salvation0[Action, Science Fiction, Thriller]
6Next1[Action, Science Fiction, Thriller]
7Teeth1[Comedy, Horror]
8Night at the Museum1[Action, Adventure, Comedy, Family, Fantasy]
9Piranha 3D1[Comedy, Horror]
10A Nightmare on Elm Street 4: The Dream Master0[Horror, Thriller]
11The Addams Family0[Horror, Comedy, Fantasy]
12Cube Zero1[Mystery, Science Fiction, Thriller]
13Toy Story 31[Animation, Family, Comedy]
14Toy Story1[Animation, Comedy, Family]
15Best of the Best 3: No Turning Back1[Action]
16Repo Men1[Action, Science Fiction, Thriller, Crime]
17Terminator Genisys1[Science Fiction, Action, Thriller, Adventure]
18The Mist0[Science Fiction, Horror, Thriller]
\n", "
" ], "text/plain": [ " movie evaluation \\\n", "0 Star Wars: Episode III - Revenge of the Sith 1 \n", "1 True Romance 1 \n", "2 District 9 1 \n", "3 Eden Lake 1 \n", "4 I Am Legend 1 \n", "5 Terminator Salvation 0 \n", "6 Next 1 \n", "7 Teeth 1 \n", "8 Night at the Museum 1 \n", "9 Piranha 3D 1 \n", "10 A Nightmare on Elm Street 4: The Dream Master 0 \n", "11 The Addams Family 0 \n", "12 Cube Zero 1 \n", "13 Toy Story 3 1 \n", "14 Toy Story 1 \n", "15 Best of the Best 3: No Turning Back 1 \n", "16 Repo Men 1 \n", "17 Terminator Genisys 1 \n", "18 The Mist 0 \n", "\n", " genres \n", "0 [Science Fiction, Adventure, Action] \n", "1 [Action, Thriller, Crime, Romance] \n", "2 [Science Fiction] \n", "3 [Horror, Thriller] \n", "4 [Drama, Horror, Action, Thriller, Science Fict... \n", "5 [Action, Science Fiction, Thriller] \n", "6 [Action, Science Fiction, Thriller] \n", "7 [Comedy, Horror] \n", "8 [Action, Adventure, Comedy, Family, Fantasy] \n", "9 [Comedy, Horror] \n", "10 [Horror, Thriller] \n", "11 [Horror, Comedy, Fantasy] \n", "12 [Mystery, Science Fiction, Thriller] \n", "13 [Animation, Family, Comedy] \n", "14 [Animation, Comedy, Family] \n", "15 [Action] \n", "16 [Action, Science Fiction, Thriller, Crime] \n", "17 [Science Fiction, Action, Thriller, Adventure] \n", "18 [Science Fiction, Horror, Thriller] " ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inter_movie = pd.merge(maniac_movie, movie_genres, on='movie', how='inner')\n", "inter_movie" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## <결과>\n", "1. 피어슨 상관계수를 이용한 유저 평점 기반 추천 함수로 1차 추천 \n", "\n", "2. 1차 추천을 기반으로 크롤링을 통해 나와 비슷한 유저 발견\n", "\n", "3. 해당 유저에 대한 상세 크롤링과 감정 분석을 통한 2차 추천" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }