{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "텍스트 요약.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "Bl2oQgNJ5ePd" }, "source": [ "출처: https://wikidocs.net/72820\n", "# 아마존 리뷰 데이터" ] }, { "cell_type": "code", "metadata": { "id": "a8dao_uW5wX1" }, "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import matplotlib.pyplot as plt\n", "from nltk.corpus import stopwords\n", "from bs4 import BeautifulSoup \n", "from tensorflow.keras.preprocessing.text import Tokenizer \n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "import urllib.request\n", "np.random.seed(seed=0)" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "_kOSiXbk_JaP" }, "source": [ "# 1) 데이터 로드하기" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "olKVHdCn71go", "outputId": "fc9bfd7a-3a87-4ee1-caf2-6d2e09e13128" }, "source": [ "# 데이터 로드하기\n", "data = pd.read_csv(\"/content/Reviews.csv\", nrows = 10000)\n", "print('전체 리뷰 개수 :', (len(data)))" ], "execution_count": 18, "outputs": [ { "output_type": "stream", "text": [ "전체 리뷰 개수 : 10000\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 476 }, "id": "dgQ-o18_814y", "outputId": "b843dd5b-94b7-4eee-c5c0-a060753505b5" }, "source": [ "data.head()" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", " | Id | \n", "ProductId | \n", "UserId | \n", "ProfileName | \n", "HelpfulnessNumerator | \n", "HelpfulnessDenominator | \n", "Score | \n", "Time | \n", "Summary | \n", "Text | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "B001E4KFG0 | \n", "A3SGXH7AUHU8GW | \n", "delmartian | \n", "1 | \n", "1 | \n", "5 | \n", "1303862400 | \n", "Good Quality Dog Food | \n", "I have bought several of the Vitality canned d... | \n", "
1 | \n", "2 | \n", "B00813GRG4 | \n", "A1D87F6ZCVE5NK | \n", "dll pa | \n", "0 | \n", "0 | \n", "1 | \n", "1346976000 | \n", "Not as Advertised | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "
2 | \n", "3 | \n", "B000LQOCH0 | \n", "ABXLMWJIXXAIN | \n", "Natalia Corres \"Natalia Corres\" | \n", "1 | \n", "1 | \n", "4 | \n", "1219017600 | \n", "\"Delight\" says it all | \n", "This is a confection that has been around a fe... | \n", "
3 | \n", "4 | \n", "B000UA0QIQ | \n", "A395BORC6FGVXV | \n", "Karl | \n", "3 | \n", "3 | \n", "2 | \n", "1307923200 | \n", "Cough Medicine | \n", "If you are looking for the secret ingredient i... | \n", "
4 | \n", "5 | \n", "B006K2ZZ7K | \n", "A1UQRSCLF8GW1T | \n", "Michael D. Bigham \"M. Wassir\" | \n", "0 | \n", "0 | \n", "5 | \n", "1350777600 | \n", "Great taffy | \n", "Great taffy at a great price. There was a wid... | \n", "
\n", " | Text | \n", "Summary | \n", "
---|---|---|
0 | \n", "I have bought several of the Vitality canned d... | \n", "Good Quality Dog Food | \n", "
1 | \n", "Product arrived labeled as Jumbo Salted Peanut... | \n", "Not as Advertised | \n", "
2 | \n", "This is a confection that has been around a fe... | \n", "\"Delight\" says it all | \n", "
3 | \n", "If you are looking for the secret ingredient i... | \n", "Cough Medicine | \n", "
4 | \n", "Great taffy at a great price. There was a wid... | \n", "Great taffy | \n", "
\n", " | Text | \n", "Summary | \n", "
---|---|---|
0 | \n", "bought several vitality canned dog food produc... | \n", "good quality dog food | \n", "
1 | \n", "product arrived labeled jumbo salted peanuts p... | \n", "not as advertised | \n", "
2 | \n", "confection around centuries light pillowy citr... | \n", "delight says it all | \n", "
3 | \n", "looking secret ingredient robitussin believe f... | \n", "cough medicine | \n", "
4 | \n", "great taffy great price wide assortment yummy ... | \n", "great taffy | \n", "
\n", " | Text | \n", "Summary | \n", "decoder_input | \n", "decoder_target | \n", "
---|---|---|---|---|
0 | \n", "bought several vitality canned dog food produc... | \n", "good quality dog food | \n", "sostoken good quality dog food | \n", "good quality dog food eostoken | \n", "
1 | \n", "product arrived labeled jumbo salted peanuts p... | \n", "not as advertised | \n", "sostoken not as advertised | \n", "not as advertised eostoken | \n", "
2 | \n", "confection around centuries light pillowy citr... | \n", "delight says it all | \n", "sostoken delight says it all | \n", "delight says it all eostoken | \n", "
3 | \n", "looking secret ingredient robitussin believe f... | \n", "cough medicine | \n", "sostoken cough medicine | \n", "cough medicine eostoken | \n", "
4 | \n", "great taffy great price wide assortment yummy ... | \n", "great taffy | \n", "sostoken great taffy | \n", "great taffy eostoken | \n", "