{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "adjacent-steel",
   "metadata": {},
   "source": [
    "2. IMDB 리뷰 토큰화하기"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "experienced-resolution",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sentencepiece as spm\n",
    "import pandas as pd\n",
    "import urllib.request\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "studied-advantage",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x2b4fbe9ed60>)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv\", filename=\"IMDb_Reviews.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "greek-creation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        My family and I normally do not watch local mo...\n",
       "1        Believe it or not, this was at one time the wo...\n",
       "2        After some internet surfing, I found the \"Home...\n",
       "3        One of the most unheralded great works of anim...\n",
       "4        It was the Sixties, and anyone with long hair ...\n",
       "                               ...                        \n",
       "49995    the people who came up with this are SICK AND ...\n",
       "49996    The script is so so laughable... this in turn,...\n",
       "49997    \"So there's this bride, you see, and she gets ...\n",
       "49998    Your mind will not be satisfied by this nobud...\n",
       "49999    The chaser's war on everything is a weekly sho...\n",
       "Name: review, Length: 50000, dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv('IMDb_Reviews.csv')\n",
    "train_df['review']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "imported-milwaukee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "리뷰 개수 : 50000\n"
     ]
    }
   ],
   "source": [
    "print('리뷰 개수 :',len(train_df)) # 리뷰 개수 출력"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "portuguese-supplier",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 센텐스피스의 입력으로 사용하기 위해서 데이터프레임을 txt 파일로 저장\n",
    "\n",
    "with open('imdb_review.txt', 'w', encoding='utf8') as f:\n",
    "    f.write('\\n'.join(train_df['review']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "progressive-skirt",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 센텐스피스로 단어 집합과 각 단어에 고유한 정수를 부여\n",
    "\n",
    "spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "through-touch",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2471</th>\n",
       "      <td>▁stere</td>\n",
       "      <td>-2468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3692</th>\n",
       "      <td>mark</td>\n",
       "      <td>-3689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3687</th>\n",
       "      <td>asty</td>\n",
       "      <td>-3684</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3388</th>\n",
       "      <td>vert</td>\n",
       "      <td>-3385</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4480</th>\n",
       "      <td>▁Sus</td>\n",
       "      <td>-4477</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2319</th>\n",
       "      <td>▁...</td>\n",
       "      <td>-2316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>740</th>\n",
       "      <td>ically</td>\n",
       "      <td>-737</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3050</th>\n",
       "      <td>▁cru</td>\n",
       "      <td>-3047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1167</th>\n",
       "      <td>▁stupid</td>\n",
       "      <td>-1164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>953</th>\n",
       "      <td>ither</td>\n",
       "      <td>-950</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0     1\n",
       "2471   ▁stere -2468\n",
       "3692     mark -3689\n",
       "3687     asty -3684\n",
       "3388     vert -3385\n",
       "4480     ▁Sus -4477\n",
       "2319     ▁... -2316\n",
       "740    ically  -737\n",
       "3050     ▁cru -3047\n",
       "1167  ▁stupid -1164\n",
       "953     ither  -950"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# vocab 생성이 완료되면 imdb.model, imdb.vocab 파일 두개가 생성\n",
    "# 단어 집합의 크기를 확인하기 위해 vocab 파일을 데이터프레임에 저장\n",
    "\n",
    "vocab_list = pd.read_csv('imdb.vocab', sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
    "vocab_list.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "nearby-andrews",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5000"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 단어 집합의 크기는 5,000개\n",
    "\n",
    "len(vocab_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "average-bookmark",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# model 파일을 로드\n",
    "\n",
    "sp = spm.SentencePieceProcessor()\n",
    "vocab_file = \"imdb.model\"\n",
    "sp.load(vocab_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "adolescent-cherry",
   "metadata": {},
   "source": [
    "- encode_as_pieces : 문장을 입력하면 서브 워드 시퀀스로 변환합니다.\n",
    "- encode_as_ids : 문장을 입력하면 정수 시퀀스로 변환합니다."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fifteen-acrylic",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I didn't at all think of it this way.\n",
      "['▁I', '▁didn', \"'\", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']\n",
      "[41, 623, 4950, 4926, 138, 169, 378, 30, 58, 73, 413, 4945]\n",
      "\n",
      "I have waited a long time for someone to film\n",
      "['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']\n",
      "[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "lines = [\n",
    "  \"I didn't at all think of it this way.\",\n",
    "  \"I have waited a long time for someone to film\"\n",
    "]\n",
    "for line in lines:\n",
    "  print(line)\n",
    "  print(sp.encode_as_pieces(line))\n",
    "  print(sp.encode_as_ids(line))\n",
    "  print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "living-heaven",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# GetPieceSize() : 단어 집합의 크기를 확인\n",
    "\n",
    "sp.GetPieceSize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "amazing-advocate",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁character'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# idToPiece : 정수로부터 맵핑되는 서브 워드로 변환\n",
    "\n",
    "sp.IdToPiece(430)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "marine-candidate",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "430"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# PieceToId : 서브워드로부터 맵핑되는 정수로 변환\n",
    "\n",
    "sp.PieceToId('▁character')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "known-carter",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'I have waited a long time for someone to film'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DecodeIds : 정수 시퀀스로부터 문장으로 변환\n",
    "\n",
    "sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "southwest-residence",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'I have waited a long time for someone to film'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DecodePieces : 서브워드 시퀀스로부터 문장으로 변환\n",
    "\n",
    "sp.DecodePieces(['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "enormous-marriage",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']\n",
      "[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]\n"
     ]
    }
   ],
   "source": [
    "print(sp.encode('I have waited a long time for someone to film', out_type=str))\n",
    "print(sp.encode('I have waited a long time for someone to film', out_type=int))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "informal-interpretation",
   "metadata": {},
   "source": [
    "3. 네이버 영화 리뷰 토큰화하기"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "spare-mississippi",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('ratings.txt', <http.client.HTTPMessage at 0x2b49b2db790>)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt\", filename=\"ratings.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "perceived-junction",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>document</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8112052</td>\n",
       "      <td>어릴때보고 지금다시봐도 재밌어요ㅋㅋ</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8132799</td>\n",
       "      <td>디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4655635</td>\n",
       "      <td>폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9251303</td>\n",
       "      <td>와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10067386</td>\n",
       "      <td>안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id                                           document  label\n",
       "0   8112052                                어릴때보고 지금다시봐도 재밌어요ㅋㅋ      1\n",
       "1   8132799  디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...      1\n",
       "2   4655635               폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.      1\n",
       "3   9251303  와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...      1\n",
       "4  10067386                        안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.      1"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "naver_df = pd.read_table('ratings.txt')\n",
    "naver_df[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "closing-secretary",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "리뷰 개수 : 200000\n"
     ]
    }
   ],
   "source": [
    "print('리뷰 개수 :',len(naver_df)) # 리뷰 개수 출력"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "higher-slave",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# null 값 제거\n",
    "\n",
    "print(naver_df.isnull().values.any())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "strong-blackjack",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    }
   ],
   "source": [
    "naver_df = naver_df.dropna(how = 'any') # Null 값이 존재하는 행 제거\n",
    "print(naver_df.isnull().values.any()) # Null 값이 존재하는지 확인"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "julian-group",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "리뷰 개수 : 199992\n"
     ]
    }
   ],
   "source": [
    "print('리뷰 개수 :',len(naver_df)) # 리뷰 개수 출력"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "standing-university",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 199,992개의 샘플을 naver_review.txt 파일에 저장한 후에 센텐스피스를 통해 단어 집합을 생성\n",
    "\n",
    "with open('naver_review.txt', 'w', encoding='utf8') as f:\n",
    "    f.write('\\n'.join(naver_df['document']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "demonstrated-neighbor",
   "metadata": {},
   "outputs": [],
   "source": [
    "spm.SentencePieceTrainer.Train('--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "worth-fabric",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt;unk&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;s&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;/s&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>..</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>영화</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>▁영화</td>\n",
       "      <td>-2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>▁이</td>\n",
       "      <td>-3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>▁아</td>\n",
       "      <td>-4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>...</td>\n",
       "      <td>-5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>▁그</td>\n",
       "      <td>-6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0  1\n",
       "0  <unk>  0\n",
       "1    <s>  0\n",
       "2   </s>  0\n",
       "3     ..  0\n",
       "4     영화 -1\n",
       "5    ▁영화 -2\n",
       "6     ▁이 -3\n",
       "7     ▁아 -4\n",
       "8    ... -5\n",
       "9     ▁그 -6"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_list = pd.read_csv('naver.vocab', sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
    "vocab_list[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "third-wages",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4476</th>\n",
       "      <td>숲</td>\n",
       "      <td>-4473</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3751</th>\n",
       "      <td>뒤</td>\n",
       "      <td>-3748</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4892</th>\n",
       "      <td>궤</td>\n",
       "      <td>-4889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2856</th>\n",
       "      <td>▁부터</td>\n",
       "      <td>-2853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1490</th>\n",
       "      <td>▁각본</td>\n",
       "      <td>-1487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>523</th>\n",
       "      <td>중에</td>\n",
       "      <td>-520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2570</th>\n",
       "      <td>▁기본</td>\n",
       "      <td>-2567</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3612</th>\n",
       "      <td>져</td>\n",
       "      <td>-3609</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3409</th>\n",
       "      <td>랑</td>\n",
       "      <td>-3406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1802</th>\n",
       "      <td>▁짧</td>\n",
       "      <td>-1799</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0     1\n",
       "4476    숲 -4473\n",
       "3751    뒤 -3748\n",
       "4892    궤 -4889\n",
       "2856  ▁부터 -2853\n",
       "1490  ▁각본 -1487\n",
       "523    중에  -520\n",
       "2570  ▁기본 -2567\n",
       "3612    져 -3609\n",
       "3409    랑 -3406\n",
       "1802   ▁짧 -1799"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_list.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "incredible-kruger",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sp = spm.SentencePieceProcessor()\n",
    "vocab_file = \"naver.model\"\n",
    "sp.load(vocab_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "anticipated-penguin",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "뭐 이딴 것도 영화냐.\n",
      "['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']\n",
      "[132, 966, 1296, 2590, 3276]\n",
      "\n",
      "진짜 최고의 영화입니다 ㅋㅋ\n",
      "['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']\n",
      "[54, 200, 821, 85]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "lines = [\n",
    "  \"뭐 이딴 것도 영화냐.\",\n",
    "  \"진짜 최고의 영화입니다 ㅋㅋ\",\n",
    "]\n",
    "for line in lines:\n",
    "  print(line)\n",
    "  print(sp.encode_as_pieces(line))\n",
    "  print(sp.encode_as_ids(line))\n",
    "  print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "killing-saint",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5000"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sp.GetPieceSize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "serial-pixel",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'영화'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sp.IdToPiece(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "heavy-starter",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sp.PieceToId('영화')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "adjustable-skill",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'진짜 최고의 영화입니다 ᄏᄏ'"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sp.DecodeIds([54, 200, 821, 85])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "accomplished-helicopter",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}