{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "raised-question",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from math import log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "better-diagnosis",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = [\n",
    "  '먹고 싶은 사과',\n",
    "  '먹고 싶은 바나나',\n",
    "  '길고 노란 바나나 바나나',\n",
    "  '저는 과일이 좋아요'\n",
    "]\n",
    "\n",
    "vocab = list(set(w for doc in docs for w in doc.split()))\n",
    "vocab.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "exact-appraisal",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TF, IDF, 그리고 TF-IDF 값을 구하는 함수를 구현"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "baking-polymer",
   "metadata": {},
   "outputs": [],
   "source": [
    "N = len(docs) # 총 문서의 수\n",
    "\n",
    "def tf(t, d):\n",
    "    return d.count(t)\n",
    "\n",
    "def idf(t):\n",
    "    df = 0\n",
    "    for doc in docs:\n",
    "        df += t in doc\n",
    "    return log(N/(df + 1))\n",
    "\n",
    "def tfidf(t, d):\n",
    "    return tf(t, d) * idf(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "electrical-december",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TF 구현 : 다시 말해 DTM을 데이터프레임에 저장하여 출력"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "rubber-incidence",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>과일이</th>\n",
       "      <th>길고</th>\n",
       "      <th>노란</th>\n",
       "      <th>먹고</th>\n",
       "      <th>바나나</th>\n",
       "      <th>사과</th>\n",
       "      <th>싶은</th>\n",
       "      <th>저는</th>\n",
       "      <th>좋아요</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   과일이  길고  노란  먹고  바나나  사과  싶은  저는  좋아요\n",
       "0    0   0   0   1    0   1   1   0    0\n",
       "1    0   0   0   1    1   0   1   0    0\n",
       "2    0   1   1   0    2   0   0   0    0\n",
       "3    1   0   0   0    0   0   0   1    1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = []\n",
    "for i in range(N):  # 각 문서에 대해 아래 명령 수행\n",
    "    result.append([])\n",
    "    d = docs[i]\n",
    "    for j in range(len(vocab)):\n",
    "        t = vocab[j]\n",
    "        result[-1].append(tf(t, d))\n",
    "        \n",
    "tf_ = pd.DataFrame(result, columns = vocab)\n",
    "tf_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "accompanied-anthony",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 각 단어에 대한 IDF 값을 구하기"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "proud-swing",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>IDF</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>과일이</th>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>길고</th>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>노란</th>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>먹고</th>\n",
       "      <td>0.287682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>바나나</th>\n",
       "      <td>0.287682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>사과</th>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>싶은</th>\n",
       "      <td>0.287682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>저는</th>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>좋아요</th>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          IDF\n",
       "과일이  0.693147\n",
       "길고   0.693147\n",
       "노란   0.693147\n",
       "먹고   0.287682\n",
       "바나나  0.287682\n",
       "사과   0.693147\n",
       "싶은   0.287682\n",
       "저는   0.693147\n",
       "좋아요  0.693147"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = []\n",
    "for j in range(len(vocab)):\n",
    "    t = vocab[j]\n",
    "    result.append(idf(t))\n",
    "    \n",
    "idf_ = pd.DataFrame(result, index = vocab, columns = ['IDF'])\n",
    "idf_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "spiritual-burlington",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TF-IDF 행렬을 출력"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "compact-vegetation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>과일이</th>\n",
       "      <th>길고</th>\n",
       "      <th>노란</th>\n",
       "      <th>먹고</th>\n",
       "      <th>바나나</th>\n",
       "      <th>사과</th>\n",
       "      <th>싶은</th>\n",
       "      <th>저는</th>\n",
       "      <th>좋아요</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.287682</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.287682</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.287682</td>\n",
       "      <td>0.287682</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.287682</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.575364</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        과일이        길고        노란        먹고       바나나        사과        싶은  \\\n",
       "0  0.000000  0.000000  0.000000  0.287682  0.000000  0.693147  0.287682   \n",
       "1  0.000000  0.000000  0.000000  0.287682  0.287682  0.000000  0.287682   \n",
       "2  0.000000  0.693147  0.693147  0.000000  0.575364  0.000000  0.000000   \n",
       "3  0.693147  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
       "\n",
       "         저는       좋아요  \n",
       "0  0.000000  0.000000  \n",
       "1  0.000000  0.000000  \n",
       "2  0.000000  0.000000  \n",
       "3  0.693147  0.693147  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = []\n",
    "for i in range(N):\n",
    "    result.append([])\n",
    "    d = docs[i]\n",
    "    \n",
    "    for j in range(len(vocab)):\n",
    "        t = vocab[j]\n",
    "        \n",
    "        result[-1].append(tfidf(t, d))\n",
    "        \n",
    "tfidf_ = pd.DataFrame(result, columns = vocab)\n",
    "tfidf_"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "adolescent-prophet",
   "metadata": {},
   "source": [
    "- 사이킷런을 이용한 DTM과 TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "thorough-monroe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 1 0 1 0 1 0 1 1]\n",
      " [0 0 1 0 0 0 0 1 0]\n",
      " [1 0 0 0 1 0 1 0 0]]\n",
      "{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "corpus = [\n",
    "    'you know I want your love',\n",
    "    'I like you',\n",
    "    'what should I do ',\n",
    "]\n",
    "\n",
    "# DTM\n",
    "\n",
    "vector = CountVectorizer()\n",
    "print(vector.fit_transform(corpus).toarray())  # 코퍼스로부터 각 단어의 빈도 수를 기록한다.\n",
    "print(vector.vocabulary_)  # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "comfortable-arlington",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TF-IDF 구하기"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "distant-pursuit",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.         0.46735098 0.         0.46735098 0.         0.46735098\n",
      "  0.         0.35543247 0.46735098]\n",
      " [0.         0.         0.79596054 0.         0.         0.\n",
      "  0.         0.60534851 0.        ]\n",
      " [0.57735027 0.         0.         0.         0.57735027 0.\n",
      "  0.57735027 0.         0.        ]]\n",
      "{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "corpus = [\n",
    "    'you know I want your love',\n",
    "    'I like you',\n",
    "    'what should I do ',    \n",
    "]\n",
    "\n",
    "tfidfv = TfidfVectorizer().fit(corpus)\n",
    "print(tfidfv.transform(corpus).toarray())\n",
    "print(tfidfv.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "analyzed-police",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}