{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "valued-patrol", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import scipy as sp" ] }, { "cell_type": "code", "execution_count": null, "id": "transsexual-assistant", "metadata": {}, "outputs": [], "source": [ "# 교차엔트로피\n", "# 분포1 : p=[1/4,1/4,1/4,1/4]\n", "# 분포2 : q=[1/2,1/4,1/8,1/8]" ] }, { "cell_type": "code", "execution_count": 2, "id": "acquired-reynolds", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.25" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "- 1/4 * np.log2(1/2) - 1/4 * np.log2(1/4) - 1/4 * np.log2(1/8) - 1/4 * np.log2(1/8) " ] }, { "cell_type": "code", "execution_count": null, "id": "interim-mentor", "metadata": {}, "outputs": [], "source": [ "# 교차엔트로피를 사용한 분류성능 측정\n", "# 꽃받침 길이 5.6cm 기준" ] }, { "cell_type": "code", "execution_count": 3, "id": "found-chicago", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)yy_hat
955.73.04.21.211
965.72.94.21.311
976.22.94.31.311
985.12.53.01.110
995.72.84.11.311
\n", "
" ], "text/plain": [ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", "95 5.7 3.0 4.2 1.2 \n", "96 5.7 2.9 4.2 1.3 \n", "97 6.2 2.9 4.3 1.3 \n", "98 5.1 2.5 3.0 1.1 \n", "99 5.7 2.8 4.1 1.3 \n", "\n", " y y_hat \n", "95 1 1 \n", "96 1 1 \n", "97 1 1 \n", "98 1 0 \n", "99 1 1 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.datasets import load_iris\n", "\n", "iris = load_iris()\n", "idx = np.in1d(iris.target, [0, 1])\n", "X = iris.data[idx, :]\n", "y = iris.target[idx]\n", "df = pd.DataFrame(X, columns=iris.feature_names)\n", "df[\"y\"] = iris.target[idx]\n", "df[\"y_hat\"] = (df[\"sepal length (cm)\"] > 5.4).astype(int)\n", "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "id": "competent-valley", "metadata": {}, "outputs": [], "source": [ "# 로그손실값 계산" ] }, { "cell_type": "code", "execution_count": 4, "id": "framed-access", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.799305383311686" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import log_loss\n", "\n", "log_loss(df[\"y\"], df[\"y_hat\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "peaceful-ordinary", "metadata": {}, "outputs": [], "source": [ "# 가변길이 인코딩과 쿨백-라이블러 발산" ] }, { "cell_type": "code", "execution_count": 5, "id": "official-privacy", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'BAAACAAAADDCABAABABADDBDCABDADACAABABBBCACBBCCABBCDDAAAAABCAAAAAAABBBDACAADCDDAAAAAABABBAAAABAABBADAABAABDBBBCADABBBABAAABACBAABAAAAAACABCDADCDCBAAAAAAAAABBBCAACBBBBDADBADCAAACADAAAAABBAADACAACABBADAC'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "N = 200\n", "p = [1/2, 1/4, 1/8, 1/8]\n", "doc0 = list(\"\".join([int(N * p[i]) * c for i, c in enumerate(\"ABCD\")]))\n", "np.random.shuffle(doc0)\n", "doc = \"\".join(doc0)\n", "doc" ] }, { "cell_type": "code", "execution_count": null, "id": "seven-governor", "metadata": {}, "outputs": [], "source": [ "# 글자의 확률분포" ] }, { "cell_type": "code", "execution_count": 6, "id": "located-member", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.25 , 0.5 , 0.125, 0.125])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import Counter\n", "\n", "p = np.array(list(Counter(doc).values())) / len(doc)\n", "p" ] }, { "cell_type": "code", "execution_count": null, "id": "emerging-camera", "metadata": {}, "outputs": [], "source": [ "# 엔트로피" ] }, { "cell_type": "code", "execution_count": 7, "id": "broke-peoples", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.75" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sp.stats.entropy([1/2, 1/4, 1/8, 1/8], base=2)" ] }, { "cell_type": "code", "execution_count": null, "id": "naval-metropolitan", "metadata": {}, "outputs": [], "source": [ "# 실제로 인코딩된 글자수에서 확인" ] }, { "cell_type": "code", "execution_count": 8, "id": "interested-entrance", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.75" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vl_encoder = {\"A\": \"0\", \"B\": \"10\", \"C\": \"110\", \"D\": \"111\"}\n", "vl_encoded_doc = \"\".join([vl_encoder[c] for c in doc])\n", "len(vl_encoded_doc) / len(doc)" ] }, { "cell_type": "code", "execution_count": null, "id": "consistent-protein", "metadata": {}, "outputs": [], "source": [ "# 실제로 한글자당 인코딩된 글자수" ] }, { "cell_type": "code", "execution_count": 9, "id": "sharp-investigator", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder = {\"A\": \"00\", \"B\": \"01\", \"C\": \"10\", \"D\": \"11\"}\n", "encoded_doc = \"\".join([encoder[c] for c in doc])\n", "len(encoded_doc) / len(doc)" ] }, { "cell_type": "code", "execution_count": null, "id": "composite-refrigerator", "metadata": {}, "outputs": [], "source": [ "# 사이파이로 쿨백-라이블러 발산 계산" ] }, { "cell_type": "code", "execution_count": 10, "id": "british-pantyhose", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.24999999999999997" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sp.stats.entropy([1/2, 1/4, 1/8, 1/8], [1/4, 1/4, 1/4, 1/4], base=2)" ] }, { "cell_type": "code", "execution_count": null, "id": "unnecessary-muscle", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }