{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":3494,"sourceType":"datasetVersion","datasetId":2050}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-04-08T04:34:12.257659Z","iopub.execute_input":"2024-04-08T04:34:12.258439Z","iopub.status.idle":"2024-04-08T04:34:13.436477Z","shell.execute_reply.started":"2024-04-08T04:34:12.258399Z","shell.execute_reply":"2024-04-08T04:34:13.435083Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv\n","output_type":"stream"}]},{"cell_type":"code","source":"df = pd.read_csv(\"/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv\")\ndf.head()","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:36:51.380500Z","iopub.execute_input":"2024-04-08T04:36:51.381669Z","iopub.status.idle":"2024-04-08T04:36:51.432817Z","shell.execute_reply.started":"2024-04-08T04:36:51.381631Z","shell.execute_reply":"2024-04-08T04:36:51.431188Z"},"trusted":true},"execution_count":2,"outputs":[{"execution_count":2,"output_type":"execute_result","data":{"text/plain":" Category Message\n0 ham Go until jurong point, crazy.. Available only ...\n1 ham Ok lar... Joking wif u oni...\n2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n3 ham U dun say so early hor... U c already then say...\n4 ham Nah I don't think he goes to usf, he lives aro...","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CategoryMessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n
"},"metadata":{}}]},{"cell_type":"code","source":"df['Category'].value_counts()","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:37:14.844345Z","iopub.execute_input":"2024-04-08T04:37:14.844747Z","iopub.status.idle":"2024-04-08T04:37:14.860461Z","shell.execute_reply.started":"2024-04-08T04:37:14.844716Z","shell.execute_reply":"2024-04-08T04:37:14.859522Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"Category\nham 4825\nspam 747\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"X = df['Message']\ny = df['Category']\nlen(X)","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:38:22.105311Z","iopub.execute_input":"2024-04-08T04:38:22.105700Z","iopub.status.idle":"2024-04-08T04:38:22.113297Z","shell.execute_reply.started":"2024-04-08T04:38:22.105670Z","shell.execute_reply":"2024-04-08T04:38:22.112143Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"5572"},"metadata":{}}]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:39:49.684198Z","iopub.execute_input":"2024-04-08T04:39:49.684583Z","iopub.status.idle":"2024-04-08T04:39:51.252500Z","shell.execute_reply.started":"2024-04-08T04:39:49.684555Z","shell.execute_reply":"2024-04-08T04:39:51.251275Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"len(X_train)","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:40:27.246913Z","iopub.execute_input":"2024-04-08T04:40:27.247303Z","iopub.status.idle":"2024-04-08T04:40:27.254879Z","shell.execute_reply.started":"2024-04-08T04:40:27.247274Z","shell.execute_reply":"2024-04-08T04:40:27.253395Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"4457"},"metadata":{}}]},{"cell_type":"code","source":"from sklearn.pipeline import Pipeline\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.naive_bayes import MultinomialNB, ComplementNB\nfrom sklearn.svm import LinearSVC\nfrom sklearn.metrics import accuracy_score, classification_report","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:47:34.347399Z","iopub.execute_input":"2024-04-08T04:47:34.347760Z","iopub.status.idle":"2024-04-08T04:47:34.353701Z","shell.execute_reply.started":"2024-04-08T04:47:34.347733Z","shell.execute_reply":"2024-04-08T04:47:34.352296Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"pipeMNB = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,3))), ('clf', MultinomialNB())])\npipeCNB = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,3))), ('clf', ComplementNB())])\npipeSVC = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,3))), ('clf', LinearSVC())])","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:52:24.603052Z","iopub.execute_input":"2024-04-08T04:52:24.603501Z","iopub.status.idle":"2024-04-08T04:52:24.624488Z","shell.execute_reply.started":"2024-04-08T04:52:24.603470Z","shell.execute_reply":"2024-04-08T04:52:24.623184Z"},"trusted":true},"execution_count":31,"outputs":[]},{"cell_type":"code","source":"pipeMNB.fit(X_train, y_train)\npredictMNB = pipeMNB.predict(X_test)\nprint(f\"MNB: {accuracy_score(y_test, predictMNB):.2f}\")\npipeCNB.fit(X_train, y_train)\npredictCNB = pipeCNB.predict(X_test)\nprint(f\"CNB: {accuracy_score(y_test, predictCNB):.2f}\")\npipeSVC.fit(X_train, y_train)\npredictSVC = pipeSVC.predict(X_test)\nprint(f\"SVC: {accuracy_score(y_test, predictSVC):.2f}\")","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:52:29.351761Z","iopub.execute_input":"2024-04-08T04:52:29.352170Z","iopub.status.idle":"2024-04-08T04:52:31.250027Z","shell.execute_reply.started":"2024-04-08T04:52:29.352138Z","shell.execute_reply":"2024-04-08T04:52:31.248843Z"},"trusted":true},"execution_count":32,"outputs":[{"name":"stdout","text":"MNB: 0.95\nCNB: 0.98\nSVC: 0.99\n","output_type":"stream"}]},{"cell_type":"code","source":"print(classification_report(y_test, predictSVC))","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:50:07.340093Z","iopub.execute_input":"2024-04-08T04:50:07.340505Z","iopub.status.idle":"2024-04-08T04:50:07.416987Z","shell.execute_reply.started":"2024-04-08T04:50:07.340473Z","shell.execute_reply":"2024-04-08T04:50:07.415855Z"},"trusted":true},"execution_count":25,"outputs":[{"name":"stdout","text":" precision recall f1-score support\n\n ham 0.98 1.00 0.99 965\n spam 0.99 0.90 0.94 150\n\n accuracy 0.99 1115\n macro avg 0.99 0.95 0.97 1115\nweighted avg 0.99 0.99 0.99 1115\n\n","output_type":"stream"}]},{"cell_type":"code","source":"msg = \"you have won a $10000 prize! contact us for eh reward!\"\nclsf = pipeSVC.predict([msg])\nprint(clsf[0])","metadata":{"execution":{"iopub.status.busy":"2024-04-08T04:54:56.006597Z","iopub.execute_input":"2024-04-08T04:54:56.007014Z","iopub.status.idle":"2024-04-08T04:54:56.016088Z","shell.execute_reply.started":"2024-04-08T04:54:56.006982Z","shell.execute_reply":"2024-04-08T04:54:56.014879Z"},"trusted":true},"execution_count":34,"outputs":[{"name":"stdout","text":"spam\n","output_type":"stream"}]}]}