In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls

drive  sample_data


In [3]:
%cd drive/MyDrive/비컴최

/content/drive/MyDrive/비컴최


In [4]:
import torch
print(torch.__version__)


2.0.1+cu118


In [5]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

import string
import pandas as pd
import plotly.express as px
import tensorflow.data as tfd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.utils import class_weight
from tensorflow.keras import callbacks
from tensorflow.keras import Model, layers

In [45]:
#하이퍼 파라미터 수정
num_heads = 4
embed_dim = 256
ff_dim = 128
vocab_size = 10000
max_seq_len = 40

learning_rate = 0.003
epochs = 100
batch_size = 32


In [7]:
# Set up random seed for reproducibility
random_seed = 123
np.random.seed(random_seed)
tf.random.set_seed(random_seed)


In [30]:
data_frame = pd.read_csv("SPAM_text_message.csv")

data_frame.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
#데이터셋에서 가장 긴 메시지의 길이를 계산하고, 최대 시퀀스 길이를 출력합니다.
max_len = max([len(text) for text in data_frame.Message])
print(f"Maximum Length Of Input Sequence : {max_len}")

Maximum Length Of Input Sequence : 910


In [32]:
# 데이터셋에서 'Message' 열을 X로, 'Category' 열을 y로 추출합니다.
X = data_frame['Message'].tolist()
y = data_frame['Category'].tolist()


# 레이블 인코더를 초기화하고, y값을 숫자로 변환합니다.
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print(f'X[:5]: \n{X[:5]}\n')
print(f'y[:5]: {y[:5]}\n')
print(f"Label Mapping : {label_encoder.inverse_transform(y[:5])}")

X[:5]: 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]

y[:5]: [0 0 1 0 0]

Label Mapping : ['ham' 'ham' 'spam' 'ham' 'ham']


In [33]:
# 클래스의 불균형을 보정하기 위해 클래스 가중치를 계산합니다. 스팸과 비스팸 클래스 간의 비율을 고려하여 가중치를 계산하고 출력합니다.
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=data_frame.Category.unique(), y=label_encoder.inverse_transform(y))
class_weights = {number: weight for number, weight in enumerate(class_weights)}

print(f"Associated class weights: {class_weights}")

Associated class weights: {0: 0.5774093264248704, 1: 3.7295850066934406}


In [35]:
# Define a function to preprocess the text
def preprocess_text(text: str) -> str:
    """
    데이터 전처리 파트
    """
    # 구두점을 공백으로 대체
    text = tf.strings.regex_replace(text, f"[{string.punctuation}]", " ")
    
    # 소문자로변환
    text = tf.strings.lower(text)
    
    # 앞뒤 공백을 제거
    text = tf.strings.strip(text)
    
    return text
    

# TextVectorization layer 생성합니다.
text_vectorizer = layers.TextVectorization(
    max_tokens=vocab_size,                       # 최대 어휘 크기
    output_sequence_length=max_seq_len,          # 최대 시퀀스 길이
    standardize=preprocess_text,                 # 전처리 함수
    pad_to_max_tokens=True,                      # Pad sequences to maximum length
    output_mode='int'                            # integer-encoded sequences
)

# TextVectorization 레이어를 데이터에 적용하여 어휘를 생성하고, 어휘 사전을 구축합니다.
text_vectorizer.adapt(X)

In [36]:
# Data set size
N_SAMPLES = len(data_frame)

print(f"Total Number of Samples : {N_SAMPLES}")

Total Number of Samples : 5572


In [37]:
for _ in range(5):
    # Send a text to randomly.
    text_temp = X[np.random.randint(N_SAMPLES)]

    # vectorization을 적용.
    text_vec_temp = text_vectorizer(text_temp)

    # results
    print(f"Original Text: {text_temp}")
    print(f"Vectorized Text: {text_vec_temp}\n")

Original Text: What was she looking for?
Vectorized Text: [ 51  65  82 445  14   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]

Original Text: What does the dance river do?
Vectorized Text: [  51  404    6 3946 3270   32    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]

Original Text: K I'll be sure to get up before noon and see what's what
Vectorized Text: [ 99   2  57  39 192   3  36  48 210 977   8  89  51  20  51   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]

Original Text: I don wake since. I checked that stuff and saw that its true no available spaces. Pls call the embassy or send a mail to them.
Vectorized Text: [   2   92  484  470    2 1622   18  290    8  519   18   68  456   42
  671 5190  109   17    6 

In [38]:
# 어휘 사전을 가져와서 토큰을 확인합니다
VOCAB = text_vectorizer.get_vocabulary()

print(f"Vocabulary size: {len(VOCAB)}")
print(f"Vocabulary: {VOCAB[150:200]}")

Vocabulary size: 8841
Vocabulary: ['number', 'message', 'e', 've', 'tomorrow', 'say', 'won', 'right', 'prize', 'already', 'after', 'said', 'ask', 'doing', 'cash', 'amp', '3', 'yeah', 'really', 'im', 'why', 'b', 'life', 'them', 'meet', 'find', 'very', 'miss', 'morning', 'let', 'babe', 'last', 'would', 'win', 'thanks', 'cos', 'anything', 'uk', 'lol', 'also', 'care', 'every', 'sure', 'pick', 'com', '150p', 'sent', 'nokia', 'keep', 'urgent']


In [40]:
# training and testing으로 데이터를 분리함.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# Apply the Text Vectorization
X_train = text_vectorizer(X_train)
X_test = text_vectorizer(X_test)

# One Hot Vectors
Xoh_train = tf.one_hot(X_train, depth=1000)
Xoh_test  = tf.one_hot(X_test, depth=1000)

In [41]:
class TokenAndPositionalEmbedding(layers.Layer):
    
    def __init__(self, embedding_dims, vocab_size, seq_len, **kwargs):
        super(TokenAndPositionalEmbedding, self).__init__(**kwargs)
        
        # Initialize parameters
        self.seq_len = seq_len # 시퀀스의 길이
        self.vocab_size = vocab_size # 어휘 사전의 크기
        self.embedding_dims = embedding_dims # 임베딩 벡터의 차원
        # 임베딩 벡터에 곱해지는 스케일 값을 계산하기 위함.
        self.embed_scale = tf.math.sqrt(tf.cast(embedding_dims, tf.float32))
        
        # Define layers
        # token 임베딩을 위한 임베딩 레이어, 어휘 사전의 크기와 임베딩 벡터의 차원을 입력으로 받음.
        self.token_embedding = layers.Embedding(
            input_dim=vocab_size, 
            output_dim=embedding_dims,
            name="token_embedding"
        )
        # 위칭 임베딩을 위한 임베딩 레이어, 시퀀스의 길이와 임베딩 벡터의 차원을 입력으로 받음 
        self.positional_embedding = layers.Embedding(
            input_dim=seq_len, 
            output_dim=embedding_dims,
            name="positional_embedding"
        )
    
    def call(self, inputs): # 순방향 전파 연산을 정의함. 입력으로 들어온 시퀀스에 대해 토큰 임베딩과 위치 임베딩을 계산하여 결합한 결과를 반환함.
        seq_len = tf.shape(inputs)[1]
        
        # Token Embedding
        token_embedding = self.token_embedding(inputs)
        token_embedding *= self.embed_scale
        
        # Positional Embedding
        positions = tf.range(start=0, limit=seq_len, delta=1)
        positional_embedding = self.positional_embedding(positions)
        
        # Add Token and Positional Embedding
        embeddings = token_embedding + positional_embedding
        
        return embeddings
        
    
    def get_config(self): # 층의 설정을 반환함. 모델을 저장하거나 load 할 때, 층의 구성을 저장하고 복원하는데 사용.
        config = super(TokenAndPositionalEmbedding, self).get_config()
        config.update({
            'embedding_dims': self.embedding_dims,
            'vocab_size': self.vocab_size,
            'seq_len': self.seq_len,
        })
        return config

In [42]:
temp_embeds = TokenAndPositionalEmbedding(embed_dim, vocab_size, max_seq_len)(X_train[:1])
temp_embeds


<tf.Tensor: shape=(1, 40, 256), dtype=float32, numpy=
array([[[-0.3880964 , -0.47288898, -0.24537694, ...,  0.40310794,
         -0.44804114, -0.29323986],
        [-0.40660465, -0.04319959, -0.03655719, ..., -0.52695304,
         -0.13978596, -0.2532973 ],
        [ 0.16900317,  0.70011663, -0.00110105, ...,  0.02513746,
          0.4199832 , -0.13565637],
        ...,
        [ 0.63882357, -0.31927913,  0.103112  , ..., -0.3371863 ,
         -0.11610547, -0.4128485 ],
        [ 0.7090536 , -0.3397284 ,  0.15972938, ..., -0.31687164,
         -0.1631437 , -0.37359652],
        [ 0.6810206 , -0.27514577,  0.16277978, ..., -0.30336446,
         -0.19594531, -0.3800351 ]]], dtype=float32)>

In [43]:
class TransformerLayer(layers.Layer):
    
    def __init__(self, num_heads: int, dropout_rate: float, embedding_dims: int, ff_dim: int, **kwargs):
        super(TransformerLayer, self).__init__(**kwargs)
        
        # Initialize Parameters
        self.num_heads = num_heads # layer head 개수
        self.dropout_rate = dropout_rate # drop out 비율
        self.embedding_dims = embedding_dims # 임베딩 차원
        self.ff_dim = ff_dim # feedforward 네트워크 차원
        
        # Initialize Layers
        # 레이어를 생성함. multi-head attetnion을 구현하는 부분. input을 세번 사용하여, query,key,value를 받음.
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dims, dropout=dropout_rate)
        # 어텐션의 정규화 및 연결을 담당하는 layer
        self.ln1 = layers.LayerNormalization(epsilon=1e-6)
        # feedforward 내트워크를 구성하는 Sequential model. 두개의 연결 layer로 구성되어있으며, 활성화함수로 ReLU를 사용함.
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation='relu', kernel_initializer='he_normal'),
            layers.Dense(embedding_dims)
        ])
        # feedforward 네트워크 출력과 정규화를 담당하는 레이어.
        self.ln2 = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs): # 순반향 전파 연산을 정의함.

        # Multi-Head Attention
        attention = self.mha(inputs, inputs, inputs)
        
        # Layer Normalization and Residual Connection
        normalized1 = self.ln1(attention + inputs)
        
        # feedforward Network
        ffn_out = self.ffn(normalized1)
        
        # Layer Normalization and Residual Connection
        normalized2 = self.ln2(ffn_out + normalized1)
        
        return normalized2
    
    def get_config(self):

        config = super(TransformerLayer, self).get_config()
        config.update({
            "num_heads": self.num_heads,
            "dropout_rate": self.dropout_rate,
            "embedding_dims": self.embedding_dims,
            "ff_dim": self.ff_dim
        })
        return config

In [21]:
# Transformer layers execution
TransformerLayer(num_heads=num_heads, embedding_dims=embed_dim, ff_dim=ff_dim, dropout_rate=0.1)(temp_embeds)

<tf.Tensor: shape=(1, 40, 256), dtype=float32, numpy=
array([[[-0.5493694 , -0.23213398, -1.488012  , ..., -0.7851608 ,
          0.5061727 ,  0.4329842 ],
        [ 1.5696354 ,  1.0594529 , -2.4565094 , ...,  0.52035755,
          0.7263887 , -0.67543054],
        [-0.5230986 , -0.7380692 ,  0.04876898, ...,  1.7040778 ,
         -0.4524423 , -0.8817183 ],
        ...,
        [ 1.8601035 ,  1.100482  , -1.7476187 , ...,  1.6482563 ,
          0.15755595,  0.48714718],
        [ 1.6561365 ,  0.9935242 , -1.8066171 , ...,  1.4416547 ,
          0.1746852 ,  0.43709984],
        [ 1.7067384 ,  1.0015274 , -1.8400493 , ...,  1.5454199 ,
          0.12127464,  0.4311695 ]]], dtype=float32)>

In [20]:
# Input layer
InputLayer = layers.Input(shape=(max_seq_len,), name="InputLayer")

# Embedding Layer
embeddings = TokenAndPositionalEmbedding(embed_dim, vocab_size, max_seq_len, name="EmbeddingLayer")(InputLayer)

# Transformer Layer
encodings = TransformerLayer(num_heads=num_heads, embedding_dims=embed_dim, ff_dim=ff_dim, dropout_rate=0.1, name="TransformerLayer")(embeddings)

# Classifier
gap = layers.GlobalAveragePooling1D(name="GlobalAveragePooling")(encodings)
drop = layers.Dropout(0.5, name="Dropout")(gap)
OutputLayer = layers.Dense(1, activation='sigmoid', name="OutputLayer")(drop)

# Model
model = keras.Model(InputLayer, OutputLayer, name="TransformerNet")

# Model Architecture Summary
model.summary()

Model: "TransformerNet"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 InputLayer (InputLayer)     [(None, 40)]              0         
                                                                 
 EmbeddingLayer (TokenAndPos  (None, 40, 256)          2570240   
 itionalEmbedding)                                               
                                                                 
 TransformerLayer (Transform  (None, 40, 256)          1118848   
 erLayer)                                                        
                                                                 
 GlobalAveragePooling (Globa  (None, 256)              0         
 lAveragePooling1D)                                              
                                                                 
 Dropout (Dropout)           (None, 256)               0         
                                                    

In [46]:
# Model 컴파일
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=[
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
    ]
)

# Train Model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    class_weight=class_weights
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [22]:
# Evaluate model performance on test data
loss, acc, precision, recall, auc = model.evaluate(X_test, y_test, verbose=0)

# model performance
print('Test loss      :', loss)
print('Test accuracy  :', acc*100)
print('Test precision :', precision*100)
print('Test recall    :', recall*100)
print('Test AUC       :', auc*100)

Test loss      : 0.20169119536876678
Test accuracy  : 97.75784611701965
Test precision : 93.05555820465088
Test recall    : 89.9328887462616
Test AUC       : 96.21215462684631


In [44]:
def decode_tokens(tokens):

    text = " ".join(VOCAB[int(token)] for token in tokens).strip()
    return text

In [24]:
for _ in range(10):
    # Randomly select a text from the testing data.
    index = np.random.randint(1,len(X_test))
    tokens = X_test[index-1:index]
    label = y_test[index]

    # Feed the tokens to the model
    print(f"\nModel Prediction\n{'-'*100}")
    proba = 1 if model.predict(tokens, verbose=0)[0][0]>0.5 else 0
    pred = label_encoder.inverse_transform([proba])
    print(f"Message: '{decode_tokens(tokens[0])}' | Prediction: {pred[0].title()} | True : {label_encoder.inverse_transform([label])[0].title()}\n")


Model Prediction
----------------------------------------------------------------------------------------------------
Message: 'well thats nice too bad i cant eat it' | Prediction: Ham | True : Ham


Model Prediction
----------------------------------------------------------------------------------------------------
Message: 'height of oh shit situation a guy throws a luv letter on a gal but falls on her brothers head whos a gay d' | Prediction: Ham | True : Ham


Model Prediction
----------------------------------------------------------------------------------------------------
Message: 'on the way to office da' | Prediction: Ham | True : Ham


Model Prediction
----------------------------------------------------------------------------------------------------
Message: 'i had a good time too its nice to do something a bit different with my weekends for a change see ya soon' | Prediction: Ham | True : Ham


Model Prediction
------------------------------------------------------------

In [27]:
# Custom Input
text = input("Enter your Msg: ")

# Convert into tokens
tokens = text_vectorizer([text])

# Feed the tokens to the model
print(f"\nModel Predictions\n{'-'*100}")
proba = 1 if model.predict(tokens, verbose=0)[0][0]>0.5 else 0
pred = label_encoder.inverse_transform([proba])
print(f"Message: '{text}' | Prediction: {pred[0].title()}")


Enter your Msg: This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.

Model Predictions
----------------------------------------------------------------------------------------------------
Message: 'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.' | Prediction: Spam
