MOHAN KRISHNA

0 %
Mohan Krishna
Multimedia Professional
Ai & ML Researcher & Enthusiast
  • Residence:
    India
  • City:
    Vijayawada
  • Age:
    46
AI/ML Enthusiast. New Media Trainer, VFX Artist, Non Linear Video Editor, Graphic Designer, Sound Editor and iOS App Designer.
Telugu
English
Hindi
Tamil
Proficiency:
Graphic Design
Web Design
Video & VFX
Machine Learning
Artificial Intelligence
Digital Marketing
Areas of Interest:
Take a look at some of the things I love working on.
  • Non Linear Video Editing
  • Graphic Design
  • Web Design
  • Audio Editing
  • Content Management Systems
  • Python
  • Deep Learning
  • OpenCV
  • Image Classification

SPAM MESSAGE DETECTION

October 8, 2022

Dataset Cleaning:

# importing required libraries
# http://www.pillalamarri.in/python/spam-message-detection/
import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
import pickle
import warnings
import re
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')

# reading the dataset
# dataset: https://www.kaggle.com/uciml/sms-spam-collection-dataset
msg = pd.read_csv("./Message_Spam_Detection/dataset.csv", encoding='latin-1')
msg.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
msg.rename(columns={"v1": "label", "v2": "text"}, inplace=True)

# mapping ham=0 and spam=1
for i in msg.index:
    if msg['label'][i] == 'ham':
        msg['label'][i] = 0
    else:
        msg['label'][i] = 1

# dropping duplicate columns
msg = msg.drop_duplicates()

# data cleaning/preprocessing - removing punctuation and digits
msg['cleaned_text'] = ""
for i in msg.index:
    updated_list = []
    for j in range(len(msg['text'][i])):
        if msg['text'][i][j] not in string.punctuation:
            if msg['text'][i][j].isdigit() == False:
                updated_list.append(msg['text'][i][j])
    updated_string = "".join(updated_list)
    msg['cleaned_text'][i] = updated_string
msg.drop(['text'], axis=1, inplace=True)

# data clearning/preprocessing - tokenization and convert to lower case
msg['token'] = ""
for i in msg.index:
    msg['token'][i] = re.split("\W+", msg['cleaned_text'][i].lower())

# data cleaning/preprocessing - stopwords
msg['updated_token'] = ""
stopwords = nltk.corpus.stopwords.words('english')
for i in msg.index:
    updated_list = []
    for j in range(len(msg['token'][i])):
        if msg['token'][i][j] not in stopwords:
            updated_list.append(msg['token'][i][j])
    msg['updated_token'][i] = updated_list
msg.drop(['token'], axis=1, inplace=True)

# data cleaning/preprocessing - lemmentizing
msg['lem_text'] = ""
wordlem = nltk.WordNetLemmatizer()
for i in msg.index:
    updated_list = []
    for j in range(len(msg['updated_token'][i])):
        updated_list.append(wordlem.lemmatize(msg['updated_token'][i][j]))
    msg['lem_text'][i] = updated_list
msg.drop(['updated_token'], axis=1, inplace=True)

# data cleaning/preprocessing - mergining token
msg['final_text'] = ""
for i in msg.index:
    updated_string = " ".join(msg['lem_text'][i])
    msg['final_text'][i] = updated_string
msg.drop(['cleaned_text', 'lem_text'], axis=1, inplace=True)

# cleaned dataset
msg.to_csv('Cleaned_Dataset.csv')
# http://www.pillalamarri.in/python/spam-message-detection/

Spam Detection Code:

# importing required libraries
# http://www.pillalamarri.in/python/spam-message-detection/
import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
import pickle
import warnings
import re
warnings.filterwarnings("ignore")

# reading the dataset
msg = pd.read_csv(
    "./Message_Spam_Detection/Cleaned_Dataset.csv", encoding='latin-1')
msg.drop(['Unnamed: 0'], axis=1, inplace=True)

# seperating target and features
y = pd.DataFrame(msg.label)
x = msg.drop(['label'], axis=1)

# countvectorization
cv = CountVectorizer(max_features=5000)
temp1 = cv.fit_transform(x['final_text'].values.astype('U')).toarray()
tf = TfidfTransformer()
temp1 = tf.fit_transform(temp1)
temp1 = pd.DataFrame(temp1.toarray(), index=x.index)
x = pd.concat([x, temp1], axis=1, sort=False)

# drop final_text col
x.drop(['final_text'], axis=1, inplace=True)

# converting to int datatype
y = y.astype(int)

# randomforstclassifier model
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(x, y)

# User input
text = input("Enter text: ")

# data cleaning/preprocessing - removing punctuation and digits
updated_text = ''
for i in range(len(text)):
    if text[i] not in string.punctuation:
        if text[i].isdigit() == False:
            updated_text = updated_text+text[i]
text = updated_text

# data clearning/preprocessing - tokenization and convert to lower case
text = re.split("\W+", text.lower())

# data cleaning/preprocessing - stopwords
updated_list = []
stopwords = nltk.corpus.stopwords.words('english')
for i in range(len(text)):
    if text[i] not in stopwords:
        updated_list.append(text[i])
text = updated_list

# data cleaning/preprocessing - lemmentizing
updated_list = []
wordlem = nltk.WordNetLemmatizer()
for i in range(len(text)):
    updated_list.append(wordlem.lemmatize(text[i]))
text = updated_list

# data cleaning/preprocessing - mergining token
text = " ".join(text)

text = cv.transform([text])
text = tf.transform(text)
pred = model.predict(text)
if pred == 0:
    print("Not Spam")
else:
    print("Spam")
# http://www.pillalamarri.in/python/spam-message-detection/
Posted in PythonTags: